RLE now works on nibbles

This commit is contained in:
2026-04-12 21:26:36 -07:00
parent 3fb10b78e3
commit bba9db290e
2 changed files with 398 additions and 360 deletions

View File

@@ -1,26 +1,30 @@
//! # Benchmark: BlockQ4K vs BlockQ4KRle //! # Benchmark: BlockQ4K vs BlockQ4KRle
//! //!
//! Measures three operations across two weight distributions: //! Measures three operations across three weight distributions, encoded with
//! `min_coverage = 0.01` (blocks need ≥ 1 % of their 256 nibbles in repeated
//! runs to use RLE mode).
//! //!
//! | Group | What is timed | //! | Group | What is timed |
//! |--------------|--------------------------------------------------| //! |--------------|-----------------------------------------------------|
//! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks | //! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks |
//! | `dequantize` | Single-block dequantisation for all three paths | //! | `dequantize` | Single-block dequantisation across all four paths |
//! | `matmul` | Full A×B multiply at three matrix sizes | //! | `matmul` | Full A×B multiply at three matrix sizes |
//! //!
//! ## Weight distributions //! ## Weight distributions
//! //!
//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG). //! **uniform** — each qs byte is drawn from a pseudo-random LCG sequence.
//! Consecutive bytes almost never repeat, so each block produces ~128 //! Adjacent nibbles match with probability 1/16, giving ~12 % nibble coverage.
//! single-byte runs. At 2 bytes per pair that would require ~256 bytes, //! At `min_coverage = 0.01` these blocks encode to **RLE mode** (IS_RLE = 1)
//! which exceeds the 128-byte raw payload, so `encode` always keeps these //! with ~230240 nibble entries — a realistic proxy for trained Q4_K weights.
//! blocks in **raw mode** (IS_RLE = 0). This is representative of typical
//! unstructured LLM weight matrices.
//! //!
//! **rle_optimal** — every byte in a block's qs field is the same value. //! **rle_optimal** — every qs byte is the same value. All 256 nibbles are
//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 — //! identical, giving 100 % coverage and just 16 nibble entries. This is the
//! and sets IS_RLE = 1. This is the theoretical compression maximum, and //! theoretical RLE maximum and represents highly structured weight blocks.
//! is representative of highly sparse or dead-neuron weight matrices. //!
//! **zero_coverage** — nibbles cycle deterministically so no two consecutive
//! nibbles (in output-sequential order) are ever equal. Coverage = 0 %;
//! `encode` keeps these blocks in **raw mode** (IS_RLE = 0) at any positive
//! threshold. Used only in the `dequantize` group to benchmark the raw path.
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use matrix_testing::{ use matrix_testing::{
@@ -83,12 +87,11 @@ fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
s s
} }
/// Return `count` blocks whose qs bytes are pseudo-random. /// Return `count` blocks whose qs bytes are pseudo-random (LCG).
/// ///
/// With uniformly distributed bytes, consecutive bytes match with probability /// Adjacent nibbles match with probability 1/16, giving each block roughly
/// 1/256 ≈ 0.4%, yielding ~128 runs per block. Storing those as (value, /// 12 % nibble coverage. At `min_coverage = 0.01` these blocks encode to
/// count) pairs would need ~256 bytes — more than the 128-byte raw payload — /// **RLE mode** (IS_RLE = 1) with ~230240 nibble entries per block.
/// so `encode` will always select **raw mode** (IS_RLE = 0).
fn uniform_blocks(count: usize) -> Vec<BlockQ4K> { fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234); let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
let scales = make_scales(7, 2); let scales = make_scales(7, 2);
@@ -107,10 +110,9 @@ fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
/// Return `count` blocks where every qs byte is the same value. /// Return `count` blocks where every qs byte is the same value.
/// ///
/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes /// All 256 nibbles are identical → 100 % nibble coverage → always **RLE mode**
/// instead of 128. `encode` will always select **RLE mode** (IS_RLE = 1). /// with exactly 16 entries (256 nibbles / 16 per entry).
/// Each block uses a fresh pseudo-random byte so no two blocks are identical, /// Each block uses a fresh pseudo-random byte to avoid cache-warm artifacts.
/// avoiding degenerate cache-warm effects across the batch.
fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> { fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0); let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
let scales = make_scales(7, 2); let scales = make_scales(7, 2);
@@ -129,6 +131,28 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
vec![f32_to_fp16(1.0); k * n] vec![f32_to_fp16(1.0); k * n]
} }
/// Build one block whose nibbles cycle so that no two consecutive nibbles
/// (in output-sequential order) are ever equal → 0 % nibble coverage.
///
/// Lo nibble of byte `i` = `i % 16`; hi nibble = `(i + 8) % 16`.
/// Within every 32-byte group the lo and hi streams each visit all 16 values
/// twice without repetition, and across group boundaries the last nibble of
/// one stream differs from the first nibble of the next.
///
/// At any `min_coverage > 0.0`, `encode` keeps this block in **raw mode**.
fn zero_coverage_block() -> BlockQ4K {
let scales = make_scales(7, 2);
let d = f32_to_fp16(0.01);
let dmin = f32_to_fp16(0.001);
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
let lo = (i % 16) as u8;
let hi = ((i + 8) % 16) as u8;
*b = lo | (hi << 4);
}
BlockQ4K { d, dmin, scales, qs }
}
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// Group 1 — encode // Group 1 — encode
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@@ -136,12 +160,11 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
/// Number of blocks encoded per iteration in `bench_encode`. /// Number of blocks encoded per iteration in `bench_encode`.
const ENCODE_BATCH: usize = 512; const ENCODE_BATCH: usize = 512;
/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output. /// Measures the cost of scanning nibbles and writing the `BlockQ4KRle` output.
/// ///
/// Both distributions perform the same O(128) run-length scan. The only /// Both distributions perform the same O(256) nibble scan. The output differs:
/// divergence is at the output stage: /// * **uniform** — ~12 % coverage → RLE mode, ~230240 entries written.
/// * **uniform** — run count > 63 → fall through to memcpy of 128 bytes. /// * **rle_optimal** — 100 % coverage → RLE mode, exactly 16 entries written.
/// * **rle_optimal** — run count = 1 → write 2 bytes and set IS_RLE.
fn bench_encode(c: &mut Criterion) { fn bench_encode(c: &mut Criterion) {
let uniform = uniform_blocks(ENCODE_BATCH); let uniform = uniform_blocks(ENCODE_BATCH);
let rle_opt = rle_optimal_blocks(ENCODE_BATCH); let rle_opt = rle_optimal_blocks(ENCODE_BATCH);
@@ -153,7 +176,7 @@ fn bench_encode(c: &mut Criterion) {
group.bench_function("uniform", |b| { group.bench_function("uniform", |b| {
b.iter(|| { b.iter(|| {
for blk in &uniform { for blk in &uniform {
black_box(encode(black_box(blk), 0.0)); black_box(encode(black_box(blk), 0.01));
} }
}); });
}); });
@@ -161,7 +184,7 @@ fn bench_encode(c: &mut Criterion) {
group.bench_function("rle_optimal", |b| { group.bench_function("rle_optimal", |b| {
b.iter(|| { b.iter(|| {
for blk in &rle_opt { for blk in &rle_opt {
black_box(encode(black_box(blk), 0.0)); black_box(encode(black_box(blk), 0.01));
} }
}); });
}); });
@@ -173,25 +196,35 @@ fn bench_encode(c: &mut Criterion) {
// Group 2 — dequantize (single block) // Group 2 — dequantize (single block)
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
/// Compares the three single-block dequantisation code paths. /// Compares four single-block dequantisation code paths.
/// ///
/// | Variant | Block type | Encoding | Extra work vs baseline | /// | Variant | Block type | Encoding | IS_RLE | Entries |
/// |------------------|-------------|----------|-------------------------------| /// |--------------------|-------------|-----------|--------|---------|
/// | `q4k_baseline` | BlockQ4K | — | none | /// | `q4k_baseline` | BlockQ4K | — | — | — |
/// | `rle_raw_mode` | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) | /// | `rle_raw_mode` | BlockQ4KRle | raw | 0 | — |
/// | `rle_rle_mode` | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf | /// | `rle_sparse` | BlockQ4KRle | RLE | 1 | ~235 |
/// | `rle_dense` | BlockQ4KRle | RLE | 1 | 16 |
///
/// `rle_raw_mode` uses the zero-coverage fixture (0 % nibble coverage), which
/// stays in raw mode at any positive threshold.
/// `rle_sparse` uses the LCG uniform fixture (~12 % coverage, ~235 entries),
/// representative of actual trained Q4_K weight blocks.
/// `rle_dense` uses the rle_optimal fixture (100 % coverage, 16 entries).
/// ///
/// Throughput is the number of dequantised weights produced per second. /// Throughput is the number of dequantised weights produced per second.
fn bench_dequantize(c: &mut Criterion) { fn bench_dequantize(c: &mut Criterion) {
let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap(); let q4k_baseline_block = uniform_blocks(1).into_iter().next().unwrap();
let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap(); let q4k_zero_cov = zero_coverage_block();
let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap();
let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap();
let rle_raw = encode(&q4k_uniform, 0.0); // IS_RLE = 0 let rle_raw = encode(&q4k_zero_cov, 0.01); // IS_RLE = 0 (0 % coverage)
let rle_rle = encode(&q4k_rle_opt, 0.0); // IS_RLE = 1 let rle_sparse = encode(&q4k_uniform, 0.01); // IS_RLE = 1 (~12 % coverage)
let rle_dense = encode(&q4k_rle_opt, 0.01); // IS_RLE = 1 (100 % coverage)
// Confirm the fixtures ended up in the right encoding modes. assert!(!rle_raw.is_rle(), "zero-coverage block must be raw mode");
assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode"); assert!(rle_sparse.is_rle(), "uniform block must be RLE at 0.01 threshold");
assert!(rle_rle.is_rle(), "rle-optimal block should encode to rle mode"); assert!(rle_dense.is_rle(), "rle-optimal block must be RLE mode");
let mut group = c.benchmark_group("dequantize"); let mut group = c.benchmark_group("dequantize");
// Throughput = QK_K (256) weights dequantised per second. // Throughput = QK_K (256) weights dequantised per second.
@@ -200,7 +233,7 @@ fn bench_dequantize(c: &mut Criterion) {
group.bench_function("q4k_baseline", |b| { group.bench_function("q4k_baseline", |b| {
b.iter(|| { b.iter(|| {
let mut out = [0.0f32; QK_K]; let mut out = [0.0f32; QK_K];
dequantize_block_q4k(black_box(&q4k_uniform), &mut out); dequantize_block_q4k(black_box(&q4k_baseline_block), &mut out);
black_box(out) black_box(out)
}); });
}); });
@@ -213,10 +246,18 @@ fn bench_dequantize(c: &mut Criterion) {
}); });
}); });
group.bench_function("rle_rle_mode", |b| { group.bench_function("rle_sparse", |b| {
b.iter(|| { b.iter(|| {
let mut out = [0.0f32; QK_K]; let mut out = [0.0f32; QK_K];
dequantize_block_q4k_rle(black_box(&rle_rle), &mut out); dequantize_block_q4k_rle(black_box(&rle_sparse), &mut out);
black_box(out)
});
});
group.bench_function("rle_dense", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k_rle(black_box(&rle_dense), &mut out);
black_box(out) black_box(out)
}); });
}); });
@@ -245,14 +286,14 @@ const CONFIGS: &[(usize, usize, usize)] = &[
/// Full matrix-multiply benchmark across weight distributions and matrix sizes. /// Full matrix-multiply benchmark across weight distributions and matrix sizes.
/// ///
/// Four variants per size: /// Four variants per size (`min_coverage = 0.01`):
/// ///
/// | Label | A type | RLE mode? | /// | Label | A type | IS_RLE | Entries/block |
/// |----------------------|-------------|-----------| /// |----------------------|-------------|--------|---------------|
/// | `baseline/uniform` | BlockQ4K | — | /// | `baseline/uniform` | BlockQ4K | — | — |
/// | `rle/uniform` | BlockQ4KRle | raw | /// | `rle/uniform` | BlockQ4KRle | 1 | ~235 |
/// | `baseline/rle_opt` | BlockQ4K | — | /// | `baseline/rle_opt` | BlockQ4K | — | — |
/// | `rle/rle_opt` | BlockQ4KRle | rle | /// | `rle/rle_opt` | BlockQ4KRle | 1 | 16 |
/// ///
/// Throughput is reported as multiply-accumulate operations (M × K × N) per /// Throughput is reported as multiply-accumulate operations (M × K × N) per
/// second, allowing fair cross-size comparison. /// second, allowing fair cross-size comparison.
@@ -270,10 +311,10 @@ fn bench_matmul(c: &mut Criterion) {
// Build all four A variants and the shared B matrix for this config. // Build all four A variants and the shared B matrix for this config.
let a_q4k_u: Vec<BlockQ4K> = uniform_blocks(m * bpr); let a_q4k_u: Vec<BlockQ4K> = uniform_blocks(m * bpr);
let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.0)).collect(); let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.01)).collect();
let a_q4k_r: Vec<BlockQ4K> = rle_optimal_blocks(m * bpr); let a_q4k_r: Vec<BlockQ4K> = rle_optimal_blocks(m * bpr);
let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.0)).collect(); let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.01)).collect();
let b = fp16_ones(k, n); let b = fp16_ones(k, n);

View File

@@ -1,26 +1,29 @@
//! RLE-optional Q4_K super-block encoding. //! RLE-optional Q4_K super-block encoding.
//! //!
//! This module provides [`BlockQ4KRle`], a variant of [`crate::BlockQ4K`] that //! This module provides [`BlockQ4KRle`], a variant of [`crate::BlockQ4K`] that
//! optionally compresses the 128-byte weight payload using **byte-level //! optionally compresses the 128-byte weight payload using **nibble-level
//! run-length encoding** (RLE). A flag bit in the [`BlockQ4KRle::flags`] //! run-length encoding** (RLE). A flag bit in the [`BlockQ4KRle::flags`]
//! field indicates which mode is active: //! field indicates which mode is active:
//! //!
//! | `IS_RLE` bit | `qs` interpretation | //! | `IS_RLE` bit | `qs` interpretation |
//! |--------------|------------------------------------------------------------| //! |--------------|------------------------------------------------------------------------|
//! | 0 | Raw packed nibbles, identical to [`crate::BlockQ4K::qs`] | //! | 0 | Raw packed nibbles, identical to [`crate::BlockQ4K::qs`] |
//! | 1 | RLE stream of `(value, count)` byte-pairs | //! | 1 | RLE stream of single-byte entries `(nibble_val << 4 | count_minus_1)` |
//! //!
//! ## RLE format (when `IS_RLE` = 1) //! ## RLE format (when `IS_RLE` = 1)
//! //!
//! - `n_pairs` gives the number of `(value, count)` pairs stored in `qs`. //! - `n_pairs` gives the number of nibble-level RLE entries stored in `qs`.
//! - For each pair `i`: //! - For each entry `i` (one byte each):
//! - `qs[2*i]` — the byte value (two packed 4-bit weights, same packing //! - bits 74: nibble value (015)
//! as the raw format). //! - bits 30: `count - 1` (015, meaning run length 116)
//! - `qs[2*i + 1]` — the run length in bytes (1..=255). //! - The run lengths must sum to exactly 256 (the number of nibbles in QK_K).
//! - The run lengths must sum to exactly 128 (the uncompressed `qs` size).
//! //!
//! The 256-byte `qs` field can hold up to 128 `(value, count)` pairs — enough //! Nibbles are read in output-sequential order: for each 32-byte group, first
//! to represent even fully-random blocks where every byte differs from its //! all 32 lo nibbles, then all 32 hi nibbles. The nibble at stream position
//! `p` maps directly to output element `p` in sub-block `p / 32`.
//!
//! The 256-byte `qs` field can hold up to 256 single-byte entries — enough to
//! represent even fully-random blocks where every nibble differs from its
//! neighbour. //! neighbour.
//! //!
//! ## Constructing blocks //! ## Constructing blocks
@@ -45,12 +48,12 @@ pub const IS_RLE: u8 = 0x01;
// Block definition // Block definition
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
/// A Q4_K super-block with optional byte-level RLE compression on the weights. /// A Q4_K super-block with optional nibble-level RLE compression on the weights.
/// ///
/// Unlike [`crate::BlockQ4K`], this format is **not** binary-compatible with /// Unlike [`crate::BlockQ4K`], this format is **not** binary-compatible with
/// the GGUF on-disk layout. It uses a 256-byte `qs` field (vs the 128-byte /// the GGUF on-disk layout. It uses a 256-byte `qs` field (vs the 128-byte
/// field in `BlockQ4K`) so the RLE stream can store up to 128 `(value, count)` /// field in `BlockQ4K`) so the RLE stream can store up to 256 single-byte
/// pairs — enough to represent even fully-random blocks where every byte /// entries — enough to represent even fully-random blocks where every nibble
/// differs from its neighbour. /// differs from its neighbour.
/// ///
/// Memory layout (`repr C`): /// Memory layout (`repr C`):
@@ -61,17 +64,18 @@ pub const IS_RLE: u8 = 0x01;
/// | 2 | `dmin` | 2 B | fp16 super-block min-scale | /// | 2 | `dmin` | 2 B | fp16 super-block min-scale |
/// | 4 | `scales` | 12 B | packed 6-bit sub-block params | /// | 4 | `scales` | 12 B | packed 6-bit sub-block params |
/// | 16 | `flags` | 1 B | bit 0 = `IS_RLE`; bits 17 unused | /// | 16 | `flags` | 1 B | bit 0 = `IS_RLE`; bits 17 unused |
/// | 17 | `n_pairs` | 1 B | RLE pair count (0 when raw) | /// | 17 | (pad) | 1 B | alignment padding for `n_pairs` |
/// | 18 | `qs` | 256 B | raw nibbles (first 128 B) or RLE | /// | 18 | `n_pairs` | 2 B | RLE entry count (0 when raw) |
/// | 20 | `qs` | 256 B | raw nibbles (first 128 B) or RLE |
/// ///
/// **sizeof = 274 bytes.** /// **sizeof = 276 bytes.**
/// ///
/// ## `qs` interpretation /// ## `qs` interpretation
/// ///
/// | `IS_RLE` | Meaning | /// | `IS_RLE` | Meaning |
/// |----------|--------------------------------------------------------------| /// |----------|---------------------------------------------------------------------|
/// | 0 | `qs[0..128]` holds raw packed nibbles (same as `BlockQ4K`) | /// | 0 | `qs[0..128]` holds raw packed nibbles (same as `BlockQ4K`) |
/// | 1 | `qs[0..n_pairs*2]` holds `(value, count)` byte-pairs | /// | 1 | `qs[0..n_pairs]` holds nibble-level RLE entries (1 byte each) |
#[repr(C)] #[repr(C)]
#[derive(Clone, Copy, Debug)] #[derive(Clone, Copy, Debug)]
pub struct BlockQ4KRle { pub struct BlockQ4KRle {
@@ -80,11 +84,11 @@ pub struct BlockQ4KRle {
pub scales: [u8; K_SCALE_SIZE], pub scales: [u8; K_SCALE_SIZE],
/// Encoding flags. Only bit 0 (`IS_RLE`) is used; bits 1-7 are reserved. /// Encoding flags. Only bit 0 (`IS_RLE`) is used; bits 1-7 are reserved.
pub flags: u8, pub flags: u8,
/// When `IS_RLE` is set: number of `(value, count)` byte-pairs in `qs`. /// When `IS_RLE` is set: number of nibble-level RLE entries in `qs`.
/// Zero when in raw mode. /// Zero when in raw mode.
pub n_pairs: u8, pub n_pairs: u16,
/// Raw packed-nibble weights (IS_RLE = 0, first 128 bytes) or RLE stream /// Raw packed-nibble weights (IS_RLE = 0, first 128 bytes) or nibble-level
/// (IS_RLE = 1, first `n_pairs * 2` bytes). /// RLE stream (IS_RLE = 1, first `n_pairs` bytes; one byte per entry).
pub qs: [u8; QK_K], // 256 bytes pub qs: [u8; QK_K], // 256 bytes
} }
@@ -95,7 +99,7 @@ impl BlockQ4KRle {
self.flags & IS_RLE != 0 self.flags & IS_RLE != 0
} }
/// Number of `(value, count)` byte-pairs in `qs`. /// Number of nibble-level RLE entries in `qs`.
/// Only meaningful when `is_rle()` is true. /// Only meaningful when `is_rle()` is true.
#[inline] #[inline]
pub fn rle_len(&self) -> usize { pub fn rle_len(&self) -> usize {
@@ -103,74 +107,112 @@ impl BlockQ4KRle {
} }
} }
// ---------------------------------------------------------------------------
// Nibble extraction / packing helpers
// ---------------------------------------------------------------------------
/// Extract all 256 nibbles from a 128-byte `qs` payload in output-sequential
/// order: for each 32-byte group, first all 32 lo nibbles, then all 32 hi
/// nibbles. Nibble at position `p` maps to output element `p`.
fn extract_nibbles(raw: &[u8; QK_K / 2]) -> [u8; QK_K] {
let mut nibbles = [0u8; QK_K];
let mut q_off = 0usize; // byte cursor
let mut n_off = 0usize; // nibble cursor
while n_off < QK_K {
for l in 0..32 {
nibbles[n_off + l] = raw[q_off + l] & 0x0F; // lo
nibbles[n_off + 32 + l] = raw[q_off + l] >> 4; // hi
}
q_off += 32;
n_off += 64;
}
nibbles
}
/// Inverse of [`extract_nibbles`]: pack a 256-nibble output-sequential array
/// back into the 128-byte `qs` layout.
fn pack_nibbles(nibbles: &[u8; QK_K]) -> [u8; QK_K / 2] {
let mut raw = [0u8; QK_K / 2];
let mut q_off = 0usize;
let mut n_off = 0usize;
while n_off < QK_K {
for l in 0..32 {
raw[q_off + l] = nibbles[n_off + l] | (nibbles[n_off + 32 + l] << 4);
}
q_off += 32;
n_off += 64;
}
raw
}
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// Encoding // Encoding
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
/// Encode a [`BlockQ4K`] block into a [`BlockQ4KRle`] block. /// Encode a [`BlockQ4K`] block into a [`BlockQ4KRle`] block.
/// ///
/// The `qs` payload is scanned for runs of equal consecutive bytes. RLE mode /// The `qs` payload is scanned for runs of equal consecutive nibbles in
/// is chosen when **both** conditions hold: /// output-sequential order. RLE mode is chosen when **both** conditions hold:
/// ///
/// 1. **Coverage**: at least `min_coverage` fraction of the 128 `qs` bytes /// 1. **Coverage**: at least `min_coverage` fraction of the 256 nibbles
/// belong to runs of length ≥ 2. These are the bytes whose weights can be /// belong to runs of length ≥ 2. These are the nibbles whose weights can
/// batched in `accumulate_rle_block`, replacing `2 * run_len` multiplies /// be batched in `accumulate_rle_block`, replacing one multiply per nibble
/// with just 2 per group-segment. /// with one multiply per output column per segment.
/// ///
/// 2. **Capacity**: the pair count does not exceed 128 (the physical limit of /// 2. **Capacity**: the entry count does not exceed 256 (the physical limit of
/// the 256-byte `qs` field at 2 bytes per pair). /// the 256-byte `qs` field at 1 byte per entry).
/// ///
/// | `min_coverage` | Effect | /// | `min_coverage` | Effect |
/// |----------------|------------------------------------------------------| /// |----------------|-------------------------------------------------------|
/// | `0.0` | RLE whenever pairs fit (≤ 128), regardless of runs | /// | `0.0` | RLE whenever entries fit (≤ 256), regardless of runs |
/// | `0.5` | RLE only if ≥ 50 % of bytes are in repeated runs | /// | `0.5` | RLE only if ≥ 50 % of nibbles are in repeated runs |
/// | `1.0` | RLE only when every byte is part of a run | /// | `1.0` | RLE only when every nibble is part of a run |
pub fn encode(block: &BlockQ4K, min_coverage: f32) -> BlockQ4KRle { pub fn encode(block: &BlockQ4K, min_coverage: f32) -> BlockQ4KRle {
debug_assert!( debug_assert!(
(0.0..=1.0).contains(&min_coverage), (0.0..=1.0).contains(&min_coverage),
"min_coverage must be in [0.0, 1.0], got {min_coverage}" "min_coverage must be in [0.0, 1.0], got {min_coverage}"
); );
let raw = &block.qs; // [u8; 128] let nibbles = extract_nibbles(&block.qs);
// Scan for runs of equal consecutive bytes. // Scan for runs of equal consecutive nibbles.
// Track long_run_bytes: bytes in runs of length ≥ 2 (the bytes that let mut entries = Vec::<u8>::with_capacity(QK_K);
// benefit from RLE in the matmul). let mut long_run_nibbles = 0usize;
let mut pairs: Vec<(u8, u8)> = Vec::with_capacity(QK_K / 2); let mut i = 0usize;
let mut long_run_bytes = 0usize;
let mut i = 0usize; while i < QK_K {
while i < raw.len() { let val = nibbles[i];
let val = raw[i]; let mut run = 0usize;
let mut run = 1u8; while i + run < QK_K && nibbles[i + run] == val {
while i + (run as usize) < raw.len()
&& raw[i + (run as usize)] == val
&& run < u8::MAX
{
run += 1; run += 1;
} }
pairs.push((val, run));
if run >= 2 { if run >= 2 {
long_run_bytes += run as usize; long_run_nibbles += run;
} }
i += run as usize; // Split runs longer than 16 into max-16 chunks (4-bit count field).
let mut rem = run;
while rem > 0 {
let chunk = rem.min(16);
entries.push((val << 4) | ((chunk - 1) as u8));
rem -= chunk;
}
i += run;
} }
// Coverage: fraction of qs bytes that are in non-singleton runs. // Coverage: fraction of the 256 nibbles that are in non-singleton runs.
let coverage = long_run_bytes as f32 / raw.len() as f32; let coverage = long_run_nibbles as f32 / QK_K as f32;
if pairs.len() <= QK_K / 2 && coverage >= min_coverage { // Use RLE when entries fit in qs (≤ 256) and coverage meets the threshold.
let n = pairs.len(); if entries.len() <= QK_K && coverage >= min_coverage {
let n = entries.len();
let mut qs = [0u8; QK_K]; let mut qs = [0u8; QK_K];
for (k, &(val, count)) in pairs.iter().enumerate() { qs[..n].copy_from_slice(&entries);
qs[2 * k] = val;
qs[2 * k + 1] = count;
}
BlockQ4KRle { BlockQ4KRle {
d: block.d, d: block.d,
dmin: block.dmin, dmin: block.dmin,
scales: block.scales, scales: block.scales,
flags: IS_RLE, flags: IS_RLE,
n_pairs: n as u8, n_pairs: n as u16,
qs, qs,
} }
} else { } else {
@@ -196,29 +238,32 @@ pub fn encode(block: &BlockQ4K, min_coverage: f32) -> BlockQ4KRle {
/// ///
/// # Panics (debug builds only) /// # Panics (debug builds only)
/// ///
/// Panics if the decoded RLE stream does not sum to exactly 128 bytes. /// Panics if the decoded RLE nibble stream does not sum to exactly 256 nibbles.
fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] { fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] {
if !block.is_rle() { if !block.is_rle() {
// First QK_K/2 bytes of qs hold the raw packed nibbles. // First QK_K/2 bytes of qs hold the raw packed nibbles.
block.qs[..QK_K / 2].try_into().unwrap() return block.qs[..QK_K / 2].try_into().unwrap();
} else {
let n = block.rle_len();
let mut raw = [0u8; QK_K / 2];
let mut pos = 0usize;
for i in 0..n {
let val = block.qs[2 * i];
let count = block.qs[2 * i + 1] as usize;
raw[pos..pos + count].fill(val);
pos += count;
}
debug_assert_eq!(
pos,
QK_K / 2,
"RLE run lengths sum to {pos}, expected {}",
QK_K / 2
);
raw
} }
let n = block.rle_len();
let mut nibbles = [0u8; QK_K];
let mut pos = 0usize;
for i in 0..n {
let entry = block.qs[i];
let val = entry >> 4;
let count = (entry & 0x0F) as usize + 1;
nibbles[pos..pos + count].fill(val);
pos += count;
}
debug_assert_eq!(
pos,
QK_K,
"nibble RLE lengths sum to {pos}, expected {QK_K}"
);
pack_nibbles(&nibbles)
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@@ -227,9 +272,9 @@ fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] {
/// Dequantise one [`BlockQ4KRle`] super-block into [`QK_K`] (256) `f32` values. /// Dequantise one [`BlockQ4KRle`] super-block into [`QK_K`] (256) `f32` values.
/// ///
/// When `IS_RLE` is set the RLE stream is first expanded into a 128-byte raw /// When `IS_RLE` is set the RLE stream is first expanded into a 256-nibble
/// buffer; thereafter the dequantisation is identical to /// buffer and packed back into a 128-byte raw representation; thereafter the
/// [`crate::dequantize_block_q4k`]: /// dequantisation is identical to [`crate::dequantize_block_q4k`]:
/// ///
/// ```text /// ```text
/// out[i] = d * scale[s] * nibble[i] - dmin * min[s] /// out[i] = d * scale[s] * nibble[i] - dmin * min[s]
@@ -273,86 +318,69 @@ pub fn dequantize_block_q4k_rle(block: &BlockQ4KRle, out: &mut [f32; QK_K]) {
/// Accumulate the contribution of one RLE-encoded block into `c_row`. /// Accumulate the contribution of one RLE-encoded block into `c_row`.
/// ///
/// For each `(value, count)` pair the dequantised weight is constant within /// With nibble-level RLE and output-sequential ordering, nibble position `p`
/// every 32-byte sub-block group, so the per-output-column dot-product /// maps directly to output element `p` in sub-block `p / 32`. For each entry
/// contribution reduces from `2 * run_len` multiplies to just `2`: /// the dequantised weight `dq` is constant within each sub-block segment, so
/// the per-output-column contribution reduces to:
/// ///
/// ```text /// ```text
/// original: Σ_{l} ( dq_lo * B[ki_lo+l, j] + dq_hi * B[ki_hi+l, j] ) /// c_row[j] += dq * Σ_{l in seg} B[ki_base + pos + l, j]
///
/// optimised: dq_lo * Σ_{l} B[ki_lo+l, j] + dq_hi * Σ_{l} B[ki_hi+l, j]
/// ``` /// ```
/// ///
/// A run that crosses a 32-byte group boundary (and thus a scale/min change) /// A run that crosses a 32-nibble sub-block boundary is split at the boundary;
/// is split at the boundary; each resulting segment is handled independently. /// each resulting segment is handled independently.
/// ///
/// `sum_lo` and `sum_hi` are caller-provided scratch slices (length `≥ n`) /// `sum_b` is a caller-provided scratch slice (length `≥ n`) reused across
/// reused across calls to avoid repeated allocation. /// calls to avoid repeated allocation.
fn accumulate_rle_block( fn accumulate_rle_block(
block: &BlockQ4KRle, block: &BlockQ4KRle,
b: &[u16], b: &[u16],
ki_base: usize, // first B-row index for this block (= b_idx * QK_K) ki_base: usize, // first B-row index for this block (= b_idx * QK_K)
n: usize, n: usize,
c_row: &mut [f32], c_row: &mut [f32],
sum_lo: &mut [f32], sum_b: &mut [f32], // scratch, length ≥ n
sum_hi: &mut [f32],
) { ) {
let d = fp16_to_f32(block.d); let d = fp16_to_f32(block.d);
let dmin = fp16_to_f32(block.dmin); let dmin = fp16_to_f32(block.dmin);
let mut byte_pos = 0usize; // running cursor into the 128-byte qs payload let mut nibble_pos = 0usize; // current position in the 256-nibble output stream
for p in 0..block.rle_len() { for p in 0..block.rle_len() {
let val = block.qs[2 * p]; let entry = block.qs[p];
let run = block.qs[2 * p + 1] as usize; let val = (entry >> 4) as f32; // nibble value 015
let lo = (val & 0x0F) as f32; let run = (entry & 0x0F) as usize + 1; // count 116
let hi = (val >> 4) as f32;
let mut remaining = run; let mut remaining = run;
let mut pos = byte_pos; let mut pos = nibble_pos;
while remaining > 0 { while remaining > 0 {
// Clip the current run to the boundary of the 32-byte group so // Sub-block at this position; split at sub-block boundaries (every 32 nibbles).
// that the sub-block scale/min stays constant over the segment. let sub_block = pos / 32; // 0..8
let group = pos / 32; // 0..4 let in_sb = pos % 32;
let in_group = pos % 32; // byte offset within this group let seg_len = remaining.min(32 - in_sb);
let seg_len = remaining.min((group + 1) * 32 - pos);
// Constant dequantised values for both nibble levels in this group. let (sc, mn) = get_scale_min(sub_block, &block.scales);
let (sc_lo, mn_lo) = get_scale_min(group * 2, &block.scales); let dq = d * sc as f32 * val - dmin * mn as f32;
let (sc_hi, mn_hi) = get_scale_min(group * 2 + 1, &block.scales);
let dq_lo = d * sc_lo as f32 * lo - dmin * mn_lo as f32;
let dq_hi = d * sc_hi as f32 * hi - dmin * mn_hi as f32;
// Map byte positions to dequantised-output indices (0..QK_K): // Accumulate B-column sums for this segment (stride-1 per B row).
// lo nibbles → group*64 + in_group .. + seg_len sum_b[..n].fill(0.0);
// hi nibbles → group*64 + 32 + in_group .. + seg_len
let out_lo = group * 64 + in_group;
let out_hi = group * 64 + 32 + in_group;
// Sum B rows for every j across the segment (B accessed stride-1
// within each row — cache-friendly).
sum_lo[..n].fill(0.0);
sum_hi[..n].fill(0.0);
for l in 0..seg_len { for l in 0..seg_len {
let base_lo = (ki_base + out_lo + l) * n; let b_base = (ki_base + pos + l) * n;
let base_hi = (ki_base + out_hi + l) * n;
for j in 0..n { for j in 0..n {
sum_lo[j] += fp16_to_f32(b[base_lo + j]); sum_b[j] += fp16_to_f32(b[b_base + j]);
sum_hi[j] += fp16_to_f32(b[base_hi + j]);
} }
} }
// One multiply per output column instead of one per weight element. // One multiply per output column instead of one per nibble.
for j in 0..n { for j in 0..n {
c_row[j] += dq_lo * sum_lo[j] + dq_hi * sum_hi[j]; c_row[j] += dq * sum_b[j];
} }
pos += seg_len; pos += seg_len;
remaining -= seg_len; remaining -= seg_len;
} }
byte_pos += run; nibble_pos += run;
} }
} }
@@ -361,20 +389,16 @@ fn accumulate_rle_block(
/// ///
/// For blocks in **RLE mode** (`IS_RLE = 1`) the intermediate decompressed row /// For blocks in **RLE mode** (`IS_RLE = 1`) the intermediate decompressed row
/// is eliminated entirely. [`accumulate_rle_block`] works directly over the /// is eliminated entirely. [`accumulate_rle_block`] works directly over the
/// `(value, count)` pairs: within each run the dequantised weight is constant /// nibble-level RLE entries: within each run the dequantised weight is constant
/// across all elements in the run, so each output column `j` requires only /// across all elements, so each output column `j` requires only one multiply
/// **2 multiplies per group-segment** rather than 2 per weight element: /// per sub-block segment rather than one per nibble:
/// ///
/// ```text /// ```text
/// c[i, j] += dq_lo * Σ B[ki_lo, j] + dq_hi * Σ B[ki_hi, j] /// c[i, j] += dq * Σ B[ki_base + pos + l, j]
/// ─────────────────────────────────────────── /// ──────────────────────────
/// summed over seg_len consecutive positions /// summed over seg_len nibble positions
/// ``` /// ```
/// ///
/// For a single-run block (all bytes identical) this reduces the multiply
/// count from `2 * QK_K = 512` to `2 * 4 = 8` per output column (4 groups,
/// 2 nibble levels each), while B is still read exactly once.
///
/// For blocks in **raw mode** (`IS_RLE = 0`) the block is dequantised into a /// For blocks in **raw mode** (`IS_RLE = 0`) the block is dequantised into a
/// scratch buffer and its contribution is accumulated via a saxpy loop /// scratch buffer and its contribution is accumulated via a saxpy loop
/// (weight-outer, column-inner), which accesses B in row-major order. /// (weight-outer, column-inner), which accesses B in row-major order.
@@ -431,8 +455,7 @@ pub fn matmul_q4k_rle_fp16(
// Scratch for raw-mode block dequantisation. // Scratch for raw-mode block dequantisation.
let mut block_buf = [0.0f32; QK_K]; let mut block_buf = [0.0f32; QK_K];
// Scratch for RLE-mode B-column sums; allocated once and reused per segment. // Scratch for RLE-mode B-column sums; allocated once and reused per segment.
let mut sum_lo = vec![0.0f32; n]; let mut sum_b = vec![0.0f32; n];
let mut sum_hi = vec![0.0f32; n];
for i in 0..m { for i in 0..m {
let c_row = &mut c[i * n..(i + 1) * n]; let c_row = &mut c[i * n..(i + 1) * n];
@@ -442,10 +465,10 @@ pub fn matmul_q4k_rle_fp16(
let ki_base = b_idx * QK_K; let ki_base = b_idx * QK_K;
if block.is_rle() { if block.is_rle() {
// RLE path: accumulate directly from runs, no decompression. // RLE path: accumulate directly from nibble runs, no decompression.
accumulate_rle_block( accumulate_rle_block(
block, b, ki_base, n, c_row, block, b, ki_base, n, c_row,
&mut sum_lo, &mut sum_hi, &mut sum_b,
); );
} else { } else {
// Raw path: dequantise once, then saxpy into c_row. // Raw path: dequantise once, then saxpy into c_row.
@@ -585,18 +608,17 @@ mod tests {
// ========================================================================= // =========================================================================
#[test] #[test]
fn block_q4k_rle_size_is_274_bytes() { fn block_q4k_rle_size_is_276_bytes() {
// d(2) + dmin(2) + scales(12) + flags(1) + n_pairs(1) + qs(256) = 274 bytes. // d(2) + dmin(2) + scales(12) + flags(1) + pad(1) + n_pairs(2) + qs(256) = 276 bytes.
// No padding needed: struct is already 2-byte aligned and 274 is even. assert_eq!(core::mem::size_of::<BlockQ4KRle>(), 276);
assert_eq!(core::mem::size_of::<BlockQ4KRle>(), 274);
} }
#[test] #[test]
fn block_q4k_rle_is_130_bytes_larger_than_block_q4k() { fn block_q4k_rle_is_132_bytes_larger_than_block_q4k() {
// BlockQ4K = 144 bytes, BlockQ4KRle = 274 bytes, delta = 130. // BlockQ4K = 144 bytes, BlockQ4KRle = 276 bytes, delta = 132.
assert_eq!( assert_eq!(
core::mem::size_of::<BlockQ4KRle>(), core::mem::size_of::<BlockQ4KRle>(),
core::mem::size_of::<BlockQ4K>() + 130, core::mem::size_of::<BlockQ4K>() + 132,
); );
} }
@@ -633,11 +655,11 @@ mod tests {
#[test] #[test]
fn rle_len_reports_pair_count_from_n_pairs() { fn rle_len_reports_pair_count_from_n_pairs() {
for n in [0usize, 1, 7, 31, 63, 128] { for n in [0usize, 1, 7, 31, 63, 128, 256] {
let b = BlockQ4KRle { let b = BlockQ4KRle {
d: 0, dmin: 0, scales: [0; K_SCALE_SIZE], d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
flags: if n > 0 { IS_RLE } else { 0 }, flags: if n > 0 { IS_RLE } else { 0 },
n_pairs: n as u8, n_pairs: n as u16,
qs: [0; QK_K], qs: [0; QK_K],
}; };
assert_eq!(b.rle_len(), n, "expected rle_len {n}"); assert_eq!(b.rle_len(), n, "expected rle_len {n}");
@@ -650,30 +672,37 @@ mod tests {
#[test] #[test]
fn encode_uniform_qs_uses_rle() { fn encode_uniform_qs_uses_rle() {
// 128 identical bytes → 1 pair → 2 bytes stored in qs. // All identical bytes → 256 identical nibbles → RLE mode.
let src = make_block(1.0, 0.0, 1, 0, 0x77); let src = make_block(1.0, 0.0, 1, 0, 0x77);
let rle = encode(&src, 0.0); let rle = encode(&src, 0.0);
assert!(rle.is_rle(), "uniform qs should trigger RLE mode"); assert!(rle.is_rle(), "uniform qs should trigger RLE mode");
} }
#[test] #[test]
fn encode_uniform_qs_rle_len_is_one() { fn encode_uniform_qs_rle_entry_count_is_sixteen() {
// 256 identical nibbles → max chunk size 16 → 16 entries.
let src = make_block(1.0, 0.0, 1, 0, 0x55); let src = make_block(1.0, 0.0, 1, 0, 0x55);
let rle = encode(&src, 0.0); let rle = encode(&src, 0.0);
assert_eq!(rle.rle_len(), 1); assert_eq!(rle.rle_len(), 16, "256 identical nibbles → 16 chunks of 16");
} }
#[test] #[test]
fn encode_uniform_qs_rle_entry_is_correct() { fn encode_uniform_qs_first_entry_is_correct() {
// 0xAB: lo nibble = 0xB = 11, hi nibble = 0xA = 10.
// Output-sequential: [0xB×32, 0xA×32] × 4 groups = 8 runs of 32.
// Each run of 32 → 2 entries of 16.
// First entry: val=0xB=11, count=16 → (11<<4)|15 = 0xBF.
let src = make_block(1.0, 0.0, 1, 0, 0xAB); let src = make_block(1.0, 0.0, 1, 0, 0xAB);
let rle = encode(&src, 0.0); let rle = encode(&src, 0.0);
assert_eq!(rle.qs[0], 0xAB, "RLE value byte should equal the repeated byte"); assert!(rle.is_rle());
assert_eq!(rle.qs[1], 128, "RLE run length should be 128 bytes"); assert_eq!(rle.qs[0], (0xBu8 << 4) | 0xFu8, "first entry: val=0xB, count=16");
assert_eq!(rle.rle_len(), 16, "8 runs of 32, each split into 2 → 16 entries");
} }
#[test] #[test]
fn encode_alternating_bytes_stays_raw() { fn encode_alternating_bytes_stays_raw() {
// Alternating 0xAA / 0x55 → 128 singleton pairs, coverage = 0%. // Alternating 0xAA / 0x55 → nibble stream alternates 10,5,10,5,...
// No two adjacent nibbles are equal → 0% nibble coverage.
// At threshold 0.01 the 0% coverage fails → raw mode. // At threshold 0.01 the 0% coverage fails → raw mode.
let mut qs = [0u8; QK_K / 2]; let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() { for (i, b) in qs.iter_mut().enumerate() {
@@ -686,7 +715,7 @@ mod tests {
#[test] #[test]
fn encode_raw_mode_copies_qs_verbatim() { fn encode_raw_mode_copies_qs_verbatim() {
// Three-byte cycle of distinct values → 128 runs of 1 byte each, // Three-byte cycle of distinct values → no adjacent nibble repeats,
// coverage = 0%. At threshold 0.01 the 0% coverage fails → raw mode. // coverage = 0%. At threshold 0.01 the 0% coverage fails → raw mode.
let mut qs = [0u8; QK_K / 2]; let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() { for (i, b) in qs.iter_mut().enumerate() {
@@ -700,138 +729,99 @@ mod tests {
} }
#[test] #[test]
fn encode_two_runs_uses_rle_and_stores_correct_pairs() { fn encode_two_run_block_stores_correct_entries() {
// Two distinct runs: 64 bytes of 0x11 followed by 64 bytes of 0x22. // qs = [0x11×64, 0x22×64]: both nibbles of 0x11 are 1, of 0x22 are 2.
// → 2 pairs = 4 bytes. // Nibble stream (output-sequential): [1×128, 2×128] → 2 runs of 128.
// Each run of 128 → 8 entries of 16 → 16 entries total.
let mut qs = [0u8; QK_K / 2]; let mut qs = [0u8; QK_K / 2];
qs[..64].fill(0x11); qs[..64].fill(0x11);
qs[64..].fill(0x22); qs[64..].fill(0x22);
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
let rle = encode(&src, 0.0); let rle = encode(&src, 0.0);
assert!(rle.is_rle()); assert!(rle.is_rle());
assert_eq!(rle.rle_len(), 2); // 2 runs of 128 nibbles each → 8 entries per run → 16 total entries.
assert_eq!(rle.qs[0], 0x11, "first pair: value"); assert_eq!(rle.rle_len(), 16);
assert_eq!(rle.qs[1], 64, "first pair: run length"); // First entry: nibble val=1, count=16 → (1<<4)|15 = 0x1F.
assert_eq!(rle.qs[2], 0x22, "second pair: value"); assert_eq!(rle.qs[0], 0x1F, "first entry should be nibble=1, count=16");
assert_eq!(rle.qs[3], 64, "second pair: run length"); // 9th entry (first for nibble 2): (2<<4)|15 = 0x2F.
assert_eq!(rle.qs[8], 0x2F, "9th entry should be nibble=2, count=16");
} }
#[test] #[test]
fn encode_63_pairs_uses_rle() { fn encode_nibble_coverage_determines_rle_mode() {
// Build 62 runs of 2 bytes each (124 bytes) + 1 run of 4 bytes = 128 bytes. // A block with all-same nibbles → 100% coverage → always RLE.
// 63 pairs × 2 = 126 bytes; 63 ≤ 128 → RLE should be chosen. let src_uniform = make_block(1.0, 0.0, 1, 0, 0x77);
let mut qs = [0u8; QK_K / 2]; assert!(encode(&src_uniform, 0.0).is_rle());
let mut pos = 0usize; assert!(encode(&src_uniform, 1.0).is_rle()); // 100% coverage meets any threshold
for run in 0..62usize {
// Use a stride-3 sequence so consecutive values are always distinct.
let v = (run as u8).wrapping_mul(3).wrapping_add(1);
qs[pos] = v;
qs[pos + 1] = v;
pos += 2;
}
// Final run: 4 bytes, value chosen to differ from the previous one.
qs[pos..].fill(0xFE);
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); // Build a block whose nibble stream has 0% coverage:
let rle = encode(&src, 0.0); // lo = (i*2+1) % 16, hi = (i*2+2) % 16
assert!(rle.is_rle(), "63 pairs should use RLE"); // Adjacent lo nibbles differ by 2 (mod 16); adjacent hi nibbles differ by 2.
assert_eq!(rle.rle_len(), 63); // Group boundaries also do not align (verified analytically).
}
#[test]
fn encode_64_pairs_uses_rle_at_zero_threshold() {
// 64 runs of 2 bytes each = 128 bytes total, coverage = 100%.
// pairs (64) ≤ 128 AND 100% ≥ 0.0 → RLE mode.
let mut qs = [0u8; QK_K / 2];
let mut pos = 0usize;
for run in 0..64usize {
let v = (run as u8).wrapping_mul(3).wrapping_add(1);
qs[pos] = v;
qs[pos + 1] = v;
pos += 2;
}
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
let rle = encode(&src, 0.0);
assert!(rle.is_rle(), "64 pairs, 100% coverage, threshold 0.0 → RLE");
assert_eq!(rle.rle_len(), 64);
}
#[test]
fn encode_128_pairs_uses_rle_at_zero_threshold() {
// 128 distinct consecutive bytes = 128 singleton runs = 128 pairs.
// With old cap (64 pairs), this was always raw.
// With new cap (128 pairs), threshold 0.0 accepts it.
// Coverage = 0 % (all singletons) → threshold > 0.0 rejects it.
let mut qs = [0u8; QK_K / 2]; let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() { for (i, b) in qs.iter_mut().enumerate() {
*b = i as u8; // 0x00, 0x01, ..., 0x7F — all distinct, all singletons let lo = ((i * 2 + 1) % 16) as u8;
let hi = ((i * 2 + 2) % 16) as u8;
*b = lo | (hi << 4);
} }
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); let src_varied = make_block_with_qs(1.0, 0.0, 1, 0, qs);
// At 0.0 threshold: RLE (≤ 256 entries always fit, 0.0 ≥ 0.0).
assert!( assert!(encode(&src_varied, 0.0).is_rle());
encode(&src, 0.0).is_rle(), // At any positive threshold: 0% nibble coverage → raw.
"128 pairs ≤ 128 limit AND 0% ≥ 0.0 → RLE at zero threshold" assert!(!encode(&src_varied, 0.01).is_rle());
);
assert_eq!(encode(&src, 0.0).rle_len(), 128);
assert!(
!encode(&src, 0.01).is_rle(),
"0% coverage fails any threshold > 0"
);
} }
#[test] #[test]
fn encode_coverage_threshold_rejects_low_coverage_block() { fn encode_nibble_max_count_is_sixteen() {
// Construct: 63 singletons + 1 run of 65 bytes = 64 pairs. // A run of 256 identical nibbles should be stored as 16 entries of 16 nibbles.
// coverage = 65/128 ≈ 50.8%. // 0x33: lo nibble = 3, hi nibble = 3 → all 256 nibbles are 3.
// threshold 0.50 accepts it; threshold 0.60 rejects it. let src = make_block(1.0, 0.0, 1, 0, 0x33);
let mut qs = [0u8; QK_K / 2]; let rle = encode(&src, 0.0);
qs[0] = 0x01; assert!(rle.is_rle());
for i in 1..63usize { assert_eq!(rle.rle_len(), 16);
// Distinct odd bytes, none equal to 0x01 or adjacent values. // Each entry: nibble=3, count=16 → (3<<4)|15 = 0x3F.
qs[i] = (i as u8).wrapping_mul(2).wrapping_add(5); assert!(rle.qs[..16].iter().all(|&e| e == 0x3F));
}
qs[63..].fill(0xAB); // 65-byte run; qs[62] = 62*2+5 = 129 → wraps to 0x81 ≠ 0xAB ✓
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
assert!(
encode(&src, 0.50).is_rle(),
"50.8% coverage should meet 50% threshold"
);
assert!(
!encode(&src, 0.60).is_rle(),
"50.8% coverage should fail 60% threshold"
);
} }
#[test] #[test]
fn encode_coverage_zero_threshold_always_uses_rle_when_pairs_fit() { fn encode_256_nibbles_fits_in_qs_at_zero_threshold() {
// Any block whose runs produce ≤ 128 pairs uses RLE at threshold 0.0, // Worst case: all 256 nibbles are singletons.
// regardless of how many singletons it contains. // Construct lo = (i*2+1) % 16, hi = (i*2+2) % 16: gives 0% coverage.
// Use the 63-pair block from encode_63_pairs_uses_rle.
let mut qs = [0u8; QK_K / 2]; let mut qs = [0u8; QK_K / 2];
let mut pos = 0usize; for (i, b) in qs.iter_mut().enumerate() {
for run in 0..62usize { let lo = ((i * 2 + 1) % 16) as u8;
let v = (run as u8).wrapping_mul(3).wrapping_add(1); let hi = ((i * 2 + 2) % 16) as u8;
qs[pos] = v; *b = lo | (hi << 4);
qs[pos + 1] = v;
pos += 2;
} }
qs[pos..].fill(0xFE);
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
assert!(encode(&src, 0.0).is_rle()); let rle = encode(&src, 0.0);
// At threshold 0.0 the block should be RLE (256 entries ≤ 256 capacity,
// and 0.0 ≥ 0.0). The dequantised output must match the baseline.
let mut got = [0.0f32; QK_K];
let mut expected = [0.0f32; QK_K];
dequantize_block_q4k_rle(&rle, &mut got);
dequantize_block_q4k(&src, &mut expected);
assert_slices_close(&got, &expected, 1e-5);
} }
#[test] #[test]
fn encode_coverage_one_threshold_requires_total_coverage() { fn encode_nibble_coverage_threshold_controls_rle_selection() {
// A block with even one singleton byte fails the 100% threshold. // Build a block with moderate nibble-level coverage.
// Build: 1 singleton + 1 run of 127 bytes = 2 pairs, coverage = 127/128 ≈ 99.2%. // First 32 bytes all 0x77 (both nibbles 7) → group 0 in output-sequential
// order is entirely 7 (64 nibbles in one run). The remaining bytes
// cycle through values with no adjacent nibble repeats.
// Coverage ≈ 64/256 = 25% (plus a few boundary matches, ~26-27%).
let mut qs = [0u8; QK_K / 2]; let mut qs = [0u8; QK_K / 2];
qs[0] = 0x01; // singleton (value distinct from rest) qs[..32].fill(0x77);
qs[1..].fill(0x02); // 127-byte run for i in 32..128usize {
qs[i] = ((i % 15 + 1) as u8) | (((i + 8) % 16) as u8) << 4;
}
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
assert!(!encode(&src, 1.0).is_rle(), "99.2% coverage should fail 100% threshold");
assert!(encode(&src, 0.99).is_rle(), "99.2% coverage should meet 99% threshold"); // Coverage is roughly 2527%, which is between 0.20 and 0.30.
assert!(encode(&src, 0.20).is_rle(), "~26% coverage meets 20% threshold");
assert!(encode(&src, 0.24).is_rle(), "~26% coverage meets 24% threshold");
assert!(!encode(&src, 0.30).is_rle(), "~26% coverage does not meet 30% threshold");
} }
#[test] #[test]
@@ -859,30 +849,58 @@ mod tests {
} }
#[test] #[test]
fn decode_qs_rle_expands_two_pair_stream() { fn decode_qs_rle_expands_two_run_stream() {
// Hand-craft an RLE block: [0xAA × 64, 0xBB × 64]. // [0xAA × 64 bytes, 0xBB × 64 bytes] in nibble-level RLE.
// 0xAA: lo=hi=0xA=10. 0xBB: lo=hi=0xB=11.
// Output-sequential: 128 nibbles of 10 (from 64 bytes of 0xAA),
// then 128 nibbles of 11 (from 64 bytes of 0xBB).
// Each run of 128 → 8 entries of 16: 0xAF × 8, then 0xBF × 8.
let mut qs = [0u8; QK_K]; let mut qs = [0u8; QK_K];
qs[0] = 0xAA; qs[1] = 64; for i in 0..8 { qs[i] = 0xAF; } // val=0xA, count-1=0xF → 16 nibbles each
qs[2] = 0xBB; qs[3] = 64; for i in 8..16 { qs[i] = 0xBF; }
let rle = BlockQ4KRle { let rle = BlockQ4KRle {
d: 0, dmin: 0, scales: [0; K_SCALE_SIZE], d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
flags: IS_RLE, n_pairs: 2, qs, flags: IS_RLE, n_pairs: 16, qs,
}; };
let expanded = decode_qs(&rle); let expanded = decode_qs(&rle);
// First 64 bytes should be 0xAA, last 64 bytes should be 0xBB.
assert!(expanded[..64].iter().all(|&b| b == 0xAA), "first 64 bytes must be 0xAA"); assert!(expanded[..64].iter().all(|&b| b == 0xAA), "first 64 bytes must be 0xAA");
assert!(expanded[64..].iter().all(|&b| b == 0xBB), "last 64 bytes must be 0xBB"); assert!(expanded[64..].iter().all(|&b| b == 0xBB), "last 64 bytes must be 0xBB");
} }
#[test] #[test]
fn decode_qs_rle_single_run_covers_all() { fn decode_qs_rle_single_byte_value_covers_all() {
// 128 bytes of 0xCD: lo nibble = 0xD = 13, hi nibble = 0xC = 12.
// Nibble stream in output-sequential order:
// [0xD×32, 0xC×32] × 4 groups = 8 runs of 32.
// Each run of 32 → 2 entries of 16.
// Entries in pairs: DF, DF, CF, CF, DF, DF, CF, CF, ...
// i.e. (i/2) % 2 == 0 → 0xDF, else 0xCF.
let mut qs = [0u8; QK_K]; let mut qs = [0u8; QK_K];
qs[0] = 0xCD; qs[1] = 128; // one run of 128 bytes for i in 0..16 {
qs[i] = if (i / 2) % 2 == 0 { 0xDF } else { 0xCF };
}
let rle = BlockQ4KRle { let rle = BlockQ4KRle {
d: 0, dmin: 0, scales: [0; K_SCALE_SIZE], d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
flags: IS_RLE, n_pairs: 1, qs, flags: IS_RLE, n_pairs: 16, qs,
}; };
let expanded = decode_qs(&rle); let expanded = decode_qs(&rle);
assert!(expanded.iter().all(|&b| b == 0xCD)); assert!(expanded.iter().all(|&b| b == 0xCD), "all bytes should be 0xCD");
}
// =========================================================================
// extract_nibbles / pack_nibbles round-trip
// =========================================================================
#[test]
fn extract_and_pack_nibbles_round_trip() {
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
*b = (i.wrapping_mul(37).wrapping_add(13) & 0xFF) as u8;
}
let nibbles = extract_nibbles(&qs);
let repacked = pack_nibbles(&nibbles);
assert_eq!(repacked, qs, "pack(extract(qs)) must equal qs");
} }
// ========================================================================= // =========================================================================
@@ -972,7 +990,7 @@ mod tests {
#[test] #[test]
fn roundtrip_raw_mode_matches_original() { fn roundtrip_raw_mode_matches_original() {
// Alternating bytes → 128 singleton pairs, coverage = 0%. // Alternating bytes → no adjacent nibble repeats, coverage = 0%.
// Use threshold 0.01 to force raw mode (0% < 0.01). // Use threshold 0.01 to force raw mode (0% < 0.01).
let mut qs = [0u8; QK_K / 2]; let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() { for (i, b) in qs.iter_mut().enumerate() {
@@ -1007,7 +1025,7 @@ mod tests {
#[test] #[test]
fn roundtrip_many_short_runs_matches_original() { fn roundtrip_many_short_runs_matches_original() {
// Four distinct runs of varying lengths → still compresses. // Four distinct byte runs → multiple nibble runs (RLE still compresses).
let mut qs = [0u8; QK_K / 2]; let mut qs = [0u8; QK_K / 2];
qs[..10].fill(0x11); qs[..10].fill(0x11);
qs[10..30].fill(0x22); qs[10..30].fill(0x22);
@@ -1015,8 +1033,7 @@ mod tests {
qs[31..].fill(0x44); qs[31..].fill(0x44);
let src = make_block_with_qs(1.0, 0.5, 7, 3, qs); let src = make_block_with_qs(1.0, 0.5, 7, 3, qs);
let rle = encode(&src, 0.0); let rle = encode(&src, 0.0);
assert!(rle.is_rle(), "4-run block should compress"); assert!(rle.is_rle(), "multi-run block should compress");
assert_eq!(rle.rle_len(), 4);
let mut got = [0.0f32; QK_K]; let mut got = [0.0f32; QK_K];
let mut expected = [0.0f32; QK_K]; let mut expected = [0.0f32; QK_K];
@@ -1053,26 +1070,6 @@ mod tests {
assert_close(got[32], 3.0, 1e-5); assert_close(got[32], 3.0, 1e-5);
} }
#[test]
fn roundtrip_128_singleton_pairs_matches_original() {
// All-distinct bytes → 128 pairs, 0% coverage.
// encode at threshold 0.0 → RLE; dequantize must match baseline.
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
*b = (i as u8).wrapping_mul(3).wrapping_add(7);
}
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
let rle = encode(&src, 0.0);
assert!(rle.is_rle());
assert_eq!(rle.rle_len(), 128);
let mut got = [0.0f32; QK_K];
let mut expected = [0.0f32; QK_K];
dequantize_block_q4k_rle(&rle, &mut got);
dequantize_block_q4k(&src, &mut expected);
assert_slices_close(&got, &expected, 1e-5);
}
// ========================================================================= // =========================================================================
// matmul_q4k_rle_fp16 // matmul_q4k_rle_fp16
// ========================================================================= // =========================================================================