From bba9db290e139d7e74dc82b88c957d1175c4418a Mon Sep 17 00:00:00 2001 From: charles Date: Sun, 12 Apr 2026 21:26:36 -0700 Subject: [PATCH] RLE now works on nibbles --- benches/matmul.rs | 155 +++++++----- src/rle.rs | 603 +++++++++++++++++++++++----------------------- 2 files changed, 398 insertions(+), 360 deletions(-) diff --git a/benches/matmul.rs b/benches/matmul.rs index 71b6fe3..75546dc 100644 --- a/benches/matmul.rs +++ b/benches/matmul.rs @@ -1,26 +1,30 @@ //! # Benchmark: BlockQ4K vs BlockQ4KRle //! -//! Measures three operations across two weight distributions: +//! Measures three operations across three weight distributions, encoded with +//! `min_coverage = 0.01` (blocks need ≥ 1 % of their 256 nibbles in repeated +//! runs to use RLE mode). //! -//! | Group | What is timed | -//! |--------------|--------------------------------------------------| -//! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks | -//! | `dequantize` | Single-block dequantisation for all three paths | -//! | `matmul` | Full A×B multiply at three matrix sizes | +//! | Group | What is timed | +//! |--------------|-----------------------------------------------------| +//! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks | +//! | `dequantize` | Single-block dequantisation across all four paths | +//! | `matmul` | Full A×B multiply at three matrix sizes | //! //! ## Weight distributions //! -//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG). -//! Consecutive bytes almost never repeat, so each block produces ~128 -//! single-byte runs. At 2 bytes per pair that would require ~256 bytes, -//! which exceeds the 128-byte raw payload, so `encode` always keeps these -//! blocks in **raw mode** (IS_RLE = 0). This is representative of typical -//! unstructured LLM weight matrices. +//! **uniform** — each qs byte is drawn from a pseudo-random LCG sequence. +//! Adjacent nibbles match with probability 1/16, giving ~12 % nibble coverage. +//! At `min_coverage = 0.01` these blocks encode to **RLE mode** (IS_RLE = 1) +//! with ~230–240 nibble entries — a realistic proxy for trained Q4_K weights. //! -//! **rle_optimal** — every byte in a block's qs field is the same value. -//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 — -//! and sets IS_RLE = 1. This is the theoretical compression maximum, and -//! is representative of highly sparse or dead-neuron weight matrices. +//! **rle_optimal** — every qs byte is the same value. All 256 nibbles are +//! identical, giving 100 % coverage and just 16 nibble entries. This is the +//! theoretical RLE maximum and represents highly structured weight blocks. +//! +//! **zero_coverage** — nibbles cycle deterministically so no two consecutive +//! nibbles (in output-sequential order) are ever equal. Coverage = 0 %; +//! `encode` keeps these blocks in **raw mode** (IS_RLE = 0) at any positive +//! threshold. Used only in the `dequantize` group to benchmark the raw path. use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; use matrix_testing::{ @@ -83,12 +87,11 @@ fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] { s } -/// Return `count` blocks whose qs bytes are pseudo-random. +/// Return `count` blocks whose qs bytes are pseudo-random (LCG). /// -/// With uniformly distributed bytes, consecutive bytes match with probability -/// 1/256 ≈ 0.4%, yielding ~128 runs per block. Storing those as (value, -/// count) pairs would need ~256 bytes — more than the 128-byte raw payload — -/// so `encode` will always select **raw mode** (IS_RLE = 0). +/// Adjacent nibbles match with probability 1/16, giving each block roughly +/// 12 % nibble coverage. At `min_coverage = 0.01` these blocks encode to +/// **RLE mode** (IS_RLE = 1) with ~230–240 nibble entries per block. fn uniform_blocks(count: usize) -> Vec { let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234); let scales = make_scales(7, 2); @@ -107,10 +110,9 @@ fn uniform_blocks(count: usize) -> Vec { /// Return `count` blocks where every qs byte is the same value. /// -/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes -/// instead of 128. `encode` will always select **RLE mode** (IS_RLE = 1). -/// Each block uses a fresh pseudo-random byte so no two blocks are identical, -/// avoiding degenerate cache-warm effects across the batch. +/// All 256 nibbles are identical → 100 % nibble coverage → always **RLE mode** +/// with exactly 16 entries (256 nibbles / 16 per entry). +/// Each block uses a fresh pseudo-random byte to avoid cache-warm artifacts. fn rle_optimal_blocks(count: usize) -> Vec { let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0); let scales = make_scales(7, 2); @@ -129,6 +131,28 @@ fn fp16_ones(k: usize, n: usize) -> Vec { vec![f32_to_fp16(1.0); k * n] } +/// Build one block whose nibbles cycle so that no two consecutive nibbles +/// (in output-sequential order) are ever equal → 0 % nibble coverage. +/// +/// Lo nibble of byte `i` = `i % 16`; hi nibble = `(i + 8) % 16`. +/// Within every 32-byte group the lo and hi streams each visit all 16 values +/// twice without repetition, and across group boundaries the last nibble of +/// one stream differs from the first nibble of the next. +/// +/// At any `min_coverage > 0.0`, `encode` keeps this block in **raw mode**. +fn zero_coverage_block() -> BlockQ4K { + let scales = make_scales(7, 2); + let d = f32_to_fp16(0.01); + let dmin = f32_to_fp16(0.001); + let mut qs = [0u8; QK_K / 2]; + for (i, b) in qs.iter_mut().enumerate() { + let lo = (i % 16) as u8; + let hi = ((i + 8) % 16) as u8; + *b = lo | (hi << 4); + } + BlockQ4K { d, dmin, scales, qs } +} + // --------------------------------------------------------------------------- // Group 1 — encode // --------------------------------------------------------------------------- @@ -136,12 +160,11 @@ fn fp16_ones(k: usize, n: usize) -> Vec { /// Number of blocks encoded per iteration in `bench_encode`. const ENCODE_BATCH: usize = 512; -/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output. +/// Measures the cost of scanning nibbles and writing the `BlockQ4KRle` output. /// -/// Both distributions perform the same O(128) run-length scan. The only -/// divergence is at the output stage: -/// * **uniform** — run count > 63 → fall through to memcpy of 128 bytes. -/// * **rle_optimal** — run count = 1 → write 2 bytes and set IS_RLE. +/// Both distributions perform the same O(256) nibble scan. The output differs: +/// * **uniform** — ~12 % coverage → RLE mode, ~230–240 entries written. +/// * **rle_optimal** — 100 % coverage → RLE mode, exactly 16 entries written. fn bench_encode(c: &mut Criterion) { let uniform = uniform_blocks(ENCODE_BATCH); let rle_opt = rle_optimal_blocks(ENCODE_BATCH); @@ -153,7 +176,7 @@ fn bench_encode(c: &mut Criterion) { group.bench_function("uniform", |b| { b.iter(|| { for blk in &uniform { - black_box(encode(black_box(blk), 0.0)); + black_box(encode(black_box(blk), 0.01)); } }); }); @@ -161,7 +184,7 @@ fn bench_encode(c: &mut Criterion) { group.bench_function("rle_optimal", |b| { b.iter(|| { for blk in &rle_opt { - black_box(encode(black_box(blk), 0.0)); + black_box(encode(black_box(blk), 0.01)); } }); }); @@ -173,25 +196,35 @@ fn bench_encode(c: &mut Criterion) { // Group 2 — dequantize (single block) // --------------------------------------------------------------------------- -/// Compares the three single-block dequantisation code paths. +/// Compares four single-block dequantisation code paths. /// -/// | Variant | Block type | Encoding | Extra work vs baseline | -/// |------------------|-------------|----------|-------------------------------| -/// | `q4k_baseline` | BlockQ4K | — | none | -/// | `rle_raw_mode` | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) | -/// | `rle_rle_mode` | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf | +/// | Variant | Block type | Encoding | IS_RLE | Entries | +/// |--------------------|-------------|-----------|--------|---------| +/// | `q4k_baseline` | BlockQ4K | — | — | — | +/// | `rle_raw_mode` | BlockQ4KRle | raw | 0 | — | +/// | `rle_sparse` | BlockQ4KRle | RLE | 1 | ~235 | +/// | `rle_dense` | BlockQ4KRle | RLE | 1 | 16 | +/// +/// `rle_raw_mode` uses the zero-coverage fixture (0 % nibble coverage), which +/// stays in raw mode at any positive threshold. +/// `rle_sparse` uses the LCG uniform fixture (~12 % coverage, ~235 entries), +/// representative of actual trained Q4_K weight blocks. +/// `rle_dense` uses the rle_optimal fixture (100 % coverage, 16 entries). /// /// Throughput is the number of dequantised weights produced per second. fn bench_dequantize(c: &mut Criterion) { - let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap(); - let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap(); + let q4k_baseline_block = uniform_blocks(1).into_iter().next().unwrap(); + let q4k_zero_cov = zero_coverage_block(); + let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap(); + let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap(); - let rle_raw = encode(&q4k_uniform, 0.0); // IS_RLE = 0 - let rle_rle = encode(&q4k_rle_opt, 0.0); // IS_RLE = 1 + let rle_raw = encode(&q4k_zero_cov, 0.01); // IS_RLE = 0 (0 % coverage) + let rle_sparse = encode(&q4k_uniform, 0.01); // IS_RLE = 1 (~12 % coverage) + let rle_dense = encode(&q4k_rle_opt, 0.01); // IS_RLE = 1 (100 % coverage) - // Confirm the fixtures ended up in the right encoding modes. - assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode"); - assert!(rle_rle.is_rle(), "rle-optimal block should encode to rle mode"); + assert!(!rle_raw.is_rle(), "zero-coverage block must be raw mode"); + assert!(rle_sparse.is_rle(), "uniform block must be RLE at 0.01 threshold"); + assert!(rle_dense.is_rle(), "rle-optimal block must be RLE mode"); let mut group = c.benchmark_group("dequantize"); // Throughput = QK_K (256) weights dequantised per second. @@ -200,7 +233,7 @@ fn bench_dequantize(c: &mut Criterion) { group.bench_function("q4k_baseline", |b| { b.iter(|| { let mut out = [0.0f32; QK_K]; - dequantize_block_q4k(black_box(&q4k_uniform), &mut out); + dequantize_block_q4k(black_box(&q4k_baseline_block), &mut out); black_box(out) }); }); @@ -213,10 +246,18 @@ fn bench_dequantize(c: &mut Criterion) { }); }); - group.bench_function("rle_rle_mode", |b| { + group.bench_function("rle_sparse", |b| { b.iter(|| { let mut out = [0.0f32; QK_K]; - dequantize_block_q4k_rle(black_box(&rle_rle), &mut out); + dequantize_block_q4k_rle(black_box(&rle_sparse), &mut out); + black_box(out) + }); + }); + + group.bench_function("rle_dense", |b| { + b.iter(|| { + let mut out = [0.0f32; QK_K]; + dequantize_block_q4k_rle(black_box(&rle_dense), &mut out); black_box(out) }); }); @@ -245,14 +286,14 @@ const CONFIGS: &[(usize, usize, usize)] = &[ /// Full matrix-multiply benchmark across weight distributions and matrix sizes. /// -/// Four variants per size: +/// Four variants per size (`min_coverage = 0.01`): /// -/// | Label | A type | RLE mode? | -/// |----------------------|-------------|-----------| -/// | `baseline/uniform` | BlockQ4K | — | -/// | `rle/uniform` | BlockQ4KRle | raw | -/// | `baseline/rle_opt` | BlockQ4K | — | -/// | `rle/rle_opt` | BlockQ4KRle | rle | +/// | Label | A type | IS_RLE | Entries/block | +/// |----------------------|-------------|--------|---------------| +/// | `baseline/uniform` | BlockQ4K | — | — | +/// | `rle/uniform` | BlockQ4KRle | 1 | ~235 | +/// | `baseline/rle_opt` | BlockQ4K | — | — | +/// | `rle/rle_opt` | BlockQ4KRle | 1 | 16 | /// /// Throughput is reported as multiply-accumulate operations (M × K × N) per /// second, allowing fair cross-size comparison. @@ -270,10 +311,10 @@ fn bench_matmul(c: &mut Criterion) { // Build all four A variants and the shared B matrix for this config. let a_q4k_u: Vec = uniform_blocks(m * bpr); - let a_rle_u: Vec = a_q4k_u.iter().map(|b| encode(b, 0.0)).collect(); + let a_rle_u: Vec = a_q4k_u.iter().map(|b| encode(b, 0.01)).collect(); let a_q4k_r: Vec = rle_optimal_blocks(m * bpr); - let a_rle_r: Vec = a_q4k_r.iter().map(|b| encode(b, 0.0)).collect(); + let a_rle_r: Vec = a_q4k_r.iter().map(|b| encode(b, 0.01)).collect(); let b = fp16_ones(k, n); diff --git a/src/rle.rs b/src/rle.rs index c560d96..2d23052 100644 --- a/src/rle.rs +++ b/src/rle.rs @@ -1,26 +1,29 @@ //! RLE-optional Q4_K super-block encoding. //! //! This module provides [`BlockQ4KRle`], a variant of [`crate::BlockQ4K`] that -//! optionally compresses the 128-byte weight payload using **byte-level +//! optionally compresses the 128-byte weight payload using **nibble-level //! run-length encoding** (RLE). A flag bit in the [`BlockQ4KRle::flags`] //! field indicates which mode is active: //! -//! | `IS_RLE` bit | `qs` interpretation | -//! |--------------|------------------------------------------------------------| -//! | 0 | Raw packed nibbles, identical to [`crate::BlockQ4K::qs`] | -//! | 1 | RLE stream of `(value, count)` byte-pairs | +//! | `IS_RLE` bit | `qs` interpretation | +//! |--------------|------------------------------------------------------------------------| +//! | 0 | Raw packed nibbles, identical to [`crate::BlockQ4K::qs`] | +//! | 1 | RLE stream of single-byte entries `(nibble_val << 4 | count_minus_1)` | //! //! ## RLE format (when `IS_RLE` = 1) //! -//! - `n_pairs` gives the number of `(value, count)` pairs stored in `qs`. -//! - For each pair `i`: -//! - `qs[2*i]` — the byte value (two packed 4-bit weights, same packing -//! as the raw format). -//! - `qs[2*i + 1]` — the run length in bytes (1..=255). -//! - The run lengths must sum to exactly 128 (the uncompressed `qs` size). +//! - `n_pairs` gives the number of nibble-level RLE entries stored in `qs`. +//! - For each entry `i` (one byte each): +//! - bits 7–4: nibble value (0–15) +//! - bits 3–0: `count - 1` (0–15, meaning run length 1–16) +//! - The run lengths must sum to exactly 256 (the number of nibbles in QK_K). //! -//! The 256-byte `qs` field can hold up to 128 `(value, count)` pairs — enough -//! to represent even fully-random blocks where every byte differs from its +//! Nibbles are read in output-sequential order: for each 32-byte group, first +//! all 32 lo nibbles, then all 32 hi nibbles. The nibble at stream position +//! `p` maps directly to output element `p` in sub-block `p / 32`. +//! +//! The 256-byte `qs` field can hold up to 256 single-byte entries — enough to +//! represent even fully-random blocks where every nibble differs from its //! neighbour. //! //! ## Constructing blocks @@ -45,12 +48,12 @@ pub const IS_RLE: u8 = 0x01; // Block definition // --------------------------------------------------------------------------- -/// A Q4_K super-block with optional byte-level RLE compression on the weights. +/// A Q4_K super-block with optional nibble-level RLE compression on the weights. /// /// Unlike [`crate::BlockQ4K`], this format is **not** binary-compatible with /// the GGUF on-disk layout. It uses a 256-byte `qs` field (vs the 128-byte -/// field in `BlockQ4K`) so the RLE stream can store up to 128 `(value, count)` -/// pairs — enough to represent even fully-random blocks where every byte +/// field in `BlockQ4K`) so the RLE stream can store up to 256 single-byte +/// entries — enough to represent even fully-random blocks where every nibble /// differs from its neighbour. /// /// Memory layout (`repr C`): @@ -61,17 +64,18 @@ pub const IS_RLE: u8 = 0x01; /// | 2 | `dmin` | 2 B | fp16 super-block min-scale | /// | 4 | `scales` | 12 B | packed 6-bit sub-block params | /// | 16 | `flags` | 1 B | bit 0 = `IS_RLE`; bits 1–7 unused | -/// | 17 | `n_pairs` | 1 B | RLE pair count (0 when raw) | -/// | 18 | `qs` | 256 B | raw nibbles (first 128 B) or RLE | +/// | 17 | (pad) | 1 B | alignment padding for `n_pairs` | +/// | 18 | `n_pairs` | 2 B | RLE entry count (0 when raw) | +/// | 20 | `qs` | 256 B | raw nibbles (first 128 B) or RLE | /// -/// **sizeof = 274 bytes.** +/// **sizeof = 276 bytes.** /// /// ## `qs` interpretation /// -/// | `IS_RLE` | Meaning | -/// |----------|--------------------------------------------------------------| -/// | 0 | `qs[0..128]` holds raw packed nibbles (same as `BlockQ4K`) | -/// | 1 | `qs[0..n_pairs*2]` holds `(value, count)` byte-pairs | +/// | `IS_RLE` | Meaning | +/// |----------|---------------------------------------------------------------------| +/// | 0 | `qs[0..128]` holds raw packed nibbles (same as `BlockQ4K`) | +/// | 1 | `qs[0..n_pairs]` holds nibble-level RLE entries (1 byte each) | #[repr(C)] #[derive(Clone, Copy, Debug)] pub struct BlockQ4KRle { @@ -80,11 +84,11 @@ pub struct BlockQ4KRle { pub scales: [u8; K_SCALE_SIZE], /// Encoding flags. Only bit 0 (`IS_RLE`) is used; bits 1-7 are reserved. pub flags: u8, - /// When `IS_RLE` is set: number of `(value, count)` byte-pairs in `qs`. + /// When `IS_RLE` is set: number of nibble-level RLE entries in `qs`. /// Zero when in raw mode. - pub n_pairs: u8, - /// Raw packed-nibble weights (IS_RLE = 0, first 128 bytes) or RLE stream - /// (IS_RLE = 1, first `n_pairs * 2` bytes). + pub n_pairs: u16, + /// Raw packed-nibble weights (IS_RLE = 0, first 128 bytes) or nibble-level + /// RLE stream (IS_RLE = 1, first `n_pairs` bytes; one byte per entry). pub qs: [u8; QK_K], // 256 bytes } @@ -95,7 +99,7 @@ impl BlockQ4KRle { self.flags & IS_RLE != 0 } - /// Number of `(value, count)` byte-pairs in `qs`. + /// Number of nibble-level RLE entries in `qs`. /// Only meaningful when `is_rle()` is true. #[inline] pub fn rle_len(&self) -> usize { @@ -103,74 +107,112 @@ impl BlockQ4KRle { } } +// --------------------------------------------------------------------------- +// Nibble extraction / packing helpers +// --------------------------------------------------------------------------- + +/// Extract all 256 nibbles from a 128-byte `qs` payload in output-sequential +/// order: for each 32-byte group, first all 32 lo nibbles, then all 32 hi +/// nibbles. Nibble at position `p` maps to output element `p`. +fn extract_nibbles(raw: &[u8; QK_K / 2]) -> [u8; QK_K] { + let mut nibbles = [0u8; QK_K]; + let mut q_off = 0usize; // byte cursor + let mut n_off = 0usize; // nibble cursor + while n_off < QK_K { + for l in 0..32 { + nibbles[n_off + l] = raw[q_off + l] & 0x0F; // lo + nibbles[n_off + 32 + l] = raw[q_off + l] >> 4; // hi + } + q_off += 32; + n_off += 64; + } + nibbles +} + +/// Inverse of [`extract_nibbles`]: pack a 256-nibble output-sequential array +/// back into the 128-byte `qs` layout. +fn pack_nibbles(nibbles: &[u8; QK_K]) -> [u8; QK_K / 2] { + let mut raw = [0u8; QK_K / 2]; + let mut q_off = 0usize; + let mut n_off = 0usize; + while n_off < QK_K { + for l in 0..32 { + raw[q_off + l] = nibbles[n_off + l] | (nibbles[n_off + 32 + l] << 4); + } + q_off += 32; + n_off += 64; + } + raw +} + // --------------------------------------------------------------------------- // Encoding // --------------------------------------------------------------------------- /// Encode a [`BlockQ4K`] block into a [`BlockQ4KRle`] block. /// -/// The `qs` payload is scanned for runs of equal consecutive bytes. RLE mode -/// is chosen when **both** conditions hold: +/// The `qs` payload is scanned for runs of equal consecutive nibbles in +/// output-sequential order. RLE mode is chosen when **both** conditions hold: /// -/// 1. **Coverage**: at least `min_coverage` fraction of the 128 `qs` bytes -/// belong to runs of length ≥ 2. These are the bytes whose weights can be -/// batched in `accumulate_rle_block`, replacing `2 * run_len` multiplies -/// with just 2 per group-segment. +/// 1. **Coverage**: at least `min_coverage` fraction of the 256 nibbles +/// belong to runs of length ≥ 2. These are the nibbles whose weights can +/// be batched in `accumulate_rle_block`, replacing one multiply per nibble +/// with one multiply per output column per segment. /// -/// 2. **Capacity**: the pair count does not exceed 128 (the physical limit of -/// the 256-byte `qs` field at 2 bytes per pair). +/// 2. **Capacity**: the entry count does not exceed 256 (the physical limit of +/// the 256-byte `qs` field at 1 byte per entry). /// -/// | `min_coverage` | Effect | -/// |----------------|------------------------------------------------------| -/// | `0.0` | RLE whenever pairs fit (≤ 128), regardless of runs | -/// | `0.5` | RLE only if ≥ 50 % of bytes are in repeated runs | -/// | `1.0` | RLE only when every byte is part of a run | +/// | `min_coverage` | Effect | +/// |----------------|-------------------------------------------------------| +/// | `0.0` | RLE whenever entries fit (≤ 256), regardless of runs | +/// | `0.5` | RLE only if ≥ 50 % of nibbles are in repeated runs | +/// | `1.0` | RLE only when every nibble is part of a run | pub fn encode(block: &BlockQ4K, min_coverage: f32) -> BlockQ4KRle { debug_assert!( (0.0..=1.0).contains(&min_coverage), "min_coverage must be in [0.0, 1.0], got {min_coverage}" ); - let raw = &block.qs; // [u8; 128] + let nibbles = extract_nibbles(&block.qs); - // Scan for runs of equal consecutive bytes. - // Track long_run_bytes: bytes in runs of length ≥ 2 (the bytes that - // benefit from RLE in the matmul). - let mut pairs: Vec<(u8, u8)> = Vec::with_capacity(QK_K / 2); - let mut long_run_bytes = 0usize; - let mut i = 0usize; - while i < raw.len() { - let val = raw[i]; - let mut run = 1u8; - while i + (run as usize) < raw.len() - && raw[i + (run as usize)] == val - && run < u8::MAX - { + // Scan for runs of equal consecutive nibbles. + let mut entries = Vec::::with_capacity(QK_K); + let mut long_run_nibbles = 0usize; + let mut i = 0usize; + + while i < QK_K { + let val = nibbles[i]; + let mut run = 0usize; + while i + run < QK_K && nibbles[i + run] == val { run += 1; } - pairs.push((val, run)); if run >= 2 { - long_run_bytes += run as usize; + long_run_nibbles += run; } - i += run as usize; + // Split runs longer than 16 into max-16 chunks (4-bit count field). + let mut rem = run; + while rem > 0 { + let chunk = rem.min(16); + entries.push((val << 4) | ((chunk - 1) as u8)); + rem -= chunk; + } + i += run; } - // Coverage: fraction of qs bytes that are in non-singleton runs. - let coverage = long_run_bytes as f32 / raw.len() as f32; + // Coverage: fraction of the 256 nibbles that are in non-singleton runs. + let coverage = long_run_nibbles as f32 / QK_K as f32; - if pairs.len() <= QK_K / 2 && coverage >= min_coverage { - let n = pairs.len(); + // Use RLE when entries fit in qs (≤ 256) and coverage meets the threshold. + if entries.len() <= QK_K && coverage >= min_coverage { + let n = entries.len(); let mut qs = [0u8; QK_K]; - for (k, &(val, count)) in pairs.iter().enumerate() { - qs[2 * k] = val; - qs[2 * k + 1] = count; - } + qs[..n].copy_from_slice(&entries); BlockQ4KRle { d: block.d, dmin: block.dmin, scales: block.scales, flags: IS_RLE, - n_pairs: n as u8, + n_pairs: n as u16, qs, } } else { @@ -196,29 +238,32 @@ pub fn encode(block: &BlockQ4K, min_coverage: f32) -> BlockQ4KRle { /// /// # Panics (debug builds only) /// -/// Panics if the decoded RLE stream does not sum to exactly 128 bytes. +/// Panics if the decoded RLE nibble stream does not sum to exactly 256 nibbles. fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] { if !block.is_rle() { // First QK_K/2 bytes of qs hold the raw packed nibbles. - block.qs[..QK_K / 2].try_into().unwrap() - } else { - let n = block.rle_len(); - let mut raw = [0u8; QK_K / 2]; - let mut pos = 0usize; - for i in 0..n { - let val = block.qs[2 * i]; - let count = block.qs[2 * i + 1] as usize; - raw[pos..pos + count].fill(val); - pos += count; - } - debug_assert_eq!( - pos, - QK_K / 2, - "RLE run lengths sum to {pos}, expected {}", - QK_K / 2 - ); - raw + return block.qs[..QK_K / 2].try_into().unwrap(); } + + let n = block.rle_len(); + let mut nibbles = [0u8; QK_K]; + let mut pos = 0usize; + + for i in 0..n { + let entry = block.qs[i]; + let val = entry >> 4; + let count = (entry & 0x0F) as usize + 1; + nibbles[pos..pos + count].fill(val); + pos += count; + } + + debug_assert_eq!( + pos, + QK_K, + "nibble RLE lengths sum to {pos}, expected {QK_K}" + ); + + pack_nibbles(&nibbles) } // --------------------------------------------------------------------------- @@ -227,9 +272,9 @@ fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] { /// Dequantise one [`BlockQ4KRle`] super-block into [`QK_K`] (256) `f32` values. /// -/// When `IS_RLE` is set the RLE stream is first expanded into a 128-byte raw -/// buffer; thereafter the dequantisation is identical to -/// [`crate::dequantize_block_q4k`]: +/// When `IS_RLE` is set the RLE stream is first expanded into a 256-nibble +/// buffer and packed back into a 128-byte raw representation; thereafter the +/// dequantisation is identical to [`crate::dequantize_block_q4k`]: /// /// ```text /// out[i] = d * scale[s] * nibble[i] - dmin * min[s] @@ -273,86 +318,69 @@ pub fn dequantize_block_q4k_rle(block: &BlockQ4KRle, out: &mut [f32; QK_K]) { /// Accumulate the contribution of one RLE-encoded block into `c_row`. /// -/// For each `(value, count)` pair the dequantised weight is constant within -/// every 32-byte sub-block group, so the per-output-column dot-product -/// contribution reduces from `2 * run_len` multiplies to just `2`: +/// With nibble-level RLE and output-sequential ordering, nibble position `p` +/// maps directly to output element `p` in sub-block `p / 32`. For each entry +/// the dequantised weight `dq` is constant within each sub-block segment, so +/// the per-output-column contribution reduces to: /// /// ```text -/// original: Σ_{l} ( dq_lo * B[ki_lo+l, j] + dq_hi * B[ki_hi+l, j] ) -/// -/// optimised: dq_lo * Σ_{l} B[ki_lo+l, j] + dq_hi * Σ_{l} B[ki_hi+l, j] +/// c_row[j] += dq * Σ_{l in seg} B[ki_base + pos + l, j] /// ``` /// -/// A run that crosses a 32-byte group boundary (and thus a scale/min change) -/// is split at the boundary; each resulting segment is handled independently. +/// A run that crosses a 32-nibble sub-block boundary is split at the boundary; +/// each resulting segment is handled independently. /// -/// `sum_lo` and `sum_hi` are caller-provided scratch slices (length `≥ n`) -/// reused across calls to avoid repeated allocation. +/// `sum_b` is a caller-provided scratch slice (length `≥ n`) reused across +/// calls to avoid repeated allocation. fn accumulate_rle_block( block: &BlockQ4KRle, b: &[u16], ki_base: usize, // first B-row index for this block (= b_idx * QK_K) n: usize, c_row: &mut [f32], - sum_lo: &mut [f32], - sum_hi: &mut [f32], + sum_b: &mut [f32], // scratch, length ≥ n ) { let d = fp16_to_f32(block.d); let dmin = fp16_to_f32(block.dmin); - let mut byte_pos = 0usize; // running cursor into the 128-byte qs payload + let mut nibble_pos = 0usize; // current position in the 256-nibble output stream for p in 0..block.rle_len() { - let val = block.qs[2 * p]; - let run = block.qs[2 * p + 1] as usize; - let lo = (val & 0x0F) as f32; - let hi = (val >> 4) as f32; + let entry = block.qs[p]; + let val = (entry >> 4) as f32; // nibble value 0–15 + let run = (entry & 0x0F) as usize + 1; // count 1–16 let mut remaining = run; - let mut pos = byte_pos; + let mut pos = nibble_pos; while remaining > 0 { - // Clip the current run to the boundary of the 32-byte group so - // that the sub-block scale/min stays constant over the segment. - let group = pos / 32; // 0..4 - let in_group = pos % 32; // byte offset within this group - let seg_len = remaining.min((group + 1) * 32 - pos); + // Sub-block at this position; split at sub-block boundaries (every 32 nibbles). + let sub_block = pos / 32; // 0..8 + let in_sb = pos % 32; + let seg_len = remaining.min(32 - in_sb); - // Constant dequantised values for both nibble levels in this group. - let (sc_lo, mn_lo) = get_scale_min(group * 2, &block.scales); - let (sc_hi, mn_hi) = get_scale_min(group * 2 + 1, &block.scales); - let dq_lo = d * sc_lo as f32 * lo - dmin * mn_lo as f32; - let dq_hi = d * sc_hi as f32 * hi - dmin * mn_hi as f32; + let (sc, mn) = get_scale_min(sub_block, &block.scales); + let dq = d * sc as f32 * val - dmin * mn as f32; - // Map byte positions to dequantised-output indices (0..QK_K): - // lo nibbles → group*64 + in_group .. + seg_len - // hi nibbles → group*64 + 32 + in_group .. + seg_len - let out_lo = group * 64 + in_group; - let out_hi = group * 64 + 32 + in_group; - - // Sum B rows for every j across the segment (B accessed stride-1 - // within each row — cache-friendly). - sum_lo[..n].fill(0.0); - sum_hi[..n].fill(0.0); + // Accumulate B-column sums for this segment (stride-1 per B row). + sum_b[..n].fill(0.0); for l in 0..seg_len { - let base_lo = (ki_base + out_lo + l) * n; - let base_hi = (ki_base + out_hi + l) * n; + let b_base = (ki_base + pos + l) * n; for j in 0..n { - sum_lo[j] += fp16_to_f32(b[base_lo + j]); - sum_hi[j] += fp16_to_f32(b[base_hi + j]); + sum_b[j] += fp16_to_f32(b[b_base + j]); } } - // One multiply per output column instead of one per weight element. + // One multiply per output column instead of one per nibble. for j in 0..n { - c_row[j] += dq_lo * sum_lo[j] + dq_hi * sum_hi[j]; + c_row[j] += dq * sum_b[j]; } pos += seg_len; remaining -= seg_len; } - byte_pos += run; + nibble_pos += run; } } @@ -361,20 +389,16 @@ fn accumulate_rle_block( /// /// For blocks in **RLE mode** (`IS_RLE = 1`) the intermediate decompressed row /// is eliminated entirely. [`accumulate_rle_block`] works directly over the -/// `(value, count)` pairs: within each run the dequantised weight is constant -/// across all elements in the run, so each output column `j` requires only -/// **2 multiplies per group-segment** rather than 2 per weight element: +/// nibble-level RLE entries: within each run the dequantised weight is constant +/// across all elements, so each output column `j` requires only one multiply +/// per sub-block segment rather than one per nibble: /// /// ```text -/// c[i, j] += dq_lo * Σ B[ki_lo, j] + dq_hi * Σ B[ki_hi, j] -/// ─────────────────────────────────────────── -/// summed over seg_len consecutive positions +/// c[i, j] += dq * Σ B[ki_base + pos + l, j] +/// ────────────────────────── +/// summed over seg_len nibble positions /// ``` /// -/// For a single-run block (all bytes identical) this reduces the multiply -/// count from `2 * QK_K = 512` to `2 * 4 = 8` per output column (4 groups, -/// 2 nibble levels each), while B is still read exactly once. -/// /// For blocks in **raw mode** (`IS_RLE = 0`) the block is dequantised into a /// scratch buffer and its contribution is accumulated via a saxpy loop /// (weight-outer, column-inner), which accesses B in row-major order. @@ -431,8 +455,7 @@ pub fn matmul_q4k_rle_fp16( // Scratch for raw-mode block dequantisation. let mut block_buf = [0.0f32; QK_K]; // Scratch for RLE-mode B-column sums; allocated once and reused per segment. - let mut sum_lo = vec![0.0f32; n]; - let mut sum_hi = vec![0.0f32; n]; + let mut sum_b = vec![0.0f32; n]; for i in 0..m { let c_row = &mut c[i * n..(i + 1) * n]; @@ -442,10 +465,10 @@ pub fn matmul_q4k_rle_fp16( let ki_base = b_idx * QK_K; if block.is_rle() { - // RLE path: accumulate directly from runs, no decompression. + // RLE path: accumulate directly from nibble runs, no decompression. accumulate_rle_block( block, b, ki_base, n, c_row, - &mut sum_lo, &mut sum_hi, + &mut sum_b, ); } else { // Raw path: dequantise once, then saxpy into c_row. @@ -585,18 +608,17 @@ mod tests { // ========================================================================= #[test] - fn block_q4k_rle_size_is_274_bytes() { - // d(2) + dmin(2) + scales(12) + flags(1) + n_pairs(1) + qs(256) = 274 bytes. - // No padding needed: struct is already 2-byte aligned and 274 is even. - assert_eq!(core::mem::size_of::(), 274); + fn block_q4k_rle_size_is_276_bytes() { + // d(2) + dmin(2) + scales(12) + flags(1) + pad(1) + n_pairs(2) + qs(256) = 276 bytes. + assert_eq!(core::mem::size_of::(), 276); } #[test] - fn block_q4k_rle_is_130_bytes_larger_than_block_q4k() { - // BlockQ4K = 144 bytes, BlockQ4KRle = 274 bytes, delta = 130. + fn block_q4k_rle_is_132_bytes_larger_than_block_q4k() { + // BlockQ4K = 144 bytes, BlockQ4KRle = 276 bytes, delta = 132. assert_eq!( core::mem::size_of::(), - core::mem::size_of::() + 130, + core::mem::size_of::() + 132, ); } @@ -633,11 +655,11 @@ mod tests { #[test] fn rle_len_reports_pair_count_from_n_pairs() { - for n in [0usize, 1, 7, 31, 63, 128] { + for n in [0usize, 1, 7, 31, 63, 128, 256] { let b = BlockQ4KRle { d: 0, dmin: 0, scales: [0; K_SCALE_SIZE], flags: if n > 0 { IS_RLE } else { 0 }, - n_pairs: n as u8, + n_pairs: n as u16, qs: [0; QK_K], }; assert_eq!(b.rle_len(), n, "expected rle_len {n}"); @@ -650,30 +672,37 @@ mod tests { #[test] fn encode_uniform_qs_uses_rle() { - // 128 identical bytes → 1 pair → 2 bytes stored in qs. + // All identical bytes → 256 identical nibbles → RLE mode. let src = make_block(1.0, 0.0, 1, 0, 0x77); let rle = encode(&src, 0.0); assert!(rle.is_rle(), "uniform qs should trigger RLE mode"); } #[test] - fn encode_uniform_qs_rle_len_is_one() { + fn encode_uniform_qs_rle_entry_count_is_sixteen() { + // 256 identical nibbles → max chunk size 16 → 16 entries. let src = make_block(1.0, 0.0, 1, 0, 0x55); let rle = encode(&src, 0.0); - assert_eq!(rle.rle_len(), 1); + assert_eq!(rle.rle_len(), 16, "256 identical nibbles → 16 chunks of 16"); } #[test] - fn encode_uniform_qs_rle_entry_is_correct() { + fn encode_uniform_qs_first_entry_is_correct() { + // 0xAB: lo nibble = 0xB = 11, hi nibble = 0xA = 10. + // Output-sequential: [0xB×32, 0xA×32] × 4 groups = 8 runs of 32. + // Each run of 32 → 2 entries of 16. + // First entry: val=0xB=11, count=16 → (11<<4)|15 = 0xBF. let src = make_block(1.0, 0.0, 1, 0, 0xAB); let rle = encode(&src, 0.0); - assert_eq!(rle.qs[0], 0xAB, "RLE value byte should equal the repeated byte"); - assert_eq!(rle.qs[1], 128, "RLE run length should be 128 bytes"); + assert!(rle.is_rle()); + assert_eq!(rle.qs[0], (0xBu8 << 4) | 0xFu8, "first entry: val=0xB, count=16"); + assert_eq!(rle.rle_len(), 16, "8 runs of 32, each split into 2 → 16 entries"); } #[test] fn encode_alternating_bytes_stays_raw() { - // Alternating 0xAA / 0x55 → 128 singleton pairs, coverage = 0%. + // Alternating 0xAA / 0x55 → nibble stream alternates 10,5,10,5,... + // No two adjacent nibbles are equal → 0% nibble coverage. // At threshold 0.01 the 0% coverage fails → raw mode. let mut qs = [0u8; QK_K / 2]; for (i, b) in qs.iter_mut().enumerate() { @@ -686,7 +715,7 @@ mod tests { #[test] fn encode_raw_mode_copies_qs_verbatim() { - // Three-byte cycle of distinct values → 128 runs of 1 byte each, + // Three-byte cycle of distinct values → no adjacent nibble repeats, // coverage = 0%. At threshold 0.01 the 0% coverage fails → raw mode. let mut qs = [0u8; QK_K / 2]; for (i, b) in qs.iter_mut().enumerate() { @@ -700,138 +729,99 @@ mod tests { } #[test] - fn encode_two_runs_uses_rle_and_stores_correct_pairs() { - // Two distinct runs: 64 bytes of 0x11 followed by 64 bytes of 0x22. - // → 2 pairs = 4 bytes. + fn encode_two_run_block_stores_correct_entries() { + // qs = [0x11×64, 0x22×64]: both nibbles of 0x11 are 1, of 0x22 are 2. + // Nibble stream (output-sequential): [1×128, 2×128] → 2 runs of 128. + // Each run of 128 → 8 entries of 16 → 16 entries total. let mut qs = [0u8; QK_K / 2]; qs[..64].fill(0x11); qs[64..].fill(0x22); let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); let rle = encode(&src, 0.0); assert!(rle.is_rle()); - assert_eq!(rle.rle_len(), 2); - assert_eq!(rle.qs[0], 0x11, "first pair: value"); - assert_eq!(rle.qs[1], 64, "first pair: run length"); - assert_eq!(rle.qs[2], 0x22, "second pair: value"); - assert_eq!(rle.qs[3], 64, "second pair: run length"); + // 2 runs of 128 nibbles each → 8 entries per run → 16 total entries. + assert_eq!(rle.rle_len(), 16); + // First entry: nibble val=1, count=16 → (1<<4)|15 = 0x1F. + assert_eq!(rle.qs[0], 0x1F, "first entry should be nibble=1, count=16"); + // 9th entry (first for nibble 2): (2<<4)|15 = 0x2F. + assert_eq!(rle.qs[8], 0x2F, "9th entry should be nibble=2, count=16"); } #[test] - fn encode_63_pairs_uses_rle() { - // Build 62 runs of 2 bytes each (124 bytes) + 1 run of 4 bytes = 128 bytes. - // 63 pairs × 2 = 126 bytes; 63 ≤ 128 → RLE should be chosen. - let mut qs = [0u8; QK_K / 2]; - let mut pos = 0usize; - for run in 0..62usize { - // Use a stride-3 sequence so consecutive values are always distinct. - let v = (run as u8).wrapping_mul(3).wrapping_add(1); - qs[pos] = v; - qs[pos + 1] = v; - pos += 2; - } - // Final run: 4 bytes, value chosen to differ from the previous one. - qs[pos..].fill(0xFE); + fn encode_nibble_coverage_determines_rle_mode() { + // A block with all-same nibbles → 100% coverage → always RLE. + let src_uniform = make_block(1.0, 0.0, 1, 0, 0x77); + assert!(encode(&src_uniform, 0.0).is_rle()); + assert!(encode(&src_uniform, 1.0).is_rle()); // 100% coverage meets any threshold - let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); - let rle = encode(&src, 0.0); - assert!(rle.is_rle(), "63 pairs should use RLE"); - assert_eq!(rle.rle_len(), 63); - } - - #[test] - fn encode_64_pairs_uses_rle_at_zero_threshold() { - // 64 runs of 2 bytes each = 128 bytes total, coverage = 100%. - // pairs (64) ≤ 128 AND 100% ≥ 0.0 → RLE mode. - let mut qs = [0u8; QK_K / 2]; - let mut pos = 0usize; - for run in 0..64usize { - let v = (run as u8).wrapping_mul(3).wrapping_add(1); - qs[pos] = v; - qs[pos + 1] = v; - pos += 2; - } - let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); - let rle = encode(&src, 0.0); - assert!(rle.is_rle(), "64 pairs, 100% coverage, threshold 0.0 → RLE"); - assert_eq!(rle.rle_len(), 64); - } - - #[test] - fn encode_128_pairs_uses_rle_at_zero_threshold() { - // 128 distinct consecutive bytes = 128 singleton runs = 128 pairs. - // With old cap (64 pairs), this was always raw. - // With new cap (128 pairs), threshold 0.0 accepts it. - // Coverage = 0 % (all singletons) → threshold > 0.0 rejects it. + // Build a block whose nibble stream has 0% coverage: + // lo = (i*2+1) % 16, hi = (i*2+2) % 16 + // Adjacent lo nibbles differ by 2 (mod 16); adjacent hi nibbles differ by 2. + // Group boundaries also do not align (verified analytically). let mut qs = [0u8; QK_K / 2]; for (i, b) in qs.iter_mut().enumerate() { - *b = i as u8; // 0x00, 0x01, ..., 0x7F — all distinct, all singletons + let lo = ((i * 2 + 1) % 16) as u8; + let hi = ((i * 2 + 2) % 16) as u8; + *b = lo | (hi << 4); } - let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); - - assert!( - encode(&src, 0.0).is_rle(), - "128 pairs ≤ 128 limit AND 0% ≥ 0.0 → RLE at zero threshold" - ); - assert_eq!(encode(&src, 0.0).rle_len(), 128); - - assert!( - !encode(&src, 0.01).is_rle(), - "0% coverage fails any threshold > 0" - ); + let src_varied = make_block_with_qs(1.0, 0.0, 1, 0, qs); + // At 0.0 threshold: RLE (≤ 256 entries always fit, 0.0 ≥ 0.0). + assert!(encode(&src_varied, 0.0).is_rle()); + // At any positive threshold: 0% nibble coverage → raw. + assert!(!encode(&src_varied, 0.01).is_rle()); } #[test] - fn encode_coverage_threshold_rejects_low_coverage_block() { - // Construct: 63 singletons + 1 run of 65 bytes = 64 pairs. - // coverage = 65/128 ≈ 50.8%. - // threshold 0.50 accepts it; threshold 0.60 rejects it. - let mut qs = [0u8; QK_K / 2]; - qs[0] = 0x01; - for i in 1..63usize { - // Distinct odd bytes, none equal to 0x01 or adjacent values. - qs[i] = (i as u8).wrapping_mul(2).wrapping_add(5); - } - qs[63..].fill(0xAB); // 65-byte run; qs[62] = 62*2+5 = 129 → wraps to 0x81 ≠ 0xAB ✓ - - let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); - assert!( - encode(&src, 0.50).is_rle(), - "50.8% coverage should meet 50% threshold" - ); - assert!( - !encode(&src, 0.60).is_rle(), - "50.8% coverage should fail 60% threshold" - ); + fn encode_nibble_max_count_is_sixteen() { + // A run of 256 identical nibbles should be stored as 16 entries of 16 nibbles. + // 0x33: lo nibble = 3, hi nibble = 3 → all 256 nibbles are 3. + let src = make_block(1.0, 0.0, 1, 0, 0x33); + let rle = encode(&src, 0.0); + assert!(rle.is_rle()); + assert_eq!(rle.rle_len(), 16); + // Each entry: nibble=3, count=16 → (3<<4)|15 = 0x3F. + assert!(rle.qs[..16].iter().all(|&e| e == 0x3F)); } #[test] - fn encode_coverage_zero_threshold_always_uses_rle_when_pairs_fit() { - // Any block whose runs produce ≤ 128 pairs uses RLE at threshold 0.0, - // regardless of how many singletons it contains. - // Use the 63-pair block from encode_63_pairs_uses_rle. + fn encode_256_nibbles_fits_in_qs_at_zero_threshold() { + // Worst case: all 256 nibbles are singletons. + // Construct lo = (i*2+1) % 16, hi = (i*2+2) % 16: gives 0% coverage. let mut qs = [0u8; QK_K / 2]; - let mut pos = 0usize; - for run in 0..62usize { - let v = (run as u8).wrapping_mul(3).wrapping_add(1); - qs[pos] = v; - qs[pos + 1] = v; - pos += 2; + for (i, b) in qs.iter_mut().enumerate() { + let lo = ((i * 2 + 1) % 16) as u8; + let hi = ((i * 2 + 2) % 16) as u8; + *b = lo | (hi << 4); } - qs[pos..].fill(0xFE); let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); - assert!(encode(&src, 0.0).is_rle()); + let rle = encode(&src, 0.0); + // At threshold 0.0 the block should be RLE (256 entries ≤ 256 capacity, + // and 0.0 ≥ 0.0). The dequantised output must match the baseline. + let mut got = [0.0f32; QK_K]; + let mut expected = [0.0f32; QK_K]; + dequantize_block_q4k_rle(&rle, &mut got); + dequantize_block_q4k(&src, &mut expected); + assert_slices_close(&got, &expected, 1e-5); } #[test] - fn encode_coverage_one_threshold_requires_total_coverage() { - // A block with even one singleton byte fails the 100% threshold. - // Build: 1 singleton + 1 run of 127 bytes = 2 pairs, coverage = 127/128 ≈ 99.2%. + fn encode_nibble_coverage_threshold_controls_rle_selection() { + // Build a block with moderate nibble-level coverage. + // First 32 bytes all 0x77 (both nibbles 7) → group 0 in output-sequential + // order is entirely 7 (64 nibbles in one run). The remaining bytes + // cycle through values with no adjacent nibble repeats. + // Coverage ≈ 64/256 = 25% (plus a few boundary matches, ~26-27%). let mut qs = [0u8; QK_K / 2]; - qs[0] = 0x01; // singleton (value distinct from rest) - qs[1..].fill(0x02); // 127-byte run + qs[..32].fill(0x77); + for i in 32..128usize { + qs[i] = ((i % 15 + 1) as u8) | (((i + 8) % 16) as u8) << 4; + } let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); - assert!(!encode(&src, 1.0).is_rle(), "99.2% coverage should fail 100% threshold"); - assert!(encode(&src, 0.99).is_rle(), "99.2% coverage should meet 99% threshold"); + + // Coverage is roughly 25–27%, which is between 0.20 and 0.30. + assert!(encode(&src, 0.20).is_rle(), "~26% coverage meets 20% threshold"); + assert!(encode(&src, 0.24).is_rle(), "~26% coverage meets 24% threshold"); + assert!(!encode(&src, 0.30).is_rle(), "~26% coverage does not meet 30% threshold"); } #[test] @@ -859,30 +849,58 @@ mod tests { } #[test] - fn decode_qs_rle_expands_two_pair_stream() { - // Hand-craft an RLE block: [0xAA × 64, 0xBB × 64]. + fn decode_qs_rle_expands_two_run_stream() { + // [0xAA × 64 bytes, 0xBB × 64 bytes] in nibble-level RLE. + // 0xAA: lo=hi=0xA=10. 0xBB: lo=hi=0xB=11. + // Output-sequential: 128 nibbles of 10 (from 64 bytes of 0xAA), + // then 128 nibbles of 11 (from 64 bytes of 0xBB). + // Each run of 128 → 8 entries of 16: 0xAF × 8, then 0xBF × 8. let mut qs = [0u8; QK_K]; - qs[0] = 0xAA; qs[1] = 64; - qs[2] = 0xBB; qs[3] = 64; + for i in 0..8 { qs[i] = 0xAF; } // val=0xA, count-1=0xF → 16 nibbles each + for i in 8..16 { qs[i] = 0xBF; } let rle = BlockQ4KRle { d: 0, dmin: 0, scales: [0; K_SCALE_SIZE], - flags: IS_RLE, n_pairs: 2, qs, + flags: IS_RLE, n_pairs: 16, qs, }; let expanded = decode_qs(&rle); + // First 64 bytes should be 0xAA, last 64 bytes should be 0xBB. assert!(expanded[..64].iter().all(|&b| b == 0xAA), "first 64 bytes must be 0xAA"); assert!(expanded[64..].iter().all(|&b| b == 0xBB), "last 64 bytes must be 0xBB"); } #[test] - fn decode_qs_rle_single_run_covers_all() { + fn decode_qs_rle_single_byte_value_covers_all() { + // 128 bytes of 0xCD: lo nibble = 0xD = 13, hi nibble = 0xC = 12. + // Nibble stream in output-sequential order: + // [0xD×32, 0xC×32] × 4 groups = 8 runs of 32. + // Each run of 32 → 2 entries of 16. + // Entries in pairs: DF, DF, CF, CF, DF, DF, CF, CF, ... + // i.e. (i/2) % 2 == 0 → 0xDF, else 0xCF. let mut qs = [0u8; QK_K]; - qs[0] = 0xCD; qs[1] = 128; // one run of 128 bytes + for i in 0..16 { + qs[i] = if (i / 2) % 2 == 0 { 0xDF } else { 0xCF }; + } let rle = BlockQ4KRle { d: 0, dmin: 0, scales: [0; K_SCALE_SIZE], - flags: IS_RLE, n_pairs: 1, qs, + flags: IS_RLE, n_pairs: 16, qs, }; let expanded = decode_qs(&rle); - assert!(expanded.iter().all(|&b| b == 0xCD)); + assert!(expanded.iter().all(|&b| b == 0xCD), "all bytes should be 0xCD"); + } + + // ========================================================================= + // extract_nibbles / pack_nibbles round-trip + // ========================================================================= + + #[test] + fn extract_and_pack_nibbles_round_trip() { + let mut qs = [0u8; QK_K / 2]; + for (i, b) in qs.iter_mut().enumerate() { + *b = (i.wrapping_mul(37).wrapping_add(13) & 0xFF) as u8; + } + let nibbles = extract_nibbles(&qs); + let repacked = pack_nibbles(&nibbles); + assert_eq!(repacked, qs, "pack(extract(qs)) must equal qs"); } // ========================================================================= @@ -972,7 +990,7 @@ mod tests { #[test] fn roundtrip_raw_mode_matches_original() { - // Alternating bytes → 128 singleton pairs, coverage = 0%. + // Alternating bytes → no adjacent nibble repeats, coverage = 0%. // Use threshold 0.01 to force raw mode (0% < 0.01). let mut qs = [0u8; QK_K / 2]; for (i, b) in qs.iter_mut().enumerate() { @@ -1007,7 +1025,7 @@ mod tests { #[test] fn roundtrip_many_short_runs_matches_original() { - // Four distinct runs of varying lengths → still compresses. + // Four distinct byte runs → multiple nibble runs (RLE still compresses). let mut qs = [0u8; QK_K / 2]; qs[..10].fill(0x11); qs[10..30].fill(0x22); @@ -1015,8 +1033,7 @@ mod tests { qs[31..].fill(0x44); let src = make_block_with_qs(1.0, 0.5, 7, 3, qs); let rle = encode(&src, 0.0); - assert!(rle.is_rle(), "4-run block should compress"); - assert_eq!(rle.rle_len(), 4); + assert!(rle.is_rle(), "multi-run block should compress"); let mut got = [0.0f32; QK_K]; let mut expected = [0.0f32; QK_K]; @@ -1053,26 +1070,6 @@ mod tests { assert_close(got[32], 3.0, 1e-5); } - #[test] - fn roundtrip_128_singleton_pairs_matches_original() { - // All-distinct bytes → 128 pairs, 0% coverage. - // encode at threshold 0.0 → RLE; dequantize must match baseline. - let mut qs = [0u8; QK_K / 2]; - for (i, b) in qs.iter_mut().enumerate() { - *b = (i as u8).wrapping_mul(3).wrapping_add(7); - } - let src = make_block_with_qs(1.0, 0.0, 1, 0, qs); - let rle = encode(&src, 0.0); - assert!(rle.is_rle()); - assert_eq!(rle.rle_len(), 128); - - let mut got = [0.0f32; QK_K]; - let mut expected = [0.0f32; QK_K]; - dequantize_block_q4k_rle(&rle, &mut got); - dequantize_block_q4k(&src, &mut expected); - assert_slices_close(&got, &expected, 1e-5); - } - // ========================================================================= // matmul_q4k_rle_fp16 // =========================================================================