RLE now works on nibbles

2026-04-12 21:26:36 -07:00
parent 3fb10b78e3
commit bba9db290e
2 changed files with 398 additions and 360 deletions
--- a/benches/matmul.rs
+++ b/benches/matmul.rs
@@ -1,26 +1,30 @@
 //! # Benchmark: BlockQ4K vs BlockQ4KRle
 //!
-//! Measures three operations across two weight distributions:
+//! Measures three operations across three weight distributions, encoded with
+//! `min_coverage = 0.01` (blocks need ≥ 1 % of their 256 nibbles in repeated
+//! runs to use RLE mode).
 //!
-//! | Group        | What is timed                                    |
-//! |--------------|--------------------------------------------------|
-//! | `encode`     | BlockQ4K → BlockQ4KRle for a batch of 512 blocks |
-//! | `dequantize` | Single-block dequantisation for all three paths  |
-//! | `matmul`     | Full A×B multiply at three matrix sizes          |
+//! | Group        | What is timed                                       |
+//! |--------------|-----------------------------------------------------|
+//! | `encode`     | BlockQ4K → BlockQ4KRle for a batch of 512 blocks    |
+//! | `dequantize` | Single-block dequantisation across all four paths   |
+//! | `matmul`     | Full A×B multiply at three matrix sizes             |
 //!
 //! ## Weight distributions
 //!
-//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG).
-//! Consecutive bytes almost never repeat, so each block produces ~128
-//! single-byte runs.  At 2 bytes per pair that would require ~256 bytes,
-//! which exceeds the 128-byte raw payload, so `encode` always keeps these
-//! blocks in **raw mode** (IS_RLE = 0).  This is representative of typical
-//! unstructured LLM weight matrices.
+//! **uniform** — each qs byte is drawn from a pseudo-random LCG sequence.
+//! Adjacent nibbles match with probability 1/16, giving ~12 % nibble coverage.
+//! At `min_coverage = 0.01` these blocks encode to **RLE mode** (IS_RLE = 1)
+//! with ~230–240 nibble entries — a realistic proxy for trained Q4_K weights.
 //!
-//! **rle_optimal** — every byte in a block's qs field is the same value.
-//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 —
-//! and sets IS_RLE = 1.  This is the theoretical compression maximum, and
-//! is representative of highly sparse or dead-neuron weight matrices.
+//! **rle_optimal** — every qs byte is the same value.  All 256 nibbles are
+//! identical, giving 100 % coverage and just 16 nibble entries.  This is the
+//! theoretical RLE maximum and represents highly structured weight blocks.
+//!
+//! **zero_coverage** — nibbles cycle deterministically so no two consecutive
+//! nibbles (in output-sequential order) are ever equal.  Coverage = 0 %;
+//! `encode` keeps these blocks in **raw mode** (IS_RLE = 0) at any positive
+//! threshold.  Used only in the `dequantize` group to benchmark the raw path.

 use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 use matrix_testing::{
@@ -83,12 +87,11 @@ fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
    s
 }

-/// Return `count` blocks whose qs bytes are pseudo-random.
+/// Return `count` blocks whose qs bytes are pseudo-random (LCG).
 ///
-/// With uniformly distributed bytes, consecutive bytes match with probability
-/// 1/256 ≈ 0.4%, yielding ~128 runs per block.  Storing those as (value,
-/// count) pairs would need ~256 bytes — more than the 128-byte raw payload —
-/// so `encode` will always select **raw mode** (IS_RLE = 0).
+/// Adjacent nibbles match with probability 1/16, giving each block roughly
+/// 12 % nibble coverage.  At `min_coverage = 0.01` these blocks encode to
+/// **RLE mode** (IS_RLE = 1) with ~230–240 nibble entries per block.
 fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
    let scales  = make_scales(7, 2);
@@ -107,10 +110,9 @@ fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {

 /// Return `count` blocks where every qs byte is the same value.
 ///
-/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes
-/// instead of 128.  `encode` will always select **RLE mode** (IS_RLE = 1).
-/// Each block uses a fresh pseudo-random byte so no two blocks are identical,
-/// avoiding degenerate cache-warm effects across the batch.
+/// All 256 nibbles are identical → 100 % nibble coverage → always **RLE mode**
+/// with exactly 16 entries (256 nibbles / 16 per entry).
+/// Each block uses a fresh pseudo-random byte to avoid cache-warm artifacts.
 fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
    let scales  = make_scales(7, 2);
@@ -129,6 +131,28 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
    vec![f32_to_fp16(1.0); k * n]
 }

+/// Build one block whose nibbles cycle so that no two consecutive nibbles
+/// (in output-sequential order) are ever equal → 0 % nibble coverage.
+///
+/// Lo nibble of byte `i` = `i % 16`; hi nibble = `(i + 8) % 16`.
+/// Within every 32-byte group the lo and hi streams each visit all 16 values
+/// twice without repetition, and across group boundaries the last nibble of
+/// one stream differs from the first nibble of the next.
+///
+/// At any `min_coverage > 0.0`, `encode` keeps this block in **raw mode**.
+fn zero_coverage_block() -> BlockQ4K {
+    let scales = make_scales(7, 2);
+    let d      = f32_to_fp16(0.01);
+    let dmin   = f32_to_fp16(0.001);
+    let mut qs = [0u8; QK_K / 2];
+    for (i, b) in qs.iter_mut().enumerate() {
+        let lo = (i % 16) as u8;
+        let hi = ((i + 8) % 16) as u8;
+        *b = lo | (hi << 4);
+    }
+    BlockQ4K { d, dmin, scales, qs }
+}
+
 // ---------------------------------------------------------------------------
 // Group 1 — encode
 // ---------------------------------------------------------------------------
@@ -136,12 +160,11 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
 /// Number of blocks encoded per iteration in `bench_encode`.
 const ENCODE_BATCH: usize = 512;

-/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output.
+/// Measures the cost of scanning nibbles and writing the `BlockQ4KRle` output.
 ///
-/// Both distributions perform the same O(128) run-length scan.  The only
-/// divergence is at the output stage:
-/// * **uniform**    — run count > 63 → fall through to memcpy of 128 bytes.
-/// * **rle_optimal** — run count = 1  → write 2 bytes and set IS_RLE.
+/// Both distributions perform the same O(256) nibble scan.  The output differs:
+/// * **uniform**    — ~12 % coverage → RLE mode, ~230–240 entries written.
+/// * **rle_optimal** — 100 % coverage → RLE mode, exactly 16 entries written.
 fn bench_encode(c: &mut Criterion) {
    let uniform  = uniform_blocks(ENCODE_BATCH);
    let rle_opt  = rle_optimal_blocks(ENCODE_BATCH);
@@ -153,7 +176,7 @@ fn bench_encode(c: &mut Criterion) {
    group.bench_function("uniform", |b| {
        b.iter(|| {
            for blk in &uniform {
-                black_box(encode(black_box(blk), 0.0));
+                black_box(encode(black_box(blk), 0.01));
            }
        });
    });
@@ -161,7 +184,7 @@ fn bench_encode(c: &mut Criterion) {
    group.bench_function("rle_optimal", |b| {
        b.iter(|| {
            for blk in &rle_opt {
-                black_box(encode(black_box(blk), 0.0));
+                black_box(encode(black_box(blk), 0.01));
            }
        });
    });
@@ -173,25 +196,35 @@ fn bench_encode(c: &mut Criterion) {
 // Group 2 — dequantize (single block)
 // ---------------------------------------------------------------------------

-/// Compares the three single-block dequantisation code paths.
+/// Compares four single-block dequantisation code paths.
 ///
-/// | Variant          | Block type  | Encoding | Extra work vs baseline        |
-/// |------------------|-------------|----------|-------------------------------|
-/// | `q4k_baseline`   | BlockQ4K    | —        | none                          |
-/// | `rle_raw_mode`   | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) |
-/// | `rle_rle_mode`   | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf  |
+/// | Variant            | Block type  | Encoding  | IS_RLE | Entries |
+/// |--------------------|-------------|-----------|--------|---------|
+/// | `q4k_baseline`     | BlockQ4K    | —         | —      | —       |
+/// | `rle_raw_mode`     | BlockQ4KRle | raw       | 0      | —       |
+/// | `rle_sparse`       | BlockQ4KRle | RLE       | 1      | ~235    |
+/// | `rle_dense`        | BlockQ4KRle | RLE       | 1      | 16      |
+///
+/// `rle_raw_mode` uses the zero-coverage fixture (0 % nibble coverage), which
+/// stays in raw mode at any positive threshold.
+/// `rle_sparse` uses the LCG uniform fixture (~12 % coverage, ~235 entries),
+/// representative of actual trained Q4_K weight blocks.
+/// `rle_dense` uses the rle_optimal fixture (100 % coverage, 16 entries).
 ///
 /// Throughput is the number of dequantised weights produced per second.
 fn bench_dequantize(c: &mut Criterion) {
-    let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap();
-    let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap();
+    let q4k_baseline_block = uniform_blocks(1).into_iter().next().unwrap();
+    let q4k_zero_cov       = zero_coverage_block();
+    let q4k_uniform        = uniform_blocks(1).into_iter().next().unwrap();
+    let q4k_rle_opt        = rle_optimal_blocks(1).into_iter().next().unwrap();

-    let rle_raw = encode(&q4k_uniform, 0.0); // IS_RLE = 0
-    let rle_rle = encode(&q4k_rle_opt, 0.0); // IS_RLE = 1
+    let rle_raw    = encode(&q4k_zero_cov, 0.01); // IS_RLE = 0  (0 % coverage)
+    let rle_sparse = encode(&q4k_uniform,  0.01); // IS_RLE = 1  (~12 % coverage)
+    let rle_dense  = encode(&q4k_rle_opt,  0.01); // IS_RLE = 1  (100 % coverage)

-    // Confirm the fixtures ended up in the right encoding modes.
-    assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode");
-    assert!(rle_rle.is_rle(),  "rle-optimal block should encode to rle mode");
+    assert!(!rle_raw.is_rle(),    "zero-coverage block must be raw mode");
+    assert!(rle_sparse.is_rle(),  "uniform block must be RLE at 0.01 threshold");
+    assert!(rle_dense.is_rle(),   "rle-optimal block must be RLE mode");

    let mut group = c.benchmark_group("dequantize");
    // Throughput = QK_K (256) weights dequantised per second.
@@ -200,7 +233,7 @@ fn bench_dequantize(c: &mut Criterion) {
    group.bench_function("q4k_baseline", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
-            dequantize_block_q4k(black_box(&q4k_uniform), &mut out);
+            dequantize_block_q4k(black_box(&q4k_baseline_block), &mut out);
            black_box(out)
        });
    });
@@ -213,10 +246,18 @@ fn bench_dequantize(c: &mut Criterion) {
        });
    });

-    group.bench_function("rle_rle_mode", |b| {
+    group.bench_function("rle_sparse", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
-            dequantize_block_q4k_rle(black_box(&rle_rle), &mut out);
+            dequantize_block_q4k_rle(black_box(&rle_sparse), &mut out);
+            black_box(out)
+        });
+    });
+
+    group.bench_function("rle_dense", |b| {
+        b.iter(|| {
+            let mut out = [0.0f32; QK_K];
+            dequantize_block_q4k_rle(black_box(&rle_dense), &mut out);
            black_box(out)
        });
    });
@@ -245,14 +286,14 @@ const CONFIGS: &[(usize, usize, usize)] = &[

 /// Full matrix-multiply benchmark across weight distributions and matrix sizes.
 ///
-/// Four variants per size:
+/// Four variants per size (`min_coverage = 0.01`):
 ///
-/// | Label                | A type      | RLE mode? |
-/// |----------------------|-------------|-----------|
-/// | `baseline/uniform`   | BlockQ4K    | —         |
-/// | `rle/uniform`        | BlockQ4KRle | raw       |
-/// | `baseline/rle_opt`   | BlockQ4K    | —         |
-/// | `rle/rle_opt`        | BlockQ4KRle | rle       |
+/// | Label                | A type      | IS_RLE | Entries/block |
+/// |----------------------|-------------|--------|---------------|
+/// | `baseline/uniform`   | BlockQ4K    | —      | —             |
+/// | `rle/uniform`        | BlockQ4KRle | 1      | ~235          |
+/// | `baseline/rle_opt`   | BlockQ4K    | —      | —             |
+/// | `rle/rle_opt`        | BlockQ4KRle | 1      | 16            |
 ///
 /// Throughput is reported as multiply-accumulate operations (M × K × N) per
 /// second, allowing fair cross-size comparison.
@@ -270,10 +311,10 @@ fn bench_matmul(c: &mut Criterion) {

        // Build all four A variants and the shared B matrix for this config.
        let a_q4k_u: Vec<BlockQ4K>    = uniform_blocks(m * bpr);
-        let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.0)).collect();
+        let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.01)).collect();

        let a_q4k_r: Vec<BlockQ4K>    = rle_optimal_blocks(m * bpr);
-        let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.0)).collect();
+        let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.01)).collect();

        let b = fp16_ones(k, n);