RLE now works on nibbles

Allow variable coverage
Try sorting
2026-04-12 21:26:36 -07:00 · 2026-04-12 20:51:19 -07:00 · 2026-04-12 19:51:59 -07:00
4 changed files with 533 additions and 344 deletions
--- a/benches/matmul.rs
+++ b/benches/matmul.rs
@@ -1,26 +1,30 @@
 //! # Benchmark: BlockQ4K vs BlockQ4KRle
 //!
-//! Measures three operations across two weight distributions:
+//! Measures three operations across three weight distributions, encoded with
+//! `min_coverage = 0.01` (blocks need ≥ 1 % of their 256 nibbles in repeated
+//! runs to use RLE mode).
 //!
 //! | Group        | What is timed                                       |
-//! |--------------|--------------------------------------------------|
+//! |--------------|-----------------------------------------------------|
 //! | `encode`     | BlockQ4K → BlockQ4KRle for a batch of 512 blocks    |
-//! | `dequantize` | Single-block dequantisation for all three paths  |
+//! | `dequantize` | Single-block dequantisation across all four paths   |
 //! | `matmul`     | Full A×B multiply at three matrix sizes             |
 //!
 //! ## Weight distributions
 //!
-//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG).
-//! Consecutive bytes almost never repeat, so each block produces ~128
-//! single-byte runs.  At 2 bytes per pair that would require ~256 bytes,
-//! which exceeds the 128-byte raw payload, so `encode` always keeps these
-//! blocks in **raw mode** (IS_RLE = 0).  This is representative of typical
-//! unstructured LLM weight matrices.
+//! **uniform** — each qs byte is drawn from a pseudo-random LCG sequence.
+//! Adjacent nibbles match with probability 1/16, giving ~12 % nibble coverage.
+//! At `min_coverage = 0.01` these blocks encode to **RLE mode** (IS_RLE = 1)
+//! with ~230–240 nibble entries — a realistic proxy for trained Q4_K weights.
 //!
-//! **rle_optimal** — every byte in a block's qs field is the same value.
-//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 —
-//! and sets IS_RLE = 1.  This is the theoretical compression maximum, and
-//! is representative of highly sparse or dead-neuron weight matrices.
+//! **rle_optimal** — every qs byte is the same value.  All 256 nibbles are
+//! identical, giving 100 % coverage and just 16 nibble entries.  This is the
+//! theoretical RLE maximum and represents highly structured weight blocks.
+//!
+//! **zero_coverage** — nibbles cycle deterministically so no two consecutive
+//! nibbles (in output-sequential order) are ever equal.  Coverage = 0 %;
+//! `encode` keeps these blocks in **raw mode** (IS_RLE = 0) at any positive
+//! threshold.  Used only in the `dequantize` group to benchmark the raw path.

 use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 use matrix_testing::{
@@ -83,12 +87,11 @@ fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
    s
 }

-/// Return `count` blocks whose qs bytes are pseudo-random.
+/// Return `count` blocks whose qs bytes are pseudo-random (LCG).
 ///
-/// With uniformly distributed bytes, consecutive bytes match with probability
-/// 1/256 ≈ 0.4%, yielding ~128 runs per block.  Storing those as (value,
-/// count) pairs would need ~256 bytes — more than the 128-byte raw payload —
-/// so `encode` will always select **raw mode** (IS_RLE = 0).
+/// Adjacent nibbles match with probability 1/16, giving each block roughly
+/// 12 % nibble coverage.  At `min_coverage = 0.01` these blocks encode to
+/// **RLE mode** (IS_RLE = 1) with ~230–240 nibble entries per block.
 fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
    let scales  = make_scales(7, 2);
@@ -107,10 +110,9 @@ fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {

 /// Return `count` blocks where every qs byte is the same value.
 ///
-/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes
-/// instead of 128.  `encode` will always select **RLE mode** (IS_RLE = 1).
-/// Each block uses a fresh pseudo-random byte so no two blocks are identical,
-/// avoiding degenerate cache-warm effects across the batch.
+/// All 256 nibbles are identical → 100 % nibble coverage → always **RLE mode**
+/// with exactly 16 entries (256 nibbles / 16 per entry).
+/// Each block uses a fresh pseudo-random byte to avoid cache-warm artifacts.
 fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
    let scales  = make_scales(7, 2);
@@ -129,6 +131,28 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
    vec![f32_to_fp16(1.0); k * n]
 }

+/// Build one block whose nibbles cycle so that no two consecutive nibbles
+/// (in output-sequential order) are ever equal → 0 % nibble coverage.
+///
+/// Lo nibble of byte `i` = `i % 16`; hi nibble = `(i + 8) % 16`.
+/// Within every 32-byte group the lo and hi streams each visit all 16 values
+/// twice without repetition, and across group boundaries the last nibble of
+/// one stream differs from the first nibble of the next.
+///
+/// At any `min_coverage > 0.0`, `encode` keeps this block in **raw mode**.
+fn zero_coverage_block() -> BlockQ4K {
+    let scales = make_scales(7, 2);
+    let d      = f32_to_fp16(0.01);
+    let dmin   = f32_to_fp16(0.001);
+    let mut qs = [0u8; QK_K / 2];
+    for (i, b) in qs.iter_mut().enumerate() {
+        let lo = (i % 16) as u8;
+        let hi = ((i + 8) % 16) as u8;
+        *b = lo | (hi << 4);
+    }
+    BlockQ4K { d, dmin, scales, qs }
+}
+
 // ---------------------------------------------------------------------------
 // Group 1 — encode
 // ---------------------------------------------------------------------------
@@ -136,12 +160,11 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
 /// Number of blocks encoded per iteration in `bench_encode`.
 const ENCODE_BATCH: usize = 512;

-/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output.
+/// Measures the cost of scanning nibbles and writing the `BlockQ4KRle` output.
 ///
-/// Both distributions perform the same O(128) run-length scan.  The only
-/// divergence is at the output stage:
-/// * **uniform**    — run count > 63 → fall through to memcpy of 128 bytes.
-/// * **rle_optimal** — run count = 1  → write 2 bytes and set IS_RLE.
+/// Both distributions perform the same O(256) nibble scan.  The output differs:
+/// * **uniform**    — ~12 % coverage → RLE mode, ~230–240 entries written.
+/// * **rle_optimal** — 100 % coverage → RLE mode, exactly 16 entries written.
 fn bench_encode(c: &mut Criterion) {
    let uniform  = uniform_blocks(ENCODE_BATCH);
    let rle_opt  = rle_optimal_blocks(ENCODE_BATCH);
@@ -153,7 +176,7 @@ fn bench_encode(c: &mut Criterion) {
    group.bench_function("uniform", |b| {
        b.iter(|| {
            for blk in &uniform {
-                black_box(encode(black_box(blk)));
+                black_box(encode(black_box(blk), 0.01));
            }
        });
    });
@@ -161,7 +184,7 @@ fn bench_encode(c: &mut Criterion) {
    group.bench_function("rle_optimal", |b| {
        b.iter(|| {
            for blk in &rle_opt {
-                black_box(encode(black_box(blk)));
+                black_box(encode(black_box(blk), 0.01));
            }
        });
    });
@@ -173,25 +196,35 @@ fn bench_encode(c: &mut Criterion) {
 // Group 2 — dequantize (single block)
 // ---------------------------------------------------------------------------

-/// Compares the three single-block dequantisation code paths.
+/// Compares four single-block dequantisation code paths.
 ///
-/// | Variant          | Block type  | Encoding | Extra work vs baseline        |
-/// |------------------|-------------|----------|-------------------------------|
-/// | `q4k_baseline`   | BlockQ4K    | —        | none                          |
-/// | `rle_raw_mode`   | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) |
-/// | `rle_rle_mode`   | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf  |
+/// | Variant            | Block type  | Encoding  | IS_RLE | Entries |
+/// |--------------------|-------------|-----------|--------|---------|
+/// | `q4k_baseline`     | BlockQ4K    | —         | —      | —       |
+/// | `rle_raw_mode`     | BlockQ4KRle | raw       | 0      | —       |
+/// | `rle_sparse`       | BlockQ4KRle | RLE       | 1      | ~235    |
+/// | `rle_dense`        | BlockQ4KRle | RLE       | 1      | 16      |
+///
+/// `rle_raw_mode` uses the zero-coverage fixture (0 % nibble coverage), which
+/// stays in raw mode at any positive threshold.
+/// `rle_sparse` uses the LCG uniform fixture (~12 % coverage, ~235 entries),
+/// representative of actual trained Q4_K weight blocks.
+/// `rle_dense` uses the rle_optimal fixture (100 % coverage, 16 entries).
 ///
 /// Throughput is the number of dequantised weights produced per second.
 fn bench_dequantize(c: &mut Criterion) {
+    let q4k_baseline_block = uniform_blocks(1).into_iter().next().unwrap();
+    let q4k_zero_cov       = zero_coverage_block();
    let q4k_uniform        = uniform_blocks(1).into_iter().next().unwrap();
    let q4k_rle_opt        = rle_optimal_blocks(1).into_iter().next().unwrap();

-    let rle_raw = encode(&q4k_uniform); // IS_RLE = 0
-    let rle_rle = encode(&q4k_rle_opt); // IS_RLE = 1
+    let rle_raw    = encode(&q4k_zero_cov, 0.01); // IS_RLE = 0  (0 % coverage)
+    let rle_sparse = encode(&q4k_uniform,  0.01); // IS_RLE = 1  (~12 % coverage)
+    let rle_dense  = encode(&q4k_rle_opt,  0.01); // IS_RLE = 1  (100 % coverage)

-    // Confirm the fixtures ended up in the right encoding modes.
-    assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode");
-    assert!(rle_rle.is_rle(),  "rle-optimal block should encode to rle mode");
+    assert!(!rle_raw.is_rle(),    "zero-coverage block must be raw mode");
+    assert!(rle_sparse.is_rle(),  "uniform block must be RLE at 0.01 threshold");
+    assert!(rle_dense.is_rle(),   "rle-optimal block must be RLE mode");

    let mut group = c.benchmark_group("dequantize");
    // Throughput = QK_K (256) weights dequantised per second.
@@ -200,7 +233,7 @@ fn bench_dequantize(c: &mut Criterion) {
    group.bench_function("q4k_baseline", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
-            dequantize_block_q4k(black_box(&q4k_uniform), &mut out);
+            dequantize_block_q4k(black_box(&q4k_baseline_block), &mut out);
            black_box(out)
        });
    });
@@ -213,10 +246,18 @@ fn bench_dequantize(c: &mut Criterion) {
        });
    });

-    group.bench_function("rle_rle_mode", |b| {
+    group.bench_function("rle_sparse", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
-            dequantize_block_q4k_rle(black_box(&rle_rle), &mut out);
+            dequantize_block_q4k_rle(black_box(&rle_sparse), &mut out);
+            black_box(out)
+        });
+    });
+
+    group.bench_function("rle_dense", |b| {
+        b.iter(|| {
+            let mut out = [0.0f32; QK_K];
+            dequantize_block_q4k_rle(black_box(&rle_dense), &mut out);
            black_box(out)
        });
    });
@@ -245,14 +286,14 @@ const CONFIGS: &[(usize, usize, usize)] = &[

 /// Full matrix-multiply benchmark across weight distributions and matrix sizes.
 ///
-/// Four variants per size:
+/// Four variants per size (`min_coverage = 0.01`):
 ///
-/// | Label                | A type      | RLE mode? |
-/// |----------------------|-------------|-----------|
-/// | `baseline/uniform`   | BlockQ4K    | —         |
-/// | `rle/uniform`        | BlockQ4KRle | raw       |
-/// | `baseline/rle_opt`   | BlockQ4K    | —         |
-/// | `rle/rle_opt`        | BlockQ4KRle | rle       |
+/// | Label                | A type      | IS_RLE | Entries/block |
+/// |----------------------|-------------|--------|---------------|
+/// | `baseline/uniform`   | BlockQ4K    | —      | —             |
+/// | `rle/uniform`        | BlockQ4KRle | 1      | ~235          |
+/// | `baseline/rle_opt`   | BlockQ4K    | —      | —             |
+/// | `rle/rle_opt`        | BlockQ4KRle | 1      | 16            |
 ///
 /// Throughput is reported as multiply-accumulate operations (M × K × N) per
 /// second, allowing fair cross-size comparison.
@@ -270,10 +311,10 @@ fn bench_matmul(c: &mut Criterion) {

        // Build all four A variants and the shared B matrix for this config.
        let a_q4k_u: Vec<BlockQ4K>    = uniform_blocks(m * bpr);
-        let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(encode).collect();
+        let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.01)).collect();

        let a_q4k_r: Vec<BlockQ4K>    = rle_optimal_blocks(m * bpr);
-        let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(encode).collect();
+        let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.01)).collect();

        let b = fp16_ones(k, n);

--- a/src/bin/gguf_matmul.rs
+++ b/src/bin/gguf_matmul.rs
@@ -145,7 +145,7 @@ fn main() -> Result<(), Box<dyn Error>> {

    // ── RLE encode (best of `trials`) ────────────────────────────────────────
    let (rle_blocks, t_enc) = bench(trials, || -> Vec<BlockQ4KRle> {
-        blocks.iter().map(encode).collect()
+        blocks.iter().map(|b| encode(b, 0.0)).collect()
    });

    let n_rle     = rle_blocks.iter().filter(|b| b.is_rle()).count();
--- a/src/bin/gguf_scan.rs
+++ b/src/bin/gguf_scan.rs
@@ -101,11 +101,32 @@ fn fixed(s: &str, width: usize) -> String {
 fn main() -> Result<(), Box<dyn Error>> {
    let args: Vec<String> = env::args().collect();
    if args.len() < 2 {
-        eprintln!("usage: {} <model.gguf>", args[0]);
+        eprintln!("usage: {} <model.gguf> [--threshold <0.0..1.0>]", args[0]);
+        eprintln!();
+        eprintln!("  --threshold  Minimum fraction of qs bytes that must be in runs of");
+        eprintln!("               length ≥ 2 for a block to use RLE mode.  Default: 0.0");
+        eprintln!("               (use RLE whenever the pair count fits in 64 pairs).");
        std::process::exit(1);
    }
    let path = &args[1];

+    // Parse optional --threshold flag from the remaining arguments.
+    let mut threshold = 0.0f32;
+    let mut idx = 2usize;
+    while idx < args.len() {
+        if args[idx] == "--threshold" {
+            idx += 1;
+            threshold = args.get(idx)
+                .and_then(|s| s.parse::<f32>().ok())
+                .filter(|&v| (0.0..=1.0).contains(&v))
+                .unwrap_or_else(|| {
+                    eprintln!("error: --threshold requires a value in [0.0, 1.0]");
+                    std::process::exit(1);
+                });
+        }
+        idx += 1;
+    }
+
    // ── Parse header ─────────────────────────────────────────────────────────
    eprintln!("Parsing {path} …");
    let (tensors, data_start) = parse_header(path)?;
@@ -122,6 +143,8 @@ fn main() -> Result<(), Box<dyn Error>> {
        q4k_tensors.len(),
        other_count,
    );
+    eprintln!("  RLE threshold: {threshold:.2} (blocks need ≥ {:.0}% of bytes in runs)",
+        threshold * 100.0);
    eprintln!();

    // ── Header row ───────────────────────────────────────────────────────────
@@ -145,7 +168,7 @@ fn main() -> Result<(), Box<dyn Error>> {
        let mut stats = TensorStats::new();

        for_each_block(&mut file, data_start, tensor, |block| {
-            let rle_block = encode(block);
+            let rle_block = encode(block, threshold);
            stats.observe(rle_block.is_rle(), rle_block.rle_len());
        })?;

@@ -187,10 +210,15 @@ fn main() -> Result<(), Box<dyn Error>> {

    if !any_rle {
        println!();
-        println!("No blocks compressed with RLE — all weights are effectively random at");
-        println!("the byte level, which is typical for trained Q4_K quantised weights.");
-        println!("RLE compression only helps for structured weight matrices (binary,");
-        println!("ternary, heavily pruned, or synthetic).");
+        println!("No blocks used RLE at threshold {threshold:.2}.");
+        if threshold < 0.01 {
+            println!("All weights are effectively random at the byte level — typical for");
+            println!("trained Q4_K weights.  RLE only helps for structured weight matrices");
+            println!("(binary, ternary, heavily pruned, or synthetic).");
+        } else {
+            println!("Try a lower --threshold (e.g. --threshold 0.0) to see whether any");
+            println!("blocks have enough run structure to qualify at a looser threshold.");
+        }
    }

    Ok(())
--- a/src/rle.rs
+++ b/src/rle.rs
Author	SHA1	Message	Date
charles	bba9db290e	RLE now works on nibbles	2026-04-12 21:26:36 -07:00
charles	3fb10b78e3	Allow variable coverage	2026-04-12 20:51:19 -07:00
charles	59b5eade7e	Try sorting	2026-04-12 19:51:59 -07:00