RLE now works on nibbles

2026-04-12 21:26:36 -07:00
parent 3fb10b78e3
commit bba9db290e
2 changed files with 398 additions and 360 deletions
@@ -1,26 +1,30 @@
 //! # Benchmark: BlockQ4K vs BlockQ4KRle
 //!
-//! Measures three operations across two weight distributions:
+//! Measures three operations across three weight distributions, encoded with
+//! `min_coverage = 0.01` (blocks need ≥ 1 % of their 256 nibbles in repeated
+//! runs to use RLE mode).
 //!
-//! | Group        | What is timed                                    |
-//! |--------------|--------------------------------------------------|
-//! | `encode`     | BlockQ4K → BlockQ4KRle for a batch of 512 blocks |
-//! | `dequantize` | Single-block dequantisation for all three paths  |
-//! | `matmul`     | Full A×B multiply at three matrix sizes          |
+//! | Group        | What is timed                                       |
+//! |--------------|-----------------------------------------------------|
+//! | `encode`     | BlockQ4K → BlockQ4KRle for a batch of 512 blocks    |
+//! | `dequantize` | Single-block dequantisation across all four paths   |
+//! | `matmul`     | Full A×B multiply at three matrix sizes             |
 //!
 //! ## Weight distributions
 //!
-//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG).
-//! Consecutive bytes almost never repeat, so each block produces ~128
-//! single-byte runs.  At 2 bytes per pair that would require ~256 bytes,
-//! which exceeds the 128-byte raw payload, so `encode` always keeps these
-//! blocks in **raw mode** (IS_RLE = 0).  This is representative of typical
-//! unstructured LLM weight matrices.
+//! **uniform** — each qs byte is drawn from a pseudo-random LCG sequence.
+//! Adjacent nibbles match with probability 1/16, giving ~12 % nibble coverage.
+//! At `min_coverage = 0.01` these blocks encode to **RLE mode** (IS_RLE = 1)
+//! with ~230–240 nibble entries — a realistic proxy for trained Q4_K weights.
 //!
-//! **rle_optimal** — every byte in a block's qs field is the same value.
-//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 —
-//! and sets IS_RLE = 1.  This is the theoretical compression maximum, and
-//! is representative of highly sparse or dead-neuron weight matrices.
+//! **rle_optimal** — every qs byte is the same value.  All 256 nibbles are
+//! identical, giving 100 % coverage and just 16 nibble entries.  This is the
+//! theoretical RLE maximum and represents highly structured weight blocks.
+//!
+//! **zero_coverage** — nibbles cycle deterministically so no two consecutive
+//! nibbles (in output-sequential order) are ever equal.  Coverage = 0 %;
+//! `encode` keeps these blocks in **raw mode** (IS_RLE = 0) at any positive
+//! threshold.  Used only in the `dequantize` group to benchmark the raw path.

 use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 use matrix_testing::{
@@ -83,12 +87,11 @@ fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
    s
 }

-/// Return `count` blocks whose qs bytes are pseudo-random.
+/// Return `count` blocks whose qs bytes are pseudo-random (LCG).
 ///
-/// With uniformly distributed bytes, consecutive bytes match with probability
-/// 1/256 ≈ 0.4%, yielding ~128 runs per block.  Storing those as (value,
-/// count) pairs would need ~256 bytes — more than the 128-byte raw payload —
-/// so `encode` will always select **raw mode** (IS_RLE = 0).
+/// Adjacent nibbles match with probability 1/16, giving each block roughly
+/// 12 % nibble coverage.  At `min_coverage = 0.01` these blocks encode to
+/// **RLE mode** (IS_RLE = 1) with ~230–240 nibble entries per block.
 fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
    let scales  = make_scales(7, 2);
@@ -107,10 +110,9 @@ fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {

 /// Return `count` blocks where every qs byte is the same value.
 ///
-/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes
-/// instead of 128.  `encode` will always select **RLE mode** (IS_RLE = 1).
-/// Each block uses a fresh pseudo-random byte so no two blocks are identical,
-/// avoiding degenerate cache-warm effects across the batch.
+/// All 256 nibbles are identical → 100 % nibble coverage → always **RLE mode**
+/// with exactly 16 entries (256 nibbles / 16 per entry).
+/// Each block uses a fresh pseudo-random byte to avoid cache-warm artifacts.
 fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
    let scales  = make_scales(7, 2);
@@ -129,6 +131,28 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
    vec![f32_to_fp16(1.0); k * n]
 }

+/// Build one block whose nibbles cycle so that no two consecutive nibbles
+/// (in output-sequential order) are ever equal → 0 % nibble coverage.
+///
+/// Lo nibble of byte `i` = `i % 16`; hi nibble = `(i + 8) % 16`.
+/// Within every 32-byte group the lo and hi streams each visit all 16 values
+/// twice without repetition, and across group boundaries the last nibble of
+/// one stream differs from the first nibble of the next.
+///
+/// At any `min_coverage > 0.0`, `encode` keeps this block in **raw mode**.
+fn zero_coverage_block() -> BlockQ4K {
+    let scales = make_scales(7, 2);
+    let d      = f32_to_fp16(0.01);
+    let dmin   = f32_to_fp16(0.001);
+    let mut qs = [0u8; QK_K / 2];
+    for (i, b) in qs.iter_mut().enumerate() {
+        let lo = (i % 16) as u8;
+        let hi = ((i + 8) % 16) as u8;
+        *b = lo | (hi << 4);
+    }
+    BlockQ4K { d, dmin, scales, qs }
+}
+
 // ---------------------------------------------------------------------------
 // Group 1 — encode
 // ---------------------------------------------------------------------------
@@ -136,12 +160,11 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
 /// Number of blocks encoded per iteration in `bench_encode`.
 const ENCODE_BATCH: usize = 512;

-/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output.
+/// Measures the cost of scanning nibbles and writing the `BlockQ4KRle` output.
 ///
-/// Both distributions perform the same O(128) run-length scan.  The only
-/// divergence is at the output stage:
-/// * **uniform**    — run count > 63 → fall through to memcpy of 128 bytes.
-/// * **rle_optimal** — run count = 1  → write 2 bytes and set IS_RLE.
+/// Both distributions perform the same O(256) nibble scan.  The output differs:
+/// * **uniform**    — ~12 % coverage → RLE mode, ~230–240 entries written.
+/// * **rle_optimal** — 100 % coverage → RLE mode, exactly 16 entries written.
 fn bench_encode(c: &mut Criterion) {
    let uniform  = uniform_blocks(ENCODE_BATCH);
    let rle_opt  = rle_optimal_blocks(ENCODE_BATCH);
@@ -153,7 +176,7 @@ fn bench_encode(c: &mut Criterion) {
    group.bench_function("uniform", |b| {
        b.iter(|| {
            for blk in &uniform {
-                black_box(encode(black_box(blk), 0.0));
+                black_box(encode(black_box(blk), 0.01));
            }
        });
    });
@@ -161,7 +184,7 @@ fn bench_encode(c: &mut Criterion) {
    group.bench_function("rle_optimal", |b| {
        b.iter(|| {
            for blk in &rle_opt {
-                black_box(encode(black_box(blk), 0.0));
+                black_box(encode(black_box(blk), 0.01));
            }
        });
    });
@@ -173,25 +196,35 @@ fn bench_encode(c: &mut Criterion) {
 // Group 2 — dequantize (single block)
 // ---------------------------------------------------------------------------

-/// Compares the three single-block dequantisation code paths.
+/// Compares four single-block dequantisation code paths.
 ///
-/// | Variant          | Block type  | Encoding | Extra work vs baseline        |
-/// |------------------|-------------|----------|-------------------------------|
-/// | `q4k_baseline`   | BlockQ4K    | —        | none                          |
-/// | `rle_raw_mode`   | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) |
-/// | `rle_rle_mode`   | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf  |
+/// | Variant            | Block type  | Encoding  | IS_RLE | Entries |
+/// |--------------------|-------------|-----------|--------|---------|
+/// | `q4k_baseline`     | BlockQ4K    | —         | —      | —       |
+/// | `rle_raw_mode`     | BlockQ4KRle | raw       | 0      | —       |
+/// | `rle_sparse`       | BlockQ4KRle | RLE       | 1      | ~235    |
+/// | `rle_dense`        | BlockQ4KRle | RLE       | 1      | 16      |
+///
+/// `rle_raw_mode` uses the zero-coverage fixture (0 % nibble coverage), which
+/// stays in raw mode at any positive threshold.
+/// `rle_sparse` uses the LCG uniform fixture (~12 % coverage, ~235 entries),
+/// representative of actual trained Q4_K weight blocks.
+/// `rle_dense` uses the rle_optimal fixture (100 % coverage, 16 entries).
 ///
 /// Throughput is the number of dequantised weights produced per second.
 fn bench_dequantize(c: &mut Criterion) {
-    let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap();
-    let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap();
+    let q4k_baseline_block = uniform_blocks(1).into_iter().next().unwrap();
+    let q4k_zero_cov       = zero_coverage_block();
+    let q4k_uniform        = uniform_blocks(1).into_iter().next().unwrap();
+    let q4k_rle_opt        = rle_optimal_blocks(1).into_iter().next().unwrap();

-    let rle_raw = encode(&q4k_uniform, 0.0); // IS_RLE = 0
-    let rle_rle = encode(&q4k_rle_opt, 0.0); // IS_RLE = 1
+    let rle_raw    = encode(&q4k_zero_cov, 0.01); // IS_RLE = 0  (0 % coverage)
+    let rle_sparse = encode(&q4k_uniform,  0.01); // IS_RLE = 1  (~12 % coverage)
+    let rle_dense  = encode(&q4k_rle_opt,  0.01); // IS_RLE = 1  (100 % coverage)

-    // Confirm the fixtures ended up in the right encoding modes.
-    assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode");
-    assert!(rle_rle.is_rle(),  "rle-optimal block should encode to rle mode");
+    assert!(!rle_raw.is_rle(),    "zero-coverage block must be raw mode");
+    assert!(rle_sparse.is_rle(),  "uniform block must be RLE at 0.01 threshold");
+    assert!(rle_dense.is_rle(),   "rle-optimal block must be RLE mode");

    let mut group = c.benchmark_group("dequantize");
    // Throughput = QK_K (256) weights dequantised per second.
@@ -200,7 +233,7 @@ fn bench_dequantize(c: &mut Criterion) {
    group.bench_function("q4k_baseline", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
-            dequantize_block_q4k(black_box(&q4k_uniform), &mut out);
+            dequantize_block_q4k(black_box(&q4k_baseline_block), &mut out);
            black_box(out)
        });
    });
@@ -213,10 +246,18 @@ fn bench_dequantize(c: &mut Criterion) {
        });
    });

-    group.bench_function("rle_rle_mode", |b| {
+    group.bench_function("rle_sparse", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
-            dequantize_block_q4k_rle(black_box(&rle_rle), &mut out);
+            dequantize_block_q4k_rle(black_box(&rle_sparse), &mut out);
+            black_box(out)
+        });
+    });
+
+    group.bench_function("rle_dense", |b| {
+        b.iter(|| {
+            let mut out = [0.0f32; QK_K];
+            dequantize_block_q4k_rle(black_box(&rle_dense), &mut out);
            black_box(out)
        });
    });
@@ -245,14 +286,14 @@ const CONFIGS: &[(usize, usize, usize)] = &[

 /// Full matrix-multiply benchmark across weight distributions and matrix sizes.
 ///
-/// Four variants per size:
+/// Four variants per size (`min_coverage = 0.01`):
 ///
-/// | Label                | A type      | RLE mode? |
-/// |----------------------|-------------|-----------|
-/// | `baseline/uniform`   | BlockQ4K    | —         |
-/// | `rle/uniform`        | BlockQ4KRle | raw       |
-/// | `baseline/rle_opt`   | BlockQ4K    | —         |
-/// | `rle/rle_opt`        | BlockQ4KRle | rle       |
+/// | Label                | A type      | IS_RLE | Entries/block |
+/// |----------------------|-------------|--------|---------------|
+/// | `baseline/uniform`   | BlockQ4K    | —      | —             |
+/// | `rle/uniform`        | BlockQ4KRle | 1      | ~235          |
+/// | `baseline/rle_opt`   | BlockQ4K    | —      | —             |
+/// | `rle/rle_opt`        | BlockQ4KRle | 1      | 16            |
 ///
 /// Throughput is reported as multiply-accumulate operations (M × K × N) per
 /// second, allowing fair cross-size comparison.
@@ -270,10 +311,10 @@ fn bench_matmul(c: &mut Criterion) {

        // Build all four A variants and the shared B matrix for this config.
        let a_q4k_u: Vec<BlockQ4K>    = uniform_blocks(m * bpr);
-        let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.0)).collect();
+        let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.01)).collect();

        let a_q4k_r: Vec<BlockQ4K>    = rle_optimal_blocks(m * bpr);
-        let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.0)).collect();
+        let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.01)).collect();

        let b = fp16_ones(k, n);

@@ -1,26 +1,29 @@
 //! RLE-optional Q4_K super-block encoding.
 //!
 //! This module provides [`BlockQ4KRle`], a variant of [`crate::BlockQ4K`] that
-//! optionally compresses the 128-byte weight payload using **byte-level
+//! optionally compresses the 128-byte weight payload using **nibble-level
 //! run-length encoding** (RLE).  A flag bit in the [`BlockQ4KRle::flags`]
 //! field indicates which mode is active:
 //!
-//! | `IS_RLE` bit | `qs` interpretation                                        |
-//! |--------------|------------------------------------------------------------|
-//! | 0            | Raw packed nibbles, identical to [`crate::BlockQ4K::qs`]  |
-//! | 1            | RLE stream of `(value, count)` byte-pairs                  |
+//! | `IS_RLE` bit | `qs` interpretation                                                    |
+//! |--------------|------------------------------------------------------------------------|
+//! | 0            | Raw packed nibbles, identical to [`crate::BlockQ4K::qs`]              |
+//! | 1            | RLE stream of single-byte entries `(nibble_val << 4 | count_minus_1)` |
 //!
 //! ## RLE format (when `IS_RLE` = 1)
 //!
-//! - `n_pairs` gives the number of `(value, count)` pairs stored in `qs`.
-//! - For each pair `i`:
-//!   - `qs[2*i]`     — the byte value (two packed 4-bit weights, same packing
-//!                     as the raw format).
-//!   - `qs[2*i + 1]` — the run length in bytes (1..=255).
-//! - The run lengths must sum to exactly 128 (the uncompressed `qs` size).
+//! - `n_pairs` gives the number of nibble-level RLE entries stored in `qs`.
+//! - For each entry `i` (one byte each):
+//!   - bits 7–4: nibble value (0–15)
+//!   - bits 3–0: `count - 1` (0–15, meaning run length 1–16)
+//! - The run lengths must sum to exactly 256 (the number of nibbles in QK_K).
 //!
-//! The 256-byte `qs` field can hold up to 128 `(value, count)` pairs — enough
-//! to represent even fully-random blocks where every byte differs from its
+//! Nibbles are read in output-sequential order: for each 32-byte group, first
+//! all 32 lo nibbles, then all 32 hi nibbles.  The nibble at stream position
+//! `p` maps directly to output element `p` in sub-block `p / 32`.
+//!
+//! The 256-byte `qs` field can hold up to 256 single-byte entries — enough to
+//! represent even fully-random blocks where every nibble differs from its
 //! neighbour.
 //!
 //! ## Constructing blocks
@@ -45,12 +48,12 @@ pub const IS_RLE: u8 = 0x01;
 // Block definition
 // ---------------------------------------------------------------------------

-/// A Q4_K super-block with optional byte-level RLE compression on the weights.
+/// A Q4_K super-block with optional nibble-level RLE compression on the weights.
 ///
 /// Unlike [`crate::BlockQ4K`], this format is **not** binary-compatible with
 /// the GGUF on-disk layout.  It uses a 256-byte `qs` field (vs the 128-byte
-/// field in `BlockQ4K`) so the RLE stream can store up to 128 `(value, count)`
-/// pairs — enough to represent even fully-random blocks where every byte
+/// field in `BlockQ4K`) so the RLE stream can store up to 256 single-byte
+/// entries — enough to represent even fully-random blocks where every nibble
 /// differs from its neighbour.
 ///
 /// Memory layout (`repr C`):
@@ -61,17 +64,18 @@ pub const IS_RLE: u8 = 0x01;
 /// |  2     | `dmin`    | 2 B    | fp16 super-block min-scale         |
 /// |  4     | `scales`  | 12 B   | packed 6-bit sub-block params      |
 /// | 16     | `flags`   | 1 B    | bit 0 = `IS_RLE`; bits 1–7 unused  |
-/// | 17     | `n_pairs` | 1 B    | RLE pair count (0 when raw)        |
-/// | 18     | `qs`      | 256 B  | raw nibbles (first 128 B) or RLE   |
+/// | 17     | (pad)     | 1 B    | alignment padding for `n_pairs`    |
+/// | 18     | `n_pairs` | 2 B    | RLE entry count (0 when raw)       |
+/// | 20     | `qs`      | 256 B  | raw nibbles (first 128 B) or RLE   |
 ///
-/// **sizeof = 274 bytes.**
+/// **sizeof = 276 bytes.**
 ///
 /// ## `qs` interpretation
 ///
-/// | `IS_RLE` | Meaning                                                      |
-/// |----------|--------------------------------------------------------------|
-/// | 0        | `qs[0..128]` holds raw packed nibbles (same as `BlockQ4K`)  |
-/// | 1        | `qs[0..n_pairs*2]` holds `(value, count)` byte-pairs         |
+/// | `IS_RLE` | Meaning                                                             |
+/// |----------|---------------------------------------------------------------------|
+/// | 0        | `qs[0..128]` holds raw packed nibbles (same as `BlockQ4K`)         |
+/// | 1        | `qs[0..n_pairs]` holds nibble-level RLE entries (1 byte each)       |
 #[repr(C)]
 #[derive(Clone, Copy, Debug)]
 pub struct BlockQ4KRle {
@@ -80,11 +84,11 @@ pub struct BlockQ4KRle {
    pub scales:  [u8; K_SCALE_SIZE],
    /// Encoding flags.  Only bit 0 (`IS_RLE`) is used; bits 1-7 are reserved.
    pub flags:   u8,
-    /// When `IS_RLE` is set: number of `(value, count)` byte-pairs in `qs`.
+    /// When `IS_RLE` is set: number of nibble-level RLE entries in `qs`.
    /// Zero when in raw mode.
-    pub n_pairs: u8,
-    /// Raw packed-nibble weights (IS_RLE = 0, first 128 bytes) or RLE stream
-    /// (IS_RLE = 1, first `n_pairs * 2` bytes).
+    pub n_pairs: u16,
+    /// Raw packed-nibble weights (IS_RLE = 0, first 128 bytes) or nibble-level
+    /// RLE stream (IS_RLE = 1, first `n_pairs` bytes; one byte per entry).
    pub qs:      [u8; QK_K],   // 256 bytes
 }

@@ -95,7 +99,7 @@ impl BlockQ4KRle {
        self.flags & IS_RLE != 0
    }

-    /// Number of `(value, count)` byte-pairs in `qs`.
+    /// Number of nibble-level RLE entries in `qs`.
    /// Only meaningful when `is_rle()` is true.
    #[inline]
    pub fn rle_len(&self) -> usize {
@@ -103,74 +107,112 @@ impl BlockQ4KRle {
    }
 }

+// ---------------------------------------------------------------------------
+// Nibble extraction / packing helpers
+// ---------------------------------------------------------------------------
+
+/// Extract all 256 nibbles from a 128-byte `qs` payload in output-sequential
+/// order: for each 32-byte group, first all 32 lo nibbles, then all 32 hi
+/// nibbles.  Nibble at position `p` maps to output element `p`.
+fn extract_nibbles(raw: &[u8; QK_K / 2]) -> [u8; QK_K] {
+    let mut nibbles = [0u8; QK_K];
+    let mut q_off   = 0usize; // byte cursor
+    let mut n_off   = 0usize; // nibble cursor
+    while n_off < QK_K {
+        for l in 0..32 {
+            nibbles[n_off + l]      = raw[q_off + l] & 0x0F; // lo
+            nibbles[n_off + 32 + l] = raw[q_off + l] >> 4;   // hi
+        }
+        q_off += 32;
+        n_off += 64;
+    }
+    nibbles
+}
+
+/// Inverse of [`extract_nibbles`]: pack a 256-nibble output-sequential array
+/// back into the 128-byte `qs` layout.
+fn pack_nibbles(nibbles: &[u8; QK_K]) -> [u8; QK_K / 2] {
+    let mut raw   = [0u8; QK_K / 2];
+    let mut q_off = 0usize;
+    let mut n_off = 0usize;
+    while n_off < QK_K {
+        for l in 0..32 {
+            raw[q_off + l] = nibbles[n_off + l] | (nibbles[n_off + 32 + l] << 4);
+        }
+        q_off += 32;
+        n_off += 64;
+    }
+    raw
+}
+
 // ---------------------------------------------------------------------------
 // Encoding
 // ---------------------------------------------------------------------------

 /// Encode a [`BlockQ4K`] block into a [`BlockQ4KRle`] block.
 ///
-/// The `qs` payload is scanned for runs of equal consecutive bytes.  RLE mode
-/// is chosen when **both** conditions hold:
+/// The `qs` payload is scanned for runs of equal consecutive nibbles in
+/// output-sequential order.  RLE mode is chosen when **both** conditions hold:
 ///
-/// 1. **Coverage**: at least `min_coverage` fraction of the 128 `qs` bytes
-///    belong to runs of length ≥ 2.  These are the bytes whose weights can be
-///    batched in `accumulate_rle_block`, replacing `2 * run_len` multiplies
-///    with just 2 per group-segment.
+/// 1. **Coverage**: at least `min_coverage` fraction of the 256 nibbles
+///    belong to runs of length ≥ 2.  These are the nibbles whose weights can
+///    be batched in `accumulate_rle_block`, replacing one multiply per nibble
+///    with one multiply per output column per segment.
 ///
-/// 2. **Capacity**: the pair count does not exceed 128 (the physical limit of
-///    the 256-byte `qs` field at 2 bytes per pair).
+/// 2. **Capacity**: the entry count does not exceed 256 (the physical limit of
+///    the 256-byte `qs` field at 1 byte per entry).
 ///
-/// | `min_coverage` | Effect                                               |
-/// |----------------|------------------------------------------------------|
-/// | `0.0`          | RLE whenever pairs fit (≤ 128), regardless of runs  |
-/// | `0.5`          | RLE only if ≥ 50 % of bytes are in repeated runs    |
-/// | `1.0`          | RLE only when every byte is part of a run           |
+/// | `min_coverage` | Effect                                                |
+/// |----------------|-------------------------------------------------------|
+/// | `0.0`          | RLE whenever entries fit (≤ 256), regardless of runs |
+/// | `0.5`          | RLE only if ≥ 50 % of nibbles are in repeated runs   |
+/// | `1.0`          | RLE only when every nibble is part of a run           |
 pub fn encode(block: &BlockQ4K, min_coverage: f32) -> BlockQ4KRle {
    debug_assert!(
        (0.0..=1.0).contains(&min_coverage),
        "min_coverage must be in [0.0, 1.0], got {min_coverage}"
    );

-    let raw = &block.qs;  // [u8; 128]
+    let nibbles = extract_nibbles(&block.qs);

-    // Scan for runs of equal consecutive bytes.
-    // Track long_run_bytes: bytes in runs of length ≥ 2 (the bytes that
-    // benefit from RLE in the matmul).
-    let mut pairs: Vec<(u8, u8)> = Vec::with_capacity(QK_K / 2);
-    let mut long_run_bytes = 0usize;
-    let mut i = 0usize;
-    while i < raw.len() {
-        let val = raw[i];
-        let mut run = 1u8;
-        while i + (run as usize) < raw.len()
-            && raw[i + (run as usize)] == val
-            && run < u8::MAX
-        {
+    // Scan for runs of equal consecutive nibbles.
+    let mut entries          = Vec::<u8>::with_capacity(QK_K);
+    let mut long_run_nibbles = 0usize;
+    let mut i                = 0usize;
+
+    while i < QK_K {
+        let val = nibbles[i];
+        let mut run = 0usize;
+        while i + run < QK_K && nibbles[i + run] == val {
            run += 1;
        }
-        pairs.push((val, run));
        if run >= 2 {
-            long_run_bytes += run as usize;
+            long_run_nibbles += run;
        }
-        i += run as usize;
+        // Split runs longer than 16 into max-16 chunks (4-bit count field).
+        let mut rem = run;
+        while rem > 0 {
+            let chunk = rem.min(16);
+            entries.push((val << 4) | ((chunk - 1) as u8));
+            rem -= chunk;
+        }
+        i += run;
    }

-    // Coverage: fraction of qs bytes that are in non-singleton runs.
-    let coverage = long_run_bytes as f32 / raw.len() as f32;
+    // Coverage: fraction of the 256 nibbles that are in non-singleton runs.
+    let coverage = long_run_nibbles as f32 / QK_K as f32;

-    if pairs.len() <= QK_K / 2 && coverage >= min_coverage {
-        let n = pairs.len();
+    // Use RLE when entries fit in qs (≤ 256) and coverage meets the threshold.
+    if entries.len() <= QK_K && coverage >= min_coverage {
+        let n = entries.len();
        let mut qs = [0u8; QK_K];
-        for (k, &(val, count)) in pairs.iter().enumerate() {
-            qs[2 * k]     = val;
-            qs[2 * k + 1] = count;
-        }
+        qs[..n].copy_from_slice(&entries);
        BlockQ4KRle {
            d:       block.d,
            dmin:    block.dmin,
            scales:  block.scales,
            flags:   IS_RLE,
-            n_pairs: n as u8,
+            n_pairs: n as u16,
            qs,
        }
    } else {
@@ -196,29 +238,32 @@ pub fn encode(block: &BlockQ4K, min_coverage: f32) -> BlockQ4KRle {
 ///
 /// # Panics (debug builds only)
 ///
-/// Panics if the decoded RLE stream does not sum to exactly 128 bytes.
+/// Panics if the decoded RLE nibble stream does not sum to exactly 256 nibbles.
 fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] {
    if !block.is_rle() {
        // First QK_K/2 bytes of qs hold the raw packed nibbles.
-        block.qs[..QK_K / 2].try_into().unwrap()
-    } else {
-        let n   = block.rle_len();
-        let mut raw = [0u8; QK_K / 2];
-        let mut pos = 0usize;
-        for i in 0..n {
-            let val   = block.qs[2 * i];
-            let count = block.qs[2 * i + 1] as usize;
-            raw[pos..pos + count].fill(val);
-            pos += count;
-        }
-        debug_assert_eq!(
-            pos,
-            QK_K / 2,
-            "RLE run lengths sum to {pos}, expected {}",
-            QK_K / 2
-        );
-        raw
+        return block.qs[..QK_K / 2].try_into().unwrap();
    }
+
+    let n = block.rle_len();
+    let mut nibbles = [0u8; QK_K];
+    let mut pos     = 0usize;
+
+    for i in 0..n {
+        let entry = block.qs[i];
+        let val   = entry >> 4;
+        let count = (entry & 0x0F) as usize + 1;
+        nibbles[pos..pos + count].fill(val);
+        pos += count;
+    }
+
+    debug_assert_eq!(
+        pos,
+        QK_K,
+        "nibble RLE lengths sum to {pos}, expected {QK_K}"
+    );
+
+    pack_nibbles(&nibbles)
 }

 // ---------------------------------------------------------------------------
@@ -227,9 +272,9 @@ fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] {

 /// Dequantise one [`BlockQ4KRle`] super-block into [`QK_K`] (256) `f32` values.
 ///
-/// When `IS_RLE` is set the RLE stream is first expanded into a 128-byte raw
-/// buffer; thereafter the dequantisation is identical to
-/// [`crate::dequantize_block_q4k`]:
+/// When `IS_RLE` is set the RLE stream is first expanded into a 256-nibble
+/// buffer and packed back into a 128-byte raw representation; thereafter the
+/// dequantisation is identical to [`crate::dequantize_block_q4k`]:
 ///
 /// ```text
 /// out[i] = d * scale[s] * nibble[i]  -  dmin * min[s]
@@ -273,86 +318,69 @@ pub fn dequantize_block_q4k_rle(block: &BlockQ4KRle, out: &mut [f32; QK_K]) {

 /// Accumulate the contribution of one RLE-encoded block into `c_row`.
 ///
-/// For each `(value, count)` pair the dequantised weight is constant within
-/// every 32-byte sub-block group, so the per-output-column dot-product
-/// contribution reduces from `2 * run_len` multiplies to just `2`:
+/// With nibble-level RLE and output-sequential ordering, nibble position `p`
+/// maps directly to output element `p` in sub-block `p / 32`.  For each entry
+/// the dequantised weight `dq` is constant within each sub-block segment, so
+/// the per-output-column contribution reduces to:
 ///
 /// ```text
-/// original:   Σ_{l} ( dq_lo * B[ki_lo+l, j]  +  dq_hi * B[ki_hi+l, j] )
-///
-/// optimised:  dq_lo * Σ_{l} B[ki_lo+l, j]  +  dq_hi * Σ_{l} B[ki_hi+l, j]
+/// c_row[j] += dq * Σ_{l in seg} B[ki_base + pos + l, j]
 /// ```
 ///
-/// A run that crosses a 32-byte group boundary (and thus a scale/min change)
-/// is split at the boundary; each resulting segment is handled independently.
+/// A run that crosses a 32-nibble sub-block boundary is split at the boundary;
+/// each resulting segment is handled independently.
 ///
-/// `sum_lo` and `sum_hi` are caller-provided scratch slices (length `≥ n`)
-/// reused across calls to avoid repeated allocation.
+/// `sum_b` is a caller-provided scratch slice (length `≥ n`) reused across
+/// calls to avoid repeated allocation.
 fn accumulate_rle_block(
    block:   &BlockQ4KRle,
    b:       &[u16],
    ki_base: usize,   // first B-row index for this block (= b_idx * QK_K)
    n:       usize,
    c_row:   &mut [f32],
-    sum_lo:  &mut [f32],
-    sum_hi:  &mut [f32],
+    sum_b:   &mut [f32],   // scratch, length ≥ n
 ) {
    let d    = fp16_to_f32(block.d);
    let dmin = fp16_to_f32(block.dmin);

-    let mut byte_pos = 0usize; // running cursor into the 128-byte qs payload
+    let mut nibble_pos = 0usize; // current position in the 256-nibble output stream

    for p in 0..block.rle_len() {
-        let val = block.qs[2 * p];
-        let run = block.qs[2 * p + 1] as usize;
-        let lo  = (val & 0x0F) as f32;
-        let hi  = (val >>   4) as f32;
+        let entry = block.qs[p];
+        let val   = (entry >> 4) as f32;           // nibble value 0–15
+        let run   = (entry & 0x0F) as usize + 1;   // count 1–16

        let mut remaining = run;
-        let mut pos       = byte_pos;
+        let mut pos       = nibble_pos;

        while remaining > 0 {
-            // Clip the current run to the boundary of the 32-byte group so
-            // that the sub-block scale/min stays constant over the segment.
-            let group    = pos / 32;                       // 0..4
-            let in_group = pos % 32;                       // byte offset within this group
-            let seg_len  = remaining.min((group + 1) * 32 - pos);
+            // Sub-block at this position; split at sub-block boundaries (every 32 nibbles).
+            let sub_block = pos / 32;               // 0..8
+            let in_sb     = pos % 32;
+            let seg_len   = remaining.min(32 - in_sb);

-            // Constant dequantised values for both nibble levels in this group.
-            let (sc_lo, mn_lo) = get_scale_min(group * 2,     &block.scales);
-            let (sc_hi, mn_hi) = get_scale_min(group * 2 + 1, &block.scales);
-            let dq_lo = d * sc_lo as f32 * lo - dmin * mn_lo as f32;
-            let dq_hi = d * sc_hi as f32 * hi - dmin * mn_hi as f32;
+            let (sc, mn) = get_scale_min(sub_block, &block.scales);
+            let dq = d * sc as f32 * val - dmin * mn as f32;

-            // Map byte positions to dequantised-output indices (0..QK_K):
-            //   lo nibbles → group*64 + in_group  .. + seg_len
-            //   hi nibbles → group*64 + 32 + in_group .. + seg_len
-            let out_lo = group * 64 + in_group;
-            let out_hi = group * 64 + 32 + in_group;
-
-            // Sum B rows for every j across the segment (B accessed stride-1
-            // within each row — cache-friendly).
-            sum_lo[..n].fill(0.0);
-            sum_hi[..n].fill(0.0);
+            // Accumulate B-column sums for this segment (stride-1 per B row).
+            sum_b[..n].fill(0.0);
            for l in 0..seg_len {
-                let base_lo = (ki_base + out_lo + l) * n;
-                let base_hi = (ki_base + out_hi + l) * n;
+                let b_base = (ki_base + pos + l) * n;
                for j in 0..n {
-                    sum_lo[j] += fp16_to_f32(b[base_lo + j]);
-                    sum_hi[j] += fp16_to_f32(b[base_hi + j]);
+                    sum_b[j] += fp16_to_f32(b[b_base + j]);
                }
            }

-            // One multiply per output column instead of one per weight element.
+            // One multiply per output column instead of one per nibble.
            for j in 0..n {
-                c_row[j] += dq_lo * sum_lo[j] + dq_hi * sum_hi[j];
+                c_row[j] += dq * sum_b[j];
            }

            pos       += seg_len;
            remaining -= seg_len;
        }

-        byte_pos += run;
+        nibble_pos += run;
    }
 }

@@ -361,20 +389,16 @@ fn accumulate_rle_block(
 ///
 /// For blocks in **RLE mode** (`IS_RLE = 1`) the intermediate decompressed row
 /// is eliminated entirely.  [`accumulate_rle_block`] works directly over the
-/// `(value, count)` pairs: within each run the dequantised weight is constant
-/// across all elements in the run, so each output column `j` requires only
-/// **2 multiplies per group-segment** rather than 2 per weight element:
+/// nibble-level RLE entries: within each run the dequantised weight is constant
+/// across all elements, so each output column `j` requires only one multiply
+/// per sub-block segment rather than one per nibble:
 ///
 /// ```text
-/// c[i, j] += dq_lo * Σ B[ki_lo, j]   +   dq_hi * Σ B[ki_hi, j]
-///                     ───────────────────────────────────────────
-///                     summed over seg_len consecutive positions
+/// c[i, j] += dq * Σ B[ki_base + pos + l, j]
+///                  ──────────────────────────
+///                  summed over seg_len nibble positions
 /// ```
 ///
-/// For a single-run block (all bytes identical) this reduces the multiply
-/// count from `2 * QK_K = 512` to `2 * 4 = 8` per output column (4 groups,
-/// 2 nibble levels each), while B is still read exactly once.
-///
 /// For blocks in **raw mode** (`IS_RLE = 0`) the block is dequantised into a
 /// scratch buffer and its contribution is accumulated via a saxpy loop
 /// (weight-outer, column-inner), which accesses B in row-major order.
@@ -431,8 +455,7 @@ pub fn matmul_q4k_rle_fp16(
    // Scratch for raw-mode block dequantisation.
    let mut block_buf = [0.0f32; QK_K];
    // Scratch for RLE-mode B-column sums; allocated once and reused per segment.
-    let mut sum_lo = vec![0.0f32; n];
-    let mut sum_hi = vec![0.0f32; n];
+    let mut sum_b = vec![0.0f32; n];

    for i in 0..m {
        let c_row = &mut c[i * n..(i + 1) * n];
@@ -442,10 +465,10 @@ pub fn matmul_q4k_rle_fp16(
            let ki_base = b_idx * QK_K;

            if block.is_rle() {
-                // RLE path: accumulate directly from runs, no decompression.
+                // RLE path: accumulate directly from nibble runs, no decompression.
                accumulate_rle_block(
                    block, b, ki_base, n, c_row,
-                    &mut sum_lo, &mut sum_hi,
+                    &mut sum_b,
                );
            } else {
                // Raw path: dequantise once, then saxpy into c_row.
@@ -585,18 +608,17 @@ mod tests {
    // =========================================================================

    #[test]
-    fn block_q4k_rle_size_is_274_bytes() {
-        // d(2) + dmin(2) + scales(12) + flags(1) + n_pairs(1) + qs(256) = 274 bytes.
-        // No padding needed: struct is already 2-byte aligned and 274 is even.
-        assert_eq!(core::mem::size_of::<BlockQ4KRle>(), 274);
+    fn block_q4k_rle_size_is_276_bytes() {
+        // d(2) + dmin(2) + scales(12) + flags(1) + pad(1) + n_pairs(2) + qs(256) = 276 bytes.
+        assert_eq!(core::mem::size_of::<BlockQ4KRle>(), 276);
    }

    #[test]
-    fn block_q4k_rle_is_130_bytes_larger_than_block_q4k() {
-        // BlockQ4K = 144 bytes, BlockQ4KRle = 274 bytes, delta = 130.
+    fn block_q4k_rle_is_132_bytes_larger_than_block_q4k() {
+        // BlockQ4K = 144 bytes, BlockQ4KRle = 276 bytes, delta = 132.
        assert_eq!(
            core::mem::size_of::<BlockQ4KRle>(),
-            core::mem::size_of::<BlockQ4K>() + 130,
+            core::mem::size_of::<BlockQ4K>() + 132,
        );
    }

@@ -633,11 +655,11 @@ mod tests {

    #[test]
    fn rle_len_reports_pair_count_from_n_pairs() {
-        for n in [0usize, 1, 7, 31, 63, 128] {
+        for n in [0usize, 1, 7, 31, 63, 128, 256] {
            let b = BlockQ4KRle {
                d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
                flags: if n > 0 { IS_RLE } else { 0 },
-                n_pairs: n as u8,
+                n_pairs: n as u16,
                qs: [0; QK_K],
            };
            assert_eq!(b.rle_len(), n, "expected rle_len {n}");
@@ -650,30 +672,37 @@ mod tests {

    #[test]
    fn encode_uniform_qs_uses_rle() {
-        // 128 identical bytes → 1 pair → 2 bytes stored in qs.
+        // All identical bytes → 256 identical nibbles → RLE mode.
        let src = make_block(1.0, 0.0, 1, 0, 0x77);
        let rle = encode(&src, 0.0);
        assert!(rle.is_rle(), "uniform qs should trigger RLE mode");
    }

    #[test]
-    fn encode_uniform_qs_rle_len_is_one() {
+    fn encode_uniform_qs_rle_entry_count_is_sixteen() {
+        // 256 identical nibbles → max chunk size 16 → 16 entries.
        let src = make_block(1.0, 0.0, 1, 0, 0x55);
        let rle = encode(&src, 0.0);
-        assert_eq!(rle.rle_len(), 1);
+        assert_eq!(rle.rle_len(), 16, "256 identical nibbles → 16 chunks of 16");
    }

    #[test]
-    fn encode_uniform_qs_rle_entry_is_correct() {
+    fn encode_uniform_qs_first_entry_is_correct() {
+        // 0xAB: lo nibble = 0xB = 11, hi nibble = 0xA = 10.
+        // Output-sequential: [0xB×32, 0xA×32] × 4 groups = 8 runs of 32.
+        // Each run of 32 → 2 entries of 16.
+        // First entry: val=0xB=11, count=16 → (11<<4)|15 = 0xBF.
        let src = make_block(1.0, 0.0, 1, 0, 0xAB);
        let rle = encode(&src, 0.0);
-        assert_eq!(rle.qs[0], 0xAB, "RLE value byte should equal the repeated byte");
-        assert_eq!(rle.qs[1], 128,  "RLE run length should be 128 bytes");
+        assert!(rle.is_rle());
+        assert_eq!(rle.qs[0], (0xBu8 << 4) | 0xFu8, "first entry: val=0xB, count=16");
+        assert_eq!(rle.rle_len(), 16, "8 runs of 32, each split into 2 → 16 entries");
    }

    #[test]
    fn encode_alternating_bytes_stays_raw() {
-        // Alternating 0xAA / 0x55 → 128 singleton pairs, coverage = 0%.
+        // Alternating 0xAA / 0x55 → nibble stream alternates 10,5,10,5,...
+        // No two adjacent nibbles are equal → 0% nibble coverage.
        // At threshold 0.01 the 0% coverage fails → raw mode.
        let mut qs = [0u8; QK_K / 2];
        for (i, b) in qs.iter_mut().enumerate() {
@@ -686,7 +715,7 @@ mod tests {

    #[test]
    fn encode_raw_mode_copies_qs_verbatim() {
-        // Three-byte cycle of distinct values → 128 runs of 1 byte each,
+        // Three-byte cycle of distinct values → no adjacent nibble repeats,
        // coverage = 0%.  At threshold 0.01 the 0% coverage fails → raw mode.
        let mut qs = [0u8; QK_K / 2];
        for (i, b) in qs.iter_mut().enumerate() {
@@ -700,138 +729,99 @@ mod tests {
    }

    #[test]
-    fn encode_two_runs_uses_rle_and_stores_correct_pairs() {
-        // Two distinct runs: 64 bytes of 0x11 followed by 64 bytes of 0x22.
-        // → 2 pairs = 4 bytes.
+    fn encode_two_run_block_stores_correct_entries() {
+        // qs = [0x11×64, 0x22×64]: both nibbles of 0x11 are 1, of 0x22 are 2.
+        // Nibble stream (output-sequential): [1×128, 2×128] → 2 runs of 128.
+        // Each run of 128 → 8 entries of 16 → 16 entries total.
        let mut qs = [0u8; QK_K / 2];
        qs[..64].fill(0x11);
        qs[64..].fill(0x22);
        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
        let rle = encode(&src, 0.0);
        assert!(rle.is_rle());
-        assert_eq!(rle.rle_len(), 2);
-        assert_eq!(rle.qs[0], 0x11, "first pair: value");
-        assert_eq!(rle.qs[1], 64,   "first pair: run length");
-        assert_eq!(rle.qs[2], 0x22, "second pair: value");
-        assert_eq!(rle.qs[3], 64,   "second pair: run length");
+        // 2 runs of 128 nibbles each → 8 entries per run → 16 total entries.
+        assert_eq!(rle.rle_len(), 16);
+        // First entry: nibble val=1, count=16 → (1<<4)|15 = 0x1F.
+        assert_eq!(rle.qs[0], 0x1F, "first entry should be nibble=1, count=16");
+        // 9th entry (first for nibble 2): (2<<4)|15 = 0x2F.
+        assert_eq!(rle.qs[8], 0x2F, "9th entry should be nibble=2, count=16");
    }

    #[test]
-    fn encode_63_pairs_uses_rle() {
-        // Build 62 runs of 2 bytes each (124 bytes) + 1 run of 4 bytes = 128 bytes.
-        // 63 pairs × 2 = 126 bytes; 63 ≤ 128 → RLE should be chosen.
-        let mut qs = [0u8; QK_K / 2];
-        let mut pos = 0usize;
-        for run in 0..62usize {
-            // Use a stride-3 sequence so consecutive values are always distinct.
-            let v = (run as u8).wrapping_mul(3).wrapping_add(1);
-            qs[pos]     = v;
-            qs[pos + 1] = v;
-            pos += 2;
-        }
-        // Final run: 4 bytes, value chosen to differ from the previous one.
-        qs[pos..].fill(0xFE);
+    fn encode_nibble_coverage_determines_rle_mode() {
+        // A block with all-same nibbles → 100% coverage → always RLE.
+        let src_uniform = make_block(1.0, 0.0, 1, 0, 0x77);
+        assert!(encode(&src_uniform, 0.0).is_rle());
+        assert!(encode(&src_uniform, 1.0).is_rle()); // 100% coverage meets any threshold

-        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
-        let rle = encode(&src, 0.0);
-        assert!(rle.is_rle(), "63 pairs should use RLE");
-        assert_eq!(rle.rle_len(), 63);
-    }
-
-    #[test]
-    fn encode_64_pairs_uses_rle_at_zero_threshold() {
-        // 64 runs of 2 bytes each = 128 bytes total, coverage = 100%.
-        // pairs (64) ≤ 128 AND 100% ≥ 0.0 → RLE mode.
-        let mut qs = [0u8; QK_K / 2];
-        let mut pos = 0usize;
-        for run in 0..64usize {
-            let v = (run as u8).wrapping_mul(3).wrapping_add(1);
-            qs[pos]     = v;
-            qs[pos + 1] = v;
-            pos += 2;
-        }
-        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
-        let rle = encode(&src, 0.0);
-        assert!(rle.is_rle(), "64 pairs, 100% coverage, threshold 0.0 → RLE");
-        assert_eq!(rle.rle_len(), 64);
-    }
-
-    #[test]
-    fn encode_128_pairs_uses_rle_at_zero_threshold() {
-        // 128 distinct consecutive bytes = 128 singleton runs = 128 pairs.
-        // With old cap (64 pairs), this was always raw.
-        // With new cap (128 pairs), threshold 0.0 accepts it.
-        // Coverage = 0 % (all singletons) → threshold > 0.0 rejects it.
+        // Build a block whose nibble stream has 0% coverage:
+        // lo = (i*2+1) % 16, hi = (i*2+2) % 16
+        // Adjacent lo nibbles differ by 2 (mod 16); adjacent hi nibbles differ by 2.
+        // Group boundaries also do not align (verified analytically).
        let mut qs = [0u8; QK_K / 2];
        for (i, b) in qs.iter_mut().enumerate() {
-            *b = i as u8;   // 0x00, 0x01, ..., 0x7F — all distinct, all singletons
+            let lo = ((i * 2 + 1) % 16) as u8;
+            let hi = ((i * 2 + 2) % 16) as u8;
+            *b = lo | (hi << 4);
        }
-        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
-
-        assert!(
-            encode(&src, 0.0).is_rle(),
-            "128 pairs ≤ 128 limit AND 0% ≥ 0.0 → RLE at zero threshold"
-        );
-        assert_eq!(encode(&src, 0.0).rle_len(), 128);
-
-        assert!(
-            !encode(&src, 0.01).is_rle(),
-            "0% coverage fails any threshold > 0"
-        );
+        let src_varied = make_block_with_qs(1.0, 0.0, 1, 0, qs);
+        // At 0.0 threshold: RLE (≤ 256 entries always fit, 0.0 ≥ 0.0).
+        assert!(encode(&src_varied, 0.0).is_rle());
+        // At any positive threshold: 0% nibble coverage → raw.
+        assert!(!encode(&src_varied, 0.01).is_rle());
    }

    #[test]
-    fn encode_coverage_threshold_rejects_low_coverage_block() {
-        // Construct: 63 singletons + 1 run of 65 bytes = 64 pairs.
-        // coverage = 65/128 ≈ 50.8%.
-        // threshold 0.50 accepts it; threshold 0.60 rejects it.
-        let mut qs = [0u8; QK_K / 2];
-        qs[0] = 0x01;
-        for i in 1..63usize {
-            // Distinct odd bytes, none equal to 0x01 or adjacent values.
-            qs[i] = (i as u8).wrapping_mul(2).wrapping_add(5);
-        }
-        qs[63..].fill(0xAB); // 65-byte run; qs[62] = 62*2+5 = 129 → wraps to 0x81 ≠ 0xAB ✓
-
-        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
-        assert!(
-            encode(&src, 0.50).is_rle(),
-            "50.8% coverage should meet 50% threshold"
-        );
-        assert!(
-            !encode(&src, 0.60).is_rle(),
-            "50.8% coverage should fail 60% threshold"
-        );
+    fn encode_nibble_max_count_is_sixteen() {
+        // A run of 256 identical nibbles should be stored as 16 entries of 16 nibbles.
+        // 0x33: lo nibble = 3, hi nibble = 3 → all 256 nibbles are 3.
+        let src = make_block(1.0, 0.0, 1, 0, 0x33);
+        let rle = encode(&src, 0.0);
+        assert!(rle.is_rle());
+        assert_eq!(rle.rle_len(), 16);
+        // Each entry: nibble=3, count=16 → (3<<4)|15 = 0x3F.
+        assert!(rle.qs[..16].iter().all(|&e| e == 0x3F));
    }

    #[test]
-    fn encode_coverage_zero_threshold_always_uses_rle_when_pairs_fit() {
-        // Any block whose runs produce ≤ 128 pairs uses RLE at threshold 0.0,
-        // regardless of how many singletons it contains.
-        // Use the 63-pair block from encode_63_pairs_uses_rle.
+    fn encode_256_nibbles_fits_in_qs_at_zero_threshold() {
+        // Worst case: all 256 nibbles are singletons.
+        // Construct lo = (i*2+1) % 16, hi = (i*2+2) % 16: gives 0% coverage.
        let mut qs = [0u8; QK_K / 2];
-        let mut pos = 0usize;
-        for run in 0..62usize {
-            let v = (run as u8).wrapping_mul(3).wrapping_add(1);
-            qs[pos]     = v;
-            qs[pos + 1] = v;
-            pos += 2;
+        for (i, b) in qs.iter_mut().enumerate() {
+            let lo = ((i * 2 + 1) % 16) as u8;
+            let hi = ((i * 2 + 2) % 16) as u8;
+            *b = lo | (hi << 4);
        }
-        qs[pos..].fill(0xFE);
        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
-        assert!(encode(&src, 0.0).is_rle());
+        let rle = encode(&src, 0.0);
+        // At threshold 0.0 the block should be RLE (256 entries ≤ 256 capacity,
+        // and 0.0 ≥ 0.0).  The dequantised output must match the baseline.
+        let mut got      = [0.0f32; QK_K];
+        let mut expected = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut got);
+        dequantize_block_q4k(&src,     &mut expected);
+        assert_slices_close(&got, &expected, 1e-5);
    }

    #[test]
-    fn encode_coverage_one_threshold_requires_total_coverage() {
-        // A block with even one singleton byte fails the 100% threshold.
-        // Build: 1 singleton + 1 run of 127 bytes = 2 pairs, coverage = 127/128 ≈ 99.2%.
+    fn encode_nibble_coverage_threshold_controls_rle_selection() {
+        // Build a block with moderate nibble-level coverage.
+        // First 32 bytes all 0x77 (both nibbles 7) → group 0 in output-sequential
+        // order is entirely 7 (64 nibbles in one run).  The remaining bytes
+        // cycle through values with no adjacent nibble repeats.
+        // Coverage ≈ 64/256 = 25% (plus a few boundary matches, ~26-27%).
        let mut qs = [0u8; QK_K / 2];
-        qs[0] = 0x01;             // singleton (value distinct from rest)
-        qs[1..].fill(0x02);       // 127-byte run
+        qs[..32].fill(0x77);
+        for i in 32..128usize {
+            qs[i] = ((i % 15 + 1) as u8) | (((i + 8) % 16) as u8) << 4;
+        }
        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
-        assert!(!encode(&src, 1.0).is_rle(), "99.2% coverage should fail 100% threshold");
-        assert!(encode(&src, 0.99).is_rle(), "99.2% coverage should meet 99% threshold");
+
+        // Coverage is roughly 25–27%, which is between 0.20 and 0.30.
+        assert!(encode(&src, 0.20).is_rle(), "~26% coverage meets 20% threshold");
+        assert!(encode(&src, 0.24).is_rle(), "~26% coverage meets 24% threshold");
+        assert!(!encode(&src, 0.30).is_rle(), "~26% coverage does not meet 30% threshold");
    }

    #[test]
@@ -859,30 +849,58 @@ mod tests {
    }

    #[test]
-    fn decode_qs_rle_expands_two_pair_stream() {
-        // Hand-craft an RLE block: [0xAA × 64, 0xBB × 64].
+    fn decode_qs_rle_expands_two_run_stream() {
+        // [0xAA × 64 bytes, 0xBB × 64 bytes] in nibble-level RLE.
+        // 0xAA: lo=hi=0xA=10.  0xBB: lo=hi=0xB=11.
+        // Output-sequential: 128 nibbles of 10 (from 64 bytes of 0xAA),
+        // then 128 nibbles of 11 (from 64 bytes of 0xBB).
+        // Each run of 128 → 8 entries of 16: 0xAF × 8, then 0xBF × 8.
        let mut qs = [0u8; QK_K];
-        qs[0] = 0xAA; qs[1] = 64;
-        qs[2] = 0xBB; qs[3] = 64;
+        for i in 0..8  { qs[i] = 0xAF; } // val=0xA, count-1=0xF → 16 nibbles each
+        for i in 8..16 { qs[i] = 0xBF; }
        let rle = BlockQ4KRle {
            d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
-            flags: IS_RLE, n_pairs: 2, qs,
+            flags: IS_RLE, n_pairs: 16, qs,
        };
        let expanded = decode_qs(&rle);
+        // First 64 bytes should be 0xAA, last 64 bytes should be 0xBB.
        assert!(expanded[..64].iter().all(|&b| b == 0xAA), "first 64 bytes must be 0xAA");
        assert!(expanded[64..].iter().all(|&b| b == 0xBB), "last 64 bytes must be 0xBB");
    }

    #[test]
-    fn decode_qs_rle_single_run_covers_all() {
+    fn decode_qs_rle_single_byte_value_covers_all() {
+        // 128 bytes of 0xCD: lo nibble = 0xD = 13, hi nibble = 0xC = 12.
+        // Nibble stream in output-sequential order:
+        //   [0xD×32, 0xC×32] × 4 groups = 8 runs of 32.
+        // Each run of 32 → 2 entries of 16.
+        // Entries in pairs: DF, DF, CF, CF, DF, DF, CF, CF, ...
+        // i.e. (i/2) % 2 == 0 → 0xDF, else 0xCF.
        let mut qs = [0u8; QK_K];
-        qs[0] = 0xCD; qs[1] = 128; // one run of 128 bytes
+        for i in 0..16 {
+            qs[i] = if (i / 2) % 2 == 0 { 0xDF } else { 0xCF };
+        }
        let rle = BlockQ4KRle {
            d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
-            flags: IS_RLE, n_pairs: 1, qs,
+            flags: IS_RLE, n_pairs: 16, qs,
        };
        let expanded = decode_qs(&rle);
-        assert!(expanded.iter().all(|&b| b == 0xCD));
+        assert!(expanded.iter().all(|&b| b == 0xCD), "all bytes should be 0xCD");
+    }
+
+    // =========================================================================
+    // extract_nibbles / pack_nibbles round-trip
+    // =========================================================================
+
+    #[test]
+    fn extract_and_pack_nibbles_round_trip() {
+        let mut qs = [0u8; QK_K / 2];
+        for (i, b) in qs.iter_mut().enumerate() {
+            *b = (i.wrapping_mul(37).wrapping_add(13) & 0xFF) as u8;
+        }
+        let nibbles  = extract_nibbles(&qs);
+        let repacked = pack_nibbles(&nibbles);
+        assert_eq!(repacked, qs, "pack(extract(qs)) must equal qs");
    }

    // =========================================================================
@@ -972,7 +990,7 @@ mod tests {

    #[test]
    fn roundtrip_raw_mode_matches_original() {
-        // Alternating bytes → 128 singleton pairs, coverage = 0%.
+        // Alternating bytes → no adjacent nibble repeats, coverage = 0%.
        // Use threshold 0.01 to force raw mode (0% < 0.01).
        let mut qs = [0u8; QK_K / 2];
        for (i, b) in qs.iter_mut().enumerate() {
@@ -1007,7 +1025,7 @@ mod tests {

    #[test]
    fn roundtrip_many_short_runs_matches_original() {
-        // Four distinct runs of varying lengths → still compresses.
+        // Four distinct byte runs → multiple nibble runs (RLE still compresses).
        let mut qs = [0u8; QK_K / 2];
        qs[..10].fill(0x11);
        qs[10..30].fill(0x22);
@@ -1015,8 +1033,7 @@ mod tests {
        qs[31..].fill(0x44);
        let src = make_block_with_qs(1.0, 0.5, 7, 3, qs);
        let rle = encode(&src, 0.0);
-        assert!(rle.is_rle(), "4-run block should compress");
-        assert_eq!(rle.rle_len(), 4);
+        assert!(rle.is_rle(), "multi-run block should compress");

        let mut got      = [0.0f32; QK_K];
        let mut expected = [0.0f32; QK_K];
@@ -1053,26 +1070,6 @@ mod tests {
        assert_close(got[32], 3.0, 1e-5);
    }

-    #[test]
-    fn roundtrip_128_singleton_pairs_matches_original() {
-        // All-distinct bytes → 128 pairs, 0% coverage.
-        // encode at threshold 0.0 → RLE; dequantize must match baseline.
-        let mut qs = [0u8; QK_K / 2];
-        for (i, b) in qs.iter_mut().enumerate() {
-            *b = (i as u8).wrapping_mul(3).wrapping_add(7);
-        }
-        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
-        let rle = encode(&src, 0.0);
-        assert!(rle.is_rle());
-        assert_eq!(rle.rle_len(), 128);
-
-        let mut got      = [0.0f32; QK_K];
-        let mut expected = [0.0f32; QK_K];
-        dequantize_block_q4k_rle(&rle, &mut got);
-        dequantize_block_q4k(&src,     &mut expected);
-        assert_slices_close(&got, &expected, 1e-5);
-    }
-
    // =========================================================================
    // matmul_q4k_rle_fp16
    // =========================================================================