RLE now works on nibbles

This commit is contained in:
2026-04-12 21:26:36 -07:00
parent 3fb10b78e3
commit bba9db290e
2 changed files with 398 additions and 360 deletions

View File

@@ -1,26 +1,30 @@
//! # Benchmark: BlockQ4K vs BlockQ4KRle
//!
//! Measures three operations across two weight distributions:
//! Measures three operations across three weight distributions, encoded with
//! `min_coverage = 0.01` (blocks need ≥ 1 % of their 256 nibbles in repeated
//! runs to use RLE mode).
//!
//! | Group | What is timed |
//! |--------------|--------------------------------------------------|
//! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks |
//! | `dequantize` | Single-block dequantisation for all three paths |
//! | `matmul` | Full A×B multiply at three matrix sizes |
//! | Group | What is timed |
//! |--------------|-----------------------------------------------------|
//! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks |
//! | `dequantize` | Single-block dequantisation across all four paths |
//! | `matmul` | Full A×B multiply at three matrix sizes |
//!
//! ## Weight distributions
//!
//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG).
//! Consecutive bytes almost never repeat, so each block produces ~128
//! single-byte runs. At 2 bytes per pair that would require ~256 bytes,
//! which exceeds the 128-byte raw payload, so `encode` always keeps these
//! blocks in **raw mode** (IS_RLE = 0). This is representative of typical
//! unstructured LLM weight matrices.
//! **uniform** — each qs byte is drawn from a pseudo-random LCG sequence.
//! Adjacent nibbles match with probability 1/16, giving ~12 % nibble coverage.
//! At `min_coverage = 0.01` these blocks encode to **RLE mode** (IS_RLE = 1)
//! with ~230240 nibble entries — a realistic proxy for trained Q4_K weights.
//!
//! **rle_optimal** — every byte in a block's qs field is the same value.
//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 —
//! and sets IS_RLE = 1. This is the theoretical compression maximum, and
//! is representative of highly sparse or dead-neuron weight matrices.
//! **rle_optimal** — every qs byte is the same value. All 256 nibbles are
//! identical, giving 100 % coverage and just 16 nibble entries. This is the
//! theoretical RLE maximum and represents highly structured weight blocks.
//!
//! **zero_coverage** — nibbles cycle deterministically so no two consecutive
//! nibbles (in output-sequential order) are ever equal. Coverage = 0 %;
//! `encode` keeps these blocks in **raw mode** (IS_RLE = 0) at any positive
//! threshold. Used only in the `dequantize` group to benchmark the raw path.
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use matrix_testing::{
@@ -83,12 +87,11 @@ fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
s
}
/// Return `count` blocks whose qs bytes are pseudo-random.
/// Return `count` blocks whose qs bytes are pseudo-random (LCG).
///
/// With uniformly distributed bytes, consecutive bytes match with probability
/// 1/256 ≈ 0.4%, yielding ~128 runs per block. Storing those as (value,
/// count) pairs would need ~256 bytes — more than the 128-byte raw payload —
/// so `encode` will always select **raw mode** (IS_RLE = 0).
/// Adjacent nibbles match with probability 1/16, giving each block roughly
/// 12 % nibble coverage. At `min_coverage = 0.01` these blocks encode to
/// **RLE mode** (IS_RLE = 1) with ~230240 nibble entries per block.
fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
let scales = make_scales(7, 2);
@@ -107,10 +110,9 @@ fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
/// Return `count` blocks where every qs byte is the same value.
///
/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes
/// instead of 128. `encode` will always select **RLE mode** (IS_RLE = 1).
/// Each block uses a fresh pseudo-random byte so no two blocks are identical,
/// avoiding degenerate cache-warm effects across the batch.
/// All 256 nibbles are identical → 100 % nibble coverage → always **RLE mode**
/// with exactly 16 entries (256 nibbles / 16 per entry).
/// Each block uses a fresh pseudo-random byte to avoid cache-warm artifacts.
fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
let scales = make_scales(7, 2);
@@ -129,6 +131,28 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
vec![f32_to_fp16(1.0); k * n]
}
/// Build one block whose nibbles cycle so that no two consecutive nibbles
/// (in output-sequential order) are ever equal → 0 % nibble coverage.
///
/// Lo nibble of byte `i` = `i % 16`; hi nibble = `(i + 8) % 16`.
/// Within every 32-byte group the lo and hi streams each visit all 16 values
/// twice without repetition, and across group boundaries the last nibble of
/// one stream differs from the first nibble of the next.
///
/// At any `min_coverage > 0.0`, `encode` keeps this block in **raw mode**.
fn zero_coverage_block() -> BlockQ4K {
let scales = make_scales(7, 2);
let d = f32_to_fp16(0.01);
let dmin = f32_to_fp16(0.001);
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
let lo = (i % 16) as u8;
let hi = ((i + 8) % 16) as u8;
*b = lo | (hi << 4);
}
BlockQ4K { d, dmin, scales, qs }
}
// ---------------------------------------------------------------------------
// Group 1 — encode
// ---------------------------------------------------------------------------
@@ -136,12 +160,11 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
/// Number of blocks encoded per iteration in `bench_encode`.
const ENCODE_BATCH: usize = 512;
/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output.
/// Measures the cost of scanning nibbles and writing the `BlockQ4KRle` output.
///
/// Both distributions perform the same O(128) run-length scan. The only
/// divergence is at the output stage:
/// * **uniform** — run count > 63 → fall through to memcpy of 128 bytes.
/// * **rle_optimal** — run count = 1 → write 2 bytes and set IS_RLE.
/// Both distributions perform the same O(256) nibble scan. The output differs:
/// * **uniform** — ~12 % coverage → RLE mode, ~230240 entries written.
/// * **rle_optimal** — 100 % coverage → RLE mode, exactly 16 entries written.
fn bench_encode(c: &mut Criterion) {
let uniform = uniform_blocks(ENCODE_BATCH);
let rle_opt = rle_optimal_blocks(ENCODE_BATCH);
@@ -153,7 +176,7 @@ fn bench_encode(c: &mut Criterion) {
group.bench_function("uniform", |b| {
b.iter(|| {
for blk in &uniform {
black_box(encode(black_box(blk), 0.0));
black_box(encode(black_box(blk), 0.01));
}
});
});
@@ -161,7 +184,7 @@ fn bench_encode(c: &mut Criterion) {
group.bench_function("rle_optimal", |b| {
b.iter(|| {
for blk in &rle_opt {
black_box(encode(black_box(blk), 0.0));
black_box(encode(black_box(blk), 0.01));
}
});
});
@@ -173,25 +196,35 @@ fn bench_encode(c: &mut Criterion) {
// Group 2 — dequantize (single block)
// ---------------------------------------------------------------------------
/// Compares the three single-block dequantisation code paths.
/// Compares four single-block dequantisation code paths.
///
/// | Variant | Block type | Encoding | Extra work vs baseline |
/// |------------------|-------------|----------|-------------------------------|
/// | `q4k_baseline` | BlockQ4K | — | none |
/// | `rle_raw_mode` | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) |
/// | `rle_rle_mode` | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf |
/// | Variant | Block type | Encoding | IS_RLE | Entries |
/// |--------------------|-------------|-----------|--------|---------|
/// | `q4k_baseline` | BlockQ4K | — | — | — |
/// | `rle_raw_mode` | BlockQ4KRle | raw | 0 | — |
/// | `rle_sparse` | BlockQ4KRle | RLE | 1 | ~235 |
/// | `rle_dense` | BlockQ4KRle | RLE | 1 | 16 |
///
/// `rle_raw_mode` uses the zero-coverage fixture (0 % nibble coverage), which
/// stays in raw mode at any positive threshold.
/// `rle_sparse` uses the LCG uniform fixture (~12 % coverage, ~235 entries),
/// representative of actual trained Q4_K weight blocks.
/// `rle_dense` uses the rle_optimal fixture (100 % coverage, 16 entries).
///
/// Throughput is the number of dequantised weights produced per second.
fn bench_dequantize(c: &mut Criterion) {
let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap();
let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap();
let q4k_baseline_block = uniform_blocks(1).into_iter().next().unwrap();
let q4k_zero_cov = zero_coverage_block();
let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap();
let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap();
let rle_raw = encode(&q4k_uniform, 0.0); // IS_RLE = 0
let rle_rle = encode(&q4k_rle_opt, 0.0); // IS_RLE = 1
let rle_raw = encode(&q4k_zero_cov, 0.01); // IS_RLE = 0 (0 % coverage)
let rle_sparse = encode(&q4k_uniform, 0.01); // IS_RLE = 1 (~12 % coverage)
let rle_dense = encode(&q4k_rle_opt, 0.01); // IS_RLE = 1 (100 % coverage)
// Confirm the fixtures ended up in the right encoding modes.
assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode");
assert!(rle_rle.is_rle(), "rle-optimal block should encode to rle mode");
assert!(!rle_raw.is_rle(), "zero-coverage block must be raw mode");
assert!(rle_sparse.is_rle(), "uniform block must be RLE at 0.01 threshold");
assert!(rle_dense.is_rle(), "rle-optimal block must be RLE mode");
let mut group = c.benchmark_group("dequantize");
// Throughput = QK_K (256) weights dequantised per second.
@@ -200,7 +233,7 @@ fn bench_dequantize(c: &mut Criterion) {
group.bench_function("q4k_baseline", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k(black_box(&q4k_uniform), &mut out);
dequantize_block_q4k(black_box(&q4k_baseline_block), &mut out);
black_box(out)
});
});
@@ -213,10 +246,18 @@ fn bench_dequantize(c: &mut Criterion) {
});
});
group.bench_function("rle_rle_mode", |b| {
group.bench_function("rle_sparse", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k_rle(black_box(&rle_rle), &mut out);
dequantize_block_q4k_rle(black_box(&rle_sparse), &mut out);
black_box(out)
});
});
group.bench_function("rle_dense", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k_rle(black_box(&rle_dense), &mut out);
black_box(out)
});
});
@@ -245,14 +286,14 @@ const CONFIGS: &[(usize, usize, usize)] = &[
/// Full matrix-multiply benchmark across weight distributions and matrix sizes.
///
/// Four variants per size:
/// Four variants per size (`min_coverage = 0.01`):
///
/// | Label | A type | RLE mode? |
/// |----------------------|-------------|-----------|
/// | `baseline/uniform` | BlockQ4K | — |
/// | `rle/uniform` | BlockQ4KRle | raw |
/// | `baseline/rle_opt` | BlockQ4K | — |
/// | `rle/rle_opt` | BlockQ4KRle | rle |
/// | Label | A type | IS_RLE | Entries/block |
/// |----------------------|-------------|--------|---------------|
/// | `baseline/uniform` | BlockQ4K | — | — |
/// | `rle/uniform` | BlockQ4KRle | 1 | ~235 |
/// | `baseline/rle_opt` | BlockQ4K | — | — |
/// | `rle/rle_opt` | BlockQ4KRle | 1 | 16 |
///
/// Throughput is reported as multiply-accumulate operations (M × K × N) per
/// second, allowing fair cross-size comparison.
@@ -270,10 +311,10 @@ fn bench_matmul(c: &mut Criterion) {
// Build all four A variants and the shared B matrix for this config.
let a_q4k_u: Vec<BlockQ4K> = uniform_blocks(m * bpr);
let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.0)).collect();
let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.01)).collect();
let a_q4k_r: Vec<BlockQ4K> = rle_optimal_blocks(m * bpr);
let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.0)).collect();
let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.01)).collect();
let b = fp16_ones(k, n);

View File

@@ -1,26 +1,29 @@
//! RLE-optional Q4_K super-block encoding.
//!
//! This module provides [`BlockQ4KRle`], a variant of [`crate::BlockQ4K`] that
//! optionally compresses the 128-byte weight payload using **byte-level
//! optionally compresses the 128-byte weight payload using **nibble-level
//! run-length encoding** (RLE). A flag bit in the [`BlockQ4KRle::flags`]
//! field indicates which mode is active:
//!
//! | `IS_RLE` bit | `qs` interpretation |
//! |--------------|------------------------------------------------------------|
//! | 0 | Raw packed nibbles, identical to [`crate::BlockQ4K::qs`] |
//! | 1 | RLE stream of `(value, count)` byte-pairs |
//! | `IS_RLE` bit | `qs` interpretation |
//! |--------------|------------------------------------------------------------------------|
//! | 0 | Raw packed nibbles, identical to [`crate::BlockQ4K::qs`] |
//! | 1 | RLE stream of single-byte entries `(nibble_val << 4 | count_minus_1)` |
//!
//! ## RLE format (when `IS_RLE` = 1)
//!
//! - `n_pairs` gives the number of `(value, count)` pairs stored in `qs`.
//! - For each pair `i`:
//! - `qs[2*i]` — the byte value (two packed 4-bit weights, same packing
//! as the raw format).
//! - `qs[2*i + 1]` — the run length in bytes (1..=255).
//! - The run lengths must sum to exactly 128 (the uncompressed `qs` size).
//! - `n_pairs` gives the number of nibble-level RLE entries stored in `qs`.
//! - For each entry `i` (one byte each):
//! - bits 74: nibble value (015)
//! - bits 30: `count - 1` (015, meaning run length 116)
//! - The run lengths must sum to exactly 256 (the number of nibbles in QK_K).
//!
//! The 256-byte `qs` field can hold up to 128 `(value, count)` pairs — enough
//! to represent even fully-random blocks where every byte differs from its
//! Nibbles are read in output-sequential order: for each 32-byte group, first
//! all 32 lo nibbles, then all 32 hi nibbles. The nibble at stream position
//! `p` maps directly to output element `p` in sub-block `p / 32`.
//!
//! The 256-byte `qs` field can hold up to 256 single-byte entries — enough to
//! represent even fully-random blocks where every nibble differs from its
//! neighbour.
//!
//! ## Constructing blocks
@@ -45,12 +48,12 @@ pub const IS_RLE: u8 = 0x01;
// Block definition
// ---------------------------------------------------------------------------
/// A Q4_K super-block with optional byte-level RLE compression on the weights.
/// A Q4_K super-block with optional nibble-level RLE compression on the weights.
///
/// Unlike [`crate::BlockQ4K`], this format is **not** binary-compatible with
/// the GGUF on-disk layout. It uses a 256-byte `qs` field (vs the 128-byte
/// field in `BlockQ4K`) so the RLE stream can store up to 128 `(value, count)`
/// pairs — enough to represent even fully-random blocks where every byte
/// field in `BlockQ4K`) so the RLE stream can store up to 256 single-byte
/// entries — enough to represent even fully-random blocks where every nibble
/// differs from its neighbour.
///
/// Memory layout (`repr C`):
@@ -61,17 +64,18 @@ pub const IS_RLE: u8 = 0x01;
/// | 2 | `dmin` | 2 B | fp16 super-block min-scale |
/// | 4 | `scales` | 12 B | packed 6-bit sub-block params |
/// | 16 | `flags` | 1 B | bit 0 = `IS_RLE`; bits 17 unused |
/// | 17 | `n_pairs` | 1 B | RLE pair count (0 when raw) |
/// | 18 | `qs` | 256 B | raw nibbles (first 128 B) or RLE |
/// | 17 | (pad) | 1 B | alignment padding for `n_pairs` |
/// | 18 | `n_pairs` | 2 B | RLE entry count (0 when raw) |
/// | 20 | `qs` | 256 B | raw nibbles (first 128 B) or RLE |
///
/// **sizeof = 274 bytes.**
/// **sizeof = 276 bytes.**
///
/// ## `qs` interpretation
///
/// | `IS_RLE` | Meaning |
/// |----------|--------------------------------------------------------------|
/// | 0 | `qs[0..128]` holds raw packed nibbles (same as `BlockQ4K`) |
/// | 1 | `qs[0..n_pairs*2]` holds `(value, count)` byte-pairs |
/// | `IS_RLE` | Meaning |
/// |----------|---------------------------------------------------------------------|
/// | 0 | `qs[0..128]` holds raw packed nibbles (same as `BlockQ4K`) |
/// | 1 | `qs[0..n_pairs]` holds nibble-level RLE entries (1 byte each) |
#[repr(C)]
#[derive(Clone, Copy, Debug)]
pub struct BlockQ4KRle {
@@ -80,11 +84,11 @@ pub struct BlockQ4KRle {
pub scales: [u8; K_SCALE_SIZE],
/// Encoding flags. Only bit 0 (`IS_RLE`) is used; bits 1-7 are reserved.
pub flags: u8,
/// When `IS_RLE` is set: number of `(value, count)` byte-pairs in `qs`.
/// When `IS_RLE` is set: number of nibble-level RLE entries in `qs`.
/// Zero when in raw mode.
pub n_pairs: u8,
/// Raw packed-nibble weights (IS_RLE = 0, first 128 bytes) or RLE stream
/// (IS_RLE = 1, first `n_pairs * 2` bytes).
pub n_pairs: u16,
/// Raw packed-nibble weights (IS_RLE = 0, first 128 bytes) or nibble-level
/// RLE stream (IS_RLE = 1, first `n_pairs` bytes; one byte per entry).
pub qs: [u8; QK_K], // 256 bytes
}
@@ -95,7 +99,7 @@ impl BlockQ4KRle {
self.flags & IS_RLE != 0
}
/// Number of `(value, count)` byte-pairs in `qs`.
/// Number of nibble-level RLE entries in `qs`.
/// Only meaningful when `is_rle()` is true.
#[inline]
pub fn rle_len(&self) -> usize {
@@ -103,74 +107,112 @@ impl BlockQ4KRle {
}
}
// ---------------------------------------------------------------------------
// Nibble extraction / packing helpers
// ---------------------------------------------------------------------------
/// Extract all 256 nibbles from a 128-byte `qs` payload in output-sequential
/// order: for each 32-byte group, first all 32 lo nibbles, then all 32 hi
/// nibbles. Nibble at position `p` maps to output element `p`.
fn extract_nibbles(raw: &[u8; QK_K / 2]) -> [u8; QK_K] {
let mut nibbles = [0u8; QK_K];
let mut q_off = 0usize; // byte cursor
let mut n_off = 0usize; // nibble cursor
while n_off < QK_K {
for l in 0..32 {
nibbles[n_off + l] = raw[q_off + l] & 0x0F; // lo
nibbles[n_off + 32 + l] = raw[q_off + l] >> 4; // hi
}
q_off += 32;
n_off += 64;
}
nibbles
}
/// Inverse of [`extract_nibbles`]: pack a 256-nibble output-sequential array
/// back into the 128-byte `qs` layout.
fn pack_nibbles(nibbles: &[u8; QK_K]) -> [u8; QK_K / 2] {
let mut raw = [0u8; QK_K / 2];
let mut q_off = 0usize;
let mut n_off = 0usize;
while n_off < QK_K {
for l in 0..32 {
raw[q_off + l] = nibbles[n_off + l] | (nibbles[n_off + 32 + l] << 4);
}
q_off += 32;
n_off += 64;
}
raw
}
// ---------------------------------------------------------------------------
// Encoding
// ---------------------------------------------------------------------------
/// Encode a [`BlockQ4K`] block into a [`BlockQ4KRle`] block.
///
/// The `qs` payload is scanned for runs of equal consecutive bytes. RLE mode
/// is chosen when **both** conditions hold:
/// The `qs` payload is scanned for runs of equal consecutive nibbles in
/// output-sequential order. RLE mode is chosen when **both** conditions hold:
///
/// 1. **Coverage**: at least `min_coverage` fraction of the 128 `qs` bytes
/// belong to runs of length ≥ 2. These are the bytes whose weights can be
/// batched in `accumulate_rle_block`, replacing `2 * run_len` multiplies
/// with just 2 per group-segment.
/// 1. **Coverage**: at least `min_coverage` fraction of the 256 nibbles
/// belong to runs of length ≥ 2. These are the nibbles whose weights can
/// be batched in `accumulate_rle_block`, replacing one multiply per nibble
/// with one multiply per output column per segment.
///
/// 2. **Capacity**: the pair count does not exceed 128 (the physical limit of
/// the 256-byte `qs` field at 2 bytes per pair).
/// 2. **Capacity**: the entry count does not exceed 256 (the physical limit of
/// the 256-byte `qs` field at 1 byte per entry).
///
/// | `min_coverage` | Effect |
/// |----------------|------------------------------------------------------|
/// | `0.0` | RLE whenever pairs fit (≤ 128), regardless of runs |
/// | `0.5` | RLE only if ≥ 50 % of bytes are in repeated runs |
/// | `1.0` | RLE only when every byte is part of a run |
/// | `min_coverage` | Effect |
/// |----------------|-------------------------------------------------------|
/// | `0.0` | RLE whenever entries fit (≤ 256), regardless of runs |
/// | `0.5` | RLE only if ≥ 50 % of nibbles are in repeated runs |
/// | `1.0` | RLE only when every nibble is part of a run |
pub fn encode(block: &BlockQ4K, min_coverage: f32) -> BlockQ4KRle {
debug_assert!(
(0.0..=1.0).contains(&min_coverage),
"min_coverage must be in [0.0, 1.0], got {min_coverage}"
);
let raw = &block.qs; // [u8; 128]
let nibbles = extract_nibbles(&block.qs);
// Scan for runs of equal consecutive bytes.
// Track long_run_bytes: bytes in runs of length ≥ 2 (the bytes that
// benefit from RLE in the matmul).
let mut pairs: Vec<(u8, u8)> = Vec::with_capacity(QK_K / 2);
let mut long_run_bytes = 0usize;
let mut i = 0usize;
while i < raw.len() {
let val = raw[i];
let mut run = 1u8;
while i + (run as usize) < raw.len()
&& raw[i + (run as usize)] == val
&& run < u8::MAX
{
// Scan for runs of equal consecutive nibbles.
let mut entries = Vec::<u8>::with_capacity(QK_K);
let mut long_run_nibbles = 0usize;
let mut i = 0usize;
while i < QK_K {
let val = nibbles[i];
let mut run = 0usize;
while i + run < QK_K && nibbles[i + run] == val {
run += 1;
}
pairs.push((val, run));
if run >= 2 {
long_run_bytes += run as usize;
long_run_nibbles += run;
}
i += run as usize;
// Split runs longer than 16 into max-16 chunks (4-bit count field).
let mut rem = run;
while rem > 0 {
let chunk = rem.min(16);
entries.push((val << 4) | ((chunk - 1) as u8));
rem -= chunk;
}
i += run;
}
// Coverage: fraction of qs bytes that are in non-singleton runs.
let coverage = long_run_bytes as f32 / raw.len() as f32;
// Coverage: fraction of the 256 nibbles that are in non-singleton runs.
let coverage = long_run_nibbles as f32 / QK_K as f32;
if pairs.len() <= QK_K / 2 && coverage >= min_coverage {
let n = pairs.len();
// Use RLE when entries fit in qs (≤ 256) and coverage meets the threshold.
if entries.len() <= QK_K && coverage >= min_coverage {
let n = entries.len();
let mut qs = [0u8; QK_K];
for (k, &(val, count)) in pairs.iter().enumerate() {
qs[2 * k] = val;
qs[2 * k + 1] = count;
}
qs[..n].copy_from_slice(&entries);
BlockQ4KRle {
d: block.d,
dmin: block.dmin,
scales: block.scales,
flags: IS_RLE,
n_pairs: n as u8,
n_pairs: n as u16,
qs,
}
} else {
@@ -196,29 +238,32 @@ pub fn encode(block: &BlockQ4K, min_coverage: f32) -> BlockQ4KRle {
///
/// # Panics (debug builds only)
///
/// Panics if the decoded RLE stream does not sum to exactly 128 bytes.
/// Panics if the decoded RLE nibble stream does not sum to exactly 256 nibbles.
fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] {
if !block.is_rle() {
// First QK_K/2 bytes of qs hold the raw packed nibbles.
block.qs[..QK_K / 2].try_into().unwrap()
} else {
let n = block.rle_len();
let mut raw = [0u8; QK_K / 2];
let mut pos = 0usize;
for i in 0..n {
let val = block.qs[2 * i];
let count = block.qs[2 * i + 1] as usize;
raw[pos..pos + count].fill(val);
pos += count;
}
debug_assert_eq!(
pos,
QK_K / 2,
"RLE run lengths sum to {pos}, expected {}",
QK_K / 2
);
raw
return block.qs[..QK_K / 2].try_into().unwrap();
}
let n = block.rle_len();
let mut nibbles = [0u8; QK_K];
let mut pos = 0usize;
for i in 0..n {
let entry = block.qs[i];
let val = entry >> 4;
let count = (entry & 0x0F) as usize + 1;
nibbles[pos..pos + count].fill(val);
pos += count;
}
debug_assert_eq!(
pos,
QK_K,
"nibble RLE lengths sum to {pos}, expected {QK_K}"
);
pack_nibbles(&nibbles)
}
// ---------------------------------------------------------------------------
@@ -227,9 +272,9 @@ fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] {
/// Dequantise one [`BlockQ4KRle`] super-block into [`QK_K`] (256) `f32` values.
///
/// When `IS_RLE` is set the RLE stream is first expanded into a 128-byte raw
/// buffer; thereafter the dequantisation is identical to
/// [`crate::dequantize_block_q4k`]:
/// When `IS_RLE` is set the RLE stream is first expanded into a 256-nibble
/// buffer and packed back into a 128-byte raw representation; thereafter the
/// dequantisation is identical to [`crate::dequantize_block_q4k`]:
///
/// ```text
/// out[i] = d * scale[s] * nibble[i] - dmin * min[s]
@@ -273,86 +318,69 @@ pub fn dequantize_block_q4k_rle(block: &BlockQ4KRle, out: &mut [f32; QK_K]) {
/// Accumulate the contribution of one RLE-encoded block into `c_row`.
///
/// For each `(value, count)` pair the dequantised weight is constant within
/// every 32-byte sub-block group, so the per-output-column dot-product
/// contribution reduces from `2 * run_len` multiplies to just `2`:
/// With nibble-level RLE and output-sequential ordering, nibble position `p`
/// maps directly to output element `p` in sub-block `p / 32`. For each entry
/// the dequantised weight `dq` is constant within each sub-block segment, so
/// the per-output-column contribution reduces to:
///
/// ```text
/// original: Σ_{l} ( dq_lo * B[ki_lo+l, j] + dq_hi * B[ki_hi+l, j] )
///
/// optimised: dq_lo * Σ_{l} B[ki_lo+l, j] + dq_hi * Σ_{l} B[ki_hi+l, j]
/// c_row[j] += dq * Σ_{l in seg} B[ki_base + pos + l, j]
/// ```
///
/// A run that crosses a 32-byte group boundary (and thus a scale/min change)
/// is split at the boundary; each resulting segment is handled independently.
/// A run that crosses a 32-nibble sub-block boundary is split at the boundary;
/// each resulting segment is handled independently.
///
/// `sum_lo` and `sum_hi` are caller-provided scratch slices (length `≥ n`)
/// reused across calls to avoid repeated allocation.
/// `sum_b` is a caller-provided scratch slice (length `≥ n`) reused across
/// calls to avoid repeated allocation.
fn accumulate_rle_block(
block: &BlockQ4KRle,
b: &[u16],
ki_base: usize, // first B-row index for this block (= b_idx * QK_K)
n: usize,
c_row: &mut [f32],
sum_lo: &mut [f32],
sum_hi: &mut [f32],
sum_b: &mut [f32], // scratch, length ≥ n
) {
let d = fp16_to_f32(block.d);
let dmin = fp16_to_f32(block.dmin);
let mut byte_pos = 0usize; // running cursor into the 128-byte qs payload
let mut nibble_pos = 0usize; // current position in the 256-nibble output stream
for p in 0..block.rle_len() {
let val = block.qs[2 * p];
let run = block.qs[2 * p + 1] as usize;
let lo = (val & 0x0F) as f32;
let hi = (val >> 4) as f32;
let entry = block.qs[p];
let val = (entry >> 4) as f32; // nibble value 015
let run = (entry & 0x0F) as usize + 1; // count 116
let mut remaining = run;
let mut pos = byte_pos;
let mut pos = nibble_pos;
while remaining > 0 {
// Clip the current run to the boundary of the 32-byte group so
// that the sub-block scale/min stays constant over the segment.
let group = pos / 32; // 0..4
let in_group = pos % 32; // byte offset within this group
let seg_len = remaining.min((group + 1) * 32 - pos);
// Sub-block at this position; split at sub-block boundaries (every 32 nibbles).
let sub_block = pos / 32; // 0..8
let in_sb = pos % 32;
let seg_len = remaining.min(32 - in_sb);
// Constant dequantised values for both nibble levels in this group.
let (sc_lo, mn_lo) = get_scale_min(group * 2, &block.scales);
let (sc_hi, mn_hi) = get_scale_min(group * 2 + 1, &block.scales);
let dq_lo = d * sc_lo as f32 * lo - dmin * mn_lo as f32;
let dq_hi = d * sc_hi as f32 * hi - dmin * mn_hi as f32;
let (sc, mn) = get_scale_min(sub_block, &block.scales);
let dq = d * sc as f32 * val - dmin * mn as f32;
// Map byte positions to dequantised-output indices (0..QK_K):
// lo nibbles → group*64 + in_group .. + seg_len
// hi nibbles → group*64 + 32 + in_group .. + seg_len
let out_lo = group * 64 + in_group;
let out_hi = group * 64 + 32 + in_group;
// Sum B rows for every j across the segment (B accessed stride-1
// within each row — cache-friendly).
sum_lo[..n].fill(0.0);
sum_hi[..n].fill(0.0);
// Accumulate B-column sums for this segment (stride-1 per B row).
sum_b[..n].fill(0.0);
for l in 0..seg_len {
let base_lo = (ki_base + out_lo + l) * n;
let base_hi = (ki_base + out_hi + l) * n;
let b_base = (ki_base + pos + l) * n;
for j in 0..n {
sum_lo[j] += fp16_to_f32(b[base_lo + j]);
sum_hi[j] += fp16_to_f32(b[base_hi + j]);
sum_b[j] += fp16_to_f32(b[b_base + j]);
}
}
// One multiply per output column instead of one per weight element.
// One multiply per output column instead of one per nibble.
for j in 0..n {
c_row[j] += dq_lo * sum_lo[j] + dq_hi * sum_hi[j];
c_row[j] += dq * sum_b[j];
}
pos += seg_len;
remaining -= seg_len;
}
byte_pos += run;
nibble_pos += run;
}
}
@@ -361,20 +389,16 @@ fn accumulate_rle_block(
///
/// For blocks in **RLE mode** (`IS_RLE = 1`) the intermediate decompressed row
/// is eliminated entirely. [`accumulate_rle_block`] works directly over the
/// `(value, count)` pairs: within each run the dequantised weight is constant
/// across all elements in the run, so each output column `j` requires only
/// **2 multiplies per group-segment** rather than 2 per weight element:
/// nibble-level RLE entries: within each run the dequantised weight is constant
/// across all elements, so each output column `j` requires only one multiply
/// per sub-block segment rather than one per nibble:
///
/// ```text
/// c[i, j] += dq_lo * Σ B[ki_lo, j] + dq_hi * Σ B[ki_hi, j]
/// ───────────────────────────────────────────
/// summed over seg_len consecutive positions
/// c[i, j] += dq * Σ B[ki_base + pos + l, j]
/// ──────────────────────────
/// summed over seg_len nibble positions
/// ```
///
/// For a single-run block (all bytes identical) this reduces the multiply
/// count from `2 * QK_K = 512` to `2 * 4 = 8` per output column (4 groups,
/// 2 nibble levels each), while B is still read exactly once.
///
/// For blocks in **raw mode** (`IS_RLE = 0`) the block is dequantised into a
/// scratch buffer and its contribution is accumulated via a saxpy loop
/// (weight-outer, column-inner), which accesses B in row-major order.
@@ -431,8 +455,7 @@ pub fn matmul_q4k_rle_fp16(
// Scratch for raw-mode block dequantisation.
let mut block_buf = [0.0f32; QK_K];
// Scratch for RLE-mode B-column sums; allocated once and reused per segment.
let mut sum_lo = vec![0.0f32; n];
let mut sum_hi = vec![0.0f32; n];
let mut sum_b = vec![0.0f32; n];
for i in 0..m {
let c_row = &mut c[i * n..(i + 1) * n];
@@ -442,10 +465,10 @@ pub fn matmul_q4k_rle_fp16(
let ki_base = b_idx * QK_K;
if block.is_rle() {
// RLE path: accumulate directly from runs, no decompression.
// RLE path: accumulate directly from nibble runs, no decompression.
accumulate_rle_block(
block, b, ki_base, n, c_row,
&mut sum_lo, &mut sum_hi,
&mut sum_b,
);
} else {
// Raw path: dequantise once, then saxpy into c_row.
@@ -585,18 +608,17 @@ mod tests {
// =========================================================================
#[test]
fn block_q4k_rle_size_is_274_bytes() {
// d(2) + dmin(2) + scales(12) + flags(1) + n_pairs(1) + qs(256) = 274 bytes.
// No padding needed: struct is already 2-byte aligned and 274 is even.
assert_eq!(core::mem::size_of::<BlockQ4KRle>(), 274);
fn block_q4k_rle_size_is_276_bytes() {
// d(2) + dmin(2) + scales(12) + flags(1) + pad(1) + n_pairs(2) + qs(256) = 276 bytes.
assert_eq!(core::mem::size_of::<BlockQ4KRle>(), 276);
}
#[test]
fn block_q4k_rle_is_130_bytes_larger_than_block_q4k() {
// BlockQ4K = 144 bytes, BlockQ4KRle = 274 bytes, delta = 130.
fn block_q4k_rle_is_132_bytes_larger_than_block_q4k() {
// BlockQ4K = 144 bytes, BlockQ4KRle = 276 bytes, delta = 132.
assert_eq!(
core::mem::size_of::<BlockQ4KRle>(),
core::mem::size_of::<BlockQ4K>() + 130,
core::mem::size_of::<BlockQ4K>() + 132,
);
}
@@ -633,11 +655,11 @@ mod tests {
#[test]
fn rle_len_reports_pair_count_from_n_pairs() {
for n in [0usize, 1, 7, 31, 63, 128] {
for n in [0usize, 1, 7, 31, 63, 128, 256] {
let b = BlockQ4KRle {
d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
flags: if n > 0 { IS_RLE } else { 0 },
n_pairs: n as u8,
n_pairs: n as u16,
qs: [0; QK_K],
};
assert_eq!(b.rle_len(), n, "expected rle_len {n}");
@@ -650,30 +672,37 @@ mod tests {
#[test]
fn encode_uniform_qs_uses_rle() {
// 128 identical bytes → 1 pair → 2 bytes stored in qs.
// All identical bytes → 256 identical nibbles → RLE mode.
let src = make_block(1.0, 0.0, 1, 0, 0x77);
let rle = encode(&src, 0.0);
assert!(rle.is_rle(), "uniform qs should trigger RLE mode");
}
#[test]
fn encode_uniform_qs_rle_len_is_one() {
fn encode_uniform_qs_rle_entry_count_is_sixteen() {
// 256 identical nibbles → max chunk size 16 → 16 entries.
let src = make_block(1.0, 0.0, 1, 0, 0x55);
let rle = encode(&src, 0.0);
assert_eq!(rle.rle_len(), 1);
assert_eq!(rle.rle_len(), 16, "256 identical nibbles → 16 chunks of 16");
}
#[test]
fn encode_uniform_qs_rle_entry_is_correct() {
fn encode_uniform_qs_first_entry_is_correct() {
// 0xAB: lo nibble = 0xB = 11, hi nibble = 0xA = 10.
// Output-sequential: [0xB×32, 0xA×32] × 4 groups = 8 runs of 32.
// Each run of 32 → 2 entries of 16.
// First entry: val=0xB=11, count=16 → (11<<4)|15 = 0xBF.
let src = make_block(1.0, 0.0, 1, 0, 0xAB);
let rle = encode(&src, 0.0);
assert_eq!(rle.qs[0], 0xAB, "RLE value byte should equal the repeated byte");
assert_eq!(rle.qs[1], 128, "RLE run length should be 128 bytes");
assert!(rle.is_rle());
assert_eq!(rle.qs[0], (0xBu8 << 4) | 0xFu8, "first entry: val=0xB, count=16");
assert_eq!(rle.rle_len(), 16, "8 runs of 32, each split into 2 → 16 entries");
}
#[test]
fn encode_alternating_bytes_stays_raw() {
// Alternating 0xAA / 0x55 → 128 singleton pairs, coverage = 0%.
// Alternating 0xAA / 0x55 → nibble stream alternates 10,5,10,5,...
// No two adjacent nibbles are equal → 0% nibble coverage.
// At threshold 0.01 the 0% coverage fails → raw mode.
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
@@ -686,7 +715,7 @@ mod tests {
#[test]
fn encode_raw_mode_copies_qs_verbatim() {
// Three-byte cycle of distinct values → 128 runs of 1 byte each,
// Three-byte cycle of distinct values → no adjacent nibble repeats,
// coverage = 0%. At threshold 0.01 the 0% coverage fails → raw mode.
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
@@ -700,138 +729,99 @@ mod tests {
}
#[test]
fn encode_two_runs_uses_rle_and_stores_correct_pairs() {
// Two distinct runs: 64 bytes of 0x11 followed by 64 bytes of 0x22.
// → 2 pairs = 4 bytes.
fn encode_two_run_block_stores_correct_entries() {
// qs = [0x11×64, 0x22×64]: both nibbles of 0x11 are 1, of 0x22 are 2.
// Nibble stream (output-sequential): [1×128, 2×128] → 2 runs of 128.
// Each run of 128 → 8 entries of 16 → 16 entries total.
let mut qs = [0u8; QK_K / 2];
qs[..64].fill(0x11);
qs[64..].fill(0x22);
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
let rle = encode(&src, 0.0);
assert!(rle.is_rle());
assert_eq!(rle.rle_len(), 2);
assert_eq!(rle.qs[0], 0x11, "first pair: value");
assert_eq!(rle.qs[1], 64, "first pair: run length");
assert_eq!(rle.qs[2], 0x22, "second pair: value");
assert_eq!(rle.qs[3], 64, "second pair: run length");
// 2 runs of 128 nibbles each → 8 entries per run → 16 total entries.
assert_eq!(rle.rle_len(), 16);
// First entry: nibble val=1, count=16 → (1<<4)|15 = 0x1F.
assert_eq!(rle.qs[0], 0x1F, "first entry should be nibble=1, count=16");
// 9th entry (first for nibble 2): (2<<4)|15 = 0x2F.
assert_eq!(rle.qs[8], 0x2F, "9th entry should be nibble=2, count=16");
}
#[test]
fn encode_63_pairs_uses_rle() {
// Build 62 runs of 2 bytes each (124 bytes) + 1 run of 4 bytes = 128 bytes.
// 63 pairs × 2 = 126 bytes; 63 ≤ 128 → RLE should be chosen.
let mut qs = [0u8; QK_K / 2];
let mut pos = 0usize;
for run in 0..62usize {
// Use a stride-3 sequence so consecutive values are always distinct.
let v = (run as u8).wrapping_mul(3).wrapping_add(1);
qs[pos] = v;
qs[pos + 1] = v;
pos += 2;
}
// Final run: 4 bytes, value chosen to differ from the previous one.
qs[pos..].fill(0xFE);
fn encode_nibble_coverage_determines_rle_mode() {
// A block with all-same nibbles → 100% coverage → always RLE.
let src_uniform = make_block(1.0, 0.0, 1, 0, 0x77);
assert!(encode(&src_uniform, 0.0).is_rle());
assert!(encode(&src_uniform, 1.0).is_rle()); // 100% coverage meets any threshold
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
let rle = encode(&src, 0.0);
assert!(rle.is_rle(), "63 pairs should use RLE");
assert_eq!(rle.rle_len(), 63);
}
#[test]
fn encode_64_pairs_uses_rle_at_zero_threshold() {
// 64 runs of 2 bytes each = 128 bytes total, coverage = 100%.
// pairs (64) ≤ 128 AND 100% ≥ 0.0 → RLE mode.
let mut qs = [0u8; QK_K / 2];
let mut pos = 0usize;
for run in 0..64usize {
let v = (run as u8).wrapping_mul(3).wrapping_add(1);
qs[pos] = v;
qs[pos + 1] = v;
pos += 2;
}
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
let rle = encode(&src, 0.0);
assert!(rle.is_rle(), "64 pairs, 100% coverage, threshold 0.0 → RLE");
assert_eq!(rle.rle_len(), 64);
}
#[test]
fn encode_128_pairs_uses_rle_at_zero_threshold() {
// 128 distinct consecutive bytes = 128 singleton runs = 128 pairs.
// With old cap (64 pairs), this was always raw.
// With new cap (128 pairs), threshold 0.0 accepts it.
// Coverage = 0 % (all singletons) → threshold > 0.0 rejects it.
// Build a block whose nibble stream has 0% coverage:
// lo = (i*2+1) % 16, hi = (i*2+2) % 16
// Adjacent lo nibbles differ by 2 (mod 16); adjacent hi nibbles differ by 2.
// Group boundaries also do not align (verified analytically).
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
*b = i as u8; // 0x00, 0x01, ..., 0x7F — all distinct, all singletons
let lo = ((i * 2 + 1) % 16) as u8;
let hi = ((i * 2 + 2) % 16) as u8;
*b = lo | (hi << 4);
}
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
assert!(
encode(&src, 0.0).is_rle(),
"128 pairs ≤ 128 limit AND 0% ≥ 0.0 → RLE at zero threshold"
);
assert_eq!(encode(&src, 0.0).rle_len(), 128);
assert!(
!encode(&src, 0.01).is_rle(),
"0% coverage fails any threshold > 0"
);
let src_varied = make_block_with_qs(1.0, 0.0, 1, 0, qs);
// At 0.0 threshold: RLE (≤ 256 entries always fit, 0.0 ≥ 0.0).
assert!(encode(&src_varied, 0.0).is_rle());
// At any positive threshold: 0% nibble coverage → raw.
assert!(!encode(&src_varied, 0.01).is_rle());
}
#[test]
fn encode_coverage_threshold_rejects_low_coverage_block() {
// Construct: 63 singletons + 1 run of 65 bytes = 64 pairs.
// coverage = 65/128 ≈ 50.8%.
// threshold 0.50 accepts it; threshold 0.60 rejects it.
let mut qs = [0u8; QK_K / 2];
qs[0] = 0x01;
for i in 1..63usize {
// Distinct odd bytes, none equal to 0x01 or adjacent values.
qs[i] = (i as u8).wrapping_mul(2).wrapping_add(5);
}
qs[63..].fill(0xAB); // 65-byte run; qs[62] = 62*2+5 = 129 → wraps to 0x81 ≠ 0xAB ✓
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
assert!(
encode(&src, 0.50).is_rle(),
"50.8% coverage should meet 50% threshold"
);
assert!(
!encode(&src, 0.60).is_rle(),
"50.8% coverage should fail 60% threshold"
);
fn encode_nibble_max_count_is_sixteen() {
// A run of 256 identical nibbles should be stored as 16 entries of 16 nibbles.
// 0x33: lo nibble = 3, hi nibble = 3 → all 256 nibbles are 3.
let src = make_block(1.0, 0.0, 1, 0, 0x33);
let rle = encode(&src, 0.0);
assert!(rle.is_rle());
assert_eq!(rle.rle_len(), 16);
// Each entry: nibble=3, count=16 → (3<<4)|15 = 0x3F.
assert!(rle.qs[..16].iter().all(|&e| e == 0x3F));
}
#[test]
fn encode_coverage_zero_threshold_always_uses_rle_when_pairs_fit() {
// Any block whose runs produce ≤ 128 pairs uses RLE at threshold 0.0,
// regardless of how many singletons it contains.
// Use the 63-pair block from encode_63_pairs_uses_rle.
fn encode_256_nibbles_fits_in_qs_at_zero_threshold() {
// Worst case: all 256 nibbles are singletons.
// Construct lo = (i*2+1) % 16, hi = (i*2+2) % 16: gives 0% coverage.
let mut qs = [0u8; QK_K / 2];
let mut pos = 0usize;
for run in 0..62usize {
let v = (run as u8).wrapping_mul(3).wrapping_add(1);
qs[pos] = v;
qs[pos + 1] = v;
pos += 2;
for (i, b) in qs.iter_mut().enumerate() {
let lo = ((i * 2 + 1) % 16) as u8;
let hi = ((i * 2 + 2) % 16) as u8;
*b = lo | (hi << 4);
}
qs[pos..].fill(0xFE);
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
assert!(encode(&src, 0.0).is_rle());
let rle = encode(&src, 0.0);
// At threshold 0.0 the block should be RLE (256 entries ≤ 256 capacity,
// and 0.0 ≥ 0.0). The dequantised output must match the baseline.
let mut got = [0.0f32; QK_K];
let mut expected = [0.0f32; QK_K];
dequantize_block_q4k_rle(&rle, &mut got);
dequantize_block_q4k(&src, &mut expected);
assert_slices_close(&got, &expected, 1e-5);
}
#[test]
fn encode_coverage_one_threshold_requires_total_coverage() {
// A block with even one singleton byte fails the 100% threshold.
// Build: 1 singleton + 1 run of 127 bytes = 2 pairs, coverage = 127/128 ≈ 99.2%.
fn encode_nibble_coverage_threshold_controls_rle_selection() {
// Build a block with moderate nibble-level coverage.
// First 32 bytes all 0x77 (both nibbles 7) → group 0 in output-sequential
// order is entirely 7 (64 nibbles in one run). The remaining bytes
// cycle through values with no adjacent nibble repeats.
// Coverage ≈ 64/256 = 25% (plus a few boundary matches, ~26-27%).
let mut qs = [0u8; QK_K / 2];
qs[0] = 0x01; // singleton (value distinct from rest)
qs[1..].fill(0x02); // 127-byte run
qs[..32].fill(0x77);
for i in 32..128usize {
qs[i] = ((i % 15 + 1) as u8) | (((i + 8) % 16) as u8) << 4;
}
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
assert!(!encode(&src, 1.0).is_rle(), "99.2% coverage should fail 100% threshold");
assert!(encode(&src, 0.99).is_rle(), "99.2% coverage should meet 99% threshold");
// Coverage is roughly 2527%, which is between 0.20 and 0.30.
assert!(encode(&src, 0.20).is_rle(), "~26% coverage meets 20% threshold");
assert!(encode(&src, 0.24).is_rle(), "~26% coverage meets 24% threshold");
assert!(!encode(&src, 0.30).is_rle(), "~26% coverage does not meet 30% threshold");
}
#[test]
@@ -859,30 +849,58 @@ mod tests {
}
#[test]
fn decode_qs_rle_expands_two_pair_stream() {
// Hand-craft an RLE block: [0xAA × 64, 0xBB × 64].
fn decode_qs_rle_expands_two_run_stream() {
// [0xAA × 64 bytes, 0xBB × 64 bytes] in nibble-level RLE.
// 0xAA: lo=hi=0xA=10. 0xBB: lo=hi=0xB=11.
// Output-sequential: 128 nibbles of 10 (from 64 bytes of 0xAA),
// then 128 nibbles of 11 (from 64 bytes of 0xBB).
// Each run of 128 → 8 entries of 16: 0xAF × 8, then 0xBF × 8.
let mut qs = [0u8; QK_K];
qs[0] = 0xAA; qs[1] = 64;
qs[2] = 0xBB; qs[3] = 64;
for i in 0..8 { qs[i] = 0xAF; } // val=0xA, count-1=0xF → 16 nibbles each
for i in 8..16 { qs[i] = 0xBF; }
let rle = BlockQ4KRle {
d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
flags: IS_RLE, n_pairs: 2, qs,
flags: IS_RLE, n_pairs: 16, qs,
};
let expanded = decode_qs(&rle);
// First 64 bytes should be 0xAA, last 64 bytes should be 0xBB.
assert!(expanded[..64].iter().all(|&b| b == 0xAA), "first 64 bytes must be 0xAA");
assert!(expanded[64..].iter().all(|&b| b == 0xBB), "last 64 bytes must be 0xBB");
}
#[test]
fn decode_qs_rle_single_run_covers_all() {
fn decode_qs_rle_single_byte_value_covers_all() {
// 128 bytes of 0xCD: lo nibble = 0xD = 13, hi nibble = 0xC = 12.
// Nibble stream in output-sequential order:
// [0xD×32, 0xC×32] × 4 groups = 8 runs of 32.
// Each run of 32 → 2 entries of 16.
// Entries in pairs: DF, DF, CF, CF, DF, DF, CF, CF, ...
// i.e. (i/2) % 2 == 0 → 0xDF, else 0xCF.
let mut qs = [0u8; QK_K];
qs[0] = 0xCD; qs[1] = 128; // one run of 128 bytes
for i in 0..16 {
qs[i] = if (i / 2) % 2 == 0 { 0xDF } else { 0xCF };
}
let rle = BlockQ4KRle {
d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
flags: IS_RLE, n_pairs: 1, qs,
flags: IS_RLE, n_pairs: 16, qs,
};
let expanded = decode_qs(&rle);
assert!(expanded.iter().all(|&b| b == 0xCD));
assert!(expanded.iter().all(|&b| b == 0xCD), "all bytes should be 0xCD");
}
// =========================================================================
// extract_nibbles / pack_nibbles round-trip
// =========================================================================
#[test]
fn extract_and_pack_nibbles_round_trip() {
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
*b = (i.wrapping_mul(37).wrapping_add(13) & 0xFF) as u8;
}
let nibbles = extract_nibbles(&qs);
let repacked = pack_nibbles(&nibbles);
assert_eq!(repacked, qs, "pack(extract(qs)) must equal qs");
}
// =========================================================================
@@ -972,7 +990,7 @@ mod tests {
#[test]
fn roundtrip_raw_mode_matches_original() {
// Alternating bytes → 128 singleton pairs, coverage = 0%.
// Alternating bytes → no adjacent nibble repeats, coverage = 0%.
// Use threshold 0.01 to force raw mode (0% < 0.01).
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
@@ -1007,7 +1025,7 @@ mod tests {
#[test]
fn roundtrip_many_short_runs_matches_original() {
// Four distinct runs of varying lengths → still compresses.
// Four distinct byte runs → multiple nibble runs (RLE still compresses).
let mut qs = [0u8; QK_K / 2];
qs[..10].fill(0x11);
qs[10..30].fill(0x22);
@@ -1015,8 +1033,7 @@ mod tests {
qs[31..].fill(0x44);
let src = make_block_with_qs(1.0, 0.5, 7, 3, qs);
let rle = encode(&src, 0.0);
assert!(rle.is_rle(), "4-run block should compress");
assert_eq!(rle.rle_len(), 4);
assert!(rle.is_rle(), "multi-run block should compress");
let mut got = [0.0f32; QK_K];
let mut expected = [0.0f32; QK_K];
@@ -1053,26 +1070,6 @@ mod tests {
assert_close(got[32], 3.0, 1e-5);
}
#[test]
fn roundtrip_128_singleton_pairs_matches_original() {
// All-distinct bytes → 128 pairs, 0% coverage.
// encode at threshold 0.0 → RLE; dequantize must match baseline.
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
*b = (i as u8).wrapping_mul(3).wrapping_add(7);
}
let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
let rle = encode(&src, 0.0);
assert!(rle.is_rle());
assert_eq!(rle.rle_len(), 128);
let mut got = [0.0f32; QK_K];
let mut expected = [0.0f32; QK_K];
dequantize_block_q4k_rle(&rle, &mut got);
dequantize_block_q4k(&src, &mut expected);
assert_slices_close(&got, &expected, 1e-5);
}
// =========================================================================
// matmul_q4k_rle_fp16
// =========================================================================