//! # Benchmark: BlockQ4K vs BlockQ4KRle //! //! Measures three operations across two weight distributions: //! //! | Group | What is timed | //! |--------------|--------------------------------------------------| //! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks | //! | `dequantize` | Single-block dequantisation for all three paths | //! | `matmul` | Full A×B multiply at three matrix sizes | //! //! ## Weight distributions //! //! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG). //! Consecutive bytes almost never repeat, so each block produces ~128 //! single-byte runs. At 2 bytes per pair that would require ~256 bytes, //! which exceeds the 128-byte raw payload, so `encode` always keeps these //! blocks in **raw mode** (IS_RLE = 0). This is representative of typical //! unstructured LLM weight matrices. //! //! **rle_optimal** — every byte in a block's qs field is the same value. //! `encode` stores a single (value, count) pair — 2 bytes instead of 128 — //! and sets IS_RLE = 1. This is the theoretical compression maximum, and //! is representative of highly sparse or dead-neuron weight matrices. use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; use matrix_testing::{ dequantize_block_q4k, matmul_q4k_fp16, rle::{dequantize_block_q4k_rle, encode, matmul_q4k_rle_fp16, BlockQ4KRle}, BlockQ4K, K_SCALE_SIZE, QK_K, }; // --------------------------------------------------------------------------- // Minimal 64-bit LCG — no external dependencies needed // --------------------------------------------------------------------------- /// Deterministic pseudo-random generator using Knuth / PCG constants. struct Lcg(u64); impl Lcg { fn new(seed: u64) -> Self { Self(seed) } fn next_u8(&mut self) -> u8 { self.0 = self .0 .wrapping_mul(6_364_136_223_846_793_005) .wrapping_add(1_442_695_040_888_963_407); (self.0 >> 33) as u8 } } // --------------------------------------------------------------------------- // Fixture helpers // --------------------------------------------------------------------------- /// Lossily encode a finite, non-subnormal f32 to its fp16 bit pattern. /// /// Only used for block header fields (d, dmin); values must lie within the /// fp16 normal range [~6.1e-5, 65504]. No overflow / underflow checks. fn f32_to_fp16(f: f32) -> u16 { if f == 0.0 { return 0; } let bits = f.to_bits(); let sign = ((bits >> 31) as u16) << 15; let exp = ((bits >> 23) & 0xFF) as i32 - 127 + 15; let mantissa = (bits & 0x007F_FFFF) >> 13; sign | ((exp as u16) << 10) | mantissa as u16 } /// Build a 12-byte `scales` array where all 8 sub-blocks share the same /// `scale` and `min` (both must be < 16, matching the test helper in lib.rs). fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] { let mut s = [0u8; K_SCALE_SIZE]; for j in 0..4 { s[j] = scale; s[j + 4] = min; } for j in 8..12 { s[j] = (scale & 0x0F) | ((min & 0x0F) << 4); } s } /// Return `count` blocks whose qs bytes are pseudo-random. /// /// With uniformly distributed bytes, consecutive bytes match with probability /// 1/256 ≈ 0.4%, yielding ~128 runs per block. Storing those as (value, /// count) pairs would need ~256 bytes — more than the 128-byte raw payload — /// so `encode` will always select **raw mode** (IS_RLE = 0). fn uniform_blocks(count: usize) -> Vec { let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234); let scales = make_scales(7, 2); let d = f32_to_fp16(0.01); let dmin = f32_to_fp16(0.001); (0..count) .map(|_| { let mut qs = [0u8; QK_K / 2]; for b in qs.iter_mut() { *b = rng.next_u8(); } BlockQ4K { d, dmin, scales, qs } }) .collect() } /// Return `count` blocks where every qs byte is the same value. /// /// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes /// instead of 128. `encode` will always select **RLE mode** (IS_RLE = 1). /// Each block uses a fresh pseudo-random byte so no two blocks are identical, /// avoiding degenerate cache-warm effects across the batch. fn rle_optimal_blocks(count: usize) -> Vec { let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0); let scales = make_scales(7, 2); let d = f32_to_fp16(0.01); let dmin = f32_to_fp16(0.001); (0..count) .map(|_| { let byte = rng.next_u8(); BlockQ4K { d, dmin, scales, qs: [byte; QK_K / 2] } }) .collect() } /// Build a K×N FP16 matrix (raw u16 bits) where every element is 1.0. fn fp16_ones(k: usize, n: usize) -> Vec { vec![f32_to_fp16(1.0); k * n] } // --------------------------------------------------------------------------- // Group 1 — encode // --------------------------------------------------------------------------- /// Number of blocks encoded per iteration in `bench_encode`. const ENCODE_BATCH: usize = 512; /// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output. /// /// Both distributions perform the same O(128) run-length scan. The only /// divergence is at the output stage: /// * **uniform** — run count > 63 → fall through to memcpy of 128 bytes. /// * **rle_optimal** — run count = 1 → write 2 bytes and set IS_RLE. fn bench_encode(c: &mut Criterion) { let uniform = uniform_blocks(ENCODE_BATCH); let rle_opt = rle_optimal_blocks(ENCODE_BATCH); let mut group = c.benchmark_group("encode"); // Throughput = blocks encoded per second. group.throughput(Throughput::Elements(ENCODE_BATCH as u64)); group.bench_function("uniform", |b| { b.iter(|| { for blk in &uniform { black_box(encode(black_box(blk))); } }); }); group.bench_function("rle_optimal", |b| { b.iter(|| { for blk in &rle_opt { black_box(encode(black_box(blk))); } }); }); group.finish(); } // --------------------------------------------------------------------------- // Group 2 — dequantize (single block) // --------------------------------------------------------------------------- /// Compares the three single-block dequantisation code paths. /// /// | Variant | Block type | Encoding | Extra work vs baseline | /// |------------------|-------------|----------|-------------------------------| /// | `q4k_baseline` | BlockQ4K | — | none | /// | `rle_raw_mode` | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) | /// | `rle_rle_mode` | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf | /// /// Throughput is the number of dequantised weights produced per second. fn bench_dequantize(c: &mut Criterion) { let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap(); let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap(); let rle_raw = encode(&q4k_uniform); // IS_RLE = 0 let rle_rle = encode(&q4k_rle_opt); // IS_RLE = 1 // Confirm the fixtures ended up in the right encoding modes. assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode"); assert!(rle_rle.is_rle(), "rle-optimal block should encode to rle mode"); let mut group = c.benchmark_group("dequantize"); // Throughput = QK_K (256) weights dequantised per second. group.throughput(Throughput::Elements(QK_K as u64)); group.bench_function("q4k_baseline", |b| { b.iter(|| { let mut out = [0.0f32; QK_K]; dequantize_block_q4k(black_box(&q4k_uniform), &mut out); black_box(out) }); }); group.bench_function("rle_raw_mode", |b| { b.iter(|| { let mut out = [0.0f32; QK_K]; dequantize_block_q4k_rle(black_box(&rle_raw), &mut out); black_box(out) }); }); group.bench_function("rle_rle_mode", |b| { b.iter(|| { let mut out = [0.0f32; QK_K]; dequantize_block_q4k_rle(black_box(&rle_rle), &mut out); black_box(out) }); }); group.finish(); } // --------------------------------------------------------------------------- // Group 3 — matmul // --------------------------------------------------------------------------- /// Matrix size configurations as (M rows, blocks-per-row, N output cols). /// /// The shared dimension K = blocks_per_row × QK_K. /// /// | Label | A shape | B shape | total MACs | /// |--------|------------|-------------|------------| /// | tiny | 4 × 256 | 256 × 32 | 32 768 | /// | medium | 16 × 1024 | 1024 × 64 | 1 048 576 | /// | large | 64 × 2048 | 2048 × 128 |16 777 216 | const CONFIGS: &[(usize, usize, usize)] = &[ ( 4, 1, 32), // tiny (16, 4, 64), // medium (64, 8, 128), // large ]; /// Full matrix-multiply benchmark across weight distributions and matrix sizes. /// /// Four variants per size: /// /// | Label | A type | RLE mode? | /// |----------------------|-------------|-----------| /// | `baseline/uniform` | BlockQ4K | — | /// | `rle/uniform` | BlockQ4KRle | raw | /// | `baseline/rle_opt` | BlockQ4K | — | /// | `rle/rle_opt` | BlockQ4KRle | rle | /// /// Throughput is reported as multiply-accumulate operations (M × K × N) per /// second, allowing fair cross-size comparison. /// /// The A and B matrices are pre-built outside `iter()` so fixture construction /// is not timed. Output Vec allocation/deallocation is included because it is /// an inherent part of the current API's real-world cost. fn bench_matmul(c: &mut Criterion) { let mut group = c.benchmark_group("matmul"); for &(m, bpr, n) in CONFIGS { let k = bpr * QK_K; let label = format!("{m}x{k}x{n}"); let macs = (m * k * n) as u64; // Build all four A variants and the shared B matrix for this config. let a_q4k_u: Vec = uniform_blocks(m * bpr); let a_rle_u: Vec = a_q4k_u.iter().map(encode).collect(); let a_q4k_r: Vec = rle_optimal_blocks(m * bpr); let a_rle_r: Vec = a_q4k_r.iter().map(encode).collect(); let b = fp16_ones(k, n); // Set throughput for all four benchmarks at this matrix size. group.throughput(Throughput::Elements(macs)); group.bench_function(format!("baseline/uniform/{label}"), |bench| { bench.iter(|| matmul_q4k_fp16( black_box(&a_q4k_u), black_box(&b), m, k, n, )); }); group.bench_function(format!("rle/uniform/{label}"), |bench| { bench.iter(|| matmul_q4k_rle_fp16( black_box(&a_rle_u), black_box(&b), m, k, n, )); }); group.bench_function(format!("baseline/rle_opt/{label}"), |bench| { bench.iter(|| matmul_q4k_fp16( black_box(&a_q4k_r), black_box(&b), m, k, n, )); }); group.bench_function(format!("rle/rle_opt/{label}"), |bench| { bench.iter(|| matmul_q4k_rle_fp16( black_box(&a_rle_r), black_box(&b), m, k, n, )); }); } group.finish(); } // --------------------------------------------------------------------------- // Registration // --------------------------------------------------------------------------- criterion_group!(benches, bench_encode, bench_dequantize, bench_matmul); criterion_main!(benches);