From 4ca68c7f94d3437a7541fe4310eeefe85343b5ee Mon Sep 17 00:00:00 2001 From: charles Date: Sun, 12 Apr 2026 15:40:19 -0700 Subject: [PATCH] added benchmark --- Cargo.lock | 554 ++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 7 + benches/matmul.rs | 316 ++++++++++++++++++++++++++ 3 files changed, 877 insertions(+) create mode 100644 benches/matmul.rs diff --git a/Cargo.lock b/Cargo.lock index 2b7cb4d..2911142 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,560 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "js-sys" +version = "0.3.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.184" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" + [[package]] name = "matrix-testing" version = "0.1.0" +dependencies = [ + "criterion", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index e48d2ed..fd00172 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,3 +4,10 @@ version = "0.1.0" edition = "2024" [dependencies] + +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "matmul" +harness = false diff --git a/benches/matmul.rs b/benches/matmul.rs new file mode 100644 index 0000000..ef12a1e --- /dev/null +++ b/benches/matmul.rs @@ -0,0 +1,316 @@ +//! # Benchmark: BlockQ4K vs BlockQ4KRle +//! +//! Measures three operations across two weight distributions: +//! +//! | Group | What is timed | +//! |--------------|--------------------------------------------------| +//! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks | +//! | `dequantize` | Single-block dequantisation for all three paths | +//! | `matmul` | Full A×B multiply at three matrix sizes | +//! +//! ## Weight distributions +//! +//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG). +//! Consecutive bytes almost never repeat, so each block produces ~128 +//! single-byte runs. At 2 bytes per pair that would require ~256 bytes, +//! which exceeds the 128-byte raw payload, so `encode` always keeps these +//! blocks in **raw mode** (IS_RLE = 0). This is representative of typical +//! unstructured LLM weight matrices. +//! +//! **rle_optimal** — every byte in a block's qs field is the same value. +//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 — +//! and sets IS_RLE = 1. This is the theoretical compression maximum, and +//! is representative of highly sparse or dead-neuron weight matrices. + +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use matrix_testing::{ + dequantize_block_q4k, matmul_q4k_fp16, + rle::{dequantize_block_q4k_rle, encode, matmul_q4k_rle_fp16, BlockQ4KRle}, + BlockQ4K, K_SCALE_SIZE, QK_K, +}; + +// --------------------------------------------------------------------------- +// Minimal 64-bit LCG — no external dependencies needed +// --------------------------------------------------------------------------- + +/// Deterministic pseudo-random generator using Knuth / PCG constants. +struct Lcg(u64); + +impl Lcg { + fn new(seed: u64) -> Self { + Self(seed) + } + + fn next_u8(&mut self) -> u8 { + self.0 = self + .0 + .wrapping_mul(6_364_136_223_846_793_005) + .wrapping_add(1_442_695_040_888_963_407); + (self.0 >> 33) as u8 + } +} + +// --------------------------------------------------------------------------- +// Fixture helpers +// --------------------------------------------------------------------------- + +/// Lossily encode a finite, non-subnormal f32 to its fp16 bit pattern. +/// +/// Only used for block header fields (d, dmin); values must lie within the +/// fp16 normal range [~6.1e-5, 65504]. No overflow / underflow checks. +fn f32_to_fp16(f: f32) -> u16 { + if f == 0.0 { + return 0; + } + let bits = f.to_bits(); + let sign = ((bits >> 31) as u16) << 15; + let exp = ((bits >> 23) & 0xFF) as i32 - 127 + 15; + let mantissa = (bits & 0x007F_FFFF) >> 13; + sign | ((exp as u16) << 10) | mantissa as u16 +} + +/// Build a 12-byte `scales` array where all 8 sub-blocks share the same +/// `scale` and `min` (both must be < 16, matching the test helper in lib.rs). +fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] { + let mut s = [0u8; K_SCALE_SIZE]; + for j in 0..4 { + s[j] = scale; + s[j + 4] = min; + } + for j in 8..12 { + s[j] = (scale & 0x0F) | ((min & 0x0F) << 4); + } + s +} + +/// Return `count` blocks whose qs bytes are pseudo-random. +/// +/// With uniformly distributed bytes, consecutive bytes match with probability +/// 1/256 ≈ 0.4%, yielding ~128 runs per block. Storing those as (value, +/// count) pairs would need ~256 bytes — more than the 128-byte raw payload — +/// so `encode` will always select **raw mode** (IS_RLE = 0). +fn uniform_blocks(count: usize) -> Vec { + let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234); + let scales = make_scales(7, 2); + let d = f32_to_fp16(0.01); + let dmin = f32_to_fp16(0.001); + (0..count) + .map(|_| { + let mut qs = [0u8; QK_K / 2]; + for b in qs.iter_mut() { + *b = rng.next_u8(); + } + BlockQ4K { d, dmin, scales, qs } + }) + .collect() +} + +/// Return `count` blocks where every qs byte is the same value. +/// +/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes +/// instead of 128. `encode` will always select **RLE mode** (IS_RLE = 1). +/// Each block uses a fresh pseudo-random byte so no two blocks are identical, +/// avoiding degenerate cache-warm effects across the batch. +fn rle_optimal_blocks(count: usize) -> Vec { + let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0); + let scales = make_scales(7, 2); + let d = f32_to_fp16(0.01); + let dmin = f32_to_fp16(0.001); + (0..count) + .map(|_| { + let byte = rng.next_u8(); + BlockQ4K { d, dmin, scales, qs: [byte; QK_K / 2] } + }) + .collect() +} + +/// Build a K×N FP16 matrix (raw u16 bits) where every element is 1.0. +fn fp16_ones(k: usize, n: usize) -> Vec { + vec![f32_to_fp16(1.0); k * n] +} + +// --------------------------------------------------------------------------- +// Group 1 — encode +// --------------------------------------------------------------------------- + +/// Number of blocks encoded per iteration in `bench_encode`. +const ENCODE_BATCH: usize = 512; + +/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output. +/// +/// Both distributions perform the same O(128) run-length scan. The only +/// divergence is at the output stage: +/// * **uniform** — run count > 63 → fall through to memcpy of 128 bytes. +/// * **rle_optimal** — run count = 1 → write 2 bytes and set IS_RLE. +fn bench_encode(c: &mut Criterion) { + let uniform = uniform_blocks(ENCODE_BATCH); + let rle_opt = rle_optimal_blocks(ENCODE_BATCH); + + let mut group = c.benchmark_group("encode"); + // Throughput = blocks encoded per second. + group.throughput(Throughput::Elements(ENCODE_BATCH as u64)); + + group.bench_function("uniform", |b| { + b.iter(|| { + for blk in &uniform { + black_box(encode(black_box(blk))); + } + }); + }); + + group.bench_function("rle_optimal", |b| { + b.iter(|| { + for blk in &rle_opt { + black_box(encode(black_box(blk))); + } + }); + }); + + group.finish(); +} + +// --------------------------------------------------------------------------- +// Group 2 — dequantize (single block) +// --------------------------------------------------------------------------- + +/// Compares the three single-block dequantisation code paths. +/// +/// | Variant | Block type | Encoding | Extra work vs baseline | +/// |------------------|-------------|----------|-------------------------------| +/// | `q4k_baseline` | BlockQ4K | — | none | +/// | `rle_raw_mode` | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) | +/// | `rle_rle_mode` | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf | +/// +/// Throughput is the number of dequantised weights produced per second. +fn bench_dequantize(c: &mut Criterion) { + let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap(); + let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap(); + + let rle_raw = encode(&q4k_uniform); // IS_RLE = 0 + let rle_rle = encode(&q4k_rle_opt); // IS_RLE = 1 + + // Confirm the fixtures ended up in the right encoding modes. + assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode"); + assert!(rle_rle.is_rle(), "rle-optimal block should encode to rle mode"); + + let mut group = c.benchmark_group("dequantize"); + // Throughput = QK_K (256) weights dequantised per second. + group.throughput(Throughput::Elements(QK_K as u64)); + + group.bench_function("q4k_baseline", |b| { + b.iter(|| { + let mut out = [0.0f32; QK_K]; + dequantize_block_q4k(black_box(&q4k_uniform), &mut out); + black_box(out) + }); + }); + + group.bench_function("rle_raw_mode", |b| { + b.iter(|| { + let mut out = [0.0f32; QK_K]; + dequantize_block_q4k_rle(black_box(&rle_raw), &mut out); + black_box(out) + }); + }); + + group.bench_function("rle_rle_mode", |b| { + b.iter(|| { + let mut out = [0.0f32; QK_K]; + dequantize_block_q4k_rle(black_box(&rle_rle), &mut out); + black_box(out) + }); + }); + + group.finish(); +} + +// --------------------------------------------------------------------------- +// Group 3 — matmul +// --------------------------------------------------------------------------- + +/// Matrix size configurations as (M rows, blocks-per-row, N output cols). +/// +/// The shared dimension K = blocks_per_row × QK_K. +/// +/// | Label | A shape | B shape | total MACs | +/// |--------|------------|-------------|------------| +/// | tiny | 4 × 256 | 256 × 32 | 32 768 | +/// | medium | 16 × 1024 | 1024 × 64 | 1 048 576 | +/// | large | 64 × 2048 | 2048 × 128 |16 777 216 | +const CONFIGS: &[(usize, usize, usize)] = &[ + ( 4, 1, 32), // tiny + (16, 4, 64), // medium + (64, 8, 128), // large +]; + +/// Full matrix-multiply benchmark across weight distributions and matrix sizes. +/// +/// Four variants per size: +/// +/// | Label | A type | RLE mode? | +/// |----------------------|-------------|-----------| +/// | `baseline/uniform` | BlockQ4K | — | +/// | `rle/uniform` | BlockQ4KRle | raw | +/// | `baseline/rle_opt` | BlockQ4K | — | +/// | `rle/rle_opt` | BlockQ4KRle | rle | +/// +/// Throughput is reported as multiply-accumulate operations (M × K × N) per +/// second, allowing fair cross-size comparison. +/// +/// The A and B matrices are pre-built outside `iter()` so fixture construction +/// is not timed. Output Vec allocation/deallocation is included because it is +/// an inherent part of the current API's real-world cost. +fn bench_matmul(c: &mut Criterion) { + let mut group = c.benchmark_group("matmul"); + + for &(m, bpr, n) in CONFIGS { + let k = bpr * QK_K; + let label = format!("{m}x{k}x{n}"); + let macs = (m * k * n) as u64; + + // Build all four A variants and the shared B matrix for this config. + let a_q4k_u: Vec = uniform_blocks(m * bpr); + let a_rle_u: Vec = a_q4k_u.iter().map(encode).collect(); + + let a_q4k_r: Vec = rle_optimal_blocks(m * bpr); + let a_rle_r: Vec = a_q4k_r.iter().map(encode).collect(); + + let b = fp16_ones(k, n); + + // Set throughput for all four benchmarks at this matrix size. + group.throughput(Throughput::Elements(macs)); + + group.bench_function(format!("baseline/uniform/{label}"), |bench| { + bench.iter(|| matmul_q4k_fp16( + black_box(&a_q4k_u), black_box(&b), m, k, n, + )); + }); + + group.bench_function(format!("rle/uniform/{label}"), |bench| { + bench.iter(|| matmul_q4k_rle_fp16( + black_box(&a_rle_u), black_box(&b), m, k, n, + )); + }); + + group.bench_function(format!("baseline/rle_opt/{label}"), |bench| { + bench.iter(|| matmul_q4k_fp16( + black_box(&a_q4k_r), black_box(&b), m, k, n, + )); + }); + + group.bench_function(format!("rle/rle_opt/{label}"), |bench| { + bench.iter(|| matmul_q4k_rle_fp16( + black_box(&a_rle_r), black_box(&b), m, k, n, + )); + }); + } + + group.finish(); +} + +// --------------------------------------------------------------------------- +// Registration +// --------------------------------------------------------------------------- + +criterion_group!(benches, bench_encode, bench_dequantize, bench_matmul); +criterion_main!(benches);