From 992097b4e06a0b3851e9c1557975da0d3d22c053 Mon Sep 17 00:00:00 2001 From: charles Date: Sun, 12 Apr 2026 19:40:58 -0700 Subject: [PATCH] Create gguf_matmul.rs --- src/bin/gguf_matmul.rs | 457 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 457 insertions(+) create mode 100644 src/bin/gguf_matmul.rs diff --git a/src/bin/gguf_matmul.rs b/src/bin/gguf_matmul.rs new file mode 100644 index 0000000..e2da916 --- /dev/null +++ b/src/bin/gguf_matmul.rs @@ -0,0 +1,457 @@ +//! Read one Q4_K layer from a GGUF model file, multiply it against a random +//! FP16 activation matrix, and compare the baseline and RLE implementations. +//! +//! # Usage +//! +//! ```text +//! cargo run --release --bin gguf_matmul -- [n] +//! ``` +//! +//! `n` is the number of activation columns (token count / batch size). +//! Defaults to 1 (single-token inference). +//! +//! # GGUF layout (v2 / v3) +//! +//! ```text +//! ┌─────────────────────────────────────────────────────┐ +//! │ magic u32 │ version u32 │ n_tensors u64 │ n_kv u64 │ +//! ├─────────────────────────────────────────────────────┤ +//! │ metadata key-value pairs (variable length) │ +//! ├─────────────────────────────────────────────────────┤ +//! │ tensor info records (variable length) │ +//! ├─────────────────────────────────────────────────────┤ +//! │ padding to `alignment` boundary (default 32 bytes) │ +//! ├─────────────────────────────────────────────────────┤ +//! │ tensor data (concatenated, each individually padded)│ +//! └─────────────────────────────────────────────────────┘ +//! ``` +//! +//! Each Q4_K block is 144 bytes: +//! `d(2) + dmin(2) + scales(12) + qs(128)` — identical to our `BlockQ4K`. + +use std::{ + env, + error::Error, + fs::File, + io::{self, BufReader, Read, Seek, SeekFrom}, + time::{Duration, Instant}, +}; + +use matrix_testing::{ + matmul_q4k_fp16, BlockQ4K, K_SCALE_SIZE, QK_K, + rle::{encode, matmul_q4k_rle_fp16, BlockQ4KRle}, +}; + +// --------------------------------------------------------------------------- +// GGUF constants +// --------------------------------------------------------------------------- + +/// File magic: bytes b"GGUF" interpreted as a little-endian u32. +const GGUF_MAGIC: u32 = 0x4655_4747; + +/// Default tensor data alignment when not overridden by `general.alignment`. +const GGUF_DEFAULT_ALIGNMENT: u64 = 32; + +/// GGML tensor type code for Q4_K (matches ggml.h `GGML_TYPE_Q4_K`). +const GGML_TYPE_Q4_K: u32 = 12; + +/// Size in bytes of one Q4_K block: d(2) + dmin(2) + scales(12) + qs(128). +const BLOCK_BYTES: usize = 2 + 2 + K_SCALE_SIZE + QK_K / 2; // 144 + +// GGUF metadata value type tags (gguf-spec §3.2). +const GTYPE_U8: u32 = 0; +const GTYPE_I8: u32 = 1; +const GTYPE_U16: u32 = 2; +const GTYPE_I16: u32 = 3; +const GTYPE_U32: u32 = 4; +const GTYPE_I32: u32 = 5; +const GTYPE_F32: u32 = 6; +const GTYPE_BOOL: u32 = 7; +const GTYPE_STR: u32 = 8; +const GTYPE_ARR: u32 = 9; +const GTYPE_U64: u32 = 10; +const GTYPE_I64: u32 = 11; +const GTYPE_F64: u32 = 12; + +// --------------------------------------------------------------------------- +// Primitive binary readers (little-endian, no deps) +// --------------------------------------------------------------------------- + +fn read_u8(r: &mut impl Read) -> io::Result { + let mut b = [0u8; 1]; + r.read_exact(&mut b)?; + Ok(b[0]) +} + +fn read_u16(r: &mut impl Read) -> io::Result { + let mut b = [0u8; 2]; + r.read_exact(&mut b)?; + Ok(u16::from_le_bytes(b)) +} + +fn read_u32(r: &mut impl Read) -> io::Result { + let mut b = [0u8; 4]; + r.read_exact(&mut b)?; + Ok(u32::from_le_bytes(b)) +} + +fn read_u64(r: &mut impl Read) -> io::Result { + let mut b = [0u8; 8]; + r.read_exact(&mut b)?; + Ok(u64::from_le_bytes(b)) +} + +/// Read a GGUF length-prefixed UTF-8 string. +fn read_str(r: &mut impl Read) -> io::Result { + let len = read_u64(r)? as usize; + let mut buf = vec![0u8; len]; + r.read_exact(&mut buf)?; + Ok(String::from_utf8_lossy(&buf).into_owned()) +} + +/// Skip one GGUF metadata value of the given type tag without storing it. +fn skip_value(r: &mut impl Read, tag: u32) -> io::Result<()> { + match tag { + GTYPE_U8 | GTYPE_I8 | GTYPE_BOOL => { read_u8(r)?; } + GTYPE_U16 | GTYPE_I16 => { read_u16(r)?; } + GTYPE_U32 | GTYPE_I32 | GTYPE_F32 => { read_u32(r)?; } + GTYPE_U64 | GTYPE_I64 | GTYPE_F64 => { read_u64(r)?; } + GTYPE_STR => { read_str(r)?; } + GTYPE_ARR => { + let elem_tag = read_u32(r)?; + let count = read_u64(r)?; + for _ in 0..count { + skip_value(r, elem_tag)?; + } + } + t => return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unknown GGUF value type {t}"), + )), + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// Tensor info +// --------------------------------------------------------------------------- + +struct TensorInfo { + name: String, + /// Dimensions in GGML order: dims[0] is the innermost (fastest-varying). + /// For a 2-D weight matrix: dims[0] = K (in-features), dims[1] = M (out-features). + dims: Vec, + dtype: u32, + /// Byte offset of this tensor's data measured from the start of the data + /// section (i.e. add `data_start` to get the absolute file offset). + offset: u64, +} + +impl TensorInfo { + fn n_elements(&self) -> u64 { + self.dims.iter().product() + } + + fn data_bytes(&self) -> u64 { + debug_assert_eq!(self.dtype, GGML_TYPE_Q4_K); + (self.n_elements() / QK_K as u64) * BLOCK_BYTES as u64 + } + + /// Return (m, k) matrix dimensions. + /// dims[0] = K (column / inner dim), dims[1] = M (row / outer dim). + fn matrix_dims(&self) -> (usize, usize) { + assert_eq!(self.dims.len(), 2, "expected 2-D tensor"); + let k = self.dims[0] as usize; + let m = self.dims[1] as usize; + (m, k) + } +} + +// --------------------------------------------------------------------------- +// GGUF header parser +// --------------------------------------------------------------------------- + +/// Parse the GGUF file header and return `(tensor_infos, data_start_offset)`. +/// +/// `data_start_offset` is the absolute byte position where tensor data begins. +fn parse_header(path: &str) -> Result<(Vec, u64), Box> { + let mut r = BufReader::new(File::open(path)?); + + // Magic + version + let magic = read_u32(&mut r)?; + if magic != GGUF_MAGIC { + return Err(format!( + "not a GGUF file (expected magic {GGUF_MAGIC:#010x}, got {magic:#010x})" + ).into()); + } + let version = read_u32(&mut r)?; + if !(2..=3).contains(&version) { + eprintln!("warning: unexpected GGUF version {version} — proceeding anyway"); + } + + let n_tensors = read_u64(&mut r)? as usize; + let n_metadata = read_u64(&mut r)?; + + // Scan metadata KV pairs; capture `general.alignment` if present. + let mut alignment = GGUF_DEFAULT_ALIGNMENT; + for _ in 0..n_metadata { + let key = read_str(&mut r)?; + let tag = read_u32(&mut r)?; + if key == "general.alignment" && tag == GTYPE_U32 { + alignment = read_u32(&mut r)? as u64; + } else { + skip_value(&mut r, tag)?; + } + } + + // Tensor info records. + let mut tensors = Vec::with_capacity(n_tensors); + for _ in 0..n_tensors { + let name = read_str(&mut r)?; + let n_dims = read_u32(&mut r)? as usize; + let dims: Vec = (0..n_dims) + .map(|_| read_u64(&mut r)) + .collect::>()?; + let dtype = read_u32(&mut r)?; + let offset = read_u64(&mut r)?; + tensors.push(TensorInfo { name, dims, dtype, offset }); + } + + // Data starts at the next `alignment`-byte boundary after the header. + let header_end = r.stream_position()?; + let data_start = header_end.div_ceil(alignment) * alignment; + + Ok((tensors, data_start)) +} + +// --------------------------------------------------------------------------- +// Block loader +// --------------------------------------------------------------------------- + +/// Seek to the tensor's data and read its Q4_K blocks into a Vec. +fn load_blocks( + r: &mut (impl Read + Seek), + data_start: u64, + tensor: &TensorInfo, +) -> io::Result> { + r.seek(SeekFrom::Start(data_start + tensor.offset))?; + + let n_blocks = (tensor.n_elements() / QK_K as u64) as usize; + let mut blocks = Vec::with_capacity(n_blocks); + let mut buf = [0u8; BLOCK_BYTES]; + + for _ in 0..n_blocks { + r.read_exact(&mut buf)?; + // Parse field by field — safe, no transmute. + // Layout: d(0..2) dmin(2..4) scales(4..16) qs(16..144) + blocks.push(BlockQ4K { + d: u16::from_le_bytes([buf[0], buf[1]]), + dmin: u16::from_le_bytes([buf[2], buf[3]]), + scales: buf[4..16].try_into().unwrap(), + qs: buf[16..BLOCK_BYTES].try_into().unwrap(), + }); + } + + Ok(blocks) +} + +// --------------------------------------------------------------------------- +// Random FP16 activation matrix (no external rand dep) +// --------------------------------------------------------------------------- + +/// Minimal 64-bit LCG (Knuth / PCG constants). +struct Lcg(u64); + +impl Lcg { + fn new(seed: u64) -> Self { Self(seed) } + + /// Return the next pseudo-random f32 in (−0.05, +0.05). + /// This is a plausible scale for normalised transformer activations. + fn next_f32(&mut self) -> f32 { + self.0 = self.0 + .wrapping_mul(6_364_136_223_846_793_005) + .wrapping_add(1_442_695_040_888_963_407); + // Map high 32 bits to [0, 1) then shift to (−0.05, +0.05). + (self.0 >> 32) as f32 / 4_294_967_296.0 * 0.10 - 0.05 + } +} + +/// Lossily convert a finite f32 in the fp16 normal range to fp16 bits. +fn f32_to_fp16(f: f32) -> u16 { + if f == 0.0 { return 0; } + let bits = f.to_bits(); + let sign = ((bits >> 31) as u16) << 15; + let exp = ((bits >> 23) & 0xFF) as i32 - 127 + 15; + let mantissa = (bits & 0x007F_FFFF) >> 13; + if exp <= 0 { return sign; } // underflow → signed zero + if exp >= 31 { return sign | 0x7C00; } // overflow → signed infinity + sign | ((exp as u16) << 10) | mantissa as u16 +} + +/// Build a K × N FP16 matrix filled with pseudo-random activations. +fn random_fp16_matrix(k: usize, n: usize, seed: u64) -> Vec { + let mut lcg = Lcg::new(seed); + (0..k * n).map(|_| f32_to_fp16(lcg.next_f32())).collect() +} + +// --------------------------------------------------------------------------- +// Timing helper +// --------------------------------------------------------------------------- + +/// Run `f` `trials` times and return the result of the last run plus the +/// minimum elapsed time across all trials (best-of-N timing). +fn bench T>(trials: usize, mut f: F) -> (T, Duration) { + let mut best = Duration::MAX; + let mut last = None; + for _ in 0..trials { + let t = Instant::now(); + let r = f(); + let dt = t.elapsed(); + if dt < best { best = dt; } + last = Some(r); + } + (last.unwrap(), best) +} + +fn mflops(m: usize, k: usize, n: usize, dur: Duration) -> f64 { + 2.0 * m as f64 * k as f64 * n as f64 / dur.as_secs_f64() / 1e6 +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +fn main() -> Result<(), Box> { + let args: Vec = env::args().collect(); + if args.len() < 2 { + eprintln!("usage: {} [n_cols=1] [trials=3]", args[0]); + std::process::exit(1); + } + let path = &args[1]; + let n = args.get(2).and_then(|s| s.parse::().ok()).unwrap_or(1); + let trials = args.get(3).and_then(|s| s.parse::().ok()).unwrap_or(3); + + // ── Parse header ──────────────────────────────────────────────────────── + println!("Parsing {path} …"); + let (tensors, data_start) = parse_header(path)?; + + // List all tensor names and types for context. + let n_q4k = tensors.iter().filter(|t| t.dtype == GGML_TYPE_Q4_K).count(); + println!(" {} tensors total, {} are Q4_K", tensors.len(), n_q4k); + + // ── Select the first suitable 2-D Q4_K tensor ─────────────────────────── + // "Suitable" means 2-D and K divisible by QK_K (required for our matmul). + let tensor = tensors + .iter() + .find(|t| { + t.dtype == GGML_TYPE_Q4_K + && t.dims.len() == 2 + && t.dims[0] % QK_K as u64 == 0 + }) + .ok_or("no suitable 2-D Q4_K tensor found in this GGUF file")?; + + let (m, k) = tensor.matrix_dims(); + let n_blocks = m * (k / QK_K); + let data_mib = tensor.data_bytes() as f64 / (1u64 << 20) as f64; + + println!(); + println!("┌─ Tensor ───────────────────────────────────────"); + println!("│ name : {}", tensor.name); + println!("│ shape : {m} rows × {k} cols"); + println!("│ blocks : {n_blocks} ({data_mib:.1} MiB on disk)"); + println!("│ activation: {k} × {n} (random FP16)"); + println!("│ trials : {trials} (best-of reported)"); + println!("└────────────────────────────────────────────────"); + + // ── Load blocks ───────────────────────────────────────────────────────── + print!("\nLoading blocks … "); + let t0 = Instant::now(); + let mut file = BufReader::new(File::open(path)?); + let blocks = load_blocks(&mut file, data_start, tensor)?; + println!("{:.3} s ({} blocks × {} B)", t0.elapsed().as_secs_f64(), n_blocks, BLOCK_BYTES); + + // ── Build random activation matrix [K × N] ─────────────────────────────── + let b_fp16 = random_fp16_matrix(k, n, 0xDEAD_BEEF_CAFE_1234); + + // ── Baseline matmul (best of `trials`) ────────────────────────────────── + let (c_base, t_base) = bench(trials, || { + matmul_q4k_fp16(&blocks, &b_fp16, m, k, n) + }); + println!( + "Baseline: {:.3} s {:.0} MFLOP/s", + t_base.as_secs_f64(), + mflops(m, k, n, t_base), + ); + + // ── RLE encode (best of `trials`) ──────────────────────────────────────── + let (rle_blocks, t_enc) = bench(trials, || -> Vec { + blocks.iter().map(encode).collect() + }); + + for block in &rle_blocks { + println!("Got value {:?}", block); + for pair in block.qs { + println!("top {} bottom {}", (pair >> 4), (pair & 0b1111)); + } + break; + } + + let n_rle = rle_blocks.iter().filter(|b| b.is_rle()).count(); + let n_raw = n_blocks - n_rle; + let avg_pairs = if n_rle > 0 { + rle_blocks.iter() + .filter(|b| b.is_rle()) + .map(|b| b.rle_len() as f64) + .sum::() / n_rle as f64 + } else { 0.0 }; + + println!( + "Encode : {:.3} s RLE {n_rle}/{n_blocks} blocks ({:.1}%), \ + raw {n_raw}/{n_blocks} ({:.1}%), avg {avg_pairs:.1} pairs/RLE block", + t_enc.as_secs_f64(), + 100.0 * n_rle as f64 / n_blocks as f64, + 100.0 * n_raw as f64 / n_blocks as f64, + ); + + // ── RLE matmul (best of `trials`) ──────────────────────────────────────── + let (c_rle, t_rle) = bench(trials, || { + matmul_q4k_rle_fp16(&rle_blocks, &b_fp16, m, k, n) + }); + println!( + "RLE : {:.3} s {:.0} MFLOP/s", + t_rle.as_secs_f64(), + mflops(m, k, n, t_rle), + ); + + // ── Correctness check ──────────────────────────────────────────────────── + let max_err = c_base + .iter() + .zip(&c_rle) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + + let ok = max_err < 1e-2; + println!( + "\nMax |baseline − rle| = {max_err:.2e} {}", + if ok { "✓" } else { "✗ (unexpectedly large — check block layout)" } + ); + + // ── Summary ────────────────────────────────────────────────────────────── + println!(); + println!("Speedup (matmul only): {:.2}×", t_base.as_secs_f64() / t_rle.as_secs_f64()); + println!("Speedup (matmul + encode once): {:.2}×", + t_base.as_secs_f64() / (t_rle + t_enc).as_secs_f64()); + + // Show a small slice of the output so it's clear something real happened. + let show = n.min(4); + print!("First {show} output(s) of row 0: "); + for j in 0..show { + print!("{:.4} ", c_base[j]); + } + println!(); + + if !ok { + std::process::exit(1); + } + Ok(()) +}