//! # Benchmark: BlockQ4K vs BlockQ4KRle
//!
//! Measures three operations across two weight distributions:
//!
//! | Group        | What is timed                                    |
//! |--------------|--------------------------------------------------|
//! | `encode`     | BlockQ4K → BlockQ4KRle for a batch of 512 blocks |
//! | `dequantize` | Single-block dequantisation for all three paths  |
//! | `matmul`     | Full A×B multiply at three matrix sizes          |
//!
//! ## Weight distributions
//!
//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG).
//! Consecutive bytes almost never repeat, so each block produces ~128
//! single-byte runs.  At 2 bytes per pair that would require ~256 bytes,
//! which exceeds the 128-byte raw payload, so `encode` always keeps these
//! blocks in **raw mode** (IS_RLE = 0).  This is representative of typical
//! unstructured LLM weight matrices.
//!
//! **rle_optimal** — every byte in a block's qs field is the same value.
//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 —
//! and sets IS_RLE = 1.  This is the theoretical compression maximum, and
//! is representative of highly sparse or dead-neuron weight matrices.

use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use matrix_testing::{
    dequantize_block_q4k, matmul_q4k_fp16,
    rle::{dequantize_block_q4k_rle, encode, matmul_q4k_rle_fp16, BlockQ4KRle},
    BlockQ4K, K_SCALE_SIZE, QK_K,
};

// ---------------------------------------------------------------------------
// Minimal 64-bit LCG — no external dependencies needed
// ---------------------------------------------------------------------------

/// Deterministic pseudo-random generator using Knuth / PCG constants.
struct Lcg(u64);

impl Lcg {
    fn new(seed: u64) -> Self {
        Self(seed)
    }

    fn next_u8(&mut self) -> u8 {
        self.0 = self
            .0
            .wrapping_mul(6_364_136_223_846_793_005)
            .wrapping_add(1_442_695_040_888_963_407);
        (self.0 >> 33) as u8
    }
}

// ---------------------------------------------------------------------------
// Fixture helpers
// ---------------------------------------------------------------------------

/// Lossily encode a finite, non-subnormal f32 to its fp16 bit pattern.
///
/// Only used for block header fields (d, dmin); values must lie within the
/// fp16 normal range [~6.1e-5, 65504].  No overflow / underflow checks.
fn f32_to_fp16(f: f32) -> u16 {
    if f == 0.0 {
        return 0;
    }
    let bits     = f.to_bits();
    let sign     = ((bits >> 31) as u16) << 15;
    let exp      = ((bits >> 23) & 0xFF) as i32 - 127 + 15;
    let mantissa = (bits & 0x007F_FFFF) >> 13;
    sign | ((exp as u16) << 10) | mantissa as u16
}

/// Build a 12-byte `scales` array where all 8 sub-blocks share the same
/// `scale` and `min` (both must be < 16, matching the test helper in lib.rs).
fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
    let mut s = [0u8; K_SCALE_SIZE];
    for j in 0..4 {
        s[j]     = scale;
        s[j + 4] = min;
    }
    for j in 8..12 {
        s[j] = (scale & 0x0F) | ((min & 0x0F) << 4);
    }
    s
}

/// Return `count` blocks whose qs bytes are pseudo-random.
///
/// With uniformly distributed bytes, consecutive bytes match with probability
/// 1/256 ≈ 0.4%, yielding ~128 runs per block.  Storing those as (value,
/// count) pairs would need ~256 bytes — more than the 128-byte raw payload —
/// so `encode` will always select **raw mode** (IS_RLE = 0).
fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
    let scales  = make_scales(7, 2);
    let d       = f32_to_fp16(0.01);
    let dmin    = f32_to_fp16(0.001);
    (0..count)
        .map(|_| {
            let mut qs = [0u8; QK_K / 2];
            for b in qs.iter_mut() {
                *b = rng.next_u8();
            }
            BlockQ4K { d, dmin, scales, qs }
        })
        .collect()
}

/// Return `count` blocks where every qs byte is the same value.
///
/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes
/// instead of 128.  `encode` will always select **RLE mode** (IS_RLE = 1).
/// Each block uses a fresh pseudo-random byte so no two blocks are identical,
/// avoiding degenerate cache-warm effects across the batch.
fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
    let scales  = make_scales(7, 2);
    let d       = f32_to_fp16(0.01);
    let dmin    = f32_to_fp16(0.001);
    (0..count)
        .map(|_| {
            let byte = rng.next_u8();
            BlockQ4K { d, dmin, scales, qs: [byte; QK_K / 2] }
        })
        .collect()
}

/// Build a K×N FP16 matrix (raw u16 bits) where every element is 1.0.
fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
    vec![f32_to_fp16(1.0); k * n]
}

// ---------------------------------------------------------------------------
// Group 1 — encode
// ---------------------------------------------------------------------------

/// Number of blocks encoded per iteration in `bench_encode`.
const ENCODE_BATCH: usize = 512;

/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output.
///
/// Both distributions perform the same O(128) run-length scan.  The only
/// divergence is at the output stage:
/// * **uniform**    — run count > 63 → fall through to memcpy of 128 bytes.
/// * **rle_optimal** — run count = 1  → write 2 bytes and set IS_RLE.
fn bench_encode(c: &mut Criterion) {
    let uniform  = uniform_blocks(ENCODE_BATCH);
    let rle_opt  = rle_optimal_blocks(ENCODE_BATCH);

    let mut group = c.benchmark_group("encode");
    // Throughput = blocks encoded per second.
    group.throughput(Throughput::Elements(ENCODE_BATCH as u64));

    group.bench_function("uniform", |b| {
        b.iter(|| {
            for blk in &uniform {
                black_box(encode(black_box(blk)));
            }
        });
    });

    group.bench_function("rle_optimal", |b| {
        b.iter(|| {
            for blk in &rle_opt {
                black_box(encode(black_box(blk)));
            }
        });
    });

    group.finish();
}

// ---------------------------------------------------------------------------
// Group 2 — dequantize (single block)
// ---------------------------------------------------------------------------

/// Compares the three single-block dequantisation code paths.
///
/// | Variant          | Block type  | Encoding | Extra work vs baseline        |
/// |------------------|-------------|----------|-------------------------------|
/// | `q4k_baseline`   | BlockQ4K    | —        | none                          |
/// | `rle_raw_mode`   | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) |
/// | `rle_rle_mode`   | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf  |
///
/// Throughput is the number of dequantised weights produced per second.
fn bench_dequantize(c: &mut Criterion) {
    let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap();
    let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap();

    let rle_raw = encode(&q4k_uniform); // IS_RLE = 0
    let rle_rle = encode(&q4k_rle_opt); // IS_RLE = 1

    // Confirm the fixtures ended up in the right encoding modes.
    assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode");
    assert!(rle_rle.is_rle(),  "rle-optimal block should encode to rle mode");

    let mut group = c.benchmark_group("dequantize");
    // Throughput = QK_K (256) weights dequantised per second.
    group.throughput(Throughput::Elements(QK_K as u64));

    group.bench_function("q4k_baseline", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
            dequantize_block_q4k(black_box(&q4k_uniform), &mut out);
            black_box(out)
        });
    });

    group.bench_function("rle_raw_mode", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
            dequantize_block_q4k_rle(black_box(&rle_raw), &mut out);
            black_box(out)
        });
    });

    group.bench_function("rle_rle_mode", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
            dequantize_block_q4k_rle(black_box(&rle_rle), &mut out);
            black_box(out)
        });
    });

    group.finish();
}

// ---------------------------------------------------------------------------
// Group 3 — matmul
// ---------------------------------------------------------------------------

/// Matrix size configurations as (M rows, blocks-per-row, N output cols).
///
/// The shared dimension K = blocks_per_row × QK_K.
///
/// | Label  | A shape    | B shape     | total MACs |
/// |--------|------------|-------------|------------|
/// | tiny   | 4 × 256    | 256 × 32    |    32 768  |
/// | medium | 16 × 1024  | 1024 × 64   | 1 048 576  |
/// | large  | 64 × 2048  | 2048 × 128  |16 777 216  |
const CONFIGS: &[(usize, usize, usize)] = &[
    ( 4, 1,  32), // tiny
    (16, 4,  64), // medium
    (64, 8, 128), // large
];

/// Full matrix-multiply benchmark across weight distributions and matrix sizes.
///
/// Four variants per size:
///
/// | Label                | A type      | RLE mode? |
/// |----------------------|-------------|-----------|
/// | `baseline/uniform`   | BlockQ4K    | —         |
/// | `rle/uniform`        | BlockQ4KRle | raw       |
/// | `baseline/rle_opt`   | BlockQ4K    | —         |
/// | `rle/rle_opt`        | BlockQ4KRle | rle       |
///
/// Throughput is reported as multiply-accumulate operations (M × K × N) per
/// second, allowing fair cross-size comparison.
///
/// The A and B matrices are pre-built outside `iter()` so fixture construction
/// is not timed.  Output Vec allocation/deallocation is included because it is
/// an inherent part of the current API's real-world cost.
fn bench_matmul(c: &mut Criterion) {
    let mut group = c.benchmark_group("matmul");

    for &(m, bpr, n) in CONFIGS {
        let k     = bpr * QK_K;
        let label = format!("{m}x{k}x{n}");
        let macs  = (m * k * n) as u64;

        // Build all four A variants and the shared B matrix for this config.
        let a_q4k_u: Vec<BlockQ4K>    = uniform_blocks(m * bpr);
        let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(encode).collect();

        let a_q4k_r: Vec<BlockQ4K>    = rle_optimal_blocks(m * bpr);
        let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(encode).collect();

        let b = fp16_ones(k, n);

        // Set throughput for all four benchmarks at this matrix size.
        group.throughput(Throughput::Elements(macs));

        group.bench_function(format!("baseline/uniform/{label}"), |bench| {
            bench.iter(|| matmul_q4k_fp16(
                black_box(&a_q4k_u), black_box(&b), m, k, n,
            ));
        });

        group.bench_function(format!("rle/uniform/{label}"), |bench| {
            bench.iter(|| matmul_q4k_rle_fp16(
                black_box(&a_rle_u), black_box(&b), m, k, n,
            ));
        });

        group.bench_function(format!("baseline/rle_opt/{label}"), |bench| {
            bench.iter(|| matmul_q4k_fp16(
                black_box(&a_q4k_r), black_box(&b), m, k, n,
            ));
        });

        group.bench_function(format!("rle/rle_opt/{label}"), |bench| {
            bench.iter(|| matmul_q4k_rle_fp16(
                black_box(&a_rle_r), black_box(&b), m, k, n,
            ));
        });
    }

    group.finish();
}

// ---------------------------------------------------------------------------
// Registration
// ---------------------------------------------------------------------------

criterion_group!(benches, bench_encode, bench_dequantize, bench_matmul);
criterion_main!(benches);