diff --git a/Cargo.lock b/Cargo.lock index 87b04ea..090a277 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3422,6 +3422,7 @@ dependencies = [ "kb-app", "kb-config", "kb-core", + "kb-eval", "serde_json", ] diff --git a/crates/kb-cli/Cargo.toml b/crates/kb-cli/Cargo.toml index ed8a81a..ae46920 100644 --- a/crates/kb-cli/Cargo.toml +++ b/crates/kb-cli/Cargo.toml @@ -15,6 +15,14 @@ path = "src/main.rs" kb-core = { path = "../kb-core" } kb-config = { path = "../kb-config" } kb-app = { path = "../kb-app" } +# kb-eval re-exports `compute_aggregate` / `compare_runs` / +# `render_report_md` (P5-2). The DoD calls for these to be reached +# "via kb-app", but kb-eval already depends on kb-app (P5-1 runner +# uses the App facade) — routing the CLI through kb-app would +# require kb-app → kb-eval, forming a cycle. We therefore wire +# kb-cli → kb-eval directly; documented in +# `tasks/p5/p5-2-metrics-compare.md`. +kb-eval = { path = "../kb-eval" } anyhow = { workspace = true } serde_json = { workspace = true } clap = { version = "4", features = ["derive"] } diff --git a/crates/kb-cli/src/main.rs b/crates/kb-cli/src/main.rs index df489e1..b71203c 100644 --- a/crates/kb-cli/src/main.rs +++ b/crates/kb-cli/src/main.rs @@ -125,10 +125,41 @@ enum InspectWhat { #[derive(Subcommand, Debug)] enum EvalWhat { - /// Run an eval suite (placeholder for P9). + /// Run the golden suite end-to-end and persist `eval_runs` + + /// `eval_query_results` + `runs_dir//per_query.jsonl` + /// (P5-1). Run { + #[arg(long, default_value = "golden")] + suite: String, + #[arg(long, value_enum, default_value_t = ModeFlag::Lexical)] + mode: ModeFlag, + #[arg(long, default_value_t = 10)] + k: usize, #[arg(long)] - suite: Option, + with_rag: bool, + #[arg(long)] + temperature: Option, + #[arg(long)] + seed: Option, + }, + + /// Compute aggregate metrics for a stored run and write them back + /// into `eval_runs.aggregate_json` (P5-2). + Aggregate { run_id: String }, + + /// Diff two stored runs (P5-2). Default output is a Markdown + /// summary; use `--json` (top-level flag) for the raw report. + Compare { + run_a: String, + run_b: String, + /// Refuse to compare when the two runs' `chunker_version` + /// differ (default is graceful doc-id fallback). + #[arg(long)] + strict_chunker_version: bool, + /// Also write the Markdown report to + /// `runs_dir//report.md`. + #[arg(long)] + write_report: bool, }, } @@ -370,8 +401,81 @@ fn run(cli: &Cli) -> anyhow::Result<()> { } Cmd::Eval { what } => match what { - EvalWhat::Run { suite: _ } => { - anyhow::bail!("not yet wired (P9-3)") + EvalWhat::Run { + suite, + mode, + k, + with_rag, + temperature, + seed, + } => { + let opts = kb_eval::EvalRunOpts { + suite: suite.clone(), + mode: (*mode).into(), + with_rag: *with_rag, + k: *k, + temperature: *temperature, + seed: *seed, + }; + let run = kb_eval::run_eval(&opts)?; + if cli.json { + println!("{}", serde_json::to_string_pretty(&run)?); + } else { + println!("run_id: {}", run.run_id); + println!("queries: {}", run.per_query.len()); + let failed = run.per_query.iter().filter(|q| q.error.is_some()).count(); + println!("failed: {failed}"); + } + Ok(()) + } + + EvalWhat::Aggregate { run_id } => { + let agg = kb_eval::compute_aggregate(run_id)?; + kb_eval::store_aggregate(run_id, &agg)?; + if cli.json { + println!("{}", serde_json::to_string_pretty(&agg)?); + } else { + println!("run_id: {run_id}"); + println!("queries: {} ({} failed)", agg.total_queries, agg.failed_queries); + println!("hit@1: {:.4}", agg.hit_at_k.get(&1).copied().unwrap_or(0.0)); + println!("hit@5: {:.4}", agg.hit_at_k.get(&5).copied().unwrap_or(0.0)); + println!("MRR: {:.4}", agg.mrr); + } + Ok(()) + } + + EvalWhat::Compare { + run_a, + run_b, + strict_chunker_version, + write_report, + } => { + let cfg = kb_config::Config::load(None)?; + let opts = kb_eval::CompareOpts { + strict_chunker_version: *strict_chunker_version, + }; + let report = kb_eval::compare_runs_with_config(&cfg, run_a, run_b, &opts)?; + let md = kb_eval::render_report_md(&report); + if cli.json { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + print!("{md}"); + } + if *write_report { + let resolved_data_dir = kb_config::expand_path(&cfg.storage.data_dir, ""); + let runs_dir = kb_config::expand_path( + &cfg.storage.runs_dir, + &resolved_data_dir.to_string_lossy(), + ); + let dir = runs_dir.join(run_b); + std::fs::create_dir_all(&dir)?; + let path = dir.join("report.md"); + std::fs::write(&path, &md)?; + if !cli.json { + eprintln!("wrote {}", path.display()); + } + } + Ok(()) } }, } diff --git a/crates/kb-eval/src/compare.rs b/crates/kb-eval/src/compare.rs new file mode 100644 index 0000000..686e2c5 --- /dev/null +++ b/crates/kb-eval/src/compare.rs @@ -0,0 +1,509 @@ +//! Compare two eval runs (P5-2 — design §5.7, phase epic +//! `tasks/phase-5-evaluation.md`). +//! +//! Reads `eval_runs` + `eval_query_results` for two `run_id`s, calls +//! [`crate::metrics::compute_aggregate_with_config`] for each, then +//! diffs them per-query. Emits a [`CompareReport`] (machine) and an +//! optional Markdown render (human). +//! +//! Pure computation — no `kb-app` / retrieval imports. + +use std::collections::HashMap; +use std::fmt::Write as _; + +use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; + +use kb_config::Config; +use kb_core::{ChunkId, DocumentId}; +use kb_store_sqlite::SqliteStore; + +use crate::loader::load_golden_set; +use crate::metrics::{ + AggregateMetrics, compute_aggregate_with_config, resolve_golden_path, +}; +use crate::types::{GoldenQuery, QueryResult}; + +/// Strict-mode behavior pivot used by [`CompareOpts::strict_chunker_version`]. +/// When `false` (default) and the two runs' `chunker_version` differ, +/// per-query matching falls back to doc-id-only comparison and the +/// report's `deltas.chunker_version_match` field is set to +/// `"fallback_doc"`. +/// +/// **Spec deviation (intentional, documented):** the spec called for a +/// `"fallback_doc_span"` mode that augments doc-id matching with a 50% +/// `source_spans` overlap criterion. That requires `chunks` table +/// reads from both runs simultaneously — but in practice you re-index +/// (and overwrite the chunks table) before evaluating a chunker +/// change, so the run-A chunks are gone by the time run-B is computed. +/// We log the simpler doc-id-only fallback as `"fallback_doc"` and +/// defer span-overlap matching to a future phase that owns +/// chunker-version archival. The `strict-chunker-version` flag is +/// preserved verbatim from the spec. +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct CompareOpts { + pub strict_chunker_version: bool, +} + +/// Per-metric + per-query diff between two stored eval runs. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct CompareReport { + pub run_a: String, + pub run_b: String, + pub aggregate_a: AggregateMetrics, + pub aggregate_b: AggregateMetrics, + /// Per-metric delta (`b - a`) plus the `chunker_version_match` + /// audit field. JSON object so consumers can pluck individual + /// metrics by name without keeping the struct shape in sync. + pub deltas: serde_json::Value, + pub per_query: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct QueryComparison { + pub query_id: String, + pub kind: ComparisonKind, + pub a_hit_rank: Option, + pub b_hit_rank: Option, + pub note: Option, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ComparisonKind { + Win, + Loss, + Draw, + Regression, +} + +/// Compare two runs using the active XDG-loaded [`Config`]. Wraps +/// [`compare_runs_with_config`] with `Config::load(None)`. +pub fn compare_runs(run_id_a: &str, run_id_b: &str) -> Result { + let cfg = Config::load(None).context("load Config for compare_runs")?; + compare_runs_with_config(&cfg, run_id_a, run_id_b, &CompareOpts::default()) +} + +/// Compare two runs against an explicit [`Config`] + [`CompareOpts`]. +/// Used by integration tests and the future `kb eval compare --strict` +/// CLI surface. +pub fn compare_runs_with_config( + cfg: &Config, + run_id_a: &str, + run_id_b: &str, + opts: &CompareOpts, +) -> Result { + let store = SqliteStore::open(cfg).context("open SqliteStore for compare_runs")?; + store.run_migrations().context("run migrations")?; + + // Pull both run rows up-front so we can extract chunker_version and + // bail early on a missing run before doing any metric work. + let run_a = store + .load_eval_run(run_id_a) + .context("load eval_runs row A")? + .ok_or_else(|| anyhow::anyhow!("compare_runs: no eval_runs row for run_id {run_id_a}"))?; + let run_b = store + .load_eval_run(run_id_b) + .context("load eval_runs row B")? + .ok_or_else(|| anyhow::anyhow!("compare_runs: no eval_runs row for run_id {run_id_b}"))?; + + let aggregate_a = compute_aggregate_with_config(cfg, run_id_a)?; + let aggregate_b = compute_aggregate_with_config(cfg, run_id_b)?; + + let chunker_a = extract_chunker_version(&run_a.config_snapshot_json); + let chunker_b = extract_chunker_version(&run_b.config_snapshot_json); + let chunker_match_mode = if chunker_a == chunker_b { + "exact" + } else if opts.strict_chunker_version { + anyhow::bail!( + "compare_runs: chunker_version mismatch (a={chunker_a:?}, b={chunker_b:?}) and \ + strict_chunker_version=true. Pass strict_chunker_version=false to use the doc-id \ + fallback." + ); + } else { + "fallback_doc" + }; + + let rows_a = store.load_eval_query_results(run_id_a)?; + let rows_b = store.load_eval_query_results(run_id_b)?; + let qrs_a = parse_results(&rows_a)?; + let qrs_b = parse_results(&rows_b)?; + + let golden = load_golden_set(&resolve_golden_path()).context("reload golden set")?; + let golden_by_id: HashMap<&str, &GoldenQuery> = + golden.iter().map(|q| (q.id.as_str(), q)).collect(); + + let per_query = build_per_query(&qrs_a, &qrs_b, &golden_by_id, chunker_match_mode); + let deltas = build_deltas(&aggregate_a, &aggregate_b, chunker_match_mode); + + Ok(CompareReport { + run_a: run_id_a.to_owned(), + run_b: run_id_b.to_owned(), + aggregate_a, + aggregate_b, + deltas, + per_query, + }) +} + +/// Render a Markdown summary of `report`. Output is for human eyes +/// (saved to `runs_dir//report.md` by callers that want it) — +/// not a wire schema. Stable enough for snapshot tests. +pub fn render_report_md(report: &CompareReport) -> String { + let mut out = String::new(); + let _ = writeln!(out, "# Eval compare: `{}` vs `{}`", report.run_a, report.run_b); + let _ = writeln!(out); + let _ = writeln!(out, "## Aggregate deltas"); + let _ = writeln!(out); + let _ = writeln!(out, "| metric | a | b | Δ (b - a) |"); + let _ = writeln!(out, "|---|---|---|---|"); + let a = &report.aggregate_a; + let b = &report.aggregate_b; + for k in crate::metrics::TOP_K_VARIANTS { + let _ = writeln!( + out, + "| hit@{k} | {} | {} | {} |", + fmt(a.hit_at_k.get(k).copied().unwrap_or(f32::NAN)), + fmt(b.hit_at_k.get(k).copied().unwrap_or(f32::NAN)), + fmt_delta( + a.hit_at_k.get(k).copied().unwrap_or(f32::NAN), + b.hit_at_k.get(k).copied().unwrap_or(f32::NAN), + ), + ); + } + let _ = writeln!(out, "| MRR | {} | {} | {} |", fmt(a.mrr), fmt(b.mrr), fmt_delta(a.mrr, b.mrr)); + for k in crate::metrics::TOP_K_VARIANTS { + let _ = writeln!( + out, + "| recall@{k}_doc | {} | {} | {} |", + fmt(a.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN)), + fmt(b.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN)), + fmt_delta( + a.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN), + b.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN), + ), + ); + } + let _ = writeln!( + out, + "| citation_coverage | {} | {} | {} |", + fmt(a.citation_coverage), + fmt(b.citation_coverage), + fmt_delta(a.citation_coverage, b.citation_coverage), + ); + let _ = writeln!( + out, + "| groundedness | {} | {} | {} |", + fmt(a.groundedness), + fmt(b.groundedness), + fmt_delta(a.groundedness, b.groundedness), + ); + let _ = writeln!( + out, + "| empty_result_rate | {} | {} | {} |", + fmt(a.empty_result_rate), + fmt(b.empty_result_rate), + fmt_delta(a.empty_result_rate, b.empty_result_rate), + ); + let _ = writeln!( + out, + "| refusal_correctness | {} | {} | {} |", + fmt(a.refusal_correctness), + fmt(b.refusal_correctness), + fmt_delta(a.refusal_correctness, b.refusal_correctness), + ); + let _ = writeln!(out); + let _ = writeln!( + out, + "chunker_version_match: `{}`", + report + .deltas + .get("chunker_version_match") + .and_then(|v| v.as_str()) + .unwrap_or("?") + ); + let _ = writeln!(out); + + let wins: Vec<_> = report.per_query.iter().filter(|c| c.kind == ComparisonKind::Win).collect(); + let losses: Vec<_> = report.per_query.iter().filter(|c| c.kind == ComparisonKind::Loss).collect(); + let regressions: Vec<_> = report + .per_query + .iter() + .filter(|c| c.kind == ComparisonKind::Regression) + .collect(); + + let _ = writeln!( + out, + "## Wins ({}) / Losses ({}) / Regressions ({})", + wins.len(), + losses.len(), + regressions.len() + ); + let _ = writeln!(out); + let _ = writeln!(out, "| query_id | kind | rank_a | rank_b | note |"); + let _ = writeln!(out, "|---|---|---|---|---|"); + for c in &report.per_query { + let _ = writeln!( + out, + "| {} | {} | {} | {} | {} |", + c.query_id, + comparison_kind_label(c.kind), + c.a_hit_rank.map(|r| r.to_string()).unwrap_or_else(|| "—".into()), + c.b_hit_rank.map(|r| r.to_string()).unwrap_or_else(|| "—".into()), + c.note.as_deref().unwrap_or(""), + ); + } + out +} + +fn comparison_kind_label(k: ComparisonKind) -> &'static str { + match k { + ComparisonKind::Win => "win", + ComparisonKind::Loss => "loss", + ComparisonKind::Draw => "draw", + ComparisonKind::Regression => "regression", + } +} + +fn fmt(v: f32) -> String { + if v.is_nan() { + "—".into() + } else { + format!("{v:.4}") + } +} + +fn fmt_delta(a: f32, b: f32) -> String { + if a.is_nan() || b.is_nan() { + return "—".into(); + } + let d = b - a; + if d >= 0.0 { + format!("+{d:.4}") + } else { + format!("{d:.4}") + } +} + +/// Pull `chunker_version` out of a `config_snapshot_json` payload. The +/// runner writes `{"chunker_version": "", ...}`; missing or +/// malformed → `None`. Two `None`s compare as equal and route through +/// the "exact" matcher, but only the runner writes these snapshots +/// and it always emits `chunker_version` — so `None == None` can only +/// arise from a hand-edited DB or a pre-P5-1 fixture, both of which +/// are out-of-scope failure modes that the strict-mode flag covers. +fn extract_chunker_version(snapshot_json: &str) -> Option { + let v: serde_json::Value = serde_json::from_str(snapshot_json).ok()?; + v.get("chunker_version") + .and_then(|x| x.as_str()) + .map(|s| s.to_owned()) +} + +fn parse_results( + rows: &[kb_store_sqlite::EvalQueryResultRecord], +) -> Result> { + let mut out = HashMap::with_capacity(rows.len()); + for row in rows { + let qr: QueryResult = serde_json::from_str(&row.result_json) + .with_context(|| format!("parse result_json for {}", row.query_id))?; + out.insert(row.query_id.clone(), qr); + } + Ok(out) +} + +/// Find the top-ranked hit in `qr` whose `chunk_id` is in `expected` +/// (exact mode) or whose `doc_id` is in `expected_docs` (fallback). +fn first_hit_rank( + qr: &QueryResult, + expected_chunks: &[ChunkId], + expected_docs: &[DocumentId], + fallback_doc_only: bool, +) -> Option { + if !fallback_doc_only && !expected_chunks.is_empty() { + let exp: std::collections::HashSet<&ChunkId> = expected_chunks.iter().collect(); + return qr + .hits_top_k + .iter() + .filter(|h| exp.contains(&h.chunk_id)) + .map(|h| h.rank) + .min(); + } + if expected_docs.is_empty() { + return None; + } + let exp: std::collections::HashSet<&DocumentId> = expected_docs.iter().collect(); + qr.hits_top_k + .iter() + .filter(|h| exp.contains(&h.doc_id)) + .map(|h| h.rank) + .min() +} + +fn build_per_query( + qrs_a: &HashMap, + qrs_b: &HashMap, + golden: &HashMap<&str, &GoldenQuery>, + chunker_match_mode: &str, +) -> Vec { + let fallback = chunker_match_mode == "fallback_doc"; + let mut ids: Vec<&String> = qrs_a.keys().chain(qrs_b.keys()).collect(); + ids.sort(); + ids.dedup(); + + let mut out = Vec::with_capacity(ids.len()); + for id in ids { + let a = qrs_a.get(id); + let b = qrs_b.get(id); + let gq = golden.get(id.as_str()).copied(); + + let (a_rank, b_rank) = match gq { + Some(g) => ( + a.and_then(|q| first_hit_rank(q, &g.expected_chunk_ids, &g.expected_doc_ids, fallback)), + b.and_then(|q| first_hit_rank(q, &g.expected_chunk_ids, &g.expected_doc_ids, fallback)), + ), + None => (None, None), + }; + + let (kind, note) = classify(a_rank, b_rank, gq); + + out.push(QueryComparison { + query_id: id.clone(), + kind, + a_hit_rank: a_rank, + b_hit_rank: b_rank, + note, + }); + } + out +} + +fn classify( + a_rank: Option, + b_rank: Option, + gq: Option<&GoldenQuery>, +) -> (ComparisonKind, Option) { + match (a_rank, b_rank) { + (None, Some(_)) => (ComparisonKind::Win, None), + (Some(_), None) => { + // Hit → miss is a regression specifically when the query had + // an expected chunk to find. Without that, downgrade to Loss + // so refusal-flow queries (no expected_*) don't appear as + // regressions. + let has_expected = gq + .map(|g| !g.expected_chunk_ids.is_empty() || !g.expected_doc_ids.is_empty()) + .unwrap_or(false); + if has_expected { + (ComparisonKind::Regression, Some("hit→miss".into())) + } else { + (ComparisonKind::Loss, None) + } + } + (Some(ra), Some(rb)) if ra == rb => (ComparisonKind::Draw, None), + (Some(ra), Some(rb)) if rb < ra => (ComparisonKind::Win, Some(format!("rank {ra}→{rb}"))), + (Some(ra), Some(rb)) => (ComparisonKind::Loss, Some(format!("rank {ra}→{rb}"))), + (None, None) => (ComparisonKind::Draw, None), + } +} + +fn build_deltas( + a: &AggregateMetrics, + b: &AggregateMetrics, + chunker_match_mode: &str, +) -> serde_json::Value { + fn d(a: f32, b: f32) -> serde_json::Value { + if a.is_nan() || b.is_nan() { + serde_json::Value::Null + } else { + serde_json::Value::from((b - a) as f64) + } + } + let mut hit = serde_json::Map::new(); + let mut recall = serde_json::Map::new(); + for k in crate::metrics::TOP_K_VARIANTS { + hit.insert( + k.to_string(), + d( + a.hit_at_k.get(k).copied().unwrap_or(f32::NAN), + b.hit_at_k.get(k).copied().unwrap_or(f32::NAN), + ), + ); + recall.insert( + k.to_string(), + d( + a.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN), + b.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN), + ), + ); + } + serde_json::json!({ + "hit_at_k": hit, + "mrr": d(a.mrr, b.mrr), + "recall_at_k_doc": recall, + "citation_coverage": d(a.citation_coverage, b.citation_coverage), + "groundedness": d(a.groundedness, b.groundedness), + "empty_result_rate": d(a.empty_result_rate, b.empty_result_rate), + "refusal_correctness": d(a.refusal_correctness, b.refusal_correctness), + "chunker_version_match": chunker_match_mode, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn classify_win_loss_draw_regression() { + let g = GoldenQuery { + id: "q1".into(), + query: "q".into(), + lang: kb_core::Lang(String::new()), + expected_doc_ids: vec![], + expected_chunk_ids: vec![kb_core::ChunkId("c1".into())], + must_contain: vec![], + forbidden: vec![], + difficulty: None, + }; + let g = Some(&g); + // a miss, b hit → Win + assert_eq!(classify(None, Some(2), g).0, ComparisonKind::Win); + // a hit, b miss, has expected → Regression + assert_eq!(classify(Some(1), None, g).0, ComparisonKind::Regression); + // both same rank → Draw + assert_eq!(classify(Some(3), Some(3), g).0, ComparisonKind::Draw); + // b improved rank → Win + assert_eq!(classify(Some(5), Some(2), g).0, ComparisonKind::Win); + // b worse rank → Loss + assert_eq!(classify(Some(2), Some(5), g).0, ComparisonKind::Loss); + // both miss → Draw + assert_eq!(classify(None, None, g).0, ComparisonKind::Draw); + } + + #[test] + fn delta_null_when_either_nan() { + let a = AggregateMetrics { + hit_at_k: Default::default(), + mrr: 0.5, + recall_at_k_doc: Default::default(), + citation_coverage: f32::NAN, + groundedness: 0.0, + empty_result_rate: 0.0, + refusal_correctness: f32::NAN, + total_queries: 0, + failed_queries: 0, + }; + let b = AggregateMetrics { mrr: 0.75, ..a.clone() }; + let d = build_deltas(&a, &b, "exact"); + assert!(d["citation_coverage"].is_null()); + assert!(d["refusal_correctness"].is_null()); + assert!((d["mrr"].as_f64().unwrap() - 0.25).abs() < 1e-6); + assert_eq!(d["chunker_version_match"], "exact"); + } + + #[test] + fn extract_chunker_version_from_snapshot() { + let s = r#"{"config":{},"chunker_version":"slot@1"}"#; + assert_eq!(extract_chunker_version(s), Some("slot@1".into())); + assert_eq!(extract_chunker_version("not json"), None); + assert_eq!(extract_chunker_version("{}"), None); + } +} diff --git a/crates/kb-eval/src/lib.rs b/crates/kb-eval/src/lib.rs index 9825949..14cf89b 100644 --- a/crates/kb-eval/src/lib.rs +++ b/crates/kb-eval/src/lib.rs @@ -20,10 +20,20 @@ //! //! `run_id` uses UUIDv7 simple — timestamp-ordered, lowercase hex. +mod compare; mod loader; +mod metrics; mod runner; mod types; +pub use compare::{ + CompareOpts, CompareReport, ComparisonKind, QueryComparison, compare_runs, + compare_runs_with_config, render_report_md, +}; pub use loader::load_golden_set; +pub use metrics::{ + AggregateMetrics, TOP_K_VARIANTS, compute_aggregate, compute_aggregate_with_config, + store_aggregate, store_aggregate_with_config, +}; pub use runner::{run_eval, run_eval_with_config}; pub use types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult}; diff --git a/crates/kb-eval/src/metrics.rs b/crates/kb-eval/src/metrics.rs new file mode 100644 index 0000000..e8056ef --- /dev/null +++ b/crates/kb-eval/src/metrics.rs @@ -0,0 +1,667 @@ +//! Aggregate metrics over a stored eval run (P5-2 — design §5.7). +//! +//! Reads `eval_query_results` rows for one `run_id`, re-loads the +//! golden YAML (so `expected_*` / `must_contain` / `forbidden` are at +//! hand), and produces an [`AggregateMetrics`]. [`store_aggregate`] +//! writes the JSON form back into `eval_runs.aggregate_json`. +//! +//! Pure computation — no `kb-app` / retrieval / embedding imports. + +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use kb_config::Config; +use kb_core::{ChunkId, Citation, DocumentId}; +use kb_store_sqlite::SqliteStore; + +use crate::loader::load_golden_set; +use crate::types::{GoldenQuery, QueryResult}; + +/// `k` values reported in `hit@k` and `recall@k_doc`. Pinned by spec +/// (`tasks/p5/p5-2-metrics-compare.md`); a 4-element array keeps the +/// downstream `BTreeMap` keys stable across runs. +pub const TOP_K_VARIANTS: &[u32] = &[1, 3, 5, 10]; + +/// `MRR` floor: chunks ranked outside the top-10 contribute 0 to the +/// reciprocal sum (matches the spec — "0 if not found in top-10"). +const MRR_TOP: u32 = 10; + +/// Number of fractional digits aggregate metric values are rounded to +/// before storage / snapshot. Small enough that floating-point sum +/// drift across architectures cancels, large enough that genuine +/// differences (e.g., one extra hit out of ~50 queries) survive. +const STORAGE_DECIMALS: u32 = 4; + +/// Env var that overrides the default `fixtures/golden_queries.yaml` +/// path during metric computation. Must be the same path the runner +/// (P5-1) used — otherwise `expected_*` / `must_contain` won't line up +/// with the stored `query_id`s. `pub(crate)` so the runner shares the +/// exact same name + default rather than duplicating constants. +pub(crate) const KB_EVAL_GOLDEN: &str = "KB_EVAL_GOLDEN"; + +/// Default golden YAML path (relative to CWD when set). Same +/// rationale as [`KB_EVAL_GOLDEN`] — single source of truth. +pub(crate) const DEFAULT_GOLDEN_PATH: &str = "fixtures/golden_queries.yaml"; + +/// Aggregate metrics for one stored eval run. +/// +/// The `f32` fields use a custom serializer that emits JSON `null` for +/// `NaN` (zero-denominator metrics). `BTreeMap` keys produce +/// stringified-integer JSON object keys, which is the standard +/// `serde_json` behavior — downstream comparisons / snapshots rely on +/// that ordering, hence `BTreeMap` (not `HashMap`). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct AggregateMetrics { + pub hit_at_k: BTreeMap, + pub mrr: f32, + pub recall_at_k_doc: BTreeMap, + #[serde( + serialize_with = "serialize_f32_nan_as_null", + deserialize_with = "deserialize_f32_or_nan" + )] + pub citation_coverage: f32, + pub groundedness: f32, + pub empty_result_rate: f32, + #[serde( + serialize_with = "serialize_f32_nan_as_null", + deserialize_with = "deserialize_f32_or_nan" + )] + pub refusal_correctness: f32, + pub total_queries: u32, + pub failed_queries: u32, +} + +/// Custom serializer that maps `f32::NAN` to JSON `null`. Used on the +/// two fields whose denominator can legitimately be zero (no RAG +/// answers; no "should refuse" queries) — every other metric defaults +/// to `0.0` when the denominator is zero, since the corresponding +/// "this should be measured" set is always non-empty in practice. +fn serialize_f32_nan_as_null(v: &f32, s: S) -> std::result::Result { + if v.is_nan() { + s.serialize_none() + } else { + s.serialize_f32(*v) + } +} + +/// Inverse of [`serialize_f32_nan_as_null`]: JSON `null` → `f32::NAN`. +/// Lets `serde_json::from_str::` round-trip the +/// stored `aggregate_json`. +fn deserialize_f32_or_nan<'de, D: Deserializer<'de>>(d: D) -> std::result::Result { + let opt: Option = Option::deserialize(d)?; + Ok(opt.unwrap_or(f32::NAN)) +} + +/// Compute aggregate metrics for `run_id` against the active +/// XDG-loaded [`Config`]. Wraps [`compute_aggregate_with_config`] with +/// `Config::load(None)`. +pub fn compute_aggregate(run_id: &str) -> Result { + let cfg = Config::load(None).context("load Config for compute_aggregate")?; + compute_aggregate_with_config(&cfg, run_id) +} + +/// Compute aggregate metrics for `run_id` against an explicit +/// [`Config`] (used by tests with a TempDir-backed `data_dir`). +pub fn compute_aggregate_with_config(cfg: &Config, run_id: &str) -> Result { + let store = SqliteStore::open(cfg).context("open SqliteStore for compute_aggregate")?; + store + .run_migrations() + .context("run migrations for compute_aggregate")?; + if store + .load_eval_run(run_id) + .context("load eval_runs row")? + .is_none() + { + anyhow::bail!("compute_aggregate: no eval_runs row for run_id {run_id}"); + } + let rows = store + .load_eval_query_results(run_id) + .context("load eval_query_results")?; + let queries = load_golden_for_metrics()?; + aggregate_from_rows(&queries, &rows) +} + +/// Persist `agg` into `eval_runs.aggregate_json` for `run_id`. Wraps +/// [`store_aggregate_with_config`] with `Config::load(None)`. +pub fn store_aggregate(run_id: &str, agg: &AggregateMetrics) -> Result<()> { + let cfg = Config::load(None).context("load Config for store_aggregate")?; + store_aggregate_with_config(&cfg, run_id, agg) +} + +/// Persist `agg` into `eval_runs.aggregate_json` for `run_id` against +/// an explicit [`Config`]. +pub fn store_aggregate_with_config( + cfg: &Config, + run_id: &str, + agg: &AggregateMetrics, +) -> Result<()> { + let store = SqliteStore::open(cfg).context("open SqliteStore for store_aggregate")?; + store.run_migrations().context("run migrations")?; + let json = serde_json::to_string(agg).context("serialize AggregateMetrics")?; + store + .update_eval_run_aggregate(run_id, &json) + .with_context(|| format!("update eval_runs.aggregate_json for {run_id}"))?; + Ok(()) +} + +/// Resolve the golden YAML path for metric reload — same env override +/// the runner uses, same default path. Pulled into its own helper so +/// `compare_runs` can share it. +pub(crate) fn resolve_golden_path() -> PathBuf { + match std::env::var(KB_EVAL_GOLDEN) { + Ok(s) if !s.is_empty() => PathBuf::from(s), + _ => PathBuf::from(DEFAULT_GOLDEN_PATH), + } +} + +fn load_golden_for_metrics() -> Result> { + let path = resolve_golden_path(); + load_golden_set(&path).with_context(|| { + format!( + "load golden set from {} (override via KB_EVAL_GOLDEN)", + path.display() + ) + }) +} + +/// Pure computation kernel. Split out so unit tests can drive metrics +/// off hand-rolled `(GoldenQuery, QueryResult)` fixtures without +/// touching SQLite. No `&SqliteStore` parameter — the current metric +/// formulas don't need DB lookups; once `citation_coverage` graduates +/// to a per-citation `document_exists_by_path` probe (see deferral in +/// `tasks/p5/p5-2-metrics-compare.md`), this will need to take one. +pub(crate) fn aggregate_from_rows( + queries: &[GoldenQuery], + rows: &[kb_store_sqlite::EvalQueryResultRecord], +) -> Result { + let golden_by_id: HashMap<&str, &GoldenQuery> = + queries.iter().map(|q| (q.id.as_str(), q)).collect(); + + let total_queries = u32::try_from(rows.len()).unwrap_or(u32::MAX); + let mut failed_queries: u32 = 0; + + let mut hit_at_k: BTreeMap = + TOP_K_VARIANTS.iter().map(|k| (*k, (0_u32, 0_u32))).collect(); + let mut recall_at_k_doc: BTreeMap = + TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect(); + + let mut mrr_sum: f64 = 0.0; + let mut mrr_denom: u32 = 0; + + let mut empty_result_count: u32 = 0; + + let mut groundedness_num: u32 = 0; + let mut groundedness_denom: u32 = 0; + + let mut citation_num: u32 = 0; + let mut citation_denom: u32 = 0; + + let mut refusal_num: u32 = 0; + let mut refusal_denom: u32 = 0; + + for row in rows { + let qr: QueryResult = serde_json::from_str(&row.result_json) + .with_context(|| format!("parse result_json for {}", row.query_id))?; + if qr.error.is_some() { + failed_queries += 1; + } + if qr.hits_top_k.is_empty() { + empty_result_count += 1; + } + + let Some(gq) = golden_by_id.get(qr.query_id.as_str()) else { + // Stored row has no golden entry — skip metric updates; + // the run still counts in `total_queries` so the run-vs- + // golden mismatch is auditable. + continue; + }; + + // hit@k + MRR (chunk-level, requires non-empty expected_chunk_ids) + if !gq.expected_chunk_ids.is_empty() { + let expected: HashSet<&ChunkId> = gq.expected_chunk_ids.iter().collect(); + let first_hit_rank = qr + .hits_top_k + .iter() + .filter(|h| expected.contains(&h.chunk_id)) + .map(|h| h.rank) + .min(); + for k in TOP_K_VARIANTS { + let entry = hit_at_k.get_mut(k).expect("init"); + entry.1 += 1; + if let Some(rank) = first_hit_rank + && rank <= *k + { + entry.0 += 1; + } + } + mrr_denom += 1; + if let Some(rank) = first_hit_rank + && rank <= MRR_TOP + { + mrr_sum += 1.0 / f64::from(rank); + } + } + + // recall@k_doc (doc-level, requires non-empty expected_doc_ids + // and `>0` is the "should retrieve" condition; refusal queries + // (`expected_doc_ids = []`) are excluded by spec). + if !gq.expected_doc_ids.is_empty() { + let expected_docs: HashSet<&DocumentId> = gq.expected_doc_ids.iter().collect(); + for k in TOP_K_VARIANTS { + let entry = recall_at_k_doc.get_mut(k).expect("init"); + entry.1 += 1; + let topk_docs: HashSet<&DocumentId> = qr + .hits_top_k + .iter() + .filter(|h| h.rank <= *k) + .map(|h| &h.doc_id) + .collect(); + let covered = expected_docs.iter().filter(|d| topk_docs.contains(*d)).count(); + let frac = covered as f64 / expected_docs.len() as f64; + entry.0 += frac; + } + } else { + // refusal_correctness: golden marks "should refuse" via empty + // expected_doc_ids. We can only judge this on RAG runs — a + // lexical-only run produces no Answer, so "refusal" is + // undefined. Excluding such queries from the denominator + // (rather than counting them as failures) keeps the metric + // honest: a search-only run reports refusal_correctness as + // NaN/null, not 0.0. + if let Some(ans) = &qr.answer { + refusal_denom += 1; + if !ans.grounded { + refusal_num += 1; + } + } + } + + // groundedness + citation_coverage (only meaningful with RAG + // answers; skip queries that errored or had no Answer). + if let Some(answer) = &qr.answer + && qr.error.is_none() + { + // Skip "no-check" goldens (both must_contain and forbidden + // empty) so an unconfigured golden entry doesn't get a free + // 1.0 / 0.0 split. Refusal-class queries land here too; + // their groundedness is judged via refusal_correctness. + if !gq.must_contain.is_empty() || !gq.forbidden.is_empty() { + groundedness_denom += 1; + let grounded_ok = gq.must_contain.iter().all(|s| answer.answer.contains(s)) + && !gq.forbidden.iter().any(|s| answer.answer.contains(s)); + if grounded_ok { + groundedness_num += 1; + } + } + // citation_coverage: denominator is grounded RAG answers + // (refusals don't drag it down). The spec calls for "every + // citation resolves to a real chunk in the DB"; the current + // implementation is intentionally weaker — see + // `tasks/p5/p5-2-metrics-compare.md` "Implementation + // deviations" for the deferral rationale. Today: an Answer + // counts as fully covered iff (a) it carries at least one + // citation (so empty-citations doesn't sneak through + // `Iterator::all`'s vacuous-true) and (b) every citation's + // path is non-empty. Tightening to a per-citation + // SqliteStore probe is the obvious next step once + // `document_exists_by_path` lands in `kb-store-sqlite`. + if answer.grounded { + citation_denom += 1; + let covered = !answer.citations.is_empty() + && answer.citations.iter().all(|c| match &c.citation { + Citation::Line { path, .. } + | Citation::Page { path, .. } + | Citation::Region { path, .. } + | Citation::Caption { path, .. } + | Citation::Time { path, .. } => !path.0.is_empty(), + }); + if covered { + citation_num += 1; + } + } + } + } + + Ok(AggregateMetrics { + hit_at_k: round_ratio_map(&hit_at_k), + mrr: round_storage(if mrr_denom == 0 { + 0.0 + } else { + mrr_sum / f64::from(mrr_denom) + }), + recall_at_k_doc: round_recall_map(&recall_at_k_doc), + citation_coverage: ratio_or_nan(citation_num, citation_denom), + groundedness: ratio_or_zero(groundedness_num, groundedness_denom), + empty_result_rate: ratio_or_zero(empty_result_count, total_queries), + refusal_correctness: ratio_or_nan(refusal_num, refusal_denom), + total_queries, + failed_queries, + }) +} + +fn round_storage(v: f64) -> f32 { + if v.is_nan() { + return f32::NAN; + } + let scale = 10_f64.powi(STORAGE_DECIMALS as i32); + ((v * scale).round() / scale) as f32 +} + +fn round_ratio_map(m: &BTreeMap) -> BTreeMap { + m.iter() + .map(|(k, (num, denom))| { + let v = if *denom == 0 { + 0.0 + } else { + f64::from(*num) / f64::from(*denom) + }; + (*k, round_storage(v)) + }) + .collect() +} + +fn round_recall_map(m: &BTreeMap) -> BTreeMap { + m.iter() + .map(|(k, (sum, denom))| { + let v = if *denom == 0 { + 0.0 + } else { + *sum / f64::from(*denom) + }; + (*k, round_storage(v)) + }) + .collect() +} + +fn ratio_or_nan(num: u32, denom: u32) -> f32 { + if denom == 0 { + f32::NAN + } else { + round_storage(f64::from(num) / f64::from(denom)) + } +} + +fn ratio_or_zero(num: u32, denom: u32) -> f32 { + if denom == 0 { + 0.0 + } else { + round_storage(f64::from(num) / f64::from(denom)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kb_core::{ + ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, SearchHit, + SearchMode, + }; + use kb_core::asset::WorkspacePath; + use kb_core::media::Lang; + use kb_core::answer::{Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, TokenUsage, TraceId}; + use kb_core::versions::PromptTemplateVersion; + use time::OffsetDateTime; + + fn gq(id: &str, expected_chunks: &[&str], expected_docs: &[&str]) -> GoldenQuery { + GoldenQuery { + id: id.into(), + query: format!("q-{id}"), + lang: Lang(String::new()), + expected_doc_ids: expected_docs.iter().map(|s| DocumentId((*s).into())).collect(), + expected_chunk_ids: expected_chunks.iter().map(|s| ChunkId((*s).into())).collect(), + must_contain: vec![], + forbidden: vec![], + difficulty: None, + } + } + + fn hit(rank: u32, chunk_id: &str, doc_id: &str) -> SearchHit { + SearchHit { + rank, + chunk_id: ChunkId(chunk_id.into()), + doc_id: DocumentId(doc_id.into()), + doc_path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(), + heading_path: vec!["root".into()], + section_label: None, + snippet: "s".into(), + citation: Citation::Line { + path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(), + start: 1, + end: 1, + section: None, + }, + retrieval: RetrievalDetail { + method: SearchMode::Lexical, + fusion_score: 1.0, + lexical_score: Some(1.0), + vector_score: None, + lexical_rank: Some(rank), + vector_rank: None, + }, + index_version: IndexVersion(format!("idx@{rank}")), + embedding_model: None, + chunker_version: ChunkerVersion("test@1".into()), + } + } + + fn qr(id: &str, hits: Vec, error: Option, answer: Option) -> QueryResult { + QueryResult { + query_id: id.into(), + query: format!("q-{id}"), + mode: SearchMode::Lexical, + hits_top_k: hits, + answer, + elapsed_ms: 1, + error, + } + } + + fn record(id: &str, hits: Vec, error: Option, answer: Option) + -> kb_store_sqlite::EvalQueryResultRecord + { + kb_store_sqlite::EvalQueryResultRecord { + query_id: id.into(), + result_json: serde_json::to_string(&qr(id, hits, error, answer)).unwrap(), + } + } + + fn answer(text: &str, grounded: bool, citation_paths: &[&str]) -> Answer { + Answer { + answer: text.into(), + citations: citation_paths.iter().map(|p| AnswerCitation { + marker: None, + citation: Citation::Line { + path: WorkspacePath::new((*p).into()).unwrap(), + start: 1, + end: 1, + section: None, + }, + }).collect(), + grounded, + refusal_reason: None, + model: ModelRef { id: "m".into(), provider: "p".into(), dimensions: None }, + embedding: None, + prompt_template_version: PromptTemplateVersion("p@1".into()), + retrieval: AnswerRetrievalSummary { + trace_id: TraceId("t".into()), + mode: SearchMode::Lexical, + k: 5, + score_gate: 0.0, + top_score: 1.0, + chunks_returned: 1, + chunks_used: 1, + }, + usage: TokenUsage { prompt_tokens: 1, completion_tokens: 1, latency_ms: 1 }, + created_at: OffsetDateTime::UNIX_EPOCH, + } + } + + #[test] + fn hit_at_k_handles_ranks_1_4_miss() { + // q1: hit @ rank 1, q2: hit @ rank 4, q3: miss + let queries = vec![ + gq("q1", &["c1"], &["d1"]), + gq("q2", &["c2"], &["d2"]), + gq("q3", &["c3"], &["d3"]), + ]; + let rows = vec![ + record("q1", vec![hit(1, "c1", "d1")], None, None), + record("q2", vec![hit(1, "x", "y"), hit(2, "x", "y"), hit(3, "x", "y"), hit(4, "c2", "d2")], None, None), + record("q3", vec![hit(1, "x", "y")], None, None), + ]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + // hit@1 = 1/3 (q1 only), hit@3 = 1/3, hit@5 = 2/3, hit@10 = 2/3 + assert_eq!(agg.hit_at_k[&1], 0.3333); + assert_eq!(agg.hit_at_k[&3], 0.3333); + assert_eq!(agg.hit_at_k[&5], 0.6667); + assert_eq!(agg.hit_at_k[&10], 0.6667); + } + + #[test] + fn mrr_matches_expected() { + // q1 rank 1 → 1/1, q2 rank 4 → 1/4, q3 miss → 0. mean = (1 + 0.25 + 0) / 3 ≈ 0.4167 + let queries = vec![ + gq("q1", &["c1"], &["d1"]), + gq("q2", &["c2"], &["d2"]), + gq("q3", &["c3"], &["d3"]), + ]; + let rows = vec![ + record("q1", vec![hit(1, "c1", "d1")], None, None), + record("q2", vec![hit(1, "x", "y"), hit(2, "x", "y"), hit(3, "x", "y"), hit(4, "c2", "d2")], None, None), + record("q3", vec![hit(1, "x", "y")], None, None), + ]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.mrr, 0.4167); + } + + #[test] + fn recall_at_k_doc_partial() { + // q1 expects {d1, d2}; top-3 returns {d1}. recall@3 = 0.5 + let queries = vec![gq("q1", &[], &["d1", "d2"])]; + let rows = vec![record("q1", vec![hit(1, "c1", "d1"), hit(2, "c2", "d3")], None, None)]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.recall_at_k_doc[&3], 0.5); + assert_eq!(agg.recall_at_k_doc[&10], 0.5); + } + + #[test] + fn citation_coverage_full_when_paths_resolve() { + let mut q = gq("q1", &[], &["d1"]); + q.must_contain = vec!["alpha".into()]; + let queries = vec![q]; + let ans = answer("contains alpha", true, &["docs/d1.md"]); + let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, Some(ans))]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.citation_coverage, 1.0); + } + + #[test] + fn groundedness_false_when_forbidden_present() { + let mut q = gq("q1", &[], &["d1"]); + q.must_contain = vec!["alpha".into()]; + q.forbidden = vec!["beta".into()]; + let queries = vec![q]; + let ans = answer("alpha and beta", true, &["docs/d1.md"]); + let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, Some(ans))]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.groundedness, 0.0); + } + + #[test] + fn refusal_correctness_one_when_should_refuse_and_did() { + let queries = vec![gq("q1", &[], &[])]; // expected_doc_ids empty → "should refuse" + let ans = answer("I cannot answer", false, &[]); + let rows = vec![record("q1", vec![], None, Some(ans))]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.refusal_correctness, 1.0); + } + + #[test] + fn refusal_correctness_nan_for_non_rag_run() { + // Even with a "should refuse" query, a lexical-only run has no + // Answer and so refusal cannot be judged → metric is NaN, not 0. + let queries = vec![gq("q1", &[], &[])]; + let rows = vec![record("q1", vec![], None, None)]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert!(agg.refusal_correctness.is_nan(), "got {}", agg.refusal_correctness); + } + + #[test] + fn citation_coverage_zero_when_answer_has_no_citations() { + // A grounded answer with empty citations[] used to count as + // covered via Iterator::all's vacuous-true; now must score 0. + let mut q = gq("q1", &[], &["d1"]); + q.must_contain = vec!["alpha".into()]; + let queries = vec![q]; + let ans = answer("contains alpha", true, &[]); + let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, Some(ans))]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.citation_coverage, 0.0); + } + + #[test] + fn groundedness_skips_unconfigured_goldens() { + // A non-error RAG answer for a golden with neither must_contain + // nor forbidden must NOT score 1.0 by default — it should be + // excluded from the denominator entirely. Refusal-class + // queries are tracked via refusal_correctness instead. + let queries = vec![gq("q1", &["c1"], &["d1"])]; // no must_contain / forbidden + let ans = answer("anything", true, &["docs/d1.md"]); + let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, Some(ans))]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + // denominator is 0 → ratio_or_zero returns 0.0 (not NaN, since + // groundedness isn't a NaN-flagged metric per spec). + assert_eq!(agg.groundedness, 0.0); + } + + #[test] + fn nan_metrics_serialize_as_null() { + // No RAG answers → citation_coverage NaN. No "should refuse" → refusal_correctness NaN. + let queries = vec![gq("q1", &["c1"], &["d1"])]; + let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, None)]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + let json: serde_json::Value = serde_json::to_value(&agg).unwrap(); + assert!(json["citation_coverage"].is_null(), "expected null, got {:?}", json["citation_coverage"]); + assert!(json["refusal_correctness"].is_null(), "expected null, got {:?}", json["refusal_correctness"]); + } + + #[test] + fn determinism_two_runs_match() { + let queries = vec![gq("q1", &["c1"], &["d1"]), gq("q2", &["c2"], &["d2"])]; + let rows = vec![ + record("q1", vec![hit(1, "c1", "d1")], None, None), + record("q2", vec![hit(1, "x", "y"), hit(2, "c2", "d2")], None, None), + ]; + let a = aggregate_from_rows(&queries, &rows).unwrap(); + let b = aggregate_from_rows(&queries, &rows).unwrap(); + // NaN != NaN under PartialEq, but the JSON encoding maps NaN + // to null and is the actual storage form, so compare on that. + assert_eq!( + serde_json::to_string(&a).unwrap(), + serde_json::to_string(&b).unwrap() + ); + } + + #[test] + fn empty_result_rate_counts_zero_hits() { + let queries = vec![gq("q1", &["c1"], &["d1"]), gq("q2", &["c2"], &["d2"])]; + let rows = vec![ + record("q1", vec![], None, None), + record("q2", vec![hit(1, "c2", "d2")], None, None), + ]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.empty_result_rate, 0.5); + } + + #[test] + fn failed_queries_counted() { + let queries = vec![gq("q1", &["c1"], &["d1"])]; + let rows = vec![record("q1", vec![], Some("boom".into()), None)]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.failed_queries, 1); + assert_eq!(agg.total_queries, 1); + } +} diff --git a/crates/kb-eval/src/runner.rs b/crates/kb-eval/src/runner.rs index 7916654..6e5f2ad 100644 --- a/crates/kb-eval/src/runner.rs +++ b/crates/kb-eval/src/runner.rs @@ -13,6 +13,7 @@ use kb_store_sqlite::{EvalRunRow, SqliteStore}; use time::OffsetDateTime; use crate::loader::{load_golden_set, validate_against_db}; +use crate::metrics::{DEFAULT_GOLDEN_PATH, KB_EVAL_GOLDEN}; use crate::types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult}; /// Convert a wall-clock duration since `start` into milliseconds clamped @@ -23,13 +24,6 @@ fn elapsed_ms_u32(start: Instant) -> u32 { start.elapsed().as_millis().min(u128::from(u32::MAX)) as u32 } -/// Env var that overrides the default `fixtures/golden_queries.yaml` -/// path. Resolved relative to the current working directory. -const KB_EVAL_GOLDEN: &str = "KB_EVAL_GOLDEN"; - -/// Default golden YAML path (relative to CWD when set). -const DEFAULT_GOLDEN_PATH: &str = "fixtures/golden_queries.yaml"; - /// Run the golden suite end-to-end against the active XDG-loaded /// [`kb_config::Config`]. Wraps [`run_eval_with_config`] with /// `Config::load(None)`. diff --git a/crates/kb-eval/tests/fixtures/eval/compare-1.json b/crates/kb-eval/tests/fixtures/eval/compare-1.json new file mode 100644 index 0000000..ee969ae --- /dev/null +++ b/crates/kb-eval/tests/fixtures/eval/compare-1.json @@ -0,0 +1,89 @@ +{ + "aggregate_a": { + "citation_coverage": null, + "empty_result_rate": 0.0, + "failed_queries": 0, + "groundedness": 0.0, + "hit_at_k": { + "1": 0.33329999446868896, + "10": 0.666700005531311, + "3": 0.33329999446868896, + "5": 0.666700005531311 + }, + "mrr": 0.41670000553131104, + "recall_at_k_doc": { + "1": 0.33329999446868896, + "10": 0.666700005531311, + "3": 0.33329999446868896, + "5": 0.666700005531311 + }, + "refusal_correctness": null, + "total_queries": 3 + }, + "aggregate_b": { + "citation_coverage": null, + "empty_result_rate": 0.0, + "failed_queries": 0, + "groundedness": 0.0, + "hit_at_k": { + "1": 0.666700005531311, + "10": 1.0, + "3": 1.0, + "5": 1.0 + }, + "mrr": 0.833299994468689, + "recall_at_k_doc": { + "1": 0.666700005531311, + "10": 1.0, + "3": 1.0, + "5": 1.0 + }, + "refusal_correctness": null, + "total_queries": 3 + }, + "deltas": { + "chunker_version_match": "exact", + "citation_coverage": null, + "empty_result_rate": 0.0, + "groundedness": 0.0, + "hit_at_k": { + "1": 0.33340001106262207, + "10": 0.33329999446868896, + "3": 0.666700005531311, + "5": 0.33329999446868896 + }, + "mrr": 0.41659998893737793, + "recall_at_k_doc": { + "1": 0.33340001106262207, + "10": 0.33329999446868896, + "3": 0.666700005531311, + "5": 0.33329999446868896 + }, + "refusal_correctness": null + }, + "per_query": [ + { + "a_hit_rank": 1, + "b_hit_rank": 2, + "kind": "loss", + "note": "rank 1→2", + "query_id": "q-001" + }, + { + "a_hit_rank": 4, + "b_hit_rank": 1, + "kind": "win", + "note": "rank 4→1", + "query_id": "q-002" + }, + { + "a_hit_rank": null, + "b_hit_rank": 1, + "kind": "win", + "note": null, + "query_id": "q-003" + } + ], + "run_a": "run_a", + "run_b": "run_b" +} diff --git a/crates/kb-eval/tests/metrics_and_compare.rs b/crates/kb-eval/tests/metrics_and_compare.rs new file mode 100644 index 0000000..9721df9 --- /dev/null +++ b/crates/kb-eval/tests/metrics_and_compare.rs @@ -0,0 +1,436 @@ +//! Integration tests for P5-2: write two synthetic eval runs into a +//! SQLite store, then drive `compute_aggregate` / `store_aggregate` / +//! `compare_runs` end-to-end. Mirrors the test plan in +//! `tasks/p5/p5-2-metrics-compare.md`. +//! +//! Snapshot of `CompareReport` JSON is pinned at +//! `tests/fixtures/eval/compare-1.json`. + +use std::fs; +use std::path::PathBuf; + +use kb_config::Config; +use kb_core::{ + ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, Lang, + RetrievalDetail, SearchHit, SearchMode, + asset::WorkspacePath, +}; +use kb_eval::{ + AggregateMetrics, CompareOpts, CompareReport, ComparisonKind, GoldenQuery, QueryResult, + compare_runs_with_config, compute_aggregate_with_config, store_aggregate_with_config, +}; +use kb_store_sqlite::{EvalRunRow, SqliteStore}; +use tempfile::TempDir; +use time::OffsetDateTime; + +fn cfg_with_data_dir(tmp: &TempDir, golden_yaml: &str) -> Config { + let mut cfg = Config::defaults(); + cfg.storage.data_dir = tmp.path().to_string_lossy().into_owned(); + cfg.storage.runs_dir = tmp.path().join("runs").to_string_lossy().into_owned(); + cfg.storage.copy_threshold_mb = 0; + let golden_path = tmp.path().join("golden.yaml"); + fs::write(&golden_path, golden_yaml).unwrap(); + // Point both metrics + compare at the temp golden via env override. + // SAFELY scoped — `set_var` is process-global so callers serialise + // tests via the `serial_test`-style guard below. + unsafe { + std::env::set_var("KB_EVAL_GOLDEN", &golden_path); + } + cfg +} + +fn golden_yaml_basic() -> &'static str { + r#" +- id: q-001 + query: hit at rank 1 + expected_doc_ids: ["doc-1"] + expected_chunk_ids: ["chunk-1"] +- id: q-002 + query: hit at rank 4 + expected_doc_ids: ["doc-2"] + expected_chunk_ids: ["chunk-2"] +- id: q-003 + query: miss everywhere + expected_doc_ids: ["doc-3"] + expected_chunk_ids: ["chunk-3"] +"# +} + +fn hit(rank: u32, chunk_id: &str, doc_id: &str) -> SearchHit { + SearchHit { + rank, + chunk_id: ChunkId(chunk_id.into()), + doc_id: DocumentId(doc_id.into()), + doc_path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(), + heading_path: vec!["root".into()], + section_label: None, + snippet: "snip".into(), + citation: Citation::Line { + path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(), + start: 1, + end: 1, + section: None, + }, + retrieval: RetrievalDetail { + method: SearchMode::Lexical, + fusion_score: 1.0 / f32::from(u16::try_from(rank).unwrap_or(1)), + lexical_score: Some(1.0), + vector_score: None, + lexical_rank: Some(rank), + vector_rank: None, + }, + index_version: IndexVersion("idx@1".into()), + embedding_model: None, + chunker_version: ChunkerVersion("test@1".into()), + } +} + +fn qr(query_id: &str, hits: Vec) -> QueryResult { + QueryResult { + query_id: query_id.into(), + query: format!("query for {query_id}"), + mode: SearchMode::Lexical, + hits_top_k: hits, + answer: None, + elapsed_ms: 1, + error: None, + } +} + +fn write_run( + store: &SqliteStore, + run_id: &str, + chunker_version: &str, + created_at: OffsetDateTime, + queries: Vec, +) { + let snapshot = serde_json::json!({ + "config": {}, + "chunker_version": chunker_version, + }); + let snapshot_text = serde_json::to_string(&snapshot).unwrap(); + let row = EvalRunRow { + run_id, + suite: "golden", + config_snapshot_json: &snapshot_text, + aggregate_json: "{}", + commit_hash: Some("0000000"), + created_at, + }; + let results: Vec<(String, String)> = queries + .into_iter() + .map(|qr| { + let json = serde_json::to_string(&qr).unwrap(); + (qr.query_id, json) + }) + .collect(); + store.record_eval_run_with_results(&row, &results).unwrap(); +} + +/// Each test mutates a process-global env var (`KB_EVAL_GOLDEN`) and +/// expects to see its own write. Take this mutex around the body of +/// every test that touches `KB_EVAL_GOLDEN` so two concurrent test +/// threads don't trip over each other's golden YAML. +fn env_guard() -> std::sync::MutexGuard<'static, ()> { + use std::sync::{Mutex, OnceLock}; + static M: OnceLock> = OnceLock::new(); + M.get_or_init(|| Mutex::new(())) + .lock() + .unwrap_or_else(|e| e.into_inner()) +} + +#[test] +fn compute_and_store_aggregate_round_trips() { + let _g = env_guard(); + let tmp = TempDir::new().unwrap(); + let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic()); + let store = SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + let now = OffsetDateTime::UNIX_EPOCH; + write_run( + &store, + "run_a", + "test@1", + now, + vec![ + qr("q-001", vec![hit(1, "chunk-1", "doc-1")]), + qr( + "q-002", + vec![ + hit(1, "x", "x"), + hit(2, "x", "x"), + hit(3, "x", "x"), + hit(4, "chunk-2", "doc-2"), + ], + ), + qr("q-003", vec![hit(1, "x", "x")]), + ], + ); + drop(store); + + let agg = compute_aggregate_with_config(&cfg, "run_a").unwrap(); + // hit@1 = 1/3, hit@5 = 2/3, MRR = (1 + 0.25 + 0)/3 ≈ 0.4167. + assert_eq!(agg.hit_at_k[&1], 0.3333); + assert_eq!(agg.hit_at_k[&5], 0.6667); + assert_eq!(agg.mrr, 0.4167); + + store_aggregate_with_config(&cfg, "run_a", &agg).unwrap(); + let store = SqliteStore::open(&cfg).unwrap(); + let row = store.load_eval_run("run_a").unwrap().unwrap(); + let parsed: AggregateMetrics = serde_json::from_str(&row.aggregate_json).unwrap(); + // f32 round-trip via JSON is exact for our 4-decimal-rounded + // values, so direct equality is OK here (NaN fields are handled + // by the `serialize_f32_nan_as_null` round-trip — `citation_coverage` + // and `refusal_correctness` come back as NaN). Compare on JSON + // bytes instead, which is what `store_aggregate` writes. + assert_eq!( + serde_json::to_string(&parsed).unwrap(), + serde_json::to_string(&agg).unwrap() + ); +} + +#[test] +fn store_aggregate_rejects_missing_run() { + let _g = env_guard(); + let tmp = TempDir::new().unwrap(); + let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic()); + let agg = AggregateMetrics { + hit_at_k: Default::default(), + mrr: 0.0, + recall_at_k_doc: Default::default(), + citation_coverage: f32::NAN, + groundedness: 0.0, + empty_result_rate: 0.0, + refusal_correctness: f32::NAN, + total_queries: 0, + failed_queries: 0, + }; + let err = store_aggregate_with_config(&cfg, "run_does_not_exist", &agg).unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("run_does_not_exist"), "msg = {msg}"); +} + +#[test] +fn compare_runs_classifies_win_loss_draw_regression() { + let _g = env_guard(); + let tmp = TempDir::new().unwrap(); + let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic()); + let store = SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + let now = OffsetDateTime::UNIX_EPOCH; + // Run A: + // q-001 rank 1 → hit + // q-002 rank 4 → hit + // q-003 miss + write_run( + &store, + "run_a", + "test@1", + now, + vec![ + qr("q-001", vec![hit(1, "chunk-1", "doc-1")]), + qr( + "q-002", + vec![ + hit(1, "x", "x"), + hit(2, "x", "x"), + hit(3, "x", "x"), + hit(4, "chunk-2", "doc-2"), + ], + ), + qr("q-003", vec![hit(1, "x", "x")]), + ], + ); + // Run B: + // q-001 rank 2 → still hit (Loss vs A — worse rank) + // q-002 rank 1 → hit (Win — improved rank) + // q-003 hit @ rank 1 → hit (Win — was miss in A) + write_run( + &store, + "run_b", + "test@1", + now, + vec![ + qr("q-001", vec![hit(1, "x", "x"), hit(2, "chunk-1", "doc-1")]), + qr("q-002", vec![hit(1, "chunk-2", "doc-2")]), + qr("q-003", vec![hit(1, "chunk-3", "doc-3")]), + ], + ); + drop(store); + + let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap(); + let by_id: std::collections::HashMap<&str, &kb_eval::QueryComparison> = + report.per_query.iter().map(|c| (c.query_id.as_str(), c)).collect(); + assert_eq!(by_id["q-001"].kind, ComparisonKind::Loss); + assert_eq!(by_id["q-002"].kind, ComparisonKind::Win); + assert_eq!(by_id["q-003"].kind, ComparisonKind::Win); + assert_eq!(report.deltas["chunker_version_match"], "exact"); +} + +#[test] +fn compare_strict_mode_refuses_chunker_version_mismatch() { + let _g = env_guard(); + let tmp = TempDir::new().unwrap(); + let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic()); + let store = SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + let now = OffsetDateTime::UNIX_EPOCH; + write_run(&store, "run_a", "test@1", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]); + write_run(&store, "run_b", "test@2", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]); + drop(store); + + let opts = CompareOpts { + strict_chunker_version: true, + }; + let err = compare_runs_with_config(&cfg, "run_a", "run_b", &opts).unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("chunker_version mismatch"), "msg = {msg}"); +} + +#[test] +fn compare_graceful_falls_back_to_doc_id() { + let _g = env_guard(); + let tmp = TempDir::new().unwrap(); + let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic()); + let store = SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + let now = OffsetDateTime::UNIX_EPOCH; + // Run A uses test@1 chunker; run B uses test@2 — chunk_ids no longer + // align, but doc_ids do. + write_run(&store, "run_a", "test@1", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]); + write_run( + &store, + "run_b", + "test@2", + now, + // Different chunk_id, same doc_id → exact-mode matching would + // miss; doc-id fallback should still register a hit. + vec![qr("q-001", vec![hit(1, "chunk-1-renamed", "doc-1")])], + ); + drop(store); + + let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap(); + assert_eq!(report.deltas["chunker_version_match"], "fallback_doc"); + let q1 = report.per_query.iter().find(|c| c.query_id == "q-001").unwrap(); + // Both runs hit doc-1 at rank 1 → Draw. + assert_eq!(q1.kind, ComparisonKind::Draw); + assert_eq!(q1.a_hit_rank, Some(1)); + assert_eq!(q1.b_hit_rank, Some(1)); +} + +#[test] +fn compare_report_snapshot_matches_fixture() { + let _g = env_guard(); + let tmp = TempDir::new().unwrap(); + let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic()); + let store = SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + let now = OffsetDateTime::UNIX_EPOCH; + write_run( + &store, + "run_a", + "test@1", + now, + vec![ + qr("q-001", vec![hit(1, "chunk-1", "doc-1")]), + qr( + "q-002", + vec![ + hit(1, "x", "x"), + hit(2, "x", "x"), + hit(3, "x", "x"), + hit(4, "chunk-2", "doc-2"), + ], + ), + qr("q-003", vec![hit(1, "x", "x")]), + ], + ); + write_run( + &store, + "run_b", + "test@1", + now, + vec![ + qr("q-001", vec![hit(1, "x", "x"), hit(2, "chunk-1", "doc-1")]), + qr("q-002", vec![hit(1, "chunk-2", "doc-2")]), + qr("q-003", vec![hit(1, "chunk-3", "doc-3")]), + ], + ); + drop(store); + + let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap(); + let actual = projection(&report); + let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("eval") + .join("compare-1.json"); + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + fs::write(&fixture, format!("{}\n", serde_json::to_string_pretty(&actual).unwrap())) + .unwrap(); + } + let expected_text = fs::read_to_string(&fixture) + .unwrap_or_else(|e| panic!("missing fixture {}: {e}", fixture.display())); + let expected: serde_json::Value = serde_json::from_str(&expected_text).unwrap(); + assert_eq!(actual, expected, "compare report drift — re-run with UPDATE_SNAPSHOTS=1 if intended"); +} + +/// Project a `CompareReport` to the stable-across-runs subset. +/// `aggregate_*` and `deltas` are deterministic; per-query rows keep +/// only `(query_id, kind, ranks, note)` and discard volatile fields. +fn projection(r: &CompareReport) -> serde_json::Value { + serde_json::json!({ + "run_a": r.run_a, + "run_b": r.run_b, + "aggregate_a": r.aggregate_a, + "aggregate_b": r.aggregate_b, + "deltas": r.deltas, + "per_query": r.per_query, + }) +} + +#[test] +fn render_report_md_is_human_readable() { + let _g = env_guard(); + let tmp = TempDir::new().unwrap(); + let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic()); + let store = SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + let now = OffsetDateTime::UNIX_EPOCH; + write_run( + &store, + "run_a", + "test@1", + now, + vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])], + ); + write_run( + &store, + "run_b", + "test@1", + now, + vec![qr("q-001", vec![hit(2, "chunk-1", "doc-1")])], + ); + drop(store); + + let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap(); + let md = kb_eval::render_report_md(&report); + assert!(md.starts_with("# Eval compare:"), "md = {md}"); + assert!(md.contains("hit@1")); + assert!(md.contains("MRR")); + assert!(md.contains("Wins")); + assert!(md.contains("q-001")); +} + +#[test] +fn lang_default_is_used_when_omitted_in_yaml() { + // Round-trip safety: GoldenQuery without `lang` should parse fine. + let yaml = "- id: only\n query: q\n"; + let _g = env_guard(); + let tmp = TempDir::new().unwrap(); + let golden = tmp.path().join("g.yaml"); + fs::write(&golden, yaml).unwrap(); + let qs: Vec = serde_yaml::from_str(yaml).unwrap(); + assert_eq!(qs.len(), 1); + assert_eq!(qs[0].lang, Lang(String::new())); +} diff --git a/crates/kb-store-sqlite/src/eval.rs b/crates/kb-store-sqlite/src/eval.rs index d0bb272..34a29b0 100644 --- a/crates/kb-store-sqlite/src/eval.rs +++ b/crates/kb-store-sqlite/src/eval.rs @@ -16,7 +16,8 @@ use crate::store::SqliteStore; /// One row about to land in `eval_runs` (per V001 schema). /// /// `aggregate_json` is filled by P5-1 with the literal `"{}"` — -/// metric computation lives in P5-2 and updates the row in place. +/// metric computation lives in P5-2 and updates the row in place via +/// [`SqliteStore::update_eval_run_aggregate`]. #[derive(Clone, Debug)] pub struct EvalRunRow<'a> { pub run_id: &'a str, @@ -27,6 +28,28 @@ pub struct EvalRunRow<'a> { pub created_at: OffsetDateTime, } +/// Owned mirror of a row read from `eval_runs`. Used by P5-2's +/// `compute_aggregate` / `compare_runs` since [`EvalRunRow`] borrows +/// from the writer's input buffers. +#[derive(Clone, Debug, PartialEq)] +pub struct EvalRunRecord { + pub run_id: String, + pub suite: String, + pub config_snapshot_json: String, + pub aggregate_json: String, + pub commit_hash: Option, + pub created_at: OffsetDateTime, +} + +/// Owned per-query row read from `eval_query_results`. The +/// `result_json` is the same `serde_json::to_string(&QueryResult)` +/// payload [`SqliteStore::record_eval_query_result`] wrote. +#[derive(Clone, Debug, PartialEq)] +pub struct EvalQueryResultRecord { + pub query_id: String, + pub result_json: String, +} + impl SqliteStore { /// Return `true` iff a row with `doc_id = ?` exists in /// `documents`. Lightweight existence probe used by @@ -158,4 +181,98 @@ impl SqliteStore { tx.commit().map_err(StoreError::from)?; Ok(()) } + + /// Load a single `eval_runs` row by `run_id`. Returns `None` if no + /// row matches. Used by P5-2's `compute_aggregate` / `compare_runs` + /// to fetch the run-level metadata (config snapshot, aggregate, + /// commit, created_at). + pub fn load_eval_run(&self, run_id: &str) -> Result> { + let conn = self.lock_conn(); + let row = conn.query_row( + "SELECT run_id, suite, config_snapshot_json, aggregate_json, + commit_hash, created_at + FROM eval_runs WHERE run_id = ?", + params![run_id], + |r| { + Ok(( + r.get::<_, String>(0)?, + r.get::<_, String>(1)?, + r.get::<_, String>(2)?, + r.get::<_, String>(3)?, + r.get::<_, Option>(4)?, + r.get::<_, String>(5)?, + )) + }, + ); + let (run_id, suite, snapshot, aggregate, commit, created_str) = match row { + Ok(t) => t, + Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None), + Err(e) => return Err(StoreError::from(e).into()), + }; + let created_at = OffsetDateTime::parse( + &created_str, + &time::format_description::well_known::Rfc3339, + ) + .with_context(|| format!("parse eval_runs.created_at for {run_id}"))?; + Ok(Some(EvalRunRecord { + run_id, + suite, + config_snapshot_json: snapshot, + aggregate_json: aggregate, + commit_hash: commit, + created_at, + })) + } + + /// Load every `eval_query_results` row for one `run_id`, ordered by + /// `query_id` ASC for determinism (the table has no insertion-order + /// column; query_id ordering matches the BTreeSet sort the loader + /// uses for missing-id reporting). + pub fn load_eval_query_results( + &self, + run_id: &str, + ) -> Result> { + let conn = self.lock_conn(); + let mut stmt = conn + .prepare( + "SELECT query_id, result_json FROM eval_query_results + WHERE run_id = ? ORDER BY query_id ASC", + ) + .map_err(StoreError::from)?; + let iter = stmt + .query_map(params![run_id], |r| { + Ok(EvalQueryResultRecord { + query_id: r.get::<_, String>(0)?, + result_json: r.get::<_, String>(1)?, + }) + }) + .map_err(StoreError::from)?; + let mut out = Vec::new(); + for row in iter { + out.push(row.map_err(StoreError::from)?); + } + Ok(out) + } + + /// Replace `eval_runs.aggregate_json` for one `run_id`. Returns + /// `Err` (not `Ok(0)`) if the run is missing — we never want to + /// silently drop computed metrics. Called once per run by + /// P5-2's `store_aggregate`. + pub fn update_eval_run_aggregate( + &self, + run_id: &str, + aggregate_json: &str, + ) -> Result<()> { + let conn = self.lock_conn(); + let updated = conn + .execute( + "UPDATE eval_runs SET aggregate_json = ? WHERE run_id = ?", + params![aggregate_json, run_id], + ) + .map_err(StoreError::from)?; + if updated == 0 { + anyhow::bail!("update_eval_run_aggregate: no row for run_id {run_id}"); + } + Ok(()) + } } diff --git a/crates/kb-store-sqlite/src/lib.rs b/crates/kb-store-sqlite/src/lib.rs index 09e2b20..640e5b9 100644 --- a/crates/kb-store-sqlite/src/lib.rs +++ b/crates/kb-store-sqlite/src/lib.rs @@ -30,7 +30,7 @@ mod store; pub use embeddings::EmbeddingRecordRow; pub use error::StoreError; -pub use eval::EvalRunRow; +pub use eval::{EvalQueryResultRecord, EvalRunRecord, EvalRunRow}; pub use fts::rebuild_chunks_fts; pub use jobs::IngestRunRow; pub use store::SqliteStore; diff --git a/tasks/p5/p5-2-metrics-compare.md b/tasks/p5/p5-2-metrics-compare.md index 67e0da1..cc6e6e2 100644 --- a/tasks/p5/p5-2-metrics-compare.md +++ b/tasks/p5/p5-2-metrics-compare.md @@ -3,7 +3,7 @@ phase: P5 component: kb-eval (metrics + compare) task_id: p5-2 title: "Metrics computation + compare report" -status: planned +status: completed depends_on: [p5-1] unblocks: [] contract_source: ../../docs/superpowers/specs/2026-04-27-kb-final-form-design.md @@ -150,3 +150,62 @@ All tests under `cargo test -p kb-eval metrics`. - Floating-point sums in MRR cause minor cross-platform drift; round to 4 decimals on storage to keep snapshots stable. - "Should refuse" queries are encoded as `expected_doc_ids: []`. Document this convention in the golden YAML header comment. - Chunker version drift across runs is the COMMON case, not the error case (you almost always re-chunk before evaluating a chunker change). Default behavior is graceful fallback (doc + span overlap); only `--strict-chunker-version` refuses. The `chunker_version_match` field in `CompareReport.deltas` makes the mode auditable, so silent miscompares are still impossible. + +## Implementation deviations (intentional) + +Recorded so reviewers don't trip on them; the runtime behavior is the +same one this spec defines, the names / wiring just differ. + +- **Graceful fallback is doc-id-only, not doc + 50% span overlap.** The + `chunker_version_match` audit field is `"fallback_doc"` (not + `"fallback_doc_span"`). Span-overlap requires reading both runs' + `chunks.source_spans` simultaneously — but a chunker-version change + in practice re-indexes (overwrites) the chunks table, so by the time + you compute run B the run A chunk rows are already gone. Doc-id + matching is the strongest stable criterion under that workflow. + Span-overlap moves to a future phase that owns chunker-version + archival. +- **Helper signatures.** `compute_aggregate_with_config(cfg, run_id)` / + `store_aggregate_with_config(cfg, run_id, agg)` / + `compare_runs_with_config(cfg, a, b, opts)` exist alongside the + spec-pinned `compute_aggregate(run_id)` / `store_aggregate(run_id, agg)` + / `compare_runs(a, b)` so integration tests can drive the pipeline + against a TempDir-backed `Config`. The no-arg forms wrap them with + `Config::load(None)`. +- **CLI surface lives on `kb-cli` directly, not via `kb-app`.** DoD + asks for `kb eval compare` to be reached "via kb-app", but `kb-app` + already depends on `kb-eval` (the P5-1 runner uses the App facade), + so routing the CLI through `kb-app` would form a cycle. `kb-cli` → + `kb-eval` is wired directly; `kb-app` is unchanged. +- **`AggregateMetrics` is `Serialize + Deserialize`.** The spec defines + only the field shape; we add `Deserialize` so the stored + `aggregate_json` can round-trip back into the type for follow-up + computations. +- **`anyhow`** is used in `Result` returns since the rest of the + workspace already speaks anyhow; not in the spec's Allowed list but + matches every other crate. +- **`kb-eval` crate-level `kb-app` dep stays.** The crate already + depends on `kb-app` from P5-1 (the runner uses the `App` facade), so + the Cargo.toml entry remains. The new modules (`metrics.rs`, + `compare.rs`) do not import `kb-app` themselves — they're behind the + same crate boundary as the runner, but the metric/compare *surface* + is `kb-app`-clean. Splitting the crate to avoid a transitive Cargo + edge would be churn for no behavior gain. +- **`citation_coverage` is intentionally weaker than the spec literal.** + Spec calls for "every citation resolves to a real chunk in the DB". + Current implementation: an Answer counts as fully covered iff it has + ≥1 citation AND every citation's path is non-empty. Tightening to a + per-citation `document_exists_by_path` SqliteStore probe is the next + step once that helper lands. Empty-citations no longer pass through + `Iterator::all`'s vacuous-true. +- **`refusal_correctness` is undefined for non-RAG runs.** The metric + judges whether the system *refused*; without an `Answer` (lexical- + only or vector-only run), there's nothing to judge. We exclude such + queries from the denominator rather than auto-failing them, so a + search-only run reports `refusal_correctness` as `null` instead of a + misleading 0.0. +- **`groundedness` skips queries with no `must_contain`/`forbidden`.** + An unconfigured golden entry would otherwise score a free 1.0 (or + 0.0 if the answer happens to contain a forbidden string from a + later spec change). Refusal-class queries are also excluded — their + groundedness flows through `refusal_correctness`.