Merge pull request 'feat(p5-2): kb-eval metrics + compare — AggregateMetrics, CompareReport, kb eval CLI' (#28) from feat/p5-2-metrics-compare into main
Reviewed-on: altair823-org/kb#28
This commit was merged in pull request #28.
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -3422,6 +3422,7 @@ dependencies = [
|
||||
"kb-app",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"kb-eval",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
|
||||
@@ -15,6 +15,14 @@ path = "src/main.rs"
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-config = { path = "../kb-config" }
|
||||
kb-app = { path = "../kb-app" }
|
||||
# kb-eval re-exports `compute_aggregate` / `compare_runs` /
|
||||
# `render_report_md` (P5-2). The DoD calls for these to be reached
|
||||
# "via kb-app", but kb-eval already depends on kb-app (P5-1 runner
|
||||
# uses the App facade) — routing the CLI through kb-app would
|
||||
# require kb-app → kb-eval, forming a cycle. We therefore wire
|
||||
# kb-cli → kb-eval directly; documented in
|
||||
# `tasks/p5/p5-2-metrics-compare.md`.
|
||||
kb-eval = { path = "../kb-eval" }
|
||||
anyhow = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
|
||||
@@ -125,10 +125,41 @@ enum InspectWhat {
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum EvalWhat {
|
||||
/// Run an eval suite (placeholder for P9).
|
||||
/// Run the golden suite end-to-end and persist `eval_runs` +
|
||||
/// `eval_query_results` + `runs_dir/<run_id>/per_query.jsonl`
|
||||
/// (P5-1).
|
||||
Run {
|
||||
#[arg(long, default_value = "golden")]
|
||||
suite: String,
|
||||
#[arg(long, value_enum, default_value_t = ModeFlag::Lexical)]
|
||||
mode: ModeFlag,
|
||||
#[arg(long, default_value_t = 10)]
|
||||
k: usize,
|
||||
#[arg(long)]
|
||||
suite: Option<String>,
|
||||
with_rag: bool,
|
||||
#[arg(long)]
|
||||
temperature: Option<f32>,
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
},
|
||||
|
||||
/// Compute aggregate metrics for a stored run and write them back
|
||||
/// into `eval_runs.aggregate_json` (P5-2).
|
||||
Aggregate { run_id: String },
|
||||
|
||||
/// Diff two stored runs (P5-2). Default output is a Markdown
|
||||
/// summary; use `--json` (top-level flag) for the raw report.
|
||||
Compare {
|
||||
run_a: String,
|
||||
run_b: String,
|
||||
/// Refuse to compare when the two runs' `chunker_version`
|
||||
/// differ (default is graceful doc-id fallback).
|
||||
#[arg(long)]
|
||||
strict_chunker_version: bool,
|
||||
/// Also write the Markdown report to
|
||||
/// `runs_dir/<run_b>/report.md`.
|
||||
#[arg(long)]
|
||||
write_report: bool,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -370,8 +401,81 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
Cmd::Eval { what } => match what {
|
||||
EvalWhat::Run { suite: _ } => {
|
||||
anyhow::bail!("not yet wired (P9-3)")
|
||||
EvalWhat::Run {
|
||||
suite,
|
||||
mode,
|
||||
k,
|
||||
with_rag,
|
||||
temperature,
|
||||
seed,
|
||||
} => {
|
||||
let opts = kb_eval::EvalRunOpts {
|
||||
suite: suite.clone(),
|
||||
mode: (*mode).into(),
|
||||
with_rag: *with_rag,
|
||||
k: *k,
|
||||
temperature: *temperature,
|
||||
seed: *seed,
|
||||
};
|
||||
let run = kb_eval::run_eval(&opts)?;
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string_pretty(&run)?);
|
||||
} else {
|
||||
println!("run_id: {}", run.run_id);
|
||||
println!("queries: {}", run.per_query.len());
|
||||
let failed = run.per_query.iter().filter(|q| q.error.is_some()).count();
|
||||
println!("failed: {failed}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
EvalWhat::Aggregate { run_id } => {
|
||||
let agg = kb_eval::compute_aggregate(run_id)?;
|
||||
kb_eval::store_aggregate(run_id, &agg)?;
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string_pretty(&agg)?);
|
||||
} else {
|
||||
println!("run_id: {run_id}");
|
||||
println!("queries: {} ({} failed)", agg.total_queries, agg.failed_queries);
|
||||
println!("hit@1: {:.4}", agg.hit_at_k.get(&1).copied().unwrap_or(0.0));
|
||||
println!("hit@5: {:.4}", agg.hit_at_k.get(&5).copied().unwrap_or(0.0));
|
||||
println!("MRR: {:.4}", agg.mrr);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
EvalWhat::Compare {
|
||||
run_a,
|
||||
run_b,
|
||||
strict_chunker_version,
|
||||
write_report,
|
||||
} => {
|
||||
let cfg = kb_config::Config::load(None)?;
|
||||
let opts = kb_eval::CompareOpts {
|
||||
strict_chunker_version: *strict_chunker_version,
|
||||
};
|
||||
let report = kb_eval::compare_runs_with_config(&cfg, run_a, run_b, &opts)?;
|
||||
let md = kb_eval::render_report_md(&report);
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string_pretty(&report)?);
|
||||
} else {
|
||||
print!("{md}");
|
||||
}
|
||||
if *write_report {
|
||||
let resolved_data_dir = kb_config::expand_path(&cfg.storage.data_dir, "");
|
||||
let runs_dir = kb_config::expand_path(
|
||||
&cfg.storage.runs_dir,
|
||||
&resolved_data_dir.to_string_lossy(),
|
||||
);
|
||||
let dir = runs_dir.join(run_b);
|
||||
std::fs::create_dir_all(&dir)?;
|
||||
let path = dir.join("report.md");
|
||||
std::fs::write(&path, &md)?;
|
||||
if !cli.json {
|
||||
eprintln!("wrote {}", path.display());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
509
crates/kb-eval/src/compare.rs
Normal file
509
crates/kb-eval/src/compare.rs
Normal file
@@ -0,0 +1,509 @@
|
||||
//! Compare two eval runs (P5-2 — design §5.7, phase epic
|
||||
//! `tasks/phase-5-evaluation.md`).
|
||||
//!
|
||||
//! Reads `eval_runs` + `eval_query_results` for two `run_id`s, calls
|
||||
//! [`crate::metrics::compute_aggregate_with_config`] for each, then
|
||||
//! diffs them per-query. Emits a [`CompareReport`] (machine) and an
|
||||
//! optional Markdown render (human).
|
||||
//!
|
||||
//! Pure computation — no `kb-app` / retrieval imports.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Write as _;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use kb_config::Config;
|
||||
use kb_core::{ChunkId, DocumentId};
|
||||
use kb_store_sqlite::SqliteStore;
|
||||
|
||||
use crate::loader::load_golden_set;
|
||||
use crate::metrics::{
|
||||
AggregateMetrics, compute_aggregate_with_config, resolve_golden_path,
|
||||
};
|
||||
use crate::types::{GoldenQuery, QueryResult};
|
||||
|
||||
/// Strict-mode behavior pivot used by [`CompareOpts::strict_chunker_version`].
|
||||
/// When `false` (default) and the two runs' `chunker_version` differ,
|
||||
/// per-query matching falls back to doc-id-only comparison and the
|
||||
/// report's `deltas.chunker_version_match` field is set to
|
||||
/// `"fallback_doc"`.
|
||||
///
|
||||
/// **Spec deviation (intentional, documented):** the spec called for a
|
||||
/// `"fallback_doc_span"` mode that augments doc-id matching with a 50%
|
||||
/// `source_spans` overlap criterion. That requires `chunks` table
|
||||
/// reads from both runs simultaneously — but in practice you re-index
|
||||
/// (and overwrite the chunks table) before evaluating a chunker
|
||||
/// change, so the run-A chunks are gone by the time run-B is computed.
|
||||
/// We log the simpler doc-id-only fallback as `"fallback_doc"` and
|
||||
/// defer span-overlap matching to a future phase that owns
|
||||
/// chunker-version archival. The `strict-chunker-version` flag is
|
||||
/// preserved verbatim from the spec.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CompareOpts {
|
||||
pub strict_chunker_version: bool,
|
||||
}
|
||||
|
||||
/// Per-metric + per-query diff between two stored eval runs.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CompareReport {
|
||||
pub run_a: String,
|
||||
pub run_b: String,
|
||||
pub aggregate_a: AggregateMetrics,
|
||||
pub aggregate_b: AggregateMetrics,
|
||||
/// Per-metric delta (`b - a`) plus the `chunker_version_match`
|
||||
/// audit field. JSON object so consumers can pluck individual
|
||||
/// metrics by name without keeping the struct shape in sync.
|
||||
pub deltas: serde_json::Value,
|
||||
pub per_query: Vec<QueryComparison>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct QueryComparison {
|
||||
pub query_id: String,
|
||||
pub kind: ComparisonKind,
|
||||
pub a_hit_rank: Option<u32>,
|
||||
pub b_hit_rank: Option<u32>,
|
||||
pub note: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum ComparisonKind {
|
||||
Win,
|
||||
Loss,
|
||||
Draw,
|
||||
Regression,
|
||||
}
|
||||
|
||||
/// Compare two runs using the active XDG-loaded [`Config`]. Wraps
|
||||
/// [`compare_runs_with_config`] with `Config::load(None)`.
|
||||
pub fn compare_runs(run_id_a: &str, run_id_b: &str) -> Result<CompareReport> {
|
||||
let cfg = Config::load(None).context("load Config for compare_runs")?;
|
||||
compare_runs_with_config(&cfg, run_id_a, run_id_b, &CompareOpts::default())
|
||||
}
|
||||
|
||||
/// Compare two runs against an explicit [`Config`] + [`CompareOpts`].
|
||||
/// Used by integration tests and the future `kb eval compare --strict`
|
||||
/// CLI surface.
|
||||
pub fn compare_runs_with_config(
|
||||
cfg: &Config,
|
||||
run_id_a: &str,
|
||||
run_id_b: &str,
|
||||
opts: &CompareOpts,
|
||||
) -> Result<CompareReport> {
|
||||
let store = SqliteStore::open(cfg).context("open SqliteStore for compare_runs")?;
|
||||
store.run_migrations().context("run migrations")?;
|
||||
|
||||
// Pull both run rows up-front so we can extract chunker_version and
|
||||
// bail early on a missing run before doing any metric work.
|
||||
let run_a = store
|
||||
.load_eval_run(run_id_a)
|
||||
.context("load eval_runs row A")?
|
||||
.ok_or_else(|| anyhow::anyhow!("compare_runs: no eval_runs row for run_id {run_id_a}"))?;
|
||||
let run_b = store
|
||||
.load_eval_run(run_id_b)
|
||||
.context("load eval_runs row B")?
|
||||
.ok_or_else(|| anyhow::anyhow!("compare_runs: no eval_runs row for run_id {run_id_b}"))?;
|
||||
|
||||
let aggregate_a = compute_aggregate_with_config(cfg, run_id_a)?;
|
||||
let aggregate_b = compute_aggregate_with_config(cfg, run_id_b)?;
|
||||
|
||||
let chunker_a = extract_chunker_version(&run_a.config_snapshot_json);
|
||||
let chunker_b = extract_chunker_version(&run_b.config_snapshot_json);
|
||||
let chunker_match_mode = if chunker_a == chunker_b {
|
||||
"exact"
|
||||
} else if opts.strict_chunker_version {
|
||||
anyhow::bail!(
|
||||
"compare_runs: chunker_version mismatch (a={chunker_a:?}, b={chunker_b:?}) and \
|
||||
strict_chunker_version=true. Pass strict_chunker_version=false to use the doc-id \
|
||||
fallback."
|
||||
);
|
||||
} else {
|
||||
"fallback_doc"
|
||||
};
|
||||
|
||||
let rows_a = store.load_eval_query_results(run_id_a)?;
|
||||
let rows_b = store.load_eval_query_results(run_id_b)?;
|
||||
let qrs_a = parse_results(&rows_a)?;
|
||||
let qrs_b = parse_results(&rows_b)?;
|
||||
|
||||
let golden = load_golden_set(&resolve_golden_path()).context("reload golden set")?;
|
||||
let golden_by_id: HashMap<&str, &GoldenQuery> =
|
||||
golden.iter().map(|q| (q.id.as_str(), q)).collect();
|
||||
|
||||
let per_query = build_per_query(&qrs_a, &qrs_b, &golden_by_id, chunker_match_mode);
|
||||
let deltas = build_deltas(&aggregate_a, &aggregate_b, chunker_match_mode);
|
||||
|
||||
Ok(CompareReport {
|
||||
run_a: run_id_a.to_owned(),
|
||||
run_b: run_id_b.to_owned(),
|
||||
aggregate_a,
|
||||
aggregate_b,
|
||||
deltas,
|
||||
per_query,
|
||||
})
|
||||
}
|
||||
|
||||
/// Render a Markdown summary of `report`. Output is for human eyes
|
||||
/// (saved to `runs_dir/<run_b>/report.md` by callers that want it) —
|
||||
/// not a wire schema. Stable enough for snapshot tests.
|
||||
pub fn render_report_md(report: &CompareReport) -> String {
|
||||
let mut out = String::new();
|
||||
let _ = writeln!(out, "# Eval compare: `{}` vs `{}`", report.run_a, report.run_b);
|
||||
let _ = writeln!(out);
|
||||
let _ = writeln!(out, "## Aggregate deltas");
|
||||
let _ = writeln!(out);
|
||||
let _ = writeln!(out, "| metric | a | b | Δ (b - a) |");
|
||||
let _ = writeln!(out, "|---|---|---|---|");
|
||||
let a = &report.aggregate_a;
|
||||
let b = &report.aggregate_b;
|
||||
for k in crate::metrics::TOP_K_VARIANTS {
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"| hit@{k} | {} | {} | {} |",
|
||||
fmt(a.hit_at_k.get(k).copied().unwrap_or(f32::NAN)),
|
||||
fmt(b.hit_at_k.get(k).copied().unwrap_or(f32::NAN)),
|
||||
fmt_delta(
|
||||
a.hit_at_k.get(k).copied().unwrap_or(f32::NAN),
|
||||
b.hit_at_k.get(k).copied().unwrap_or(f32::NAN),
|
||||
),
|
||||
);
|
||||
}
|
||||
let _ = writeln!(out, "| MRR | {} | {} | {} |", fmt(a.mrr), fmt(b.mrr), fmt_delta(a.mrr, b.mrr));
|
||||
for k in crate::metrics::TOP_K_VARIANTS {
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"| recall@{k}_doc | {} | {} | {} |",
|
||||
fmt(a.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN)),
|
||||
fmt(b.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN)),
|
||||
fmt_delta(
|
||||
a.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN),
|
||||
b.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN),
|
||||
),
|
||||
);
|
||||
}
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"| citation_coverage | {} | {} | {} |",
|
||||
fmt(a.citation_coverage),
|
||||
fmt(b.citation_coverage),
|
||||
fmt_delta(a.citation_coverage, b.citation_coverage),
|
||||
);
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"| groundedness | {} | {} | {} |",
|
||||
fmt(a.groundedness),
|
||||
fmt(b.groundedness),
|
||||
fmt_delta(a.groundedness, b.groundedness),
|
||||
);
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"| empty_result_rate | {} | {} | {} |",
|
||||
fmt(a.empty_result_rate),
|
||||
fmt(b.empty_result_rate),
|
||||
fmt_delta(a.empty_result_rate, b.empty_result_rate),
|
||||
);
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"| refusal_correctness | {} | {} | {} |",
|
||||
fmt(a.refusal_correctness),
|
||||
fmt(b.refusal_correctness),
|
||||
fmt_delta(a.refusal_correctness, b.refusal_correctness),
|
||||
);
|
||||
let _ = writeln!(out);
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"chunker_version_match: `{}`",
|
||||
report
|
||||
.deltas
|
||||
.get("chunker_version_match")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("?")
|
||||
);
|
||||
let _ = writeln!(out);
|
||||
|
||||
let wins: Vec<_> = report.per_query.iter().filter(|c| c.kind == ComparisonKind::Win).collect();
|
||||
let losses: Vec<_> = report.per_query.iter().filter(|c| c.kind == ComparisonKind::Loss).collect();
|
||||
let regressions: Vec<_> = report
|
||||
.per_query
|
||||
.iter()
|
||||
.filter(|c| c.kind == ComparisonKind::Regression)
|
||||
.collect();
|
||||
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"## Wins ({}) / Losses ({}) / Regressions ({})",
|
||||
wins.len(),
|
||||
losses.len(),
|
||||
regressions.len()
|
||||
);
|
||||
let _ = writeln!(out);
|
||||
let _ = writeln!(out, "| query_id | kind | rank_a | rank_b | note |");
|
||||
let _ = writeln!(out, "|---|---|---|---|---|");
|
||||
for c in &report.per_query {
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"| {} | {} | {} | {} | {} |",
|
||||
c.query_id,
|
||||
comparison_kind_label(c.kind),
|
||||
c.a_hit_rank.map(|r| r.to_string()).unwrap_or_else(|| "—".into()),
|
||||
c.b_hit_rank.map(|r| r.to_string()).unwrap_or_else(|| "—".into()),
|
||||
c.note.as_deref().unwrap_or(""),
|
||||
);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn comparison_kind_label(k: ComparisonKind) -> &'static str {
|
||||
match k {
|
||||
ComparisonKind::Win => "win",
|
||||
ComparisonKind::Loss => "loss",
|
||||
ComparisonKind::Draw => "draw",
|
||||
ComparisonKind::Regression => "regression",
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt(v: f32) -> String {
|
||||
if v.is_nan() {
|
||||
"—".into()
|
||||
} else {
|
||||
format!("{v:.4}")
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_delta(a: f32, b: f32) -> String {
|
||||
if a.is_nan() || b.is_nan() {
|
||||
return "—".into();
|
||||
}
|
||||
let d = b - a;
|
||||
if d >= 0.0 {
|
||||
format!("+{d:.4}")
|
||||
} else {
|
||||
format!("{d:.4}")
|
||||
}
|
||||
}
|
||||
|
||||
/// Pull `chunker_version` out of a `config_snapshot_json` payload. The
|
||||
/// runner writes `{"chunker_version": "<id>", ...}`; missing or
|
||||
/// malformed → `None`. Two `None`s compare as equal and route through
|
||||
/// the "exact" matcher, but only the runner writes these snapshots
|
||||
/// and it always emits `chunker_version` — so `None == None` can only
|
||||
/// arise from a hand-edited DB or a pre-P5-1 fixture, both of which
|
||||
/// are out-of-scope failure modes that the strict-mode flag covers.
|
||||
fn extract_chunker_version(snapshot_json: &str) -> Option<String> {
|
||||
let v: serde_json::Value = serde_json::from_str(snapshot_json).ok()?;
|
||||
v.get("chunker_version")
|
||||
.and_then(|x| x.as_str())
|
||||
.map(|s| s.to_owned())
|
||||
}
|
||||
|
||||
fn parse_results(
|
||||
rows: &[kb_store_sqlite::EvalQueryResultRecord],
|
||||
) -> Result<HashMap<String, QueryResult>> {
|
||||
let mut out = HashMap::with_capacity(rows.len());
|
||||
for row in rows {
|
||||
let qr: QueryResult = serde_json::from_str(&row.result_json)
|
||||
.with_context(|| format!("parse result_json for {}", row.query_id))?;
|
||||
out.insert(row.query_id.clone(), qr);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// Find the top-ranked hit in `qr` whose `chunk_id` is in `expected`
|
||||
/// (exact mode) or whose `doc_id` is in `expected_docs` (fallback).
|
||||
fn first_hit_rank(
|
||||
qr: &QueryResult,
|
||||
expected_chunks: &[ChunkId],
|
||||
expected_docs: &[DocumentId],
|
||||
fallback_doc_only: bool,
|
||||
) -> Option<u32> {
|
||||
if !fallback_doc_only && !expected_chunks.is_empty() {
|
||||
let exp: std::collections::HashSet<&ChunkId> = expected_chunks.iter().collect();
|
||||
return qr
|
||||
.hits_top_k
|
||||
.iter()
|
||||
.filter(|h| exp.contains(&h.chunk_id))
|
||||
.map(|h| h.rank)
|
||||
.min();
|
||||
}
|
||||
if expected_docs.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let exp: std::collections::HashSet<&DocumentId> = expected_docs.iter().collect();
|
||||
qr.hits_top_k
|
||||
.iter()
|
||||
.filter(|h| exp.contains(&h.doc_id))
|
||||
.map(|h| h.rank)
|
||||
.min()
|
||||
}
|
||||
|
||||
fn build_per_query(
|
||||
qrs_a: &HashMap<String, QueryResult>,
|
||||
qrs_b: &HashMap<String, QueryResult>,
|
||||
golden: &HashMap<&str, &GoldenQuery>,
|
||||
chunker_match_mode: &str,
|
||||
) -> Vec<QueryComparison> {
|
||||
let fallback = chunker_match_mode == "fallback_doc";
|
||||
let mut ids: Vec<&String> = qrs_a.keys().chain(qrs_b.keys()).collect();
|
||||
ids.sort();
|
||||
ids.dedup();
|
||||
|
||||
let mut out = Vec::with_capacity(ids.len());
|
||||
for id in ids {
|
||||
let a = qrs_a.get(id);
|
||||
let b = qrs_b.get(id);
|
||||
let gq = golden.get(id.as_str()).copied();
|
||||
|
||||
let (a_rank, b_rank) = match gq {
|
||||
Some(g) => (
|
||||
a.and_then(|q| first_hit_rank(q, &g.expected_chunk_ids, &g.expected_doc_ids, fallback)),
|
||||
b.and_then(|q| first_hit_rank(q, &g.expected_chunk_ids, &g.expected_doc_ids, fallback)),
|
||||
),
|
||||
None => (None, None),
|
||||
};
|
||||
|
||||
let (kind, note) = classify(a_rank, b_rank, gq);
|
||||
|
||||
out.push(QueryComparison {
|
||||
query_id: id.clone(),
|
||||
kind,
|
||||
a_hit_rank: a_rank,
|
||||
b_hit_rank: b_rank,
|
||||
note,
|
||||
});
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn classify(
|
||||
a_rank: Option<u32>,
|
||||
b_rank: Option<u32>,
|
||||
gq: Option<&GoldenQuery>,
|
||||
) -> (ComparisonKind, Option<String>) {
|
||||
match (a_rank, b_rank) {
|
||||
(None, Some(_)) => (ComparisonKind::Win, None),
|
||||
(Some(_), None) => {
|
||||
// Hit → miss is a regression specifically when the query had
|
||||
// an expected chunk to find. Without that, downgrade to Loss
|
||||
// so refusal-flow queries (no expected_*) don't appear as
|
||||
// regressions.
|
||||
let has_expected = gq
|
||||
.map(|g| !g.expected_chunk_ids.is_empty() || !g.expected_doc_ids.is_empty())
|
||||
.unwrap_or(false);
|
||||
if has_expected {
|
||||
(ComparisonKind::Regression, Some("hit→miss".into()))
|
||||
} else {
|
||||
(ComparisonKind::Loss, None)
|
||||
}
|
||||
}
|
||||
(Some(ra), Some(rb)) if ra == rb => (ComparisonKind::Draw, None),
|
||||
(Some(ra), Some(rb)) if rb < ra => (ComparisonKind::Win, Some(format!("rank {ra}→{rb}"))),
|
||||
(Some(ra), Some(rb)) => (ComparisonKind::Loss, Some(format!("rank {ra}→{rb}"))),
|
||||
(None, None) => (ComparisonKind::Draw, None),
|
||||
}
|
||||
}
|
||||
|
||||
fn build_deltas(
|
||||
a: &AggregateMetrics,
|
||||
b: &AggregateMetrics,
|
||||
chunker_match_mode: &str,
|
||||
) -> serde_json::Value {
|
||||
fn d(a: f32, b: f32) -> serde_json::Value {
|
||||
if a.is_nan() || b.is_nan() {
|
||||
serde_json::Value::Null
|
||||
} else {
|
||||
serde_json::Value::from((b - a) as f64)
|
||||
}
|
||||
}
|
||||
let mut hit = serde_json::Map::new();
|
||||
let mut recall = serde_json::Map::new();
|
||||
for k in crate::metrics::TOP_K_VARIANTS {
|
||||
hit.insert(
|
||||
k.to_string(),
|
||||
d(
|
||||
a.hit_at_k.get(k).copied().unwrap_or(f32::NAN),
|
||||
b.hit_at_k.get(k).copied().unwrap_or(f32::NAN),
|
||||
),
|
||||
);
|
||||
recall.insert(
|
||||
k.to_string(),
|
||||
d(
|
||||
a.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN),
|
||||
b.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN),
|
||||
),
|
||||
);
|
||||
}
|
||||
serde_json::json!({
|
||||
"hit_at_k": hit,
|
||||
"mrr": d(a.mrr, b.mrr),
|
||||
"recall_at_k_doc": recall,
|
||||
"citation_coverage": d(a.citation_coverage, b.citation_coverage),
|
||||
"groundedness": d(a.groundedness, b.groundedness),
|
||||
"empty_result_rate": d(a.empty_result_rate, b.empty_result_rate),
|
||||
"refusal_correctness": d(a.refusal_correctness, b.refusal_correctness),
|
||||
"chunker_version_match": chunker_match_mode,
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn classify_win_loss_draw_regression() {
|
||||
let g = GoldenQuery {
|
||||
id: "q1".into(),
|
||||
query: "q".into(),
|
||||
lang: kb_core::Lang(String::new()),
|
||||
expected_doc_ids: vec![],
|
||||
expected_chunk_ids: vec![kb_core::ChunkId("c1".into())],
|
||||
must_contain: vec![],
|
||||
forbidden: vec![],
|
||||
difficulty: None,
|
||||
};
|
||||
let g = Some(&g);
|
||||
// a miss, b hit → Win
|
||||
assert_eq!(classify(None, Some(2), g).0, ComparisonKind::Win);
|
||||
// a hit, b miss, has expected → Regression
|
||||
assert_eq!(classify(Some(1), None, g).0, ComparisonKind::Regression);
|
||||
// both same rank → Draw
|
||||
assert_eq!(classify(Some(3), Some(3), g).0, ComparisonKind::Draw);
|
||||
// b improved rank → Win
|
||||
assert_eq!(classify(Some(5), Some(2), g).0, ComparisonKind::Win);
|
||||
// b worse rank → Loss
|
||||
assert_eq!(classify(Some(2), Some(5), g).0, ComparisonKind::Loss);
|
||||
// both miss → Draw
|
||||
assert_eq!(classify(None, None, g).0, ComparisonKind::Draw);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delta_null_when_either_nan() {
|
||||
let a = AggregateMetrics {
|
||||
hit_at_k: Default::default(),
|
||||
mrr: 0.5,
|
||||
recall_at_k_doc: Default::default(),
|
||||
citation_coverage: f32::NAN,
|
||||
groundedness: 0.0,
|
||||
empty_result_rate: 0.0,
|
||||
refusal_correctness: f32::NAN,
|
||||
total_queries: 0,
|
||||
failed_queries: 0,
|
||||
};
|
||||
let b = AggregateMetrics { mrr: 0.75, ..a.clone() };
|
||||
let d = build_deltas(&a, &b, "exact");
|
||||
assert!(d["citation_coverage"].is_null());
|
||||
assert!(d["refusal_correctness"].is_null());
|
||||
assert!((d["mrr"].as_f64().unwrap() - 0.25).abs() < 1e-6);
|
||||
assert_eq!(d["chunker_version_match"], "exact");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_chunker_version_from_snapshot() {
|
||||
let s = r#"{"config":{},"chunker_version":"slot@1"}"#;
|
||||
assert_eq!(extract_chunker_version(s), Some("slot@1".into()));
|
||||
assert_eq!(extract_chunker_version("not json"), None);
|
||||
assert_eq!(extract_chunker_version("{}"), None);
|
||||
}
|
||||
}
|
||||
@@ -20,10 +20,20 @@
|
||||
//!
|
||||
//! `run_id` uses UUIDv7 simple — timestamp-ordered, lowercase hex.
|
||||
|
||||
mod compare;
|
||||
mod loader;
|
||||
mod metrics;
|
||||
mod runner;
|
||||
mod types;
|
||||
|
||||
pub use compare::{
|
||||
CompareOpts, CompareReport, ComparisonKind, QueryComparison, compare_runs,
|
||||
compare_runs_with_config, render_report_md,
|
||||
};
|
||||
pub use loader::load_golden_set;
|
||||
pub use metrics::{
|
||||
AggregateMetrics, TOP_K_VARIANTS, compute_aggregate, compute_aggregate_with_config,
|
||||
store_aggregate, store_aggregate_with_config,
|
||||
};
|
||||
pub use runner::{run_eval, run_eval_with_config};
|
||||
pub use types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult};
|
||||
|
||||
667
crates/kb-eval/src/metrics.rs
Normal file
667
crates/kb-eval/src/metrics.rs
Normal file
@@ -0,0 +1,667 @@
|
||||
//! Aggregate metrics over a stored eval run (P5-2 — design §5.7).
|
||||
//!
|
||||
//! Reads `eval_query_results` rows for one `run_id`, re-loads the
|
||||
//! golden YAML (so `expected_*` / `must_contain` / `forbidden` are at
|
||||
//! hand), and produces an [`AggregateMetrics`]. [`store_aggregate`]
|
||||
//! writes the JSON form back into `eval_runs.aggregate_json`.
|
||||
//!
|
||||
//! Pure computation — no `kb-app` / retrieval / embedding imports.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
|
||||
use kb_config::Config;
|
||||
use kb_core::{ChunkId, Citation, DocumentId};
|
||||
use kb_store_sqlite::SqliteStore;
|
||||
|
||||
use crate::loader::load_golden_set;
|
||||
use crate::types::{GoldenQuery, QueryResult};
|
||||
|
||||
/// `k` values reported in `hit@k` and `recall@k_doc`. Pinned by spec
|
||||
/// (`tasks/p5/p5-2-metrics-compare.md`); a 4-element array keeps the
|
||||
/// downstream `BTreeMap<u32, f32>` keys stable across runs.
|
||||
pub const TOP_K_VARIANTS: &[u32] = &[1, 3, 5, 10];
|
||||
|
||||
/// `MRR` floor: chunks ranked outside the top-10 contribute 0 to the
|
||||
/// reciprocal sum (matches the spec — "0 if not found in top-10").
|
||||
const MRR_TOP: u32 = 10;
|
||||
|
||||
/// Number of fractional digits aggregate metric values are rounded to
|
||||
/// before storage / snapshot. Small enough that floating-point sum
|
||||
/// drift across architectures cancels, large enough that genuine
|
||||
/// differences (e.g., one extra hit out of ~50 queries) survive.
|
||||
const STORAGE_DECIMALS: u32 = 4;
|
||||
|
||||
/// Env var that overrides the default `fixtures/golden_queries.yaml`
|
||||
/// path during metric computation. Must be the same path the runner
|
||||
/// (P5-1) used — otherwise `expected_*` / `must_contain` won't line up
|
||||
/// with the stored `query_id`s. `pub(crate)` so the runner shares the
|
||||
/// exact same name + default rather than duplicating constants.
|
||||
pub(crate) const KB_EVAL_GOLDEN: &str = "KB_EVAL_GOLDEN";
|
||||
|
||||
/// Default golden YAML path (relative to CWD when set). Same
|
||||
/// rationale as [`KB_EVAL_GOLDEN`] — single source of truth.
|
||||
pub(crate) const DEFAULT_GOLDEN_PATH: &str = "fixtures/golden_queries.yaml";
|
||||
|
||||
/// Aggregate metrics for one stored eval run.
|
||||
///
|
||||
/// The `f32` fields use a custom serializer that emits JSON `null` for
|
||||
/// `NaN` (zero-denominator metrics). `BTreeMap<u32, f32>` keys produce
|
||||
/// stringified-integer JSON object keys, which is the standard
|
||||
/// `serde_json` behavior — downstream comparisons / snapshots rely on
|
||||
/// that ordering, hence `BTreeMap` (not `HashMap`).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct AggregateMetrics {
|
||||
pub hit_at_k: BTreeMap<u32, f32>,
|
||||
pub mrr: f32,
|
||||
pub recall_at_k_doc: BTreeMap<u32, f32>,
|
||||
#[serde(
|
||||
serialize_with = "serialize_f32_nan_as_null",
|
||||
deserialize_with = "deserialize_f32_or_nan"
|
||||
)]
|
||||
pub citation_coverage: f32,
|
||||
pub groundedness: f32,
|
||||
pub empty_result_rate: f32,
|
||||
#[serde(
|
||||
serialize_with = "serialize_f32_nan_as_null",
|
||||
deserialize_with = "deserialize_f32_or_nan"
|
||||
)]
|
||||
pub refusal_correctness: f32,
|
||||
pub total_queries: u32,
|
||||
pub failed_queries: u32,
|
||||
}
|
||||
|
||||
/// Custom serializer that maps `f32::NAN` to JSON `null`. Used on the
|
||||
/// two fields whose denominator can legitimately be zero (no RAG
|
||||
/// answers; no "should refuse" queries) — every other metric defaults
|
||||
/// to `0.0` when the denominator is zero, since the corresponding
|
||||
/// "this should be measured" set is always non-empty in practice.
|
||||
fn serialize_f32_nan_as_null<S: Serializer>(v: &f32, s: S) -> std::result::Result<S::Ok, S::Error> {
|
||||
if v.is_nan() {
|
||||
s.serialize_none()
|
||||
} else {
|
||||
s.serialize_f32(*v)
|
||||
}
|
||||
}
|
||||
|
||||
/// Inverse of [`serialize_f32_nan_as_null`]: JSON `null` → `f32::NAN`.
|
||||
/// Lets `serde_json::from_str::<AggregateMetrics>` round-trip the
|
||||
/// stored `aggregate_json`.
|
||||
fn deserialize_f32_or_nan<'de, D: Deserializer<'de>>(d: D) -> std::result::Result<f32, D::Error> {
|
||||
let opt: Option<f32> = Option::deserialize(d)?;
|
||||
Ok(opt.unwrap_or(f32::NAN))
|
||||
}
|
||||
|
||||
/// Compute aggregate metrics for `run_id` against the active
|
||||
/// XDG-loaded [`Config`]. Wraps [`compute_aggregate_with_config`] with
|
||||
/// `Config::load(None)`.
|
||||
pub fn compute_aggregate(run_id: &str) -> Result<AggregateMetrics> {
|
||||
let cfg = Config::load(None).context("load Config for compute_aggregate")?;
|
||||
compute_aggregate_with_config(&cfg, run_id)
|
||||
}
|
||||
|
||||
/// Compute aggregate metrics for `run_id` against an explicit
|
||||
/// [`Config`] (used by tests with a TempDir-backed `data_dir`).
|
||||
pub fn compute_aggregate_with_config(cfg: &Config, run_id: &str) -> Result<AggregateMetrics> {
|
||||
let store = SqliteStore::open(cfg).context("open SqliteStore for compute_aggregate")?;
|
||||
store
|
||||
.run_migrations()
|
||||
.context("run migrations for compute_aggregate")?;
|
||||
if store
|
||||
.load_eval_run(run_id)
|
||||
.context("load eval_runs row")?
|
||||
.is_none()
|
||||
{
|
||||
anyhow::bail!("compute_aggregate: no eval_runs row for run_id {run_id}");
|
||||
}
|
||||
let rows = store
|
||||
.load_eval_query_results(run_id)
|
||||
.context("load eval_query_results")?;
|
||||
let queries = load_golden_for_metrics()?;
|
||||
aggregate_from_rows(&queries, &rows)
|
||||
}
|
||||
|
||||
/// Persist `agg` into `eval_runs.aggregate_json` for `run_id`. Wraps
|
||||
/// [`store_aggregate_with_config`] with `Config::load(None)`.
|
||||
pub fn store_aggregate(run_id: &str, agg: &AggregateMetrics) -> Result<()> {
|
||||
let cfg = Config::load(None).context("load Config for store_aggregate")?;
|
||||
store_aggregate_with_config(&cfg, run_id, agg)
|
||||
}
|
||||
|
||||
/// Persist `agg` into `eval_runs.aggregate_json` for `run_id` against
|
||||
/// an explicit [`Config`].
|
||||
pub fn store_aggregate_with_config(
|
||||
cfg: &Config,
|
||||
run_id: &str,
|
||||
agg: &AggregateMetrics,
|
||||
) -> Result<()> {
|
||||
let store = SqliteStore::open(cfg).context("open SqliteStore for store_aggregate")?;
|
||||
store.run_migrations().context("run migrations")?;
|
||||
let json = serde_json::to_string(agg).context("serialize AggregateMetrics")?;
|
||||
store
|
||||
.update_eval_run_aggregate(run_id, &json)
|
||||
.with_context(|| format!("update eval_runs.aggregate_json for {run_id}"))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Resolve the golden YAML path for metric reload — same env override
|
||||
/// the runner uses, same default path. Pulled into its own helper so
|
||||
/// `compare_runs` can share it.
|
||||
pub(crate) fn resolve_golden_path() -> PathBuf {
|
||||
match std::env::var(KB_EVAL_GOLDEN) {
|
||||
Ok(s) if !s.is_empty() => PathBuf::from(s),
|
||||
_ => PathBuf::from(DEFAULT_GOLDEN_PATH),
|
||||
}
|
||||
}
|
||||
|
||||
fn load_golden_for_metrics() -> Result<Vec<GoldenQuery>> {
|
||||
let path = resolve_golden_path();
|
||||
load_golden_set(&path).with_context(|| {
|
||||
format!(
|
||||
"load golden set from {} (override via KB_EVAL_GOLDEN)",
|
||||
path.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Pure computation kernel. Split out so unit tests can drive metrics
|
||||
/// off hand-rolled `(GoldenQuery, QueryResult)` fixtures without
|
||||
/// touching SQLite. No `&SqliteStore` parameter — the current metric
|
||||
/// formulas don't need DB lookups; once `citation_coverage` graduates
|
||||
/// to a per-citation `document_exists_by_path` probe (see deferral in
|
||||
/// `tasks/p5/p5-2-metrics-compare.md`), this will need to take one.
|
||||
pub(crate) fn aggregate_from_rows(
|
||||
queries: &[GoldenQuery],
|
||||
rows: &[kb_store_sqlite::EvalQueryResultRecord],
|
||||
) -> Result<AggregateMetrics> {
|
||||
let golden_by_id: HashMap<&str, &GoldenQuery> =
|
||||
queries.iter().map(|q| (q.id.as_str(), q)).collect();
|
||||
|
||||
let total_queries = u32::try_from(rows.len()).unwrap_or(u32::MAX);
|
||||
let mut failed_queries: u32 = 0;
|
||||
|
||||
let mut hit_at_k: BTreeMap<u32, (u32, u32)> =
|
||||
TOP_K_VARIANTS.iter().map(|k| (*k, (0_u32, 0_u32))).collect();
|
||||
let mut recall_at_k_doc: BTreeMap<u32, (f64, u32)> =
|
||||
TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
|
||||
|
||||
let mut mrr_sum: f64 = 0.0;
|
||||
let mut mrr_denom: u32 = 0;
|
||||
|
||||
let mut empty_result_count: u32 = 0;
|
||||
|
||||
let mut groundedness_num: u32 = 0;
|
||||
let mut groundedness_denom: u32 = 0;
|
||||
|
||||
let mut citation_num: u32 = 0;
|
||||
let mut citation_denom: u32 = 0;
|
||||
|
||||
let mut refusal_num: u32 = 0;
|
||||
let mut refusal_denom: u32 = 0;
|
||||
|
||||
for row in rows {
|
||||
let qr: QueryResult = serde_json::from_str(&row.result_json)
|
||||
.with_context(|| format!("parse result_json for {}", row.query_id))?;
|
||||
if qr.error.is_some() {
|
||||
failed_queries += 1;
|
||||
}
|
||||
if qr.hits_top_k.is_empty() {
|
||||
empty_result_count += 1;
|
||||
}
|
||||
|
||||
let Some(gq) = golden_by_id.get(qr.query_id.as_str()) else {
|
||||
// Stored row has no golden entry — skip metric updates;
|
||||
// the run still counts in `total_queries` so the run-vs-
|
||||
// golden mismatch is auditable.
|
||||
continue;
|
||||
};
|
||||
|
||||
// hit@k + MRR (chunk-level, requires non-empty expected_chunk_ids)
|
||||
if !gq.expected_chunk_ids.is_empty() {
|
||||
let expected: HashSet<&ChunkId> = gq.expected_chunk_ids.iter().collect();
|
||||
let first_hit_rank = qr
|
||||
.hits_top_k
|
||||
.iter()
|
||||
.filter(|h| expected.contains(&h.chunk_id))
|
||||
.map(|h| h.rank)
|
||||
.min();
|
||||
for k in TOP_K_VARIANTS {
|
||||
let entry = hit_at_k.get_mut(k).expect("init");
|
||||
entry.1 += 1;
|
||||
if let Some(rank) = first_hit_rank
|
||||
&& rank <= *k
|
||||
{
|
||||
entry.0 += 1;
|
||||
}
|
||||
}
|
||||
mrr_denom += 1;
|
||||
if let Some(rank) = first_hit_rank
|
||||
&& rank <= MRR_TOP
|
||||
{
|
||||
mrr_sum += 1.0 / f64::from(rank);
|
||||
}
|
||||
}
|
||||
|
||||
// recall@k_doc (doc-level, requires non-empty expected_doc_ids
|
||||
// and `>0` is the "should retrieve" condition; refusal queries
|
||||
// (`expected_doc_ids = []`) are excluded by spec).
|
||||
if !gq.expected_doc_ids.is_empty() {
|
||||
let expected_docs: HashSet<&DocumentId> = gq.expected_doc_ids.iter().collect();
|
||||
for k in TOP_K_VARIANTS {
|
||||
let entry = recall_at_k_doc.get_mut(k).expect("init");
|
||||
entry.1 += 1;
|
||||
let topk_docs: HashSet<&DocumentId> = qr
|
||||
.hits_top_k
|
||||
.iter()
|
||||
.filter(|h| h.rank <= *k)
|
||||
.map(|h| &h.doc_id)
|
||||
.collect();
|
||||
let covered = expected_docs.iter().filter(|d| topk_docs.contains(*d)).count();
|
||||
let frac = covered as f64 / expected_docs.len() as f64;
|
||||
entry.0 += frac;
|
||||
}
|
||||
} else {
|
||||
// refusal_correctness: golden marks "should refuse" via empty
|
||||
// expected_doc_ids. We can only judge this on RAG runs — a
|
||||
// lexical-only run produces no Answer, so "refusal" is
|
||||
// undefined. Excluding such queries from the denominator
|
||||
// (rather than counting them as failures) keeps the metric
|
||||
// honest: a search-only run reports refusal_correctness as
|
||||
// NaN/null, not 0.0.
|
||||
if let Some(ans) = &qr.answer {
|
||||
refusal_denom += 1;
|
||||
if !ans.grounded {
|
||||
refusal_num += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// groundedness + citation_coverage (only meaningful with RAG
|
||||
// answers; skip queries that errored or had no Answer).
|
||||
if let Some(answer) = &qr.answer
|
||||
&& qr.error.is_none()
|
||||
{
|
||||
// Skip "no-check" goldens (both must_contain and forbidden
|
||||
// empty) so an unconfigured golden entry doesn't get a free
|
||||
// 1.0 / 0.0 split. Refusal-class queries land here too;
|
||||
// their groundedness is judged via refusal_correctness.
|
||||
if !gq.must_contain.is_empty() || !gq.forbidden.is_empty() {
|
||||
groundedness_denom += 1;
|
||||
let grounded_ok = gq.must_contain.iter().all(|s| answer.answer.contains(s))
|
||||
&& !gq.forbidden.iter().any(|s| answer.answer.contains(s));
|
||||
if grounded_ok {
|
||||
groundedness_num += 1;
|
||||
}
|
||||
}
|
||||
// citation_coverage: denominator is grounded RAG answers
|
||||
// (refusals don't drag it down). The spec calls for "every
|
||||
// citation resolves to a real chunk in the DB"; the current
|
||||
// implementation is intentionally weaker — see
|
||||
// `tasks/p5/p5-2-metrics-compare.md` "Implementation
|
||||
// deviations" for the deferral rationale. Today: an Answer
|
||||
// counts as fully covered iff (a) it carries at least one
|
||||
// citation (so empty-citations doesn't sneak through
|
||||
// `Iterator::all`'s vacuous-true) and (b) every citation's
|
||||
// path is non-empty. Tightening to a per-citation
|
||||
// SqliteStore probe is the obvious next step once
|
||||
// `document_exists_by_path` lands in `kb-store-sqlite`.
|
||||
if answer.grounded {
|
||||
citation_denom += 1;
|
||||
let covered = !answer.citations.is_empty()
|
||||
&& answer.citations.iter().all(|c| match &c.citation {
|
||||
Citation::Line { path, .. }
|
||||
| Citation::Page { path, .. }
|
||||
| Citation::Region { path, .. }
|
||||
| Citation::Caption { path, .. }
|
||||
| Citation::Time { path, .. } => !path.0.is_empty(),
|
||||
});
|
||||
if covered {
|
||||
citation_num += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(AggregateMetrics {
|
||||
hit_at_k: round_ratio_map(&hit_at_k),
|
||||
mrr: round_storage(if mrr_denom == 0 {
|
||||
0.0
|
||||
} else {
|
||||
mrr_sum / f64::from(mrr_denom)
|
||||
}),
|
||||
recall_at_k_doc: round_recall_map(&recall_at_k_doc),
|
||||
citation_coverage: ratio_or_nan(citation_num, citation_denom),
|
||||
groundedness: ratio_or_zero(groundedness_num, groundedness_denom),
|
||||
empty_result_rate: ratio_or_zero(empty_result_count, total_queries),
|
||||
refusal_correctness: ratio_or_nan(refusal_num, refusal_denom),
|
||||
total_queries,
|
||||
failed_queries,
|
||||
})
|
||||
}
|
||||
|
||||
fn round_storage(v: f64) -> f32 {
|
||||
if v.is_nan() {
|
||||
return f32::NAN;
|
||||
}
|
||||
let scale = 10_f64.powi(STORAGE_DECIMALS as i32);
|
||||
((v * scale).round() / scale) as f32
|
||||
}
|
||||
|
||||
fn round_ratio_map(m: &BTreeMap<u32, (u32, u32)>) -> BTreeMap<u32, f32> {
|
||||
m.iter()
|
||||
.map(|(k, (num, denom))| {
|
||||
let v = if *denom == 0 {
|
||||
0.0
|
||||
} else {
|
||||
f64::from(*num) / f64::from(*denom)
|
||||
};
|
||||
(*k, round_storage(v))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn round_recall_map(m: &BTreeMap<u32, (f64, u32)>) -> BTreeMap<u32, f32> {
|
||||
m.iter()
|
||||
.map(|(k, (sum, denom))| {
|
||||
let v = if *denom == 0 {
|
||||
0.0
|
||||
} else {
|
||||
*sum / f64::from(*denom)
|
||||
};
|
||||
(*k, round_storage(v))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn ratio_or_nan(num: u32, denom: u32) -> f32 {
|
||||
if denom == 0 {
|
||||
f32::NAN
|
||||
} else {
|
||||
round_storage(f64::from(num) / f64::from(denom))
|
||||
}
|
||||
}
|
||||
|
||||
fn ratio_or_zero(num: u32, denom: u32) -> f32 {
|
||||
if denom == 0 {
|
||||
0.0
|
||||
} else {
|
||||
round_storage(f64::from(num) / f64::from(denom))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kb_core::{
|
||||
ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, SearchHit,
|
||||
SearchMode,
|
||||
};
|
||||
use kb_core::asset::WorkspacePath;
|
||||
use kb_core::media::Lang;
|
||||
use kb_core::answer::{Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, TokenUsage, TraceId};
|
||||
use kb_core::versions::PromptTemplateVersion;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn gq(id: &str, expected_chunks: &[&str], expected_docs: &[&str]) -> GoldenQuery {
|
||||
GoldenQuery {
|
||||
id: id.into(),
|
||||
query: format!("q-{id}"),
|
||||
lang: Lang(String::new()),
|
||||
expected_doc_ids: expected_docs.iter().map(|s| DocumentId((*s).into())).collect(),
|
||||
expected_chunk_ids: expected_chunks.iter().map(|s| ChunkId((*s).into())).collect(),
|
||||
must_contain: vec![],
|
||||
forbidden: vec![],
|
||||
difficulty: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn hit(rank: u32, chunk_id: &str, doc_id: &str) -> SearchHit {
|
||||
SearchHit {
|
||||
rank,
|
||||
chunk_id: ChunkId(chunk_id.into()),
|
||||
doc_id: DocumentId(doc_id.into()),
|
||||
doc_path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(),
|
||||
heading_path: vec!["root".into()],
|
||||
section_label: None,
|
||||
snippet: "s".into(),
|
||||
citation: Citation::Line {
|
||||
path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(),
|
||||
start: 1,
|
||||
end: 1,
|
||||
section: None,
|
||||
},
|
||||
retrieval: RetrievalDetail {
|
||||
method: SearchMode::Lexical,
|
||||
fusion_score: 1.0,
|
||||
lexical_score: Some(1.0),
|
||||
vector_score: None,
|
||||
lexical_rank: Some(rank),
|
||||
vector_rank: None,
|
||||
},
|
||||
index_version: IndexVersion(format!("idx@{rank}")),
|
||||
embedding_model: None,
|
||||
chunker_version: ChunkerVersion("test@1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
fn qr(id: &str, hits: Vec<SearchHit>, error: Option<String>, answer: Option<Answer>) -> QueryResult {
|
||||
QueryResult {
|
||||
query_id: id.into(),
|
||||
query: format!("q-{id}"),
|
||||
mode: SearchMode::Lexical,
|
||||
hits_top_k: hits,
|
||||
answer,
|
||||
elapsed_ms: 1,
|
||||
error,
|
||||
}
|
||||
}
|
||||
|
||||
fn record(id: &str, hits: Vec<SearchHit>, error: Option<String>, answer: Option<Answer>)
|
||||
-> kb_store_sqlite::EvalQueryResultRecord
|
||||
{
|
||||
kb_store_sqlite::EvalQueryResultRecord {
|
||||
query_id: id.into(),
|
||||
result_json: serde_json::to_string(&qr(id, hits, error, answer)).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn answer(text: &str, grounded: bool, citation_paths: &[&str]) -> Answer {
|
||||
Answer {
|
||||
answer: text.into(),
|
||||
citations: citation_paths.iter().map(|p| AnswerCitation {
|
||||
marker: None,
|
||||
citation: Citation::Line {
|
||||
path: WorkspacePath::new((*p).into()).unwrap(),
|
||||
start: 1,
|
||||
end: 1,
|
||||
section: None,
|
||||
},
|
||||
}).collect(),
|
||||
grounded,
|
||||
refusal_reason: None,
|
||||
model: ModelRef { id: "m".into(), provider: "p".into(), dimensions: None },
|
||||
embedding: None,
|
||||
prompt_template_version: PromptTemplateVersion("p@1".into()),
|
||||
retrieval: AnswerRetrievalSummary {
|
||||
trace_id: TraceId("t".into()),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 5,
|
||||
score_gate: 0.0,
|
||||
top_score: 1.0,
|
||||
chunks_returned: 1,
|
||||
chunks_used: 1,
|
||||
},
|
||||
usage: TokenUsage { prompt_tokens: 1, completion_tokens: 1, latency_ms: 1 },
|
||||
created_at: OffsetDateTime::UNIX_EPOCH,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hit_at_k_handles_ranks_1_4_miss() {
|
||||
// q1: hit @ rank 1, q2: hit @ rank 4, q3: miss
|
||||
let queries = vec![
|
||||
gq("q1", &["c1"], &["d1"]),
|
||||
gq("q2", &["c2"], &["d2"]),
|
||||
gq("q3", &["c3"], &["d3"]),
|
||||
];
|
||||
let rows = vec![
|
||||
record("q1", vec![hit(1, "c1", "d1")], None, None),
|
||||
record("q2", vec![hit(1, "x", "y"), hit(2, "x", "y"), hit(3, "x", "y"), hit(4, "c2", "d2")], None, None),
|
||||
record("q3", vec![hit(1, "x", "y")], None, None),
|
||||
];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
// hit@1 = 1/3 (q1 only), hit@3 = 1/3, hit@5 = 2/3, hit@10 = 2/3
|
||||
assert_eq!(agg.hit_at_k[&1], 0.3333);
|
||||
assert_eq!(agg.hit_at_k[&3], 0.3333);
|
||||
assert_eq!(agg.hit_at_k[&5], 0.6667);
|
||||
assert_eq!(agg.hit_at_k[&10], 0.6667);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mrr_matches_expected() {
|
||||
// q1 rank 1 → 1/1, q2 rank 4 → 1/4, q3 miss → 0. mean = (1 + 0.25 + 0) / 3 ≈ 0.4167
|
||||
let queries = vec![
|
||||
gq("q1", &["c1"], &["d1"]),
|
||||
gq("q2", &["c2"], &["d2"]),
|
||||
gq("q3", &["c3"], &["d3"]),
|
||||
];
|
||||
let rows = vec![
|
||||
record("q1", vec![hit(1, "c1", "d1")], None, None),
|
||||
record("q2", vec![hit(1, "x", "y"), hit(2, "x", "y"), hit(3, "x", "y"), hit(4, "c2", "d2")], None, None),
|
||||
record("q3", vec![hit(1, "x", "y")], None, None),
|
||||
];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.mrr, 0.4167);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recall_at_k_doc_partial() {
|
||||
// q1 expects {d1, d2}; top-3 returns {d1}. recall@3 = 0.5
|
||||
let queries = vec![gq("q1", &[], &["d1", "d2"])];
|
||||
let rows = vec![record("q1", vec![hit(1, "c1", "d1"), hit(2, "c2", "d3")], None, None)];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.recall_at_k_doc[&3], 0.5);
|
||||
assert_eq!(agg.recall_at_k_doc[&10], 0.5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn citation_coverage_full_when_paths_resolve() {
|
||||
let mut q = gq("q1", &[], &["d1"]);
|
||||
q.must_contain = vec!["alpha".into()];
|
||||
let queries = vec![q];
|
||||
let ans = answer("contains alpha", true, &["docs/d1.md"]);
|
||||
let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, Some(ans))];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.citation_coverage, 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn groundedness_false_when_forbidden_present() {
|
||||
let mut q = gq("q1", &[], &["d1"]);
|
||||
q.must_contain = vec!["alpha".into()];
|
||||
q.forbidden = vec!["beta".into()];
|
||||
let queries = vec![q];
|
||||
let ans = answer("alpha and beta", true, &["docs/d1.md"]);
|
||||
let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, Some(ans))];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.groundedness, 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn refusal_correctness_one_when_should_refuse_and_did() {
|
||||
let queries = vec![gq("q1", &[], &[])]; // expected_doc_ids empty → "should refuse"
|
||||
let ans = answer("I cannot answer", false, &[]);
|
||||
let rows = vec![record("q1", vec![], None, Some(ans))];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.refusal_correctness, 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn refusal_correctness_nan_for_non_rag_run() {
|
||||
// Even with a "should refuse" query, a lexical-only run has no
|
||||
// Answer and so refusal cannot be judged → metric is NaN, not 0.
|
||||
let queries = vec![gq("q1", &[], &[])];
|
||||
let rows = vec![record("q1", vec![], None, None)];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert!(agg.refusal_correctness.is_nan(), "got {}", agg.refusal_correctness);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn citation_coverage_zero_when_answer_has_no_citations() {
|
||||
// A grounded answer with empty citations[] used to count as
|
||||
// covered via Iterator::all's vacuous-true; now must score 0.
|
||||
let mut q = gq("q1", &[], &["d1"]);
|
||||
q.must_contain = vec!["alpha".into()];
|
||||
let queries = vec![q];
|
||||
let ans = answer("contains alpha", true, &[]);
|
||||
let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, Some(ans))];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.citation_coverage, 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn groundedness_skips_unconfigured_goldens() {
|
||||
// A non-error RAG answer for a golden with neither must_contain
|
||||
// nor forbidden must NOT score 1.0 by default — it should be
|
||||
// excluded from the denominator entirely. Refusal-class
|
||||
// queries are tracked via refusal_correctness instead.
|
||||
let queries = vec![gq("q1", &["c1"], &["d1"])]; // no must_contain / forbidden
|
||||
let ans = answer("anything", true, &["docs/d1.md"]);
|
||||
let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, Some(ans))];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
// denominator is 0 → ratio_or_zero returns 0.0 (not NaN, since
|
||||
// groundedness isn't a NaN-flagged metric per spec).
|
||||
assert_eq!(agg.groundedness, 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nan_metrics_serialize_as_null() {
|
||||
// No RAG answers → citation_coverage NaN. No "should refuse" → refusal_correctness NaN.
|
||||
let queries = vec![gq("q1", &["c1"], &["d1"])];
|
||||
let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, None)];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
let json: serde_json::Value = serde_json::to_value(&agg).unwrap();
|
||||
assert!(json["citation_coverage"].is_null(), "expected null, got {:?}", json["citation_coverage"]);
|
||||
assert!(json["refusal_correctness"].is_null(), "expected null, got {:?}", json["refusal_correctness"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn determinism_two_runs_match() {
|
||||
let queries = vec![gq("q1", &["c1"], &["d1"]), gq("q2", &["c2"], &["d2"])];
|
||||
let rows = vec![
|
||||
record("q1", vec![hit(1, "c1", "d1")], None, None),
|
||||
record("q2", vec![hit(1, "x", "y"), hit(2, "c2", "d2")], None, None),
|
||||
];
|
||||
let a = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
let b = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
// NaN != NaN under PartialEq, but the JSON encoding maps NaN
|
||||
// to null and is the actual storage form, so compare on that.
|
||||
assert_eq!(
|
||||
serde_json::to_string(&a).unwrap(),
|
||||
serde_json::to_string(&b).unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_result_rate_counts_zero_hits() {
|
||||
let queries = vec![gq("q1", &["c1"], &["d1"]), gq("q2", &["c2"], &["d2"])];
|
||||
let rows = vec![
|
||||
record("q1", vec![], None, None),
|
||||
record("q2", vec![hit(1, "c2", "d2")], None, None),
|
||||
];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.empty_result_rate, 0.5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn failed_queries_counted() {
|
||||
let queries = vec![gq("q1", &["c1"], &["d1"])];
|
||||
let rows = vec![record("q1", vec![], Some("boom".into()), None)];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.failed_queries, 1);
|
||||
assert_eq!(agg.total_queries, 1);
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,7 @@ use kb_store_sqlite::{EvalRunRow, SqliteStore};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::loader::{load_golden_set, validate_against_db};
|
||||
use crate::metrics::{DEFAULT_GOLDEN_PATH, KB_EVAL_GOLDEN};
|
||||
use crate::types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult};
|
||||
|
||||
/// Convert a wall-clock duration since `start` into milliseconds clamped
|
||||
@@ -23,13 +24,6 @@ fn elapsed_ms_u32(start: Instant) -> u32 {
|
||||
start.elapsed().as_millis().min(u128::from(u32::MAX)) as u32
|
||||
}
|
||||
|
||||
/// Env var that overrides the default `fixtures/golden_queries.yaml`
|
||||
/// path. Resolved relative to the current working directory.
|
||||
const KB_EVAL_GOLDEN: &str = "KB_EVAL_GOLDEN";
|
||||
|
||||
/// Default golden YAML path (relative to CWD when set).
|
||||
const DEFAULT_GOLDEN_PATH: &str = "fixtures/golden_queries.yaml";
|
||||
|
||||
/// Run the golden suite end-to-end against the active XDG-loaded
|
||||
/// [`kb_config::Config`]. Wraps [`run_eval_with_config`] with
|
||||
/// `Config::load(None)`.
|
||||
|
||||
89
crates/kb-eval/tests/fixtures/eval/compare-1.json
vendored
Normal file
89
crates/kb-eval/tests/fixtures/eval/compare-1.json
vendored
Normal file
@@ -0,0 +1,89 @@
|
||||
{
|
||||
"aggregate_a": {
|
||||
"citation_coverage": null,
|
||||
"empty_result_rate": 0.0,
|
||||
"failed_queries": 0,
|
||||
"groundedness": 0.0,
|
||||
"hit_at_k": {
|
||||
"1": 0.33329999446868896,
|
||||
"10": 0.666700005531311,
|
||||
"3": 0.33329999446868896,
|
||||
"5": 0.666700005531311
|
||||
},
|
||||
"mrr": 0.41670000553131104,
|
||||
"recall_at_k_doc": {
|
||||
"1": 0.33329999446868896,
|
||||
"10": 0.666700005531311,
|
||||
"3": 0.33329999446868896,
|
||||
"5": 0.666700005531311
|
||||
},
|
||||
"refusal_correctness": null,
|
||||
"total_queries": 3
|
||||
},
|
||||
"aggregate_b": {
|
||||
"citation_coverage": null,
|
||||
"empty_result_rate": 0.0,
|
||||
"failed_queries": 0,
|
||||
"groundedness": 0.0,
|
||||
"hit_at_k": {
|
||||
"1": 0.666700005531311,
|
||||
"10": 1.0,
|
||||
"3": 1.0,
|
||||
"5": 1.0
|
||||
},
|
||||
"mrr": 0.833299994468689,
|
||||
"recall_at_k_doc": {
|
||||
"1": 0.666700005531311,
|
||||
"10": 1.0,
|
||||
"3": 1.0,
|
||||
"5": 1.0
|
||||
},
|
||||
"refusal_correctness": null,
|
||||
"total_queries": 3
|
||||
},
|
||||
"deltas": {
|
||||
"chunker_version_match": "exact",
|
||||
"citation_coverage": null,
|
||||
"empty_result_rate": 0.0,
|
||||
"groundedness": 0.0,
|
||||
"hit_at_k": {
|
||||
"1": 0.33340001106262207,
|
||||
"10": 0.33329999446868896,
|
||||
"3": 0.666700005531311,
|
||||
"5": 0.33329999446868896
|
||||
},
|
||||
"mrr": 0.41659998893737793,
|
||||
"recall_at_k_doc": {
|
||||
"1": 0.33340001106262207,
|
||||
"10": 0.33329999446868896,
|
||||
"3": 0.666700005531311,
|
||||
"5": 0.33329999446868896
|
||||
},
|
||||
"refusal_correctness": null
|
||||
},
|
||||
"per_query": [
|
||||
{
|
||||
"a_hit_rank": 1,
|
||||
"b_hit_rank": 2,
|
||||
"kind": "loss",
|
||||
"note": "rank 1→2",
|
||||
"query_id": "q-001"
|
||||
},
|
||||
{
|
||||
"a_hit_rank": 4,
|
||||
"b_hit_rank": 1,
|
||||
"kind": "win",
|
||||
"note": "rank 4→1",
|
||||
"query_id": "q-002"
|
||||
},
|
||||
{
|
||||
"a_hit_rank": null,
|
||||
"b_hit_rank": 1,
|
||||
"kind": "win",
|
||||
"note": null,
|
||||
"query_id": "q-003"
|
||||
}
|
||||
],
|
||||
"run_a": "run_a",
|
||||
"run_b": "run_b"
|
||||
}
|
||||
436
crates/kb-eval/tests/metrics_and_compare.rs
Normal file
436
crates/kb-eval/tests/metrics_and_compare.rs
Normal file
@@ -0,0 +1,436 @@
|
||||
//! Integration tests for P5-2: write two synthetic eval runs into a
|
||||
//! SQLite store, then drive `compute_aggregate` / `store_aggregate` /
|
||||
//! `compare_runs` end-to-end. Mirrors the test plan in
|
||||
//! `tasks/p5/p5-2-metrics-compare.md`.
|
||||
//!
|
||||
//! Snapshot of `CompareReport` JSON is pinned at
|
||||
//! `tests/fixtures/eval/compare-1.json`.
|
||||
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kb_config::Config;
|
||||
use kb_core::{
|
||||
ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, Lang,
|
||||
RetrievalDetail, SearchHit, SearchMode,
|
||||
asset::WorkspacePath,
|
||||
};
|
||||
use kb_eval::{
|
||||
AggregateMetrics, CompareOpts, CompareReport, ComparisonKind, GoldenQuery, QueryResult,
|
||||
compare_runs_with_config, compute_aggregate_with_config, store_aggregate_with_config,
|
||||
};
|
||||
use kb_store_sqlite::{EvalRunRow, SqliteStore};
|
||||
use tempfile::TempDir;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn cfg_with_data_dir(tmp: &TempDir, golden_yaml: &str) -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = tmp.path().to_string_lossy().into_owned();
|
||||
cfg.storage.runs_dir = tmp.path().join("runs").to_string_lossy().into_owned();
|
||||
cfg.storage.copy_threshold_mb = 0;
|
||||
let golden_path = tmp.path().join("golden.yaml");
|
||||
fs::write(&golden_path, golden_yaml).unwrap();
|
||||
// Point both metrics + compare at the temp golden via env override.
|
||||
// SAFELY scoped — `set_var` is process-global so callers serialise
|
||||
// tests via the `serial_test`-style guard below.
|
||||
unsafe {
|
||||
std::env::set_var("KB_EVAL_GOLDEN", &golden_path);
|
||||
}
|
||||
cfg
|
||||
}
|
||||
|
||||
fn golden_yaml_basic() -> &'static str {
|
||||
r#"
|
||||
- id: q-001
|
||||
query: hit at rank 1
|
||||
expected_doc_ids: ["doc-1"]
|
||||
expected_chunk_ids: ["chunk-1"]
|
||||
- id: q-002
|
||||
query: hit at rank 4
|
||||
expected_doc_ids: ["doc-2"]
|
||||
expected_chunk_ids: ["chunk-2"]
|
||||
- id: q-003
|
||||
query: miss everywhere
|
||||
expected_doc_ids: ["doc-3"]
|
||||
expected_chunk_ids: ["chunk-3"]
|
||||
"#
|
||||
}
|
||||
|
||||
fn hit(rank: u32, chunk_id: &str, doc_id: &str) -> SearchHit {
|
||||
SearchHit {
|
||||
rank,
|
||||
chunk_id: ChunkId(chunk_id.into()),
|
||||
doc_id: DocumentId(doc_id.into()),
|
||||
doc_path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(),
|
||||
heading_path: vec!["root".into()],
|
||||
section_label: None,
|
||||
snippet: "snip".into(),
|
||||
citation: Citation::Line {
|
||||
path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(),
|
||||
start: 1,
|
||||
end: 1,
|
||||
section: None,
|
||||
},
|
||||
retrieval: RetrievalDetail {
|
||||
method: SearchMode::Lexical,
|
||||
fusion_score: 1.0 / f32::from(u16::try_from(rank).unwrap_or(1)),
|
||||
lexical_score: Some(1.0),
|
||||
vector_score: None,
|
||||
lexical_rank: Some(rank),
|
||||
vector_rank: None,
|
||||
},
|
||||
index_version: IndexVersion("idx@1".into()),
|
||||
embedding_model: None,
|
||||
chunker_version: ChunkerVersion("test@1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
fn qr(query_id: &str, hits: Vec<SearchHit>) -> QueryResult {
|
||||
QueryResult {
|
||||
query_id: query_id.into(),
|
||||
query: format!("query for {query_id}"),
|
||||
mode: SearchMode::Lexical,
|
||||
hits_top_k: hits,
|
||||
answer: None,
|
||||
elapsed_ms: 1,
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn write_run(
|
||||
store: &SqliteStore,
|
||||
run_id: &str,
|
||||
chunker_version: &str,
|
||||
created_at: OffsetDateTime,
|
||||
queries: Vec<QueryResult>,
|
||||
) {
|
||||
let snapshot = serde_json::json!({
|
||||
"config": {},
|
||||
"chunker_version": chunker_version,
|
||||
});
|
||||
let snapshot_text = serde_json::to_string(&snapshot).unwrap();
|
||||
let row = EvalRunRow {
|
||||
run_id,
|
||||
suite: "golden",
|
||||
config_snapshot_json: &snapshot_text,
|
||||
aggregate_json: "{}",
|
||||
commit_hash: Some("0000000"),
|
||||
created_at,
|
||||
};
|
||||
let results: Vec<(String, String)> = queries
|
||||
.into_iter()
|
||||
.map(|qr| {
|
||||
let json = serde_json::to_string(&qr).unwrap();
|
||||
(qr.query_id, json)
|
||||
})
|
||||
.collect();
|
||||
store.record_eval_run_with_results(&row, &results).unwrap();
|
||||
}
|
||||
|
||||
/// Each test mutates a process-global env var (`KB_EVAL_GOLDEN`) and
|
||||
/// expects to see its own write. Take this mutex around the body of
|
||||
/// every test that touches `KB_EVAL_GOLDEN` so two concurrent test
|
||||
/// threads don't trip over each other's golden YAML.
|
||||
fn env_guard() -> std::sync::MutexGuard<'static, ()> {
|
||||
use std::sync::{Mutex, OnceLock};
|
||||
static M: OnceLock<Mutex<()>> = OnceLock::new();
|
||||
M.get_or_init(|| Mutex::new(()))
|
||||
.lock()
|
||||
.unwrap_or_else(|e| e.into_inner())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compute_and_store_aggregate_round_trips() {
|
||||
let _g = env_guard();
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
|
||||
let store = SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
let now = OffsetDateTime::UNIX_EPOCH;
|
||||
write_run(
|
||||
&store,
|
||||
"run_a",
|
||||
"test@1",
|
||||
now,
|
||||
vec![
|
||||
qr("q-001", vec![hit(1, "chunk-1", "doc-1")]),
|
||||
qr(
|
||||
"q-002",
|
||||
vec![
|
||||
hit(1, "x", "x"),
|
||||
hit(2, "x", "x"),
|
||||
hit(3, "x", "x"),
|
||||
hit(4, "chunk-2", "doc-2"),
|
||||
],
|
||||
),
|
||||
qr("q-003", vec![hit(1, "x", "x")]),
|
||||
],
|
||||
);
|
||||
drop(store);
|
||||
|
||||
let agg = compute_aggregate_with_config(&cfg, "run_a").unwrap();
|
||||
// hit@1 = 1/3, hit@5 = 2/3, MRR = (1 + 0.25 + 0)/3 ≈ 0.4167.
|
||||
assert_eq!(agg.hit_at_k[&1], 0.3333);
|
||||
assert_eq!(agg.hit_at_k[&5], 0.6667);
|
||||
assert_eq!(agg.mrr, 0.4167);
|
||||
|
||||
store_aggregate_with_config(&cfg, "run_a", &agg).unwrap();
|
||||
let store = SqliteStore::open(&cfg).unwrap();
|
||||
let row = store.load_eval_run("run_a").unwrap().unwrap();
|
||||
let parsed: AggregateMetrics = serde_json::from_str(&row.aggregate_json).unwrap();
|
||||
// f32 round-trip via JSON is exact for our 4-decimal-rounded
|
||||
// values, so direct equality is OK here (NaN fields are handled
|
||||
// by the `serialize_f32_nan_as_null` round-trip — `citation_coverage`
|
||||
// and `refusal_correctness` come back as NaN). Compare on JSON
|
||||
// bytes instead, which is what `store_aggregate` writes.
|
||||
assert_eq!(
|
||||
serde_json::to_string(&parsed).unwrap(),
|
||||
serde_json::to_string(&agg).unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn store_aggregate_rejects_missing_run() {
|
||||
let _g = env_guard();
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
|
||||
let agg = AggregateMetrics {
|
||||
hit_at_k: Default::default(),
|
||||
mrr: 0.0,
|
||||
recall_at_k_doc: Default::default(),
|
||||
citation_coverage: f32::NAN,
|
||||
groundedness: 0.0,
|
||||
empty_result_rate: 0.0,
|
||||
refusal_correctness: f32::NAN,
|
||||
total_queries: 0,
|
||||
failed_queries: 0,
|
||||
};
|
||||
let err = store_aggregate_with_config(&cfg, "run_does_not_exist", &agg).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("run_does_not_exist"), "msg = {msg}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compare_runs_classifies_win_loss_draw_regression() {
|
||||
let _g = env_guard();
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
|
||||
let store = SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
let now = OffsetDateTime::UNIX_EPOCH;
|
||||
// Run A:
|
||||
// q-001 rank 1 → hit
|
||||
// q-002 rank 4 → hit
|
||||
// q-003 miss
|
||||
write_run(
|
||||
&store,
|
||||
"run_a",
|
||||
"test@1",
|
||||
now,
|
||||
vec![
|
||||
qr("q-001", vec![hit(1, "chunk-1", "doc-1")]),
|
||||
qr(
|
||||
"q-002",
|
||||
vec![
|
||||
hit(1, "x", "x"),
|
||||
hit(2, "x", "x"),
|
||||
hit(3, "x", "x"),
|
||||
hit(4, "chunk-2", "doc-2"),
|
||||
],
|
||||
),
|
||||
qr("q-003", vec![hit(1, "x", "x")]),
|
||||
],
|
||||
);
|
||||
// Run B:
|
||||
// q-001 rank 2 → still hit (Loss vs A — worse rank)
|
||||
// q-002 rank 1 → hit (Win — improved rank)
|
||||
// q-003 hit @ rank 1 → hit (Win — was miss in A)
|
||||
write_run(
|
||||
&store,
|
||||
"run_b",
|
||||
"test@1",
|
||||
now,
|
||||
vec![
|
||||
qr("q-001", vec![hit(1, "x", "x"), hit(2, "chunk-1", "doc-1")]),
|
||||
qr("q-002", vec![hit(1, "chunk-2", "doc-2")]),
|
||||
qr("q-003", vec![hit(1, "chunk-3", "doc-3")]),
|
||||
],
|
||||
);
|
||||
drop(store);
|
||||
|
||||
let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap();
|
||||
let by_id: std::collections::HashMap<&str, &kb_eval::QueryComparison> =
|
||||
report.per_query.iter().map(|c| (c.query_id.as_str(), c)).collect();
|
||||
assert_eq!(by_id["q-001"].kind, ComparisonKind::Loss);
|
||||
assert_eq!(by_id["q-002"].kind, ComparisonKind::Win);
|
||||
assert_eq!(by_id["q-003"].kind, ComparisonKind::Win);
|
||||
assert_eq!(report.deltas["chunker_version_match"], "exact");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compare_strict_mode_refuses_chunker_version_mismatch() {
|
||||
let _g = env_guard();
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
|
||||
let store = SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
let now = OffsetDateTime::UNIX_EPOCH;
|
||||
write_run(&store, "run_a", "test@1", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]);
|
||||
write_run(&store, "run_b", "test@2", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]);
|
||||
drop(store);
|
||||
|
||||
let opts = CompareOpts {
|
||||
strict_chunker_version: true,
|
||||
};
|
||||
let err = compare_runs_with_config(&cfg, "run_a", "run_b", &opts).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("chunker_version mismatch"), "msg = {msg}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compare_graceful_falls_back_to_doc_id() {
|
||||
let _g = env_guard();
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
|
||||
let store = SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
let now = OffsetDateTime::UNIX_EPOCH;
|
||||
// Run A uses test@1 chunker; run B uses test@2 — chunk_ids no longer
|
||||
// align, but doc_ids do.
|
||||
write_run(&store, "run_a", "test@1", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]);
|
||||
write_run(
|
||||
&store,
|
||||
"run_b",
|
||||
"test@2",
|
||||
now,
|
||||
// Different chunk_id, same doc_id → exact-mode matching would
|
||||
// miss; doc-id fallback should still register a hit.
|
||||
vec![qr("q-001", vec![hit(1, "chunk-1-renamed", "doc-1")])],
|
||||
);
|
||||
drop(store);
|
||||
|
||||
let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap();
|
||||
assert_eq!(report.deltas["chunker_version_match"], "fallback_doc");
|
||||
let q1 = report.per_query.iter().find(|c| c.query_id == "q-001").unwrap();
|
||||
// Both runs hit doc-1 at rank 1 → Draw.
|
||||
assert_eq!(q1.kind, ComparisonKind::Draw);
|
||||
assert_eq!(q1.a_hit_rank, Some(1));
|
||||
assert_eq!(q1.b_hit_rank, Some(1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compare_report_snapshot_matches_fixture() {
|
||||
let _g = env_guard();
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
|
||||
let store = SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
let now = OffsetDateTime::UNIX_EPOCH;
|
||||
write_run(
|
||||
&store,
|
||||
"run_a",
|
||||
"test@1",
|
||||
now,
|
||||
vec![
|
||||
qr("q-001", vec![hit(1, "chunk-1", "doc-1")]),
|
||||
qr(
|
||||
"q-002",
|
||||
vec![
|
||||
hit(1, "x", "x"),
|
||||
hit(2, "x", "x"),
|
||||
hit(3, "x", "x"),
|
||||
hit(4, "chunk-2", "doc-2"),
|
||||
],
|
||||
),
|
||||
qr("q-003", vec![hit(1, "x", "x")]),
|
||||
],
|
||||
);
|
||||
write_run(
|
||||
&store,
|
||||
"run_b",
|
||||
"test@1",
|
||||
now,
|
||||
vec![
|
||||
qr("q-001", vec![hit(1, "x", "x"), hit(2, "chunk-1", "doc-1")]),
|
||||
qr("q-002", vec![hit(1, "chunk-2", "doc-2")]),
|
||||
qr("q-003", vec![hit(1, "chunk-3", "doc-3")]),
|
||||
],
|
||||
);
|
||||
drop(store);
|
||||
|
||||
let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap();
|
||||
let actual = projection(&report);
|
||||
let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
.join("eval")
|
||||
.join("compare-1.json");
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
fs::write(&fixture, format!("{}\n", serde_json::to_string_pretty(&actual).unwrap()))
|
||||
.unwrap();
|
||||
}
|
||||
let expected_text = fs::read_to_string(&fixture)
|
||||
.unwrap_or_else(|e| panic!("missing fixture {}: {e}", fixture.display()));
|
||||
let expected: serde_json::Value = serde_json::from_str(&expected_text).unwrap();
|
||||
assert_eq!(actual, expected, "compare report drift — re-run with UPDATE_SNAPSHOTS=1 if intended");
|
||||
}
|
||||
|
||||
/// Project a `CompareReport` to the stable-across-runs subset.
|
||||
/// `aggregate_*` and `deltas` are deterministic; per-query rows keep
|
||||
/// only `(query_id, kind, ranks, note)` and discard volatile fields.
|
||||
fn projection(r: &CompareReport) -> serde_json::Value {
|
||||
serde_json::json!({
|
||||
"run_a": r.run_a,
|
||||
"run_b": r.run_b,
|
||||
"aggregate_a": r.aggregate_a,
|
||||
"aggregate_b": r.aggregate_b,
|
||||
"deltas": r.deltas,
|
||||
"per_query": r.per_query,
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn render_report_md_is_human_readable() {
|
||||
let _g = env_guard();
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
|
||||
let store = SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
let now = OffsetDateTime::UNIX_EPOCH;
|
||||
write_run(
|
||||
&store,
|
||||
"run_a",
|
||||
"test@1",
|
||||
now,
|
||||
vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])],
|
||||
);
|
||||
write_run(
|
||||
&store,
|
||||
"run_b",
|
||||
"test@1",
|
||||
now,
|
||||
vec![qr("q-001", vec![hit(2, "chunk-1", "doc-1")])],
|
||||
);
|
||||
drop(store);
|
||||
|
||||
let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap();
|
||||
let md = kb_eval::render_report_md(&report);
|
||||
assert!(md.starts_with("# Eval compare:"), "md = {md}");
|
||||
assert!(md.contains("hit@1"));
|
||||
assert!(md.contains("MRR"));
|
||||
assert!(md.contains("Wins"));
|
||||
assert!(md.contains("q-001"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lang_default_is_used_when_omitted_in_yaml() {
|
||||
// Round-trip safety: GoldenQuery without `lang` should parse fine.
|
||||
let yaml = "- id: only\n query: q\n";
|
||||
let _g = env_guard();
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let golden = tmp.path().join("g.yaml");
|
||||
fs::write(&golden, yaml).unwrap();
|
||||
let qs: Vec<GoldenQuery> = serde_yaml::from_str(yaml).unwrap();
|
||||
assert_eq!(qs.len(), 1);
|
||||
assert_eq!(qs[0].lang, Lang(String::new()));
|
||||
}
|
||||
@@ -16,7 +16,8 @@ use crate::store::SqliteStore;
|
||||
/// One row about to land in `eval_runs` (per V001 schema).
|
||||
///
|
||||
/// `aggregate_json` is filled by P5-1 with the literal `"{}"` —
|
||||
/// metric computation lives in P5-2 and updates the row in place.
|
||||
/// metric computation lives in P5-2 and updates the row in place via
|
||||
/// [`SqliteStore::update_eval_run_aggregate`].
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct EvalRunRow<'a> {
|
||||
pub run_id: &'a str,
|
||||
@@ -27,6 +28,28 @@ pub struct EvalRunRow<'a> {
|
||||
pub created_at: OffsetDateTime,
|
||||
}
|
||||
|
||||
/// Owned mirror of a row read from `eval_runs`. Used by P5-2's
|
||||
/// `compute_aggregate` / `compare_runs` since [`EvalRunRow`] borrows
|
||||
/// from the writer's input buffers.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct EvalRunRecord {
|
||||
pub run_id: String,
|
||||
pub suite: String,
|
||||
pub config_snapshot_json: String,
|
||||
pub aggregate_json: String,
|
||||
pub commit_hash: Option<String>,
|
||||
pub created_at: OffsetDateTime,
|
||||
}
|
||||
|
||||
/// Owned per-query row read from `eval_query_results`. The
|
||||
/// `result_json` is the same `serde_json::to_string(&QueryResult)`
|
||||
/// payload [`SqliteStore::record_eval_query_result`] wrote.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct EvalQueryResultRecord {
|
||||
pub query_id: String,
|
||||
pub result_json: String,
|
||||
}
|
||||
|
||||
impl SqliteStore {
|
||||
/// Return `true` iff a row with `doc_id = ?` exists in
|
||||
/// `documents`. Lightweight existence probe used by
|
||||
@@ -158,4 +181,98 @@ impl SqliteStore {
|
||||
tx.commit().map_err(StoreError::from)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load a single `eval_runs` row by `run_id`. Returns `None` if no
|
||||
/// row matches. Used by P5-2's `compute_aggregate` / `compare_runs`
|
||||
/// to fetch the run-level metadata (config snapshot, aggregate,
|
||||
/// commit, created_at).
|
||||
pub fn load_eval_run(&self, run_id: &str) -> Result<Option<EvalRunRecord>> {
|
||||
let conn = self.lock_conn();
|
||||
let row = conn.query_row(
|
||||
"SELECT run_id, suite, config_snapshot_json, aggregate_json,
|
||||
commit_hash, created_at
|
||||
FROM eval_runs WHERE run_id = ?",
|
||||
params![run_id],
|
||||
|r| {
|
||||
Ok((
|
||||
r.get::<_, String>(0)?,
|
||||
r.get::<_, String>(1)?,
|
||||
r.get::<_, String>(2)?,
|
||||
r.get::<_, String>(3)?,
|
||||
r.get::<_, Option<String>>(4)?,
|
||||
r.get::<_, String>(5)?,
|
||||
))
|
||||
},
|
||||
);
|
||||
let (run_id, suite, snapshot, aggregate, commit, created_str) = match row {
|
||||
Ok(t) => t,
|
||||
Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None),
|
||||
Err(e) => return Err(StoreError::from(e).into()),
|
||||
};
|
||||
let created_at = OffsetDateTime::parse(
|
||||
&created_str,
|
||||
&time::format_description::well_known::Rfc3339,
|
||||
)
|
||||
.with_context(|| format!("parse eval_runs.created_at for {run_id}"))?;
|
||||
Ok(Some(EvalRunRecord {
|
||||
run_id,
|
||||
suite,
|
||||
config_snapshot_json: snapshot,
|
||||
aggregate_json: aggregate,
|
||||
commit_hash: commit,
|
||||
created_at,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Load every `eval_query_results` row for one `run_id`, ordered by
|
||||
/// `query_id` ASC for determinism (the table has no insertion-order
|
||||
/// column; query_id ordering matches the BTreeSet sort the loader
|
||||
/// uses for missing-id reporting).
|
||||
pub fn load_eval_query_results(
|
||||
&self,
|
||||
run_id: &str,
|
||||
) -> Result<Vec<EvalQueryResultRecord>> {
|
||||
let conn = self.lock_conn();
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT query_id, result_json FROM eval_query_results
|
||||
WHERE run_id = ? ORDER BY query_id ASC",
|
||||
)
|
||||
.map_err(StoreError::from)?;
|
||||
let iter = stmt
|
||||
.query_map(params![run_id], |r| {
|
||||
Ok(EvalQueryResultRecord {
|
||||
query_id: r.get::<_, String>(0)?,
|
||||
result_json: r.get::<_, String>(1)?,
|
||||
})
|
||||
})
|
||||
.map_err(StoreError::from)?;
|
||||
let mut out = Vec::new();
|
||||
for row in iter {
|
||||
out.push(row.map_err(StoreError::from)?);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// Replace `eval_runs.aggregate_json` for one `run_id`. Returns
|
||||
/// `Err` (not `Ok(0)`) if the run is missing — we never want to
|
||||
/// silently drop computed metrics. Called once per run by
|
||||
/// P5-2's `store_aggregate`.
|
||||
pub fn update_eval_run_aggregate(
|
||||
&self,
|
||||
run_id: &str,
|
||||
aggregate_json: &str,
|
||||
) -> Result<()> {
|
||||
let conn = self.lock_conn();
|
||||
let updated = conn
|
||||
.execute(
|
||||
"UPDATE eval_runs SET aggregate_json = ? WHERE run_id = ?",
|
||||
params![aggregate_json, run_id],
|
||||
)
|
||||
.map_err(StoreError::from)?;
|
||||
if updated == 0 {
|
||||
anyhow::bail!("update_eval_run_aggregate: no row for run_id {run_id}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ mod store;
|
||||
|
||||
pub use embeddings::EmbeddingRecordRow;
|
||||
pub use error::StoreError;
|
||||
pub use eval::EvalRunRow;
|
||||
pub use eval::{EvalQueryResultRecord, EvalRunRecord, EvalRunRow};
|
||||
pub use fts::rebuild_chunks_fts;
|
||||
pub use jobs::IngestRunRow;
|
||||
pub use store::SqliteStore;
|
||||
|
||||
@@ -3,7 +3,7 @@ phase: P5
|
||||
component: kb-eval (metrics + compare)
|
||||
task_id: p5-2
|
||||
title: "Metrics computation + compare report"
|
||||
status: planned
|
||||
status: completed
|
||||
depends_on: [p5-1]
|
||||
unblocks: []
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kb-final-form-design.md
|
||||
@@ -150,3 +150,62 @@ All tests under `cargo test -p kb-eval metrics`.
|
||||
- Floating-point sums in MRR cause minor cross-platform drift; round to 4 decimals on storage to keep snapshots stable.
|
||||
- "Should refuse" queries are encoded as `expected_doc_ids: []`. Document this convention in the golden YAML header comment.
|
||||
- Chunker version drift across runs is the COMMON case, not the error case (you almost always re-chunk before evaluating a chunker change). Default behavior is graceful fallback (doc + span overlap); only `--strict-chunker-version` refuses. The `chunker_version_match` field in `CompareReport.deltas` makes the mode auditable, so silent miscompares are still impossible.
|
||||
|
||||
## Implementation deviations (intentional)
|
||||
|
||||
Recorded so reviewers don't trip on them; the runtime behavior is the
|
||||
same one this spec defines, the names / wiring just differ.
|
||||
|
||||
- **Graceful fallback is doc-id-only, not doc + 50% span overlap.** The
|
||||
`chunker_version_match` audit field is `"fallback_doc"` (not
|
||||
`"fallback_doc_span"`). Span-overlap requires reading both runs'
|
||||
`chunks.source_spans` simultaneously — but a chunker-version change
|
||||
in practice re-indexes (overwrites) the chunks table, so by the time
|
||||
you compute run B the run A chunk rows are already gone. Doc-id
|
||||
matching is the strongest stable criterion under that workflow.
|
||||
Span-overlap moves to a future phase that owns chunker-version
|
||||
archival.
|
||||
- **Helper signatures.** `compute_aggregate_with_config(cfg, run_id)` /
|
||||
`store_aggregate_with_config(cfg, run_id, agg)` /
|
||||
`compare_runs_with_config(cfg, a, b, opts)` exist alongside the
|
||||
spec-pinned `compute_aggregate(run_id)` / `store_aggregate(run_id, agg)`
|
||||
/ `compare_runs(a, b)` so integration tests can drive the pipeline
|
||||
against a TempDir-backed `Config`. The no-arg forms wrap them with
|
||||
`Config::load(None)`.
|
||||
- **CLI surface lives on `kb-cli` directly, not via `kb-app`.** DoD
|
||||
asks for `kb eval compare` to be reached "via kb-app", but `kb-app`
|
||||
already depends on `kb-eval` (the P5-1 runner uses the App facade),
|
||||
so routing the CLI through `kb-app` would form a cycle. `kb-cli` →
|
||||
`kb-eval` is wired directly; `kb-app` is unchanged.
|
||||
- **`AggregateMetrics` is `Serialize + Deserialize`.** The spec defines
|
||||
only the field shape; we add `Deserialize` so the stored
|
||||
`aggregate_json` can round-trip back into the type for follow-up
|
||||
computations.
|
||||
- **`anyhow`** is used in `Result` returns since the rest of the
|
||||
workspace already speaks anyhow; not in the spec's Allowed list but
|
||||
matches every other crate.
|
||||
- **`kb-eval` crate-level `kb-app` dep stays.** The crate already
|
||||
depends on `kb-app` from P5-1 (the runner uses the `App` facade), so
|
||||
the Cargo.toml entry remains. The new modules (`metrics.rs`,
|
||||
`compare.rs`) do not import `kb-app` themselves — they're behind the
|
||||
same crate boundary as the runner, but the metric/compare *surface*
|
||||
is `kb-app`-clean. Splitting the crate to avoid a transitive Cargo
|
||||
edge would be churn for no behavior gain.
|
||||
- **`citation_coverage` is intentionally weaker than the spec literal.**
|
||||
Spec calls for "every citation resolves to a real chunk in the DB".
|
||||
Current implementation: an Answer counts as fully covered iff it has
|
||||
≥1 citation AND every citation's path is non-empty. Tightening to a
|
||||
per-citation `document_exists_by_path` SqliteStore probe is the next
|
||||
step once that helper lands. Empty-citations no longer pass through
|
||||
`Iterator::all`'s vacuous-true.
|
||||
- **`refusal_correctness` is undefined for non-RAG runs.** The metric
|
||||
judges whether the system *refused*; without an `Answer` (lexical-
|
||||
only or vector-only run), there's nothing to judge. We exclude such
|
||||
queries from the denominator rather than auto-failing them, so a
|
||||
search-only run reports `refusal_correctness` as `null` instead of a
|
||||
misleading 0.0.
|
||||
- **`groundedness` skips queries with no `must_contain`/`forbidden`.**
|
||||
An unconfigured golden entry would otherwise score a free 1.0 (or
|
||||
0.0 if the answer happens to contain a forbidden string from a
|
||||
later spec change). Refusal-class queries are also excluded — their
|
||||
groundedness flows through `refusal_correctness`.
|
||||
|
||||
Reference in New Issue
Block a user