Files
kebab/crates/kebab-eval/tests/metrics_and_compare.rs
th-kim0823 fa4eeb5a87 feat(p10-1a-1): add SearchHit.repo / code_lang + SearchFilters.repo / code_lang
Wire two new optional fields onto SearchHit (skip_serializing_if = None)
and two Vec<String> filter fields onto SearchFilters (serde default).
Add RetrievalDetail::Default impl (manual, uses SearchMode::Hybrid as
sentinel). Patch all downstream SearchHit / SearchFilters literal
constructors with repo: None / code_lang: None / vec![] as appropriate.
Also covers Citation::Code arm in kebab-eval metrics match.
2026-05-15 15:04:23 +09:00

445 lines
15 KiB
Rust

//! Integration tests for P5-2: write two synthetic eval runs into a
//! SQLite store, then drive `compute_aggregate` / `store_aggregate` /
//! `compare_runs` end-to-end. Mirrors the test plan in
//! `tasks/p5/p5-2-metrics-compare.md`.
//!
//! Snapshot of `CompareReport` JSON is pinned at
//! `tests/fixtures/eval/compare-1.json`.
use std::fs;
use std::path::PathBuf;
use kebab_config::Config;
use kebab_core::{
ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, Lang,
RetrievalDetail, SearchHit, SearchMode,
asset::WorkspacePath,
};
use kebab_eval::{
AggregateMetrics, CompareOpts, CompareReport, ComparisonKind, GoldenQuery, QueryResult,
compare_runs_with_config, compute_aggregate_with_config, store_aggregate_with_config,
};
use kebab_store_sqlite::{EvalRunRow, SqliteStore};
use tempfile::TempDir;
use time::OffsetDateTime;
fn cfg_with_data_dir(tmp: &TempDir, golden_yaml: &str) -> Config {
let mut cfg = Config::defaults();
cfg.storage.data_dir = tmp.path().to_string_lossy().into_owned();
cfg.storage.runs_dir = tmp.path().join("runs").to_string_lossy().into_owned();
cfg.storage.copy_threshold_mb = 0;
let golden_path = tmp.path().join("golden.yaml");
fs::write(&golden_path, golden_yaml).unwrap();
// Point both metrics + compare at the temp golden via env override.
// SAFELY scoped — `set_var` is process-global so callers serialise
// tests via the `serial_test`-style guard below.
unsafe {
std::env::set_var("KEBAB_EVAL_GOLDEN", &golden_path);
}
cfg
}
fn golden_yaml_basic() -> &'static str {
r#"
- id: q-001
query: hit at rank 1
expected_doc_ids: ["doc-1"]
expected_chunk_ids: ["chunk-1"]
- id: q-002
query: hit at rank 4
expected_doc_ids: ["doc-2"]
expected_chunk_ids: ["chunk-2"]
- id: q-003
query: miss everywhere
expected_doc_ids: ["doc-3"]
expected_chunk_ids: ["chunk-3"]
"#
}
fn hit(rank: u32, chunk_id: &str, doc_id: &str) -> SearchHit {
SearchHit {
rank,
chunk_id: ChunkId(chunk_id.into()),
doc_id: DocumentId(doc_id.into()),
doc_path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(),
heading_path: vec!["root".into()],
section_label: None,
snippet: "snip".into(),
citation: Citation::Line {
path: WorkspacePath::new(format!("docs/{doc_id}.md")).unwrap(),
start: 1,
end: 1,
section: None,
},
retrieval: RetrievalDetail {
method: SearchMode::Lexical,
fusion_score: 1.0 / f32::from(u16::try_from(rank).unwrap_or(1)),
lexical_score: Some(1.0),
vector_score: None,
lexical_rank: Some(rank),
vector_rank: None,
},
index_version: IndexVersion("idx@1".into()),
embedding_model: None,
chunker_version: ChunkerVersion("test@1".into()),
// fb-32: synthetic eval fixtures don't exercise staleness;
// pin UNIX_EPOCH + stale=false so hits stay deterministic.
indexed_at: OffsetDateTime::UNIX_EPOCH,
stale: false,
score_kind: kebab_core::ScoreKind::Rrf,
repo: None,
code_lang: None,
}
}
fn qr(query_id: &str, hits: Vec<SearchHit>) -> QueryResult {
QueryResult {
query_id: query_id.into(),
query: format!("query for {query_id}"),
mode: SearchMode::Lexical,
hits_top_k: hits,
answer: None,
elapsed_ms: 1,
error: None,
}
}
fn write_run(
store: &SqliteStore,
run_id: &str,
chunker_version: &str,
created_at: OffsetDateTime,
queries: Vec<QueryResult>,
) {
let snapshot = serde_json::json!({
"config": {},
"chunker_version": chunker_version,
});
let snapshot_text = serde_json::to_string(&snapshot).unwrap();
let row = EvalRunRow {
run_id,
suite: "golden",
config_snapshot_json: &snapshot_text,
aggregate_json: "{}",
commit_hash: Some("0000000"),
created_at,
};
let results: Vec<(String, String)> = queries
.into_iter()
.map(|qr| {
let json = serde_json::to_string(&qr).unwrap();
(qr.query_id, json)
})
.collect();
store.record_eval_run_with_results(&row, &results).unwrap();
}
/// Each test mutates a process-global env var (`KEBAB_EVAL_GOLDEN`) and
/// expects to see its own write. Take this mutex around the body of
/// every test that touches `KEBAB_EVAL_GOLDEN` so two concurrent test
/// threads don't trip over each other's golden YAML.
fn env_guard() -> std::sync::MutexGuard<'static, ()> {
use std::sync::{Mutex, OnceLock};
static M: OnceLock<Mutex<()>> = OnceLock::new();
M.get_or_init(|| Mutex::new(()))
.lock()
.unwrap_or_else(|e| e.into_inner())
}
#[test]
fn compute_and_store_aggregate_round_trips() {
let _g = env_guard();
let tmp = TempDir::new().unwrap();
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
let store = SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
let now = OffsetDateTime::UNIX_EPOCH;
write_run(
&store,
"run_a",
"test@1",
now,
vec![
qr("q-001", vec![hit(1, "chunk-1", "doc-1")]),
qr(
"q-002",
vec![
hit(1, "x", "x"),
hit(2, "x", "x"),
hit(3, "x", "x"),
hit(4, "chunk-2", "doc-2"),
],
),
qr("q-003", vec![hit(1, "x", "x")]),
],
);
drop(store);
let agg = compute_aggregate_with_config(&cfg, "run_a").unwrap();
// hit@1 = 1/3, hit@5 = 2/3, MRR = (1 + 0.25 + 0)/3 ≈ 0.4167.
assert_eq!(agg.hit_at_k[&1], 0.3333);
assert_eq!(agg.hit_at_k[&5], 0.6667);
assert_eq!(agg.mrr, 0.4167);
store_aggregate_with_config(&cfg, "run_a", &agg).unwrap();
let store = SqliteStore::open(&cfg).unwrap();
let row = store.load_eval_run("run_a").unwrap().unwrap();
let parsed: AggregateMetrics = serde_json::from_str(&row.aggregate_json).unwrap();
// f32 round-trip via JSON is exact for our 4-decimal-rounded
// values, so direct equality is OK here (NaN fields are handled
// by the `serialize_f32_nan_as_null` round-trip — `citation_coverage`
// and `refusal_correctness` come back as NaN). Compare on JSON
// bytes instead, which is what `store_aggregate` writes.
assert_eq!(
serde_json::to_string(&parsed).unwrap(),
serde_json::to_string(&agg).unwrap()
);
}
#[test]
fn store_aggregate_rejects_missing_run() {
let _g = env_guard();
let tmp = TempDir::new().unwrap();
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
let agg = AggregateMetrics {
hit_at_k: Default::default(),
mrr: 0.0,
recall_at_k_doc: Default::default(),
precision_at_k_chunk: Default::default(),
citation_coverage: f32::NAN,
groundedness: 0.0,
empty_result_rate: 0.0,
refusal_correctness: f32::NAN,
total_queries: 0,
failed_queries: 0,
};
let err = store_aggregate_with_config(&cfg, "run_does_not_exist", &agg).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("run_does_not_exist"), "msg = {msg}");
}
#[test]
fn compare_runs_classifies_win_loss_draw_regression() {
let _g = env_guard();
let tmp = TempDir::new().unwrap();
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
let store = SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
let now = OffsetDateTime::UNIX_EPOCH;
// Run A:
// q-001 rank 1 → hit
// q-002 rank 4 → hit
// q-003 miss
write_run(
&store,
"run_a",
"test@1",
now,
vec![
qr("q-001", vec![hit(1, "chunk-1", "doc-1")]),
qr(
"q-002",
vec![
hit(1, "x", "x"),
hit(2, "x", "x"),
hit(3, "x", "x"),
hit(4, "chunk-2", "doc-2"),
],
),
qr("q-003", vec![hit(1, "x", "x")]),
],
);
// Run B:
// q-001 rank 2 → still hit (Loss vs A — worse rank)
// q-002 rank 1 → hit (Win — improved rank)
// q-003 hit @ rank 1 → hit (Win — was miss in A)
write_run(
&store,
"run_b",
"test@1",
now,
vec![
qr("q-001", vec![hit(1, "x", "x"), hit(2, "chunk-1", "doc-1")]),
qr("q-002", vec![hit(1, "chunk-2", "doc-2")]),
qr("q-003", vec![hit(1, "chunk-3", "doc-3")]),
],
);
drop(store);
let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap();
let by_id: std::collections::HashMap<&str, &kebab_eval::QueryComparison> =
report.per_query.iter().map(|c| (c.query_id.as_str(), c)).collect();
assert_eq!(by_id["q-001"].kind, ComparisonKind::Loss);
assert_eq!(by_id["q-002"].kind, ComparisonKind::Win);
assert_eq!(by_id["q-003"].kind, ComparisonKind::Win);
assert_eq!(report.deltas["chunker_version_match"], "exact");
}
#[test]
fn compare_strict_mode_refuses_chunker_version_mismatch() {
let _g = env_guard();
let tmp = TempDir::new().unwrap();
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
let store = SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
let now = OffsetDateTime::UNIX_EPOCH;
write_run(&store, "run_a", "test@1", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]);
write_run(&store, "run_b", "test@2", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]);
drop(store);
let opts = CompareOpts {
strict_chunker_version: true,
};
let err = compare_runs_with_config(&cfg, "run_a", "run_b", &opts).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("chunker_version mismatch"), "msg = {msg}");
}
#[test]
fn compare_graceful_falls_back_to_doc_id() {
let _g = env_guard();
let tmp = TempDir::new().unwrap();
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
let store = SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
let now = OffsetDateTime::UNIX_EPOCH;
// Run A uses test@1 chunker; run B uses test@2 — chunk_ids no longer
// align, but doc_ids do.
write_run(&store, "run_a", "test@1", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]);
write_run(
&store,
"run_b",
"test@2",
now,
// Different chunk_id, same doc_id → exact-mode matching would
// miss; doc-id fallback should still register a hit.
vec![qr("q-001", vec![hit(1, "chunk-1-renamed", "doc-1")])],
);
drop(store);
let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap();
assert_eq!(report.deltas["chunker_version_match"], "fallback_doc");
let q1 = report.per_query.iter().find(|c| c.query_id == "q-001").unwrap();
// Both runs hit doc-1 at rank 1 → Draw.
assert_eq!(q1.kind, ComparisonKind::Draw);
assert_eq!(q1.a_hit_rank, Some(1));
assert_eq!(q1.b_hit_rank, Some(1));
}
#[test]
fn compare_report_snapshot_matches_fixture() {
let _g = env_guard();
let tmp = TempDir::new().unwrap();
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
let store = SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
let now = OffsetDateTime::UNIX_EPOCH;
write_run(
&store,
"run_a",
"test@1",
now,
vec![
qr("q-001", vec![hit(1, "chunk-1", "doc-1")]),
qr(
"q-002",
vec![
hit(1, "x", "x"),
hit(2, "x", "x"),
hit(3, "x", "x"),
hit(4, "chunk-2", "doc-2"),
],
),
qr("q-003", vec![hit(1, "x", "x")]),
],
);
write_run(
&store,
"run_b",
"test@1",
now,
vec![
qr("q-001", vec![hit(1, "x", "x"), hit(2, "chunk-1", "doc-1")]),
qr("q-002", vec![hit(1, "chunk-2", "doc-2")]),
qr("q-003", vec![hit(1, "chunk-3", "doc-3")]),
],
);
drop(store);
let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap();
let actual = projection(&report);
let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests")
.join("fixtures")
.join("eval")
.join("compare-1.json");
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
fs::write(&fixture, format!("{}\n", serde_json::to_string_pretty(&actual).unwrap()))
.unwrap();
}
let expected_text = fs::read_to_string(&fixture)
.unwrap_or_else(|e| panic!("missing fixture {}: {e}", fixture.display()));
let expected: serde_json::Value = serde_json::from_str(&expected_text).unwrap();
assert_eq!(actual, expected, "compare report drift — re-run with UPDATE_SNAPSHOTS=1 if intended");
}
/// Project a `CompareReport` to the stable-across-runs subset.
/// `aggregate_*` and `deltas` are deterministic; per-query rows keep
/// only `(query_id, kind, ranks, note)` and discard volatile fields.
fn projection(r: &CompareReport) -> serde_json::Value {
serde_json::json!({
"run_a": r.run_a,
"run_b": r.run_b,
"aggregate_a": r.aggregate_a,
"aggregate_b": r.aggregate_b,
"deltas": r.deltas,
"per_query": r.per_query,
})
}
#[test]
fn render_report_md_is_human_readable() {
let _g = env_guard();
let tmp = TempDir::new().unwrap();
let cfg = cfg_with_data_dir(&tmp, golden_yaml_basic());
let store = SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
let now = OffsetDateTime::UNIX_EPOCH;
write_run(
&store,
"run_a",
"test@1",
now,
vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])],
);
write_run(
&store,
"run_b",
"test@1",
now,
vec![qr("q-001", vec![hit(2, "chunk-1", "doc-1")])],
);
drop(store);
let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap();
let md = kebab_eval::render_report_md(&report);
assert!(md.starts_with("# Eval compare:"), "md = {md}");
assert!(md.contains("hit@1"));
assert!(md.contains("MRR"));
assert!(md.contains("Wins"));
assert!(md.contains("q-001"));
}
#[test]
fn lang_default_is_used_when_omitted_in_yaml() {
// Round-trip safety: GoldenQuery without `lang` should parse fine.
let yaml = "- id: only\n query: q\n";
let _g = env_guard();
let tmp = TempDir::new().unwrap();
let golden = tmp.path().join("g.yaml");
fs::write(&golden, yaml).unwrap();
let qs: Vec<GoldenQuery> = serde_yaml::from_str(yaml).unwrap();
assert_eq!(qs.len(), 1);
assert_eq!(qs[0].lang, Lang(String::new()));
}