From bb0ec0469f31f2f51df10af0024185d86179ca08 Mon Sep 17 00:00:00 2001 From: th-kim0823 Date: Sun, 10 May 2026 22:26:21 +0900 Subject: [PATCH] feat(eval): precision_at_k_chunk metric (P@5, P@10) (fb-39) --- crates/kebab-eval/src/compare.rs | 1 + crates/kebab-eval/src/metrics.rs | 133 ++++++++++++++++++ .../tests/fixtures/eval/compare-1.json | 12 ++ .../kebab-eval/tests/metrics_and_compare.rs | 1 + 4 files changed, 147 insertions(+) diff --git a/crates/kebab-eval/src/compare.rs b/crates/kebab-eval/src/compare.rs index 24ed840..4ba9bd4 100644 --- a/crates/kebab-eval/src/compare.rs +++ b/crates/kebab-eval/src/compare.rs @@ -484,6 +484,7 @@ mod tests { hit_at_k: Default::default(), mrr: 0.5, recall_at_k_doc: Default::default(), + precision_at_k_chunk: Default::default(), citation_coverage: f32::NAN, groundedness: 0.0, empty_result_rate: 0.0, diff --git a/crates/kebab-eval/src/metrics.rs b/crates/kebab-eval/src/metrics.rs index dd1bf7d..f138845 100644 --- a/crates/kebab-eval/src/metrics.rs +++ b/crates/kebab-eval/src/metrics.rs @@ -58,6 +58,14 @@ pub struct AggregateMetrics { pub hit_at_k: BTreeMap, pub mrr: f32, pub recall_at_k_doc: BTreeMap, + /// p9-fb-39: chunk-level precision at k. Binary relevance via + /// `expected_chunk_ids` (a hit is "relevant" if its chunk_id is + /// in the golden's `expected_chunk_ids`). Denominator is k (fixed) + /// — `hits.len() < k` still divides by k, treating shortfall as + /// precision loss (mirrors `hit_at_k`). Queries with empty + /// `expected_chunk_ids` are skipped (mirrors `hit_at_k_chunk`). + #[serde(default)] + pub precision_at_k_chunk: BTreeMap, #[serde( serialize_with = "serialize_f32_nan_as_null", deserialize_with = "deserialize_f32_or_nan" @@ -187,6 +195,8 @@ pub(crate) fn aggregate_from_rows( TOP_K_VARIANTS.iter().map(|k| (*k, (0_u32, 0_u32))).collect(); let mut recall_at_k_doc: BTreeMap = TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect(); + let mut precision_at_k_chunk: BTreeMap = + TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect(); let mut mrr_sum: f64 = 0.0; let mut mrr_denom: u32 = 0; @@ -243,6 +253,18 @@ pub(crate) fn aggregate_from_rows( { mrr_sum += 1.0 / f64::from(rank); } + // p9-fb-39: precision@k_chunk — count of top-k hits whose + // chunk_id is in `expected`, divided by k (fixed denominator). + for k in TOP_K_VARIANTS { + let hits_in_topk_relevant = qr + .hits_top_k + .iter() + .filter(|h| h.rank <= *k && expected.contains(&h.chunk_id)) + .count(); + let entry = precision_at_k_chunk.get_mut(k).expect("init"); + entry.0 += hits_in_topk_relevant as f64 / f64::from(*k); + entry.1 += 1; + } } // recall@k_doc (doc-level, requires non-empty expected_doc_ids @@ -333,6 +355,7 @@ pub(crate) fn aggregate_from_rows( mrr_sum / f64::from(mrr_denom) }), recall_at_k_doc: round_recall_map(&recall_at_k_doc), + precision_at_k_chunk: round_recall_map(&precision_at_k_chunk), citation_coverage: ratio_or_nan(citation_num, citation_denom), groundedness: ratio_or_zero(groundedness_num, groundedness_denom), empty_result_rate: ratio_or_zero(empty_result_count, total_queries), @@ -674,4 +697,114 @@ mod tests { assert_eq!(agg.failed_queries, 1); assert_eq!(agg.total_queries, 1); } + + #[test] + fn precision_at_k_chunk_field_default_empty_on_old_json() { + // Old eval_runs.metrics_json predates fb-39 — no precision_at_k_chunk field. + // serde(default) yields empty BTreeMap. + let old = serde_json::json!({ + "hit_at_k": {"1": 0.5, "3": 0.5, "5": 0.5, "10": 0.5}, + "mrr": 0.5, + "recall_at_k_doc": {"1": 0.0, "3": 0.0, "5": 0.0, "10": 0.0}, + "citation_coverage": null, + "groundedness": 0.0, + "empty_result_rate": 0.0, + "refusal_correctness": null, + "total_queries": 1, + "failed_queries": 0 + }); + let parsed: AggregateMetrics = + serde_json::from_value(old).expect("backwards-compat deserialize"); + assert!(parsed.precision_at_k_chunk.is_empty()); + } + + #[test] + fn precision_at_k_chunk_exact_match() { + // expected = [c1, c2, c3]. Top-5 hits: [c1@1, c2@2, c3@3, x@4, y@5]. + // P@5 = 3/5 = 0.6. P@10 = 3/10 = 0.3. + let queries = vec![gq("q1", &["c1", "c2", "c3"], &["d1"])]; + let rows = vec![record( + "q1", + vec![ + hit(1, "c1", "d1"), + hit(2, "c2", "d1"), + hit(3, "c3", "d1"), + hit(4, "x", "d1"), + hit(5, "y", "d1"), + ], + None, + None, + )]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.precision_at_k_chunk[&5], 0.6); + assert_eq!(agg.precision_at_k_chunk[&10], 0.3); + } + + #[test] + fn precision_at_k_chunk_partial_topk_divides_by_k() { + // expected = [c1, c2]. Hits: only [c1@1, c2@2, x@3] (3 results). + // P@5 = 2/5 = 0.4 (denominator is k, not hits.len()). + let queries = vec![gq("q1", &["c1", "c2"], &["d1"])]; + let rows = vec![record( + "q1", + vec![hit(1, "c1", "d1"), hit(2, "c2", "d1"), hit(3, "x", "d1")], + None, + None, + )]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.precision_at_k_chunk[&5], 0.4); + assert_eq!(agg.precision_at_k_chunk[&10], 0.2); + } + + #[test] + fn precision_at_k_chunk_zero_relevant_in_topk() { + // expected = [c1]. Hits: [x@1, y@2, z@3] (none relevant). + // P@5 = 0/5 = 0.0. + let queries = vec![gq("q1", &["c1"], &["d1"])]; + let rows = vec![record( + "q1", + vec![hit(1, "x", "d1"), hit(2, "y", "d1"), hit(3, "z", "d1")], + None, + None, + )]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.precision_at_k_chunk[&5], 0.0); + } + + #[test] + fn precision_at_k_chunk_empty_expected_skipped() { + // expected_chunk_ids = []. Skipped → final BTreeMap entry value = 0.0 + // (zero-denom path in round_recall_map). Mirrors recall_at_k_doc behavior. + let queries = vec![gq("q1", &[], &["d1"])]; + let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, None)]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.precision_at_k_chunk[&5], 0.0); + } + + #[test] + fn precision_at_k_chunk_two_queries_averaged() { + // q1: expected=[c1], hits=[c1@1, x@2, y@3] → P@5 = 1/5 = 0.2 + // q2: expected=[c1, c2], hits=[c1@1, c2@2] → P@5 = 2/5 = 0.4 + // Avg P@5 = 0.3. + let queries = vec![ + gq("q1", &["c1"], &["d1"]), + gq("q2", &["c1", "c2"], &["d2"]), + ]; + let rows = vec![ + record( + "q1", + vec![hit(1, "c1", "d1"), hit(2, "x", "d1"), hit(3, "y", "d1")], + None, + None, + ), + record( + "q2", + vec![hit(1, "c1", "d2"), hit(2, "c2", "d2")], + None, + None, + ), + ]; + let agg = aggregate_from_rows(&queries, &rows).unwrap(); + assert_eq!(agg.precision_at_k_chunk[&5], 0.3); + } } diff --git a/crates/kebab-eval/tests/fixtures/eval/compare-1.json b/crates/kebab-eval/tests/fixtures/eval/compare-1.json index ee969ae..de408d2 100644 --- a/crates/kebab-eval/tests/fixtures/eval/compare-1.json +++ b/crates/kebab-eval/tests/fixtures/eval/compare-1.json @@ -11,6 +11,12 @@ "5": 0.666700005531311 }, "mrr": 0.41670000553131104, + "precision_at_k_chunk": { + "1": 0.33329999446868896, + "10": 0.06669999659061432, + "3": 0.11110000312328339, + "5": 0.13330000638961792 + }, "recall_at_k_doc": { "1": 0.33329999446868896, "10": 0.666700005531311, @@ -32,6 +38,12 @@ "5": 1.0 }, "mrr": 0.833299994468689, + "precision_at_k_chunk": { + "1": 0.666700005531311, + "10": 0.10000000149011612, + "3": 0.33329999446868896, + "5": 0.20000000298023224 + }, "recall_at_k_doc": { "1": 0.666700005531311, "10": 1.0, diff --git a/crates/kebab-eval/tests/metrics_and_compare.rs b/crates/kebab-eval/tests/metrics_and_compare.rs index 1e1b366..7cd7355 100644 --- a/crates/kebab-eval/tests/metrics_and_compare.rs +++ b/crates/kebab-eval/tests/metrics_and_compare.rs @@ -203,6 +203,7 @@ fn store_aggregate_rejects_missing_run() { hit_at_k: Default::default(), mrr: 0.0, recall_at_k_doc: Default::default(), + precision_at_k_chunk: Default::default(), citation_coverage: f32::NAN, groundedness: 0.0, empty_result_rate: 0.0,