From bb0ec0469f31f2f51df10af0024185d86179ca08 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 22:26:21 +0900
Subject: [PATCH] feat(eval): precision_at_k_chunk metric (P@5, P@10) (fb-39)
---
crates/kebab-eval/src/compare.rs | 1 +
crates/kebab-eval/src/metrics.rs | 133 ++++++++++++++++++
.../tests/fixtures/eval/compare-1.json | 12 ++
.../kebab-eval/tests/metrics_and_compare.rs | 1 +
4 files changed, 147 insertions(+)
diff --git a/crates/kebab-eval/src/compare.rs b/crates/kebab-eval/src/compare.rs
index 24ed840..4ba9bd4 100644
--- a/crates/kebab-eval/src/compare.rs
+++ b/crates/kebab-eval/src/compare.rs
@@ -484,6 +484,7 @@ mod tests {
hit_at_k: Default::default(),
mrr: 0.5,
recall_at_k_doc: Default::default(),
+ precision_at_k_chunk: Default::default(),
citation_coverage: f32::NAN,
groundedness: 0.0,
empty_result_rate: 0.0,
diff --git a/crates/kebab-eval/src/metrics.rs b/crates/kebab-eval/src/metrics.rs
index dd1bf7d..f138845 100644
--- a/crates/kebab-eval/src/metrics.rs
+++ b/crates/kebab-eval/src/metrics.rs
@@ -58,6 +58,14 @@ pub struct AggregateMetrics {
pub hit_at_k: BTreeMap,
pub mrr: f32,
pub recall_at_k_doc: BTreeMap,
+ /// p9-fb-39: chunk-level precision at k. Binary relevance via
+ /// `expected_chunk_ids` (a hit is "relevant" if its chunk_id is
+ /// in the golden's `expected_chunk_ids`). Denominator is k (fixed)
+ /// — `hits.len() < k` still divides by k, treating shortfall as
+ /// precision loss (mirrors `hit_at_k`). Queries with empty
+ /// `expected_chunk_ids` are skipped (mirrors `hit_at_k_chunk`).
+ #[serde(default)]
+ pub precision_at_k_chunk: BTreeMap,
#[serde(
serialize_with = "serialize_f32_nan_as_null",
deserialize_with = "deserialize_f32_or_nan"
@@ -187,6 +195,8 @@ pub(crate) fn aggregate_from_rows(
TOP_K_VARIANTS.iter().map(|k| (*k, (0_u32, 0_u32))).collect();
let mut recall_at_k_doc: BTreeMap =
TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
+ let mut precision_at_k_chunk: BTreeMap =
+ TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
let mut mrr_sum: f64 = 0.0;
let mut mrr_denom: u32 = 0;
@@ -243,6 +253,18 @@ pub(crate) fn aggregate_from_rows(
{
mrr_sum += 1.0 / f64::from(rank);
}
+ // p9-fb-39: precision@k_chunk — count of top-k hits whose
+ // chunk_id is in `expected`, divided by k (fixed denominator).
+ for k in TOP_K_VARIANTS {
+ let hits_in_topk_relevant = qr
+ .hits_top_k
+ .iter()
+ .filter(|h| h.rank <= *k && expected.contains(&h.chunk_id))
+ .count();
+ let entry = precision_at_k_chunk.get_mut(k).expect("init");
+ entry.0 += hits_in_topk_relevant as f64 / f64::from(*k);
+ entry.1 += 1;
+ }
}
// recall@k_doc (doc-level, requires non-empty expected_doc_ids
@@ -333,6 +355,7 @@ pub(crate) fn aggregate_from_rows(
mrr_sum / f64::from(mrr_denom)
}),
recall_at_k_doc: round_recall_map(&recall_at_k_doc),
+ precision_at_k_chunk: round_recall_map(&precision_at_k_chunk),
citation_coverage: ratio_or_nan(citation_num, citation_denom),
groundedness: ratio_or_zero(groundedness_num, groundedness_denom),
empty_result_rate: ratio_or_zero(empty_result_count, total_queries),
@@ -674,4 +697,114 @@ mod tests {
assert_eq!(agg.failed_queries, 1);
assert_eq!(agg.total_queries, 1);
}
+
+ #[test]
+ fn precision_at_k_chunk_field_default_empty_on_old_json() {
+ // Old eval_runs.metrics_json predates fb-39 — no precision_at_k_chunk field.
+ // serde(default) yields empty BTreeMap.
+ let old = serde_json::json!({
+ "hit_at_k": {"1": 0.5, "3": 0.5, "5": 0.5, "10": 0.5},
+ "mrr": 0.5,
+ "recall_at_k_doc": {"1": 0.0, "3": 0.0, "5": 0.0, "10": 0.0},
+ "citation_coverage": null,
+ "groundedness": 0.0,
+ "empty_result_rate": 0.0,
+ "refusal_correctness": null,
+ "total_queries": 1,
+ "failed_queries": 0
+ });
+ let parsed: AggregateMetrics =
+ serde_json::from_value(old).expect("backwards-compat deserialize");
+ assert!(parsed.precision_at_k_chunk.is_empty());
+ }
+
+ #[test]
+ fn precision_at_k_chunk_exact_match() {
+ // expected = [c1, c2, c3]. Top-5 hits: [c1@1, c2@2, c3@3, x@4, y@5].
+ // P@5 = 3/5 = 0.6. P@10 = 3/10 = 0.3.
+ let queries = vec![gq("q1", &["c1", "c2", "c3"], &["d1"])];
+ let rows = vec![record(
+ "q1",
+ vec![
+ hit(1, "c1", "d1"),
+ hit(2, "c2", "d1"),
+ hit(3, "c3", "d1"),
+ hit(4, "x", "d1"),
+ hit(5, "y", "d1"),
+ ],
+ None,
+ None,
+ )];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.6);
+ assert_eq!(agg.precision_at_k_chunk[&10], 0.3);
+ }
+
+ #[test]
+ fn precision_at_k_chunk_partial_topk_divides_by_k() {
+ // expected = [c1, c2]. Hits: only [c1@1, c2@2, x@3] (3 results).
+ // P@5 = 2/5 = 0.4 (denominator is k, not hits.len()).
+ let queries = vec![gq("q1", &["c1", "c2"], &["d1"])];
+ let rows = vec![record(
+ "q1",
+ vec![hit(1, "c1", "d1"), hit(2, "c2", "d1"), hit(3, "x", "d1")],
+ None,
+ None,
+ )];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.4);
+ assert_eq!(agg.precision_at_k_chunk[&10], 0.2);
+ }
+
+ #[test]
+ fn precision_at_k_chunk_zero_relevant_in_topk() {
+ // expected = [c1]. Hits: [x@1, y@2, z@3] (none relevant).
+ // P@5 = 0/5 = 0.0.
+ let queries = vec![gq("q1", &["c1"], &["d1"])];
+ let rows = vec![record(
+ "q1",
+ vec![hit(1, "x", "d1"), hit(2, "y", "d1"), hit(3, "z", "d1")],
+ None,
+ None,
+ )];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
+ }
+
+ #[test]
+ fn precision_at_k_chunk_empty_expected_skipped() {
+ // expected_chunk_ids = []. Skipped → final BTreeMap entry value = 0.0
+ // (zero-denom path in round_recall_map). Mirrors recall_at_k_doc behavior.
+ let queries = vec![gq("q1", &[], &["d1"])];
+ let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, None)];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
+ }
+
+ #[test]
+ fn precision_at_k_chunk_two_queries_averaged() {
+ // q1: expected=[c1], hits=[c1@1, x@2, y@3] → P@5 = 1/5 = 0.2
+ // q2: expected=[c1, c2], hits=[c1@1, c2@2] → P@5 = 2/5 = 0.4
+ // Avg P@5 = 0.3.
+ let queries = vec![
+ gq("q1", &["c1"], &["d1"]),
+ gq("q2", &["c1", "c2"], &["d2"]),
+ ];
+ let rows = vec![
+ record(
+ "q1",
+ vec![hit(1, "c1", "d1"), hit(2, "x", "d1"), hit(3, "y", "d1")],
+ None,
+ None,
+ ),
+ record(
+ "q2",
+ vec![hit(1, "c1", "d2"), hit(2, "c2", "d2")],
+ None,
+ None,
+ ),
+ ];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.3);
+ }
}
diff --git a/crates/kebab-eval/tests/fixtures/eval/compare-1.json b/crates/kebab-eval/tests/fixtures/eval/compare-1.json
index ee969ae..de408d2 100644
--- a/crates/kebab-eval/tests/fixtures/eval/compare-1.json
+++ b/crates/kebab-eval/tests/fixtures/eval/compare-1.json
@@ -11,6 +11,12 @@
"5": 0.666700005531311
},
"mrr": 0.41670000553131104,
+ "precision_at_k_chunk": {
+ "1": 0.33329999446868896,
+ "10": 0.06669999659061432,
+ "3": 0.11110000312328339,
+ "5": 0.13330000638961792
+ },
"recall_at_k_doc": {
"1": 0.33329999446868896,
"10": 0.666700005531311,
@@ -32,6 +38,12 @@
"5": 1.0
},
"mrr": 0.833299994468689,
+ "precision_at_k_chunk": {
+ "1": 0.666700005531311,
+ "10": 0.10000000149011612,
+ "3": 0.33329999446868896,
+ "5": 0.20000000298023224
+ },
"recall_at_k_doc": {
"1": 0.666700005531311,
"10": 1.0,
diff --git a/crates/kebab-eval/tests/metrics_and_compare.rs b/crates/kebab-eval/tests/metrics_and_compare.rs
index 1e1b366..7cd7355 100644
--- a/crates/kebab-eval/tests/metrics_and_compare.rs
+++ b/crates/kebab-eval/tests/metrics_and_compare.rs
@@ -203,6 +203,7 @@ fn store_aggregate_rejects_missing_run() {
hit_at_k: Default::default(),
mrr: 0.0,
recall_at_k_doc: Default::default(),
+ precision_at_k_chunk: Default::default(),
citation_coverage: f32::NAN,
groundedness: 0.0,
empty_result_rate: 0.0,