feat(eval): precision_at_k_chunk metric (P@5, P@10) (fb-39)

2026-05-10 22:26:21 +09:00
parent f303c76f52
commit bb0ec0469f
4 changed files with 147 additions and 0 deletions
--- a/crates/kebab-eval/src/compare.rs
+++ b/crates/kebab-eval/src/compare.rs
@@ -484,6 +484,7 @@ mod tests {
            hit_at_k: Default::default(),
            mrr: 0.5,
            recall_at_k_doc: Default::default(),
+            precision_at_k_chunk: Default::default(),
            citation_coverage: f32::NAN,
            groundedness: 0.0,
            empty_result_rate: 0.0,
--- a/crates/kebab-eval/src/metrics.rs
+++ b/crates/kebab-eval/src/metrics.rs
@@ -58,6 +58,14 @@ pub struct AggregateMetrics {
    pub hit_at_k: BTreeMap<u32, f32>,
    pub mrr: f32,
    pub recall_at_k_doc: BTreeMap<u32, f32>,
+    /// p9-fb-39: chunk-level precision at k. Binary relevance via
+    /// `expected_chunk_ids` (a hit is "relevant" if its chunk_id is
+    /// in the golden's `expected_chunk_ids`). Denominator is k (fixed)
+    /// — `hits.len() < k` still divides by k, treating shortfall as
+    /// precision loss (mirrors `hit_at_k`). Queries with empty
+    /// `expected_chunk_ids` are skipped (mirrors `hit_at_k_chunk`).
+    #[serde(default)]
+    pub precision_at_k_chunk: BTreeMap<u32, f32>,
    #[serde(
        serialize_with = "serialize_f32_nan_as_null",
        deserialize_with = "deserialize_f32_or_nan"
@@ -187,6 +195,8 @@ pub(crate) fn aggregate_from_rows(
        TOP_K_VARIANTS.iter().map(|k| (*k, (0_u32, 0_u32))).collect();
    let mut recall_at_k_doc: BTreeMap<u32, (f64, u32)> =
        TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
+    let mut precision_at_k_chunk: BTreeMap<u32, (f64, u32)> =
+        TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();

    let mut mrr_sum: f64 = 0.0;
    let mut mrr_denom: u32 = 0;
@@ -243,6 +253,18 @@ pub(crate) fn aggregate_from_rows(
            {
                mrr_sum += 1.0 / f64::from(rank);
            }
+            // p9-fb-39: precision@k_chunk — count of top-k hits whose
+            // chunk_id is in `expected`, divided by k (fixed denominator).
+            for k in TOP_K_VARIANTS {
+                let hits_in_topk_relevant = qr
+                    .hits_top_k
+                    .iter()
+                    .filter(|h| h.rank <= *k && expected.contains(&h.chunk_id))
+                    .count();
+                let entry = precision_at_k_chunk.get_mut(k).expect("init");
+                entry.0 += hits_in_topk_relevant as f64 / f64::from(*k);
+                entry.1 += 1;
+            }
        }

        // recall@k_doc (doc-level, requires non-empty expected_doc_ids
@@ -333,6 +355,7 @@ pub(crate) fn aggregate_from_rows(
            mrr_sum / f64::from(mrr_denom)
        }),
        recall_at_k_doc: round_recall_map(&recall_at_k_doc),
+        precision_at_k_chunk: round_recall_map(&precision_at_k_chunk),
        citation_coverage: ratio_or_nan(citation_num, citation_denom),
        groundedness: ratio_or_zero(groundedness_num, groundedness_denom),
        empty_result_rate: ratio_or_zero(empty_result_count, total_queries),
@@ -674,4 +697,114 @@ mod tests {
        assert_eq!(agg.failed_queries, 1);
        assert_eq!(agg.total_queries, 1);
    }
+
+    #[test]
+    fn precision_at_k_chunk_field_default_empty_on_old_json() {
+        // Old eval_runs.metrics_json predates fb-39 — no precision_at_k_chunk field.
+        // serde(default) yields empty BTreeMap.
+        let old = serde_json::json!({
+            "hit_at_k": {"1": 0.5, "3": 0.5, "5": 0.5, "10": 0.5},
+            "mrr": 0.5,
+            "recall_at_k_doc": {"1": 0.0, "3": 0.0, "5": 0.0, "10": 0.0},
+            "citation_coverage": null,
+            "groundedness": 0.0,
+            "empty_result_rate": 0.0,
+            "refusal_correctness": null,
+            "total_queries": 1,
+            "failed_queries": 0
+        });
+        let parsed: AggregateMetrics =
+            serde_json::from_value(old).expect("backwards-compat deserialize");
+        assert!(parsed.precision_at_k_chunk.is_empty());
+    }
+
+    #[test]
+    fn precision_at_k_chunk_exact_match() {
+        // expected = [c1, c2, c3]. Top-5 hits: [c1@1, c2@2, c3@3, x@4, y@5].
+        // P@5 = 3/5 = 0.6. P@10 = 3/10 = 0.3.
+        let queries = vec![gq("q1", &["c1", "c2", "c3"], &["d1"])];
+        let rows = vec![record(
+            "q1",
+            vec![
+                hit(1, "c1", "d1"),
+                hit(2, "c2", "d1"),
+                hit(3, "c3", "d1"),
+                hit(4, "x", "d1"),
+                hit(5, "y", "d1"),
+            ],
+            None,
+            None,
+        )];
+        let agg = aggregate_from_rows(&queries, &rows).unwrap();
+        assert_eq!(agg.precision_at_k_chunk[&5], 0.6);
+        assert_eq!(agg.precision_at_k_chunk[&10], 0.3);
+    }
+
+    #[test]
+    fn precision_at_k_chunk_partial_topk_divides_by_k() {
+        // expected = [c1, c2]. Hits: only [c1@1, c2@2, x@3] (3 results).
+        // P@5 = 2/5 = 0.4 (denominator is k, not hits.len()).
+        let queries = vec![gq("q1", &["c1", "c2"], &["d1"])];
+        let rows = vec![record(
+            "q1",
+            vec![hit(1, "c1", "d1"), hit(2, "c2", "d1"), hit(3, "x", "d1")],
+            None,
+            None,
+        )];
+        let agg = aggregate_from_rows(&queries, &rows).unwrap();
+        assert_eq!(agg.precision_at_k_chunk[&5], 0.4);
+        assert_eq!(agg.precision_at_k_chunk[&10], 0.2);
+    }
+
+    #[test]
+    fn precision_at_k_chunk_zero_relevant_in_topk() {
+        // expected = [c1]. Hits: [x@1, y@2, z@3] (none relevant).
+        // P@5 = 0/5 = 0.0.
+        let queries = vec![gq("q1", &["c1"], &["d1"])];
+        let rows = vec![record(
+            "q1",
+            vec![hit(1, "x", "d1"), hit(2, "y", "d1"), hit(3, "z", "d1")],
+            None,
+            None,
+        )];
+        let agg = aggregate_from_rows(&queries, &rows).unwrap();
+        assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
+    }
+
+    #[test]
+    fn precision_at_k_chunk_empty_expected_skipped() {
+        // expected_chunk_ids = []. Skipped → final BTreeMap entry value = 0.0
+        // (zero-denom path in round_recall_map). Mirrors recall_at_k_doc behavior.
+        let queries = vec![gq("q1", &[], &["d1"])];
+        let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, None)];
+        let agg = aggregate_from_rows(&queries, &rows).unwrap();
+        assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
+    }
+
+    #[test]
+    fn precision_at_k_chunk_two_queries_averaged() {
+        // q1: expected=[c1], hits=[c1@1, x@2, y@3]   → P@5 = 1/5 = 0.2
+        // q2: expected=[c1, c2], hits=[c1@1, c2@2]  → P@5 = 2/5 = 0.4
+        // Avg P@5 = 0.3.
+        let queries = vec![
+            gq("q1", &["c1"], &["d1"]),
+            gq("q2", &["c1", "c2"], &["d2"]),
+        ];
+        let rows = vec![
+            record(
+                "q1",
+                vec![hit(1, "c1", "d1"), hit(2, "x", "d1"), hit(3, "y", "d1")],
+                None,
+                None,
+            ),
+            record(
+                "q2",
+                vec![hit(1, "c1", "d2"), hit(2, "c2", "d2")],
+                None,
+                None,
+            ),
+        ];
+        let agg = aggregate_from_rows(&queries, &rows).unwrap();
+        assert_eq!(agg.precision_at_k_chunk[&5], 0.3);
+    }
 }
--- a/crates/kebab-eval/tests/fixtures/eval/compare-1.json
+++ b/crates/kebab-eval/tests/fixtures/eval/compare-1.json
@@ -11,6 +11,12 @@
      "5": 0.666700005531311
    },
    "mrr": 0.41670000553131104,
+    "precision_at_k_chunk": {
+      "1": 0.33329999446868896,
+      "10": 0.06669999659061432,
+      "3": 0.11110000312328339,
+      "5": 0.13330000638961792
+    },
    "recall_at_k_doc": {
      "1": 0.33329999446868896,
      "10": 0.666700005531311,
@@ -32,6 +38,12 @@
      "5": 1.0
    },
    "mrr": 0.833299994468689,
+    "precision_at_k_chunk": {
+      "1": 0.666700005531311,
+      "10": 0.10000000149011612,
+      "3": 0.33329999446868896,
+      "5": 0.20000000298023224
+    },
    "recall_at_k_doc": {
      "1": 0.666700005531311,
      "10": 1.0,
--- a/crates/kebab-eval/tests/metrics_and_compare.rs
+++ b/crates/kebab-eval/tests/metrics_and_compare.rs
@@ -203,6 +203,7 @@ fn store_aggregate_rejects_missing_run() {
        hit_at_k: Default::default(),
        mrr: 0.0,
        recall_at_k_doc: Default::default(),
+        precision_at_k_chunk: Default::default(),
        citation_coverage: f32::NAN,
        groundedness: 0.0,
        empty_result_rate: 0.0,