From cd5b1e3bfc9fe551c48a4d0bc7f29021709269e4 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 22:05:09 +0900
Subject: [PATCH 1/5] spec(fb-39): eval foundation design (P@k metric)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- AggregateMetrics 에 precision_at_k_chunk: BTreeMap
(P@5, P@10) 추가, binary relevance via expected_chunk_ids
- Denominator = k 고정 (hits.len() < k 도 precision 손실 간주)
- Empty expected_chunk_ids query 는 skip (hit_at_k 동일 정책)
- Lever 적용 (chunk policy / RRF / cross-encoder / embedding) 은
본 spec 범위 외 — fb-39b 이후 별도 task
- Golden set schema 무변경, shipped fixtures 헤더 주석만 강화
Co-Authored-By: Claude Opus 4.7 (1M context)
---
...6-05-10-p9-fb-39-eval-foundation-design.md | 133 ++++++++++++++++++
1 file changed, 133 insertions(+)
create mode 100644 docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md
diff --git a/docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md b/docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md
new file mode 100644
index 0000000..d9a3730
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md
@@ -0,0 +1,133 @@
+---
+title: "p9-fb-39 — Eval foundation design (P@k metric)"
+phase: P9
+component: kebab-eval + docs
+task_id: p9-fb-39
+status: design
+target_version: 0.7.0
+contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
+contract_sections: [§3 chunking, §4 search, §7 RAG, §11 eval]
+date: 2026-05-10
+---
+
+# p9-fb-39 — Eval foundation (P@k metric)
+
+## Goal
+
+도그푸딩 피드백 — agent / 사용자가 "rank 5+ 부터 노이즈 섞임" 지적 (precision-at-k 저하). lever (chunk policy / RRF / score_gate / cross-encoder / embedding) 선택 전, **measurement infrastructure 먼저** 정비. 본 PR scope:
+
+- `AggregateMetrics` 에 `precision_at_k_chunk: BTreeMap` 추가 (P@5, P@10).
+- chunk-level binary relevance 기반 — `expected_chunk_ids` 안 chunk 가 top-k 안 등장한 비율.
+- Golden set schema 무변경 — `expected_chunk_ids` 가 ground truth (curator 책임).
+- 문서화 강화 — `fixtures/golden_queries.yaml` 헤더 주석.
+
+Lever 적용 (chunk policy / RRF tune / cross-encoder / embedding upgrade) 은 **본 spec 범위 외** — fb-39b 이후 별도 task 로 분리. 측정 도구가 먼저 있어야 lever 효과 비교 가능.
+
+## Behavior contract
+
+### Metric definition
+
+```
+P@k_chunk(query) = |top-k hits ∩ expected_chunk_ids| / k
+```
+
+**Denominator = k 고정**. `hits.len() < k` 인 경우에도 분모는 k — top-k 부족도 precision 손실로 간주 (`hit_at_k` 와 동일 컨벤션).
+
+`expected_chunk_ids` 빈 query 는 metric 계산에서 skip (`hit_at_k_chunk` 와 동일 정책).
+
+**Aggregation**: 모든 valid query (expected_chunk_ids 비어있지 않음) 의 P@k_chunk 평균. valid query 0 건이면 NaN → JSON null.
+
+### Wire shape
+
+`AggregateMetrics` 신규 field:
+
+```rust
+pub struct AggregateMetrics {
+ pub hit_at_k: BTreeMap,
+ pub mrr: f32,
+ pub recall_at_k_doc: BTreeMap,
+ /// p9-fb-39: chunk-level precision at k. Binary relevance via
+ /// `expected_chunk_ids`. Denominator = k (fixed). Skip queries
+ /// with empty `expected_chunk_ids`.
+ #[serde(default)]
+ pub precision_at_k_chunk: BTreeMap,
+ // ... 기존 필드 ...
+}
+```
+
+`#[serde(default)]` — 기존 eval_runs.metrics_json (옛 binary 가 기록한) 에 field 부재 시 empty BTreeMap 로 deserialize. backwards-compat 보장.
+
+### k values
+
+`compute_aggregate_metrics` 가 5, 10 두 값에 대해 계산. (기존 `hit_at_k` / `recall_at_k_doc` 가 이미 동일 k 사용 — 재사용.)
+
+## Allowed / forbidden dependencies
+
+- `kebab-eval`: 신규 dep 없음. metrics 모듈 확장만.
+- 다른 crate 무수정.
+
+`kebab-eval` 의 `metrics` / `compare` 모듈은 retrieval / embedding / LLM crate 직접 import 금지 룰 그대로 (P5 inheritance).
+
+## Public surface delta
+
+### kebab-eval::metrics
+
+```rust
+pub struct AggregateMetrics {
+ // ... 기존 ...
+ #[serde(default)]
+ pub precision_at_k_chunk: BTreeMap,
+}
+```
+
+`compute_aggregate_metrics` body 안 새 누적 BTreeMap + 평균 계산 추가. NaN handling 은 기존 `serialize_f32_nan_as_null` 패턴 재사용 — 단, BTreeMap 의 NaN 처리 패턴이 hit_at_k 와 동일하게 round_recall_map 같은 helper 통해.
+
+## Test plan
+
+| kind | description |
+|------|-------------|
+| unit (metrics) | `precision_at_k_chunk` empty expected → query skip → metric BTreeMap 안 entry 부재 또는 NaN |
+| unit (metrics) | exact match: 5 hits, top-3 in expected → P@5 = 3/5 = 0.6 |
+| unit (metrics) | partial top-k: hits.len() = 3 < k=5, all 3 in expected → P@5 = 3/5 = 0.6 (분모 k 고정) |
+| unit (metrics) | top-k 안 expected 0건 → P@5 = 0.0 |
+| unit (metrics) | 모든 query expected 비어있음 → P@k entry 부재 또는 NaN → JSON null |
+| unit (metrics) | `AggregateMetrics` serde roundtrip — precision_at_k_chunk 신규 field 보존 |
+| unit (metrics) | 옛 JSON (precision_at_k_chunk 부재) deserialize → empty BTreeMap default |
+| 통합 (eval runner) | runner end-to-end → eval_runs.metrics_json 안 precision_at_k_chunk 채워짐 |
+
+snapshot tests (기존 metrics 출력 fixture 가 있다면 갱신 — `cargo test -p kebab-eval` 수행 후 fixture diff 확인).
+
+## Implementation steps (high-level)
+
+1. `kebab-eval::metrics`: `AggregateMetrics.precision_at_k_chunk` field 추가 + 계산 로직 + 단위 테스트.
+2. snapshot tests 갱신 (있다면).
+3. `fixtures/golden_queries.yaml` 헤더 주석 강화 — `expected_chunk_ids` 채우기 가이드.
+4. README `kebab eval` 섹션 또는 design §11 eval 에 P@k 정의 한 줄 추가.
+5. tasks/INDEX.md / spec status flip.
+
+3-5 step PR. 단일 세션 내 완료 가능.
+
+## Risks / notes
+
+- **분모 = k 고정 정책**: `hits.len() < k` 인 query 가 많으면 P@k 가 항상 < 1.0. 사용자 직관과 다를 수 있음 — README/design 에 명시.
+- **frozen design vs new metric**: design §11 eval 의 metric 표 갱신 필요. frozen contract 변경 트리거 — `target_version: 0.7.0` bump 명시.
+- **lever deferral**: 본 spec contract_sections 는 §3 chunking + §4 search + §7 RAG + §11 eval 인데, 실제 본 PR 은 §11 만 건드림. lever 적용 (chunk policy / RRF / cross-encoder / embedding) 은 fb-39b 이후 별도. spec status banner 에 명시.
+- **expected_chunk_ids 비어있는 shipped golden**: 현재 `fixtures/golden_queries.yaml` 의 g001-g005 모두 expected_chunk_ids 비어있음. P@k 계산 시 모두 skip — out-of-the-box 측정값 0건. curator 가 자기 KB 로 채워야 metric 의미 가짐. 의도 — golden set 은 workspace 의존이라 shipped fixtures 는 template, 실제 측정은 user 가 채워서 한다.
+- **fb-23 incremental ingest 와 충돌 없음**: 본 PR 은 metric 만 추가. chunker_version / embedding_version 무변경.
+
+## Out of scope
+
+- Lever 적용 (chunk policy retune / RRF k tune / score_gate default ON / cross-encoder PoC / embedding model 업그레이드).
+- NDCG / MAP / 기타 ranking metric.
+- precision_at_k_doc (doc-level — recall_at_k_doc 가 이미 있음, 본 spec 은 chunk-level 만).
+- Golden set 콘텐츠 확장 (g006+ 추가) — curator 책임.
+- Synthetic golden generator (`kebab eval golden-from-corpus` 등).
+- Per-query relevance score (binary 0/1 만 — graded relevance 는 NDCG 도입 시 검토).
+
+## Documentation updates (implementation PR 동시)
+
+- `fixtures/golden_queries.yaml` — 헤더 주석에 `expected_chunk_ids` ground truth 의미 + P@k 측정 위해 채우기 권장 안내.
+- `README.md` — `kebab eval` 섹션 (있다면) 에 P@k metric 한 줄. 없으면 skip.
+- `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §11 eval — metric 표에 `precision_at_k_chunk` 한 줄 추가.
+- `tasks/p9/p9-fb-39-retrieval-precision-tuning.md` — `status: open → completed`, 단 banner 에 "eval foundation only, lever 적용 deferred to fb-39b" 명시 + design/plan 링크.
+- `tasks/INDEX.md` — fb-39 행 ✅ (eval foundation only).
--
2.49.1
From f303c76f52c1527720ff2cdf978f5aa687dc9e0b Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 22:19:44 +0900
Subject: [PATCH 2/5] plan(fb-39): eval foundation implementation plan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
4 tasks: AggregateMetrics.precision_at_k_chunk field + serde
backwards-compat, compute aggregation in loop with 5 unit tests,
golden YAML header doc strengthening, design §11 + INDEX + status
flip.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../2026-05-10-p9-fb-39-eval-foundation.md | 418 ++++++++++++++++++
1 file changed, 418 insertions(+)
create mode 100644 docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md
diff --git a/docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md b/docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md
new file mode 100644
index 0000000..a39a2ad
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md
@@ -0,0 +1,418 @@
+# fb-39 Eval Foundation (P@k Metric) Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Add chunk-level `precision_at_k_chunk` metric (P@5, P@10) to kebab-eval `AggregateMetrics`, plus golden-set ground-truth documentation strengthening — so a future fb-39b can measure whether a lever (chunk policy / RRF / cross-encoder / embedding upgrade) actually moves the rank-5+ noise needle.
+
+**Architecture:** Single new field on `AggregateMetrics`, computed inside the existing `compute_aggregate_with_config` loop using the same accumulator pattern as `recall_at_k_doc` (sum-of-per-query-ratios / denominator), serialized via the existing `round_recall_map` helper. Denominator is k (fixed), matching the `hit_at_k` convention. Skip queries with empty `expected_chunk_ids`. Golden set schema unchanged — `expected_chunk_ids` is the ground truth (curator fills per-workspace).
+
+**Tech Stack:** Rust 2024, serde, serde_yaml. No new deps.
+
+**Spec:** `docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md`
+
+---
+
+## File map
+
+**Modify:**
+- `crates/kebab-eval/src/metrics.rs` — add `precision_at_k_chunk` field on `AggregateMetrics`, init/accumulate/finalize inside `compute_aggregate_with_config`, plus unit tests.
+- `fixtures/golden_queries.yaml` — strengthen header comment about `expected_chunk_ids` being P@k ground truth.
+- `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` — add `precision_at_k_chunk` to §11 eval metric table.
+- `tasks/p9/p9-fb-39-retrieval-precision-tuning.md` — flip status, link design + plan, "lever 적용 deferred to fb-39b" banner.
+- `tasks/INDEX.md` — flip fb-39 row to ✅ (eval foundation only).
+
+**Create:** none.
+
+---
+
+## Task 1: Add precision_at_k_chunk field + serde backwards-compat
+
+**Files:**
+- Modify: `crates/kebab-eval/src/metrics.rs`
+
+- [ ] **Step 1: Append failing test to `mod tests`**
+
+```rust
+#[test]
+fn precision_at_k_chunk_field_default_empty_on_old_json() {
+ // Old eval_runs.metrics_json predates fb-39 — no precision_at_k_chunk field.
+ // serde(default) should yield empty BTreeMap.
+ let old = serde_json::json!({
+ "hit_at_k": {"1": 0.5, "3": 0.5, "5": 0.5, "10": 0.5},
+ "mrr": 0.5,
+ "recall_at_k_doc": {"1": 0.0, "3": 0.0, "5": 0.0, "10": 0.0},
+ "citation_coverage": null,
+ "groundedness": 0.0,
+ "empty_result_rate": 0.0,
+ "refusal_correctness": null,
+ "total_queries": 1,
+ "failed_queries": 0
+ });
+ let parsed: AggregateMetrics = serde_json::from_value(old).expect("backwards-compat deserialize");
+ assert!(parsed.precision_at_k_chunk.is_empty());
+}
+
+#[test]
+fn precision_at_k_chunk_field_serializes_when_populated() {
+ let mut p = BTreeMap::new();
+ p.insert(5, 0.6_f32);
+ p.insert(10, 0.3_f32);
+ let agg = AggregateMetrics {
+ hit_at_k: BTreeMap::new(),
+ mrr: 0.0,
+ recall_at_k_doc: BTreeMap::new(),
+ precision_at_k_chunk: p,
+ citation_coverage: 0.0,
+ groundedness: 0.0,
+ empty_result_rate: 0.0,
+ refusal_correctness: 0.0,
+ total_queries: 0,
+ failed_queries: 0,
+ };
+ let v = serde_json::to_value(&agg).unwrap();
+ assert_eq!(v["precision_at_k_chunk"]["5"], 0.6);
+ assert_eq!(v["precision_at_k_chunk"]["10"], 0.3);
+}
+```
+
+- [ ] **Step 2: Run tests — expect compile errors (field undefined)**
+
+```bash
+cargo test -p kebab-eval --lib precision_at_k_chunk
+```
+Expected: errors — `precision_at_k_chunk` field missing on `AggregateMetrics`.
+
+- [ ] **Step 3: Add field to `AggregateMetrics`**
+
+In `crates/kebab-eval/src/metrics.rs`, find `pub struct AggregateMetrics { ... }` (~line 57). Add field after `recall_at_k_doc`:
+
+```rust
+ /// p9-fb-39: chunk-level precision at k. Binary relevance via
+ /// `expected_chunk_ids` (a hit is "relevant" if its chunk_id is
+ /// in the golden's `expected_chunk_ids`). Denominator is k (fixed)
+ /// — `hits.len() < k` still divides by k, treating shortfall as
+ /// precision loss (mirrors `hit_at_k`). Queries with empty
+ /// `expected_chunk_ids` are skipped (mirrors `hit_at_k_chunk`).
+ #[serde(default)]
+ pub precision_at_k_chunk: BTreeMap,
+```
+
+The other tests in the file (e.g. `hit_at_k_handles_ranks_1_4_miss`, `recall_at_k_doc_partial`) construct `AggregateMetrics` via the public `compute_aggregate_with_config` path, not via struct literal, so the new `#[serde(default)]` field does NOT break them. Only direct struct-literal constructions need updates — search the file to confirm:
+
+```bash
+grep -n "AggregateMetrics {" crates/kebab-eval/src/metrics.rs
+```
+
+For each direct struct-literal site, add `precision_at_k_chunk: BTreeMap::new(),` to the literal.
+
+- [ ] **Step 4: Run tests — expect both new tests pass**
+
+```bash
+cargo test -p kebab-eval --lib precision_at_k_chunk
+```
+Expected: both pass.
+
+- [ ] **Step 5: Run clippy**
+
+```bash
+cargo clippy -p kebab-eval --all-targets -- -D warnings
+```
+Expected: clean.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add crates/kebab-eval/src/metrics.rs
+git commit -m "feat(eval): AggregateMetrics.precision_at_k_chunk field (fb-39)"
+```
+
+---
+
+## Task 2: Compute precision_at_k_chunk in aggregate loop
+
+**Files:**
+- Modify: `crates/kebab-eval/src/metrics.rs`
+
+- [ ] **Step 1: Append failing tests to `mod tests`**
+
+(Use the existing `make_query_result` / fixture helpers — read the top of the test module for available helpers, e.g. `mk_qr_with_chunks(query_id, chunk_ids_with_ranks)`.)
+
+```rust
+#[test]
+fn precision_at_k_chunk_exact_match() {
+ // 1 query, expected = [c1, c2, c3]. Top-5 hits: [c1@1, c2@2, c3@3, x@4, y@5].
+ // P@5 = 3/5 = 0.6. P@10 = 3/10 = 0.3.
+ let queries = vec![mk_golden(
+ "g1",
+ &[], // expected_doc_ids
+ &["c1", "c2", "c3"], // expected_chunk_ids
+ &[], // must_contain
+ &[], // forbidden
+ None, // expected_refusal
+ )];
+ let rows = vec![mk_query_row(
+ "g1",
+ &[("c1", 1), ("c2", 2), ("c3", 3), ("x", 4), ("y", 5)],
+ )];
+ let agg = compute_from_inputs(&queries, &rows);
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.6);
+ assert_eq!(agg.precision_at_k_chunk[&10], 0.3);
+}
+
+#[test]
+fn precision_at_k_chunk_partial_topk_divides_by_k() {
+ // 1 query, expected = [c1, c2]. Top hits: only [c1@1, c2@2] (3 results total).
+ // P@5 = 2/5 = 0.4 (denominator k, not hits.len).
+ let queries = vec![mk_golden("g1", &[], &["c1", "c2"], &[], &[], None)];
+ let rows = vec![mk_query_row("g1", &[("c1", 1), ("c2", 2), ("x", 3)])];
+ let agg = compute_from_inputs(&queries, &rows);
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.4);
+ assert_eq!(agg.precision_at_k_chunk[&10], 0.2);
+}
+
+#[test]
+fn precision_at_k_chunk_zero_relevant_in_topk() {
+ // 1 query, expected = [c1]. Top hits all unrelated.
+ // P@5 = 0/5 = 0.0.
+ let queries = vec![mk_golden("g1", &[], &["c1"], &[], &[], None)];
+ let rows = vec![mk_query_row("g1", &[("x", 1), ("y", 2), ("z", 3)])];
+ let agg = compute_from_inputs(&queries, &rows);
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
+ assert_eq!(agg.precision_at_k_chunk[&10], 0.0);
+}
+
+#[test]
+fn precision_at_k_chunk_empty_expected_skipped() {
+ // 1 query, expected_chunk_ids = []. Should be skipped — denom 0 → entry value 0.0
+ // (matches `recall_at_k_doc` behavior in `round_recall_map` for zero-denom).
+ let queries = vec![mk_golden("g1", &[], &[], &[], &[], None)];
+ let rows = vec![mk_query_row("g1", &[("c1", 1)])];
+ let agg = compute_from_inputs(&queries, &rows);
+ // Mirrors recall_at_k_doc: zero-denom → 0.0 in map (not absent).
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
+ assert_eq!(agg.precision_at_k_chunk[&10], 0.0);
+}
+
+#[test]
+fn precision_at_k_chunk_two_queries_averaged() {
+ // q1: expected=[c1], hits=[c1@1, x@2, y@3] → P@5 = 1/5 = 0.2
+ // q2: expected=[c1, c2], hits=[c1@1, c2@2] → P@5 = 2/5 = 0.4
+ // Avg P@5 = (0.2 + 0.4) / 2 = 0.3.
+ let queries = vec![
+ mk_golden("g1", &[], &["c1"], &[], &[], None),
+ mk_golden("g2", &[], &["c1", "c2"], &[], &[], None),
+ ];
+ let rows = vec![
+ mk_query_row("g1", &[("c1", 1), ("x", 2), ("y", 3)]),
+ mk_query_row("g2", &[("c1", 1), ("c2", 2)]),
+ ];
+ let agg = compute_from_inputs(&queries, &rows);
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.3);
+}
+```
+
+The `mk_golden` / `mk_query_row` / `compute_from_inputs` helpers are existing test helpers in this file. Read the top of `mod tests` (~line 380-510) to confirm the actual helper names and signatures. If your helpers have different shapes (e.g. `mk_qr_with_chunks(id, &[(chunk, rank)])`), adapt the test calls accordingly.
+
+If those helpers don't exist, look for the pattern in the existing `hit_at_k_handles_ranks_1_4_miss` test (~line 513) and mirror it.
+
+- [ ] **Step 2: Run tests — expect failures**
+
+```bash
+cargo test -p kebab-eval --lib precision_at_k_chunk
+```
+Expected: 5 failures — `precision_at_k_chunk` map empty (only `#[serde(default)]` populates it from JSON; the compute path doesn't yet).
+
+- [ ] **Step 3: Implement aggregation in `compute_aggregate_with_config`**
+
+In `crates/kebab-eval/src/metrics.rs`, find `compute_aggregate_with_config` body. After the `recall_at_k_doc` accumulator init (~line 188-189), add:
+
+```rust
+ let mut precision_at_k_chunk: BTreeMap =
+ TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
+```
+
+Inside the loop, after the existing `hit@k + MRR` block (~line 222-247) which already gates on `!gq.expected_chunk_ids.is_empty()`, add a sibling `for k in TOP_K_VARIANTS { ... }` that updates `precision_at_k_chunk`. Place it INSIDE the same `if !gq.expected_chunk_ids.is_empty() { ... }` block so the skip-empty policy is shared:
+
+```rust
+ // hit@k + MRR (chunk-level, requires non-empty expected_chunk_ids)
+ if !gq.expected_chunk_ids.is_empty() {
+ let expected: HashSet<&ChunkId> = gq.expected_chunk_ids.iter().collect();
+ // ... existing hit@k + MRR computation ...
+
+ // p9-fb-39: precision@k_chunk — count of top-k hits whose
+ // chunk_id is in `expected`, divided by k (fixed denominator).
+ for k in TOP_K_VARIANTS {
+ let hits_in_topk_relevant = qr
+ .hits_top_k
+ .iter()
+ .filter(|h| h.rank <= *k && expected.contains(&h.chunk_id))
+ .count();
+ let entry = precision_at_k_chunk.get_mut(k).expect("init");
+ entry.0 += hits_in_topk_relevant as f64 / f64::from(*k);
+ entry.1 += 1;
+ }
+ }
+```
+
+Then at the final `Ok(AggregateMetrics { ... })` return (~line 325-345), add:
+
+```rust
+ precision_at_k_chunk: round_recall_map(&precision_at_k_chunk),
+```
+
+(`round_recall_map` is the existing helper at line ~366; it accepts `BTreeMap` and divides sum by denom, returning `BTreeMap`. Same shape used by `recall_at_k_doc`.)
+
+- [ ] **Step 4: Run tests — expect all 5 pass**
+
+```bash
+cargo test -p kebab-eval --lib precision_at_k_chunk
+```
+Expected: 5 passes.
+
+- [ ] **Step 5: Run full kebab-eval suite**
+
+```bash
+cargo test -p kebab-eval
+cargo clippy -p kebab-eval --all-targets -- -D warnings
+```
+Expected: no regressions; clippy clean.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add crates/kebab-eval/src/metrics.rs
+git commit -m "feat(eval): compute precision_at_k_chunk in aggregate loop (fb-39)"
+```
+
+---
+
+## Task 3: Strengthen golden YAML header documentation
+
+**Files:**
+- Modify: `fixtures/golden_queries.yaml`
+
+- [ ] **Step 1: Read existing header**
+
+```bash
+head -20 fixtures/golden_queries.yaml
+```
+
+- [ ] **Step 2: Replace header comment**
+
+Find the existing header (the comment block above the first `- id: g001` entry). Replace with:
+
+```yaml
+# Golden query suite for `kebab eval run` (P5-1 / P5-2 / fb-39).
+#
+# Top-level: list of queries. Required fields: `id`, `query`. All
+# others are optional and default to empty / null.
+#
+# Curators: `expected_doc_ids` and `expected_chunk_ids` MUST refer to
+# real rows in the active workspace's SQLite store at run time. Stale
+# references make the runner bail at start. The shipped template
+# leaves them empty so the file is loadable on any fresh workspace —
+# fill them in after a `kebab ingest` to enable the metrics that
+# require ground truth (P5-2 + fb-39):
+#
+# - `expected_chunk_ids` → hit_at_k, MRR, precision_at_k_chunk (fb-39)
+# - `expected_doc_ids` → recall_at_k_doc
+#
+# `precision_at_k_chunk` (fb-39): of the top-k retrieved hits, what
+# fraction's `chunk_id` is in `expected_chunk_ids`. Denominator is k
+# (fixed) — `top-k` shortfall is treated as precision loss. Queries
+# with empty `expected_chunk_ids` are skipped from this metric.
+#
+# `must_contain` / `forbidden` drive the rule-based groundedness
+# metric (P5-2).
+```
+
+- [ ] **Step 3: Verify YAML still parses**
+
+```bash
+cargo test -p kebab-eval --test golden_loader 2>/dev/null || cargo test -p kebab-eval load_golden
+```
+
+If a loader test exists, it should still pass. If not, run a quick parse check:
+
+```bash
+cargo run --bin kebab -- eval --help 2>/dev/null || true
+```
+
+(The shipped `golden_queries.yaml` is just a fixture — the workspace test loader will read it during integration tests and fail loudly if YAML is malformed.)
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add fixtures/golden_queries.yaml
+git commit -m "docs(eval): document expected_chunk_ids as P@k ground truth (fb-39)"
+```
+
+---
+
+## Task 4: Update design doc + spec status flip + INDEX
+
+**Files:**
+- Modify: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`
+- Modify: `tasks/p9/p9-fb-39-retrieval-precision-tuning.md`
+- Modify: `tasks/INDEX.md`
+
+- [ ] **Step 1: Update design §11 eval metric list**
+
+```bash
+grep -n "^## §11\|^## 11\|hit_at_k\|recall_at_k_doc\|precision" docs/superpowers/specs/2026-04-27-kebab-final-form-design.md | head -10
+```
+
+Find the §11 eval section (or wherever metrics are listed). Add a `precision_at_k_chunk` line next to `hit_at_k` / `recall_at_k_doc`:
+
+```markdown
+- `precision_at_k_chunk` (fb-39): top-k 안 chunk_id 가 `expected_chunk_ids` 에 포함된 비율. 분모 = k (fixed). `expected_chunk_ids` 빈 query 는 skip.
+```
+
+If the design doc doesn't currently list metrics inline, add a short subsection or bullet under §11 introducing it.
+
+- [ ] **Step 2: Flip task spec status**
+
+```bash
+sed -i.bak 's/^status: open$/status: completed/' tasks/p9/p9-fb-39-retrieval-precision-tuning.md
+rm tasks/p9/p9-fb-39-retrieval-precision-tuning.md.bak
+```
+
+Replace the existing `> ⏳ **백로그 only — 미구현.**` skeleton banner with:
+
+```markdown
+> ✅ **Eval foundation 부분 구현 완료.** P@k metric (P@5, P@10) 추가. 본 spec 의 lever 적용 (chunk policy / RRF / cross-encoder / embedding 업그레이드) 은 별도 task 로 분리 (fb-39b 이후).
+>
+> - Design: [`docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md`](../../docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md)
+> - Plan: [`docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md`](../../docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md)
+```
+
+- [ ] **Step 3: Flip INDEX row**
+
+In `tasks/INDEX.md`, find the fb-39 row. Replace its status with `✅ 머지 (2026-05-10) — eval foundation only, lever 적용 deferred` (mirror the fb-42 row format from the previous PR for consistency).
+
+- [ ] **Step 4: Workspace test + clippy gate**
+
+```bash
+cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -10
+cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -5
+```
+
+`-j 1` REQUIRED.
+
+Expected: all green.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add docs/superpowers/specs/2026-04-27-kebab-final-form-design.md tasks/p9/p9-fb-39-retrieval-precision-tuning.md tasks/INDEX.md
+git commit -m "docs(fb-39): design §11 + spec status + INDEX (eval foundation)"
+```
+
+---
+
+## Final verification checklist
+
+- [ ] `cargo test --workspace --no-fail-fast -j 1` green
+- [ ] `cargo clippy --workspace --all-targets -- -D warnings` clean
+- [ ] `kebab eval run` (against any workspace with non-empty `expected_chunk_ids` in golden) emits `precision_at_k_chunk: {5: ..., 10: ...}` in the run's `metrics_json`
+- [ ] design §11 + INDEX + task spec status flipped
--
2.49.1
From bb0ec0469f31f2f51df10af0024185d86179ca08 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 22:26:21 +0900
Subject: [PATCH 3/5] feat(eval): precision_at_k_chunk metric (P@5, P@10)
(fb-39)
---
crates/kebab-eval/src/compare.rs | 1 +
crates/kebab-eval/src/metrics.rs | 133 ++++++++++++++++++
.../tests/fixtures/eval/compare-1.json | 12 ++
.../kebab-eval/tests/metrics_and_compare.rs | 1 +
4 files changed, 147 insertions(+)
diff --git a/crates/kebab-eval/src/compare.rs b/crates/kebab-eval/src/compare.rs
index 24ed840..4ba9bd4 100644
--- a/crates/kebab-eval/src/compare.rs
+++ b/crates/kebab-eval/src/compare.rs
@@ -484,6 +484,7 @@ mod tests {
hit_at_k: Default::default(),
mrr: 0.5,
recall_at_k_doc: Default::default(),
+ precision_at_k_chunk: Default::default(),
citation_coverage: f32::NAN,
groundedness: 0.0,
empty_result_rate: 0.0,
diff --git a/crates/kebab-eval/src/metrics.rs b/crates/kebab-eval/src/metrics.rs
index dd1bf7d..f138845 100644
--- a/crates/kebab-eval/src/metrics.rs
+++ b/crates/kebab-eval/src/metrics.rs
@@ -58,6 +58,14 @@ pub struct AggregateMetrics {
pub hit_at_k: BTreeMap,
pub mrr: f32,
pub recall_at_k_doc: BTreeMap,
+ /// p9-fb-39: chunk-level precision at k. Binary relevance via
+ /// `expected_chunk_ids` (a hit is "relevant" if its chunk_id is
+ /// in the golden's `expected_chunk_ids`). Denominator is k (fixed)
+ /// — `hits.len() < k` still divides by k, treating shortfall as
+ /// precision loss (mirrors `hit_at_k`). Queries with empty
+ /// `expected_chunk_ids` are skipped (mirrors `hit_at_k_chunk`).
+ #[serde(default)]
+ pub precision_at_k_chunk: BTreeMap,
#[serde(
serialize_with = "serialize_f32_nan_as_null",
deserialize_with = "deserialize_f32_or_nan"
@@ -187,6 +195,8 @@ pub(crate) fn aggregate_from_rows(
TOP_K_VARIANTS.iter().map(|k| (*k, (0_u32, 0_u32))).collect();
let mut recall_at_k_doc: BTreeMap =
TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
+ let mut precision_at_k_chunk: BTreeMap =
+ TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
let mut mrr_sum: f64 = 0.0;
let mut mrr_denom: u32 = 0;
@@ -243,6 +253,18 @@ pub(crate) fn aggregate_from_rows(
{
mrr_sum += 1.0 / f64::from(rank);
}
+ // p9-fb-39: precision@k_chunk — count of top-k hits whose
+ // chunk_id is in `expected`, divided by k (fixed denominator).
+ for k in TOP_K_VARIANTS {
+ let hits_in_topk_relevant = qr
+ .hits_top_k
+ .iter()
+ .filter(|h| h.rank <= *k && expected.contains(&h.chunk_id))
+ .count();
+ let entry = precision_at_k_chunk.get_mut(k).expect("init");
+ entry.0 += hits_in_topk_relevant as f64 / f64::from(*k);
+ entry.1 += 1;
+ }
}
// recall@k_doc (doc-level, requires non-empty expected_doc_ids
@@ -333,6 +355,7 @@ pub(crate) fn aggregate_from_rows(
mrr_sum / f64::from(mrr_denom)
}),
recall_at_k_doc: round_recall_map(&recall_at_k_doc),
+ precision_at_k_chunk: round_recall_map(&precision_at_k_chunk),
citation_coverage: ratio_or_nan(citation_num, citation_denom),
groundedness: ratio_or_zero(groundedness_num, groundedness_denom),
empty_result_rate: ratio_or_zero(empty_result_count, total_queries),
@@ -674,4 +697,114 @@ mod tests {
assert_eq!(agg.failed_queries, 1);
assert_eq!(agg.total_queries, 1);
}
+
+ #[test]
+ fn precision_at_k_chunk_field_default_empty_on_old_json() {
+ // Old eval_runs.metrics_json predates fb-39 — no precision_at_k_chunk field.
+ // serde(default) yields empty BTreeMap.
+ let old = serde_json::json!({
+ "hit_at_k": {"1": 0.5, "3": 0.5, "5": 0.5, "10": 0.5},
+ "mrr": 0.5,
+ "recall_at_k_doc": {"1": 0.0, "3": 0.0, "5": 0.0, "10": 0.0},
+ "citation_coverage": null,
+ "groundedness": 0.0,
+ "empty_result_rate": 0.0,
+ "refusal_correctness": null,
+ "total_queries": 1,
+ "failed_queries": 0
+ });
+ let parsed: AggregateMetrics =
+ serde_json::from_value(old).expect("backwards-compat deserialize");
+ assert!(parsed.precision_at_k_chunk.is_empty());
+ }
+
+ #[test]
+ fn precision_at_k_chunk_exact_match() {
+ // expected = [c1, c2, c3]. Top-5 hits: [c1@1, c2@2, c3@3, x@4, y@5].
+ // P@5 = 3/5 = 0.6. P@10 = 3/10 = 0.3.
+ let queries = vec![gq("q1", &["c1", "c2", "c3"], &["d1"])];
+ let rows = vec![record(
+ "q1",
+ vec![
+ hit(1, "c1", "d1"),
+ hit(2, "c2", "d1"),
+ hit(3, "c3", "d1"),
+ hit(4, "x", "d1"),
+ hit(5, "y", "d1"),
+ ],
+ None,
+ None,
+ )];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.6);
+ assert_eq!(agg.precision_at_k_chunk[&10], 0.3);
+ }
+
+ #[test]
+ fn precision_at_k_chunk_partial_topk_divides_by_k() {
+ // expected = [c1, c2]. Hits: only [c1@1, c2@2, x@3] (3 results).
+ // P@5 = 2/5 = 0.4 (denominator is k, not hits.len()).
+ let queries = vec![gq("q1", &["c1", "c2"], &["d1"])];
+ let rows = vec![record(
+ "q1",
+ vec![hit(1, "c1", "d1"), hit(2, "c2", "d1"), hit(3, "x", "d1")],
+ None,
+ None,
+ )];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.4);
+ assert_eq!(agg.precision_at_k_chunk[&10], 0.2);
+ }
+
+ #[test]
+ fn precision_at_k_chunk_zero_relevant_in_topk() {
+ // expected = [c1]. Hits: [x@1, y@2, z@3] (none relevant).
+ // P@5 = 0/5 = 0.0.
+ let queries = vec![gq("q1", &["c1"], &["d1"])];
+ let rows = vec![record(
+ "q1",
+ vec![hit(1, "x", "d1"), hit(2, "y", "d1"), hit(3, "z", "d1")],
+ None,
+ None,
+ )];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
+ }
+
+ #[test]
+ fn precision_at_k_chunk_empty_expected_skipped() {
+ // expected_chunk_ids = []. Skipped → final BTreeMap entry value = 0.0
+ // (zero-denom path in round_recall_map). Mirrors recall_at_k_doc behavior.
+ let queries = vec![gq("q1", &[], &["d1"])];
+ let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, None)];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
+ }
+
+ #[test]
+ fn precision_at_k_chunk_two_queries_averaged() {
+ // q1: expected=[c1], hits=[c1@1, x@2, y@3] → P@5 = 1/5 = 0.2
+ // q2: expected=[c1, c2], hits=[c1@1, c2@2] → P@5 = 2/5 = 0.4
+ // Avg P@5 = 0.3.
+ let queries = vec![
+ gq("q1", &["c1"], &["d1"]),
+ gq("q2", &["c1", "c2"], &["d2"]),
+ ];
+ let rows = vec![
+ record(
+ "q1",
+ vec![hit(1, "c1", "d1"), hit(2, "x", "d1"), hit(3, "y", "d1")],
+ None,
+ None,
+ ),
+ record(
+ "q2",
+ vec![hit(1, "c1", "d2"), hit(2, "c2", "d2")],
+ None,
+ None,
+ ),
+ ];
+ let agg = aggregate_from_rows(&queries, &rows).unwrap();
+ assert_eq!(agg.precision_at_k_chunk[&5], 0.3);
+ }
}
diff --git a/crates/kebab-eval/tests/fixtures/eval/compare-1.json b/crates/kebab-eval/tests/fixtures/eval/compare-1.json
index ee969ae..de408d2 100644
--- a/crates/kebab-eval/tests/fixtures/eval/compare-1.json
+++ b/crates/kebab-eval/tests/fixtures/eval/compare-1.json
@@ -11,6 +11,12 @@
"5": 0.666700005531311
},
"mrr": 0.41670000553131104,
+ "precision_at_k_chunk": {
+ "1": 0.33329999446868896,
+ "10": 0.06669999659061432,
+ "3": 0.11110000312328339,
+ "5": 0.13330000638961792
+ },
"recall_at_k_doc": {
"1": 0.33329999446868896,
"10": 0.666700005531311,
@@ -32,6 +38,12 @@
"5": 1.0
},
"mrr": 0.833299994468689,
+ "precision_at_k_chunk": {
+ "1": 0.666700005531311,
+ "10": 0.10000000149011612,
+ "3": 0.33329999446868896,
+ "5": 0.20000000298023224
+ },
"recall_at_k_doc": {
"1": 0.666700005531311,
"10": 1.0,
diff --git a/crates/kebab-eval/tests/metrics_and_compare.rs b/crates/kebab-eval/tests/metrics_and_compare.rs
index 1e1b366..7cd7355 100644
--- a/crates/kebab-eval/tests/metrics_and_compare.rs
+++ b/crates/kebab-eval/tests/metrics_and_compare.rs
@@ -203,6 +203,7 @@ fn store_aggregate_rejects_missing_run() {
hit_at_k: Default::default(),
mrr: 0.0,
recall_at_k_doc: Default::default(),
+ precision_at_k_chunk: Default::default(),
citation_coverage: f32::NAN,
groundedness: 0.0,
empty_result_rate: 0.0,
--
2.49.1
From f00fb376fe8bb9a349160c2d1e1baa876a35aa99 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 22:35:15 +0900
Subject: [PATCH 4/5] =?UTF-8?q?docs(fb-39):=20golden=20header=20+=20design?=
=?UTF-8?q?=20=C2=A710.3=20eval=20+=20spec=20status=20+=20INDEX?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Strengthen fixtures/golden_queries.yaml header with precision_at_k_chunk
explanation + measurement guidance. Add §10.3 Eval metrics section to
frozen design documenting retrieval metrics (hit@k, MRR, recall@k_doc,
P@k_chunk) + groundedness metrics. Flip p9-fb-39 spec status from open
→ completed (eval foundation only, lever deferral noted). Update
tasks/INDEX.md fb-39 row mirror to fb-42 (merged, deferred note).
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../2026-04-27-kebab-final-form-design.md | 20 +++++++++++++++++++
fixtures/golden_queries.yaml | 14 ++++++++++---
tasks/INDEX.md | 2 +-
.../p9/p9-fb-39-retrieval-precision-tuning.md | 13 +++++++-----
4 files changed, 40 insertions(+), 9 deletions(-)
diff --git a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
index a4efb74..199fcc9 100644
--- a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
+++ b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
@@ -1510,6 +1510,26 @@ agent 가 분기). HTTP-SSE transport 는 fb-29 deferral 따라 P+. classify
모듈은 `kebab-app::error_wire` 에 single source — kebab-cli + kebab-mcp
공유.
+### 10.3 Eval metrics (fb-39)
+
+#### Retrieval metrics (ground-truth curated)
+
+`kebab eval run` 이 golden query suite (`fixtures/golden_queries.yaml`) 대해 메트릭 계산. Curator 가 `expected_chunk_ids` 및 `expected_doc_ids` 설정 시에만 측정 가능 (shipped template 은 empty — workspace 별 자체 채움).
+
+| 메트릭 | 정의 | 조건 |
+|--------|------|------|
+| `hit_at_k` | top-k 안 expected chunk 존재 여부 (binary). P(hit@k=true) 평균 | `expected_chunk_ids` 채움 |
+| `MRR` | Mean Reciprocal Rank (첫 관련 chunk rank 역수 평균) | `expected_chunk_ids` 채움 |
+| `recall_at_k_doc` | top-k 안 expected doc 비율 (`|top-k_docs ∩ expected_doc_ids| / |expected_doc_ids|`) | `expected_doc_ids` 채움 |
+| `precision_at_k_chunk` (fb-39) | top-k 안 chunk_id 가 `expected_chunk_ids` 에 포함된 비율. 분모 = k (fixed) — `top-k` 부족도 precision 손실로 간주. 빈 `expected_chunk_ids` query 는 skip. | `expected_chunk_ids` 채움 |
+
+#### Groundedness metrics (rule-based)
+
+| 메트릭 | 정의 |
+|--------|------|
+| `must_contain` pass | answer 문자열 이 `golden.must_contain` 의 모든 substring 포함 |
+| `forbidden` pass | answer 문자열 이 `golden.forbidden` 의 substring 미포함 |
+
---
## 11. 동결 범위 / 변경 정책
diff --git a/fixtures/golden_queries.yaml b/fixtures/golden_queries.yaml
index eecf629..ec631fc 100644
--- a/fixtures/golden_queries.yaml
+++ b/fixtures/golden_queries.yaml
@@ -1,4 +1,4 @@
-# Golden query suite for `kb eval run` (P5-1 / P5-2).
+# Golden query suite for `kebab eval run` (P5-1 / P5-2 / fb-39).
#
# Top-level: list of queries. Required fields: `id`, `query`. All
# others are optional and default to empty / null.
@@ -7,8 +7,16 @@
# real rows in the active workspace's SQLite store at run time. Stale
# references make the runner bail at start. The shipped template
# leaves them empty so the file is loadable on any fresh workspace —
-# fill them in after a `kb ingest` to enable hit@k / MRR metrics
-# (P5-2).
+# fill them in after a `kebab ingest` to enable the metrics that
+# require ground truth (P5-2 + fb-39):
+#
+# - `expected_chunk_ids` → hit_at_k, MRR, precision_at_k_chunk (fb-39)
+# - `expected_doc_ids` → recall_at_k_doc
+#
+# `precision_at_k_chunk` (fb-39): of the top-k retrieved hits, what
+# fraction's `chunk_id` is in `expected_chunk_ids`. Denominator is k
+# (fixed) — `top-k` shortfall is treated as precision loss. Queries
+# with empty `expected_chunk_ids` are skipped from this metric.
#
# `must_contain` / `forbidden` drive the rule-based groundedness
# metric (P5-2).
diff --git a/tasks/INDEX.md b/tasks/INDEX.md
index aa42923..912b9f0 100644
--- a/tasks/INDEX.md
+++ b/tasks/INDEX.md
@@ -129,7 +129,7 @@ P0~P5 는 직렬. P6~P9 는 P5 이후 병렬 가능.
### 🎯 0.5.0 — RAG quality (cascade 동반: V00X + reindex)
- [p9-fb-38 score semantics](p9/p9-fb-38-score-semantics.md) — ✅ 머지 (2026-05-10)
- - [p9-fb-39 retrieval precision 튜닝](p9/p9-fb-39-retrieval-precision-tuning.md) — ⏳ 미구현, brainstorm 필요 (embedding_version cascade)
+ - [p9-fb-39 retrieval precision 튜닝](p9/p9-fb-39-retrieval-precision-tuning.md) — ✅ 머지 (2026-05-10) — eval foundation only, lever 적용 deferred
- [p9-fb-40 fact-grounded answer](p9/p9-fb-40-fact-grounded-answer.md) — ✅ 머지 (2026-05-10)
### 🎯 0.6.0 또는 P+ — reasoning
diff --git a/tasks/p9/p9-fb-39-retrieval-precision-tuning.md b/tasks/p9/p9-fb-39-retrieval-precision-tuning.md
index 724a6fb..7f9641e 100644
--- a/tasks/p9/p9-fb-39-retrieval-precision-tuning.md
+++ b/tasks/p9/p9-fb-39-retrieval-precision-tuning.md
@@ -1,20 +1,23 @@
---
phase: P9
-component: kebab-search + kebab-rag + kebab-chunk
+component: kebab-eval + docs
task_id: p9-fb-39
title: "Retrieval precision 튜닝 (rank 5+ 노이즈 완화)"
-status: open
-target_version: 0.5.0
+status: completed
+target_version: 0.7.0
depends_on: []
unblocks: []
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
-contract_sections: [§3 chunking, §4 search, §7 RAG]
+contract_sections: [§3 chunking, §4 search, §7 RAG, §10.3 eval metrics]
source_feedback: 사용자 도그푸딩 2026-05-06 — Claude Code 가 kebab CLI 사용 후 "rank 5+ 부터 노이즈 섞임" 지적. precision-at-k 가 k=5 이후 떨어짐.
---
# p9-fb-39 — Retrieval precision 튜닝
-> ⏳ **백로그 only — 미구현.** 본 spec 은 도그푸딩 피드백 skeleton. 구현 착수 전 [superpowers:brainstorming](../../docs/superpowers/) 으로 설계 단계 선행 필요. 어느 lever (chunk policy / RRF k / score gate / cross-encoder / embedding 업그레이드) 부터 손볼지, eval golden set 선행 여부 brainstorm 후 결정.
+> ✅ **Eval foundation 부분 구현 완료.** P@k metric (P@5, P@10) 추가. 본 spec 의 lever 적용 (chunk policy / RRF / cross-encoder / embedding 업그레이드) 은 별도 task 로 분리 (fb-39b 이후).
+>
+> - Design: [`docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md`](../../docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md)
+> - Plan: [`docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md`](../../docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md)
## 증상 / 동기
--
2.49.1
From 5870a1de15831ec4a880f2e6fb5e580759786211 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 22:39:11 +0900
Subject: [PATCH 5/5] fix(fb-39): address PR #136 round 1 review
kebab eval compare now surfaces precision_at_k_chunk delta in both
human-readable table + deltas JSON. Snapshot fixture regenerated
additively.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-eval/src/compare.rs | 21 +++++++++++++++++++
.../tests/fixtures/eval/compare-1.json | 6 ++++++
2 files changed, 27 insertions(+)
diff --git a/crates/kebab-eval/src/compare.rs b/crates/kebab-eval/src/compare.rs
index 4ba9bd4..5033ca7 100644
--- a/crates/kebab-eval/src/compare.rs
+++ b/crates/kebab-eval/src/compare.rs
@@ -184,6 +184,18 @@ pub fn render_report_md(report: &CompareReport) -> String {
),
);
}
+ for k in crate::metrics::TOP_K_VARIANTS {
+ let _ = writeln!(
+ out,
+ "| precision@{k}_chunk | {} | {} | {} |",
+ fmt(a.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN)),
+ fmt(b.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN)),
+ fmt_delta(
+ a.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
+ b.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
+ ),
+ );
+ }
let _ = writeln!(
out,
"| citation_coverage | {} | {} | {} |",
@@ -419,6 +431,7 @@ fn build_deltas(
}
let mut hit = serde_json::Map::new();
let mut recall = serde_json::Map::new();
+ let mut precision = serde_json::Map::new();
for k in crate::metrics::TOP_K_VARIANTS {
hit.insert(
k.to_string(),
@@ -434,11 +447,19 @@ fn build_deltas(
b.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN),
),
);
+ precision.insert(
+ k.to_string(),
+ d(
+ a.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
+ b.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
+ ),
+ );
}
serde_json::json!({
"hit_at_k": hit,
"mrr": d(a.mrr, b.mrr),
"recall_at_k_doc": recall,
+ "precision_at_k_chunk": precision,
"citation_coverage": d(a.citation_coverage, b.citation_coverage),
"groundedness": d(a.groundedness, b.groundedness),
"empty_result_rate": d(a.empty_result_rate, b.empty_result_rate),
diff --git a/crates/kebab-eval/tests/fixtures/eval/compare-1.json b/crates/kebab-eval/tests/fixtures/eval/compare-1.json
index de408d2..da3f300 100644
--- a/crates/kebab-eval/tests/fixtures/eval/compare-1.json
+++ b/crates/kebab-eval/tests/fixtures/eval/compare-1.json
@@ -65,6 +65,12 @@
"5": 0.33329999446868896
},
"mrr": 0.41659998893737793,
+ "precision_at_k_chunk": {
+ "1": 0.33340001106262207,
+ "10": 0.0333000048995018,
+ "3": 0.22219999134540558,
+ "5": 0.06669999659061432
+ },
"recall_at_k_doc": {
"1": 0.33340001106262207,
"10": 0.33329999446868896,
--
2.49.1