fix(eval): 변형 일관성 리뷰 H1/M1 — pool truncation 방어 + answer 판정 정렬

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 17:39:08 +00:00
parent db4af0cc72
commit 82e02aa4fe
2 changed files with 150 additions and 9 deletions
--- a/crates/kebab-eval/src/runner.rs
+++ b/crates/kebab-eval/src/runner.rs
@@ -67,7 +67,7 @@ pub fn run_eval_with_config(cfg: &kebab_config::Config, opts: &EvalRunOpts) -> R
        .context("run migrations for run_eval")?;

    // ── 3. Build config_snapshot_json ─────────────────────────────────────
-    let config_snapshot_json = build_config_snapshot(cfg)?;
+    let config_snapshot_json = build_config_snapshot(cfg, opts.k)?;
    let config_snapshot_text =
        serde_json::to_string(&config_snapshot_json).context("serialize config_snapshot_json")?;

@@ -215,10 +215,11 @@ fn execute_query(app: &App, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult
 /// stable run-time property of the config alone. P5-2 may compose it
 /// from `embedding.{model,version,dimensions}` if it needs the field
 /// for compare reports.
-fn build_config_snapshot(cfg: &kebab_config::Config) -> Result<serde_json::Value> {
+fn build_config_snapshot(cfg: &kebab_config::Config, eval_k: usize) -> Result<serde_json::Value> {
    let cfg_value = serde_json::to_value(cfg).context("serialize Config")?;
    Ok(serde_json::json!({
        "config": cfg_value,
+        "eval_k": eval_k,
        "chunker_version": cfg.chunking.chunker_version,
        "embedding": {
            "model": cfg.models.embedding.model,
--- a/crates/kebab-eval/src/variant.rs
+++ b/crates/kebab-eval/src/variant.rs
@@ -74,6 +74,9 @@ pub struct VariantConsistencyReport {
    pub a_dominant_groups: u32,
    /// missing>0 && missing>mis_ranked 인 그룹 수 (쿼리 확장 처방 우선).
    pub b_dominant_groups: u32,
+    /// 관찰된 최대 rank 가 POOL_K 미만일 때 true — eval run 의 --k 가
+    /// POOL_K 보다 작아 pool 이 절단됐을 수 있음. MisRanked(A) 판정 불가.
+    pub pool_possibly_truncated: bool,
 }

 /// 저장된 run을 그룹으로 묶어 변형 일관성 리포트를 만든다.
@@ -87,9 +90,15 @@ pub fn compute_variant_consistency(
        queries.iter().map(|q| (q.id.as_str(), q)).collect();

    let mut grouped: BTreeMap<String, Vec<VariantResult>> = BTreeMap::new();
+    let mut observed_max_rank: u32 = 0;
+    let mut has_hits = false;
    for row in rows {
        let qr: QueryResult = serde_json::from_str(&row.result_json)
            .with_context(|| format!("parse result_json for {}", row.query_id))?;
+        for hit in &qr.hits_top_k {
+            has_hits = true;
+            observed_max_rank = observed_max_rank.max(hit.rank);
+        }
        let Some(gq) = golden_by_id.get(qr.query_id.as_str()) else {
            continue;
        };
@@ -97,10 +106,18 @@ pub fn compute_variant_consistency(
            continue;
        };
        let (recall_narrow, recall_pool) = recall_narrow_pool(&qr, &gq.expected_doc_ids);
-        let answer_ok = qr.answer.as_ref().map(|a| {
-            gq.must_contain.iter().all(|s| a.answer.contains(s))
-                && !gq.forbidden.iter().any(|s| a.answer.contains(s))
-        });
+        // Mirrors metrics.rs groundedness guards: skip errored rows and
+        // vacuous-true (no must_contain/forbidden configured).
+        let answer_ok = if qr.error.is_some()
+            || (gq.must_contain.is_empty() && gq.forbidden.is_empty())
+        {
+            None
+        } else {
+            qr.answer.as_ref().map(|a| {
+                gq.must_contain.iter().all(|s| a.answer.contains(s))
+                    && !gq.forbidden.iter().any(|s| a.answer.contains(s))
+            })
+        };
        let class = classify(&gq.expected_doc_ids, recall_narrow, recall_pool);
        grouped.entry(group).or_default().push(VariantResult {
            query_id: qr.query_id.clone(),
@@ -136,6 +153,7 @@ pub fn compute_variant_consistency(
        groups.iter().map(|g| g.recall_spread_narrow).sum::<f32>() / groups.len() as f32
    };

+    let pool_possibly_truncated = has_hits && observed_max_rank < POOL_K;
    Ok(VariantConsistencyReport {
        groups,
        mean_recall_spread_narrow,
@@ -143,6 +161,7 @@ pub fn compute_variant_consistency(
        total_groups,
        a_dominant_groups,
        b_dominant_groups,
+        pool_possibly_truncated,
    })
 }

@@ -165,6 +184,8 @@ fn recall_narrow_pool(qr: &QueryResult, expected: &[DocumentId]) -> (f32, f32) {
    (cover(NARROW_K), cover(POOL_K))
 }

+// Single label per query: when multiple expected docs produce mixed classes (e.g. one
+// MisRanked + one Missing), recall_pool > recall_narrow (A: MisRanked) takes priority.
 fn classify(expected: &[DocumentId], recall_narrow: f32, recall_pool: f32) -> VariantClass {
    if expected.is_empty() {
        VariantClass::NoExpected
@@ -184,6 +205,9 @@ fn rollup_group(group: String, variants: Vec<VariantResult>) -> VariantGroupRepo
        .map(|v| v.recall_narrow)
        .collect();
    let (recall_spread_narrow, worst_recall_narrow) = if measurable.is_empty() {
+        // All variants have no expected docs: spread=0/worst=NaN is intentional.
+        // This group won't match fully_consistent (NaN != 1.0) or A/B (both 0) —
+        // it's counted in total_groups but sits in a silent "limbo" bucket.
        (0.0, f32::NAN)
    } else {
        let max = measurable.iter().copied().fold(f32::MIN, f32::max);
@@ -217,8 +241,22 @@ pub fn compute_variant_consistency_with_config(
 ) -> Result<VariantConsistencyReport> {
    let store = SqliteStore::open(cfg).context("open SqliteStore for variant consistency")?;
    store.run_migrations().context("run migrations")?;
-    if store.load_eval_run(run_id).context("load eval_runs row")?.is_none() {
-        anyhow::bail!("compute_variant_consistency: no eval_runs row for run_id {run_id}");
+    let run_record = store
+        .load_eval_run(run_id)
+        .context("load eval_runs row")?
+        .ok_or_else(|| {
+            anyhow::anyhow!("compute_variant_consistency: no eval_runs row for run_id {run_id}")
+        })?;
+    let snapshot: serde_json::Value =
+        serde_json::from_str(&run_record.config_snapshot_json).unwrap_or(serde_json::Value::Null);
+    if let Some(eval_k) = snapshot["eval_k"].as_u64() {
+        let eval_k = eval_k as u32;
+        if eval_k < POOL_K {
+            anyhow::bail!(
+                "variant consistency needs the run to retrieve >= {POOL_K} candidates, \
+                 but run used k={eval_k}; re-run `kebab eval run --k {POOL_K}` (or higher)"
+            );
+        }
    }
    let rows = store
        .load_eval_query_results(run_id)
@@ -235,14 +273,22 @@ pub fn render_variants_md(rep: &VariantConsistencyReport) -> String {
    let _ = writeln!(s, "# Variant consistency\n");
    let _ = writeln!(
        s,
-        "groups={} fully_consistent={} A_dominant={} B_dominant={} mean_spread@{}={:.3}\n",
+        "groups={} fully_consistent={} A_dominant={} B_dominant={} mean_spread@{}={:.3} pool=top-{}\n",
        rep.total_groups,
        rep.fully_consistent_groups,
        rep.a_dominant_groups,
        rep.b_dominant_groups,
        NARROW_K,
        rep.mean_recall_spread_narrow,
+        POOL_K,
    );
+    if rep.pool_possibly_truncated {
+        let _ = writeln!(
+            s,
+            "WARNING: max observed rank < {POOL_K} — pool possibly truncated. \
+             MisRanked(A) diagnoses may be suppressed. Re-run `kebab eval run --k {POOL_K}` (or higher).\n"
+        );
+    }
    for g in &rep.groups {
        let ac = match g.answer_consistency {
            Some(true) => "all-ok",
@@ -387,4 +433,98 @@ mod tests {
        let rep = compute_variant_consistency(&[q], &[row("solo", vec![hit("docX", 1)])]).unwrap();
        assert_eq!(rep.total_groups, 0);
    }
+
+    fn row_with_answer(
+        query_id: &str,
+        hits: Vec<kebab_core::SearchHit>,
+        answer_text: &str,
+        error: Option<&str>,
+    ) -> EvalQueryResultRecord {
+        let hits_json = serde_json::to_value(&hits).unwrap();
+        let error_json =
+            error.map_or(serde_json::Value::Null, |e| serde_json::Value::String(e.into()));
+        let qr_json = serde_json::json!({
+            "query_id": query_id,
+            "query": query_id,
+            "mode": "vector",
+            "hits_top_k": hits_json,
+            "answer": {
+                "answer": answer_text,
+                "citations": [],
+                "grounded": false,
+                "refusal_reason": null,
+                "model": {"id": "test-model", "provider": "test", "dimensions": null},
+                "embedding": null,
+                "prompt_template_version": "v1",
+                "retrieval": {
+                    "trace_id": "t0",
+                    "mode": "vector",
+                    "k": 10,
+                    "score_gate": 0.0,
+                    "top_score": 0.0,
+                    "chunks_returned": 0,
+                    "chunks_used": 0
+                },
+                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "latency_ms": 0},
+                "created_at": "1970-01-01T00:00:00Z"
+            },
+            "elapsed_ms": 0,
+            "error": error_json
+        });
+        EvalQueryResultRecord {
+            query_id: query_id.into(),
+            result_json: serde_json::to_string(&qr_json).unwrap(),
+        }
+    }
+
+    /// H1 회귀: eval k=10 으로 실행 시 모든 hit rank ≤ NARROW_K →
+    /// pool_possibly_truncated 플래그로 사용자에게 경고해야 한다.
+    #[test]
+    fn pool_truncation_flag_when_all_hits_within_narrow_k() {
+        let queries = vec![gq("v1", "g", "docX"), gq("v2", "g", "docX")];
+        let rows = vec![
+            row("v1", vec![hit("docX", 1)]),
+            row("v2", vec![hit("other", 7)]), // rank 7 ≤ NARROW_K=10
+        ];
+        let rep = compute_variant_consistency(&queries, &rows).unwrap();
+        assert!(rep.pool_possibly_truncated, "all ranks ≤ NARROW_K must set pool_possibly_truncated");
+        // v2 misses docX, pool also has no rank>10 → classified Missing, not MisRanked
+        assert_eq!(rep.a_dominant_groups, 0);
+        assert_eq!(rep.b_dominant_groups, 1);
+    }
+
+    /// M1a: must_contain/forbidden 둘 다 빈 golden → vacuous-true 방지,
+    /// answer_ok = None (answer 있어도).
+    /// M1b: qr.error=Some → answer 있어도 answer_ok = None.
+    #[test]
+    fn answer_ok_vacuous_and_error_guarded() {
+        // M1a: gq() helper already has empty must_contain + forbidden
+        let gq_no_check = gq("v1", "g1", "docX");
+        let row_v1 = row_with_answer("v1", vec![], "any text", None);
+        let rep = compute_variant_consistency(&[gq_no_check], &[row_v1]).unwrap();
+        let v = &rep.groups[0].variants[0];
+        assert_eq!(v.answer_ok, None, "vacuous-true guard: no checks → answer_ok = None");
+        assert_eq!(rep.groups[0].answer_consistency, None);
+
+        // M1b: must_contain present but error is also set
+        let mut gq_check = gq("v2", "g2", "docY");
+        gq_check.must_contain = vec!["expected text".to_string()];
+        let row_v2 = row_with_answer("v2", vec![], "expected text", Some("llm error"));
+        let rep2 = compute_variant_consistency(&[gq_check], &[row_v2]).unwrap();
+        let v2 = &rep2.groups[0].variants[0];
+        assert_eq!(v2.answer_ok, None, "error guard: qr.error present → answer_ok = None");
+    }
+
+    /// N1 순수 B: 두 변형 모두 pool 에서도 정답 없음 → b_dominant=1, a_dominant=0.
+    #[test]
+    fn pure_b_dominant_group() {
+        let queries = vec![gq("v1", "g", "docX"), gq("v2", "g", "docX")];
+        let rows = vec![
+            row("v1", vec![hit("other1", 1)]), // docX 없음 → Missing (B)
+            row("v2", vec![hit("other2", 1)]), // docX 없음 → Missing (B)
+        ];
+        let rep = compute_variant_consistency(&queries, &rows).unwrap();
+        assert_eq!(rep.b_dominant_groups, 1);
+        assert_eq!(rep.a_dominant_groups, 0);
+    }
 }