From 82e02aa4fecd3748a411169f735c3a56c525a8fd Mon Sep 17 00:00:00 2001 From: altair823 Date: Fri, 29 May 2026 17:39:08 +0000 Subject: [PATCH] =?UTF-8?q?fix(eval):=20=EB=B3=80=ED=98=95=20=EC=9D=BC?= =?UTF-8?q?=EA=B4=80=EC=84=B1=20=EB=A6=AC=EB=B7=B0=20H1/M1=20=E2=80=94=20p?= =?UTF-8?q?ool=20truncation=20=EB=B0=A9=EC=96=B4=20+=20answer=20=ED=8C=90?= =?UTF-8?q?=EC=A0=95=20=EC=A0=95=EB=A0=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- crates/kebab-eval/src/runner.rs | 5 +- crates/kebab-eval/src/variant.rs | 154 +++++++++++++++++++++++++++++-- 2 files changed, 150 insertions(+), 9 deletions(-) diff --git a/crates/kebab-eval/src/runner.rs b/crates/kebab-eval/src/runner.rs index 45a9652..e1096ef 100644 --- a/crates/kebab-eval/src/runner.rs +++ b/crates/kebab-eval/src/runner.rs @@ -67,7 +67,7 @@ pub fn run_eval_with_config(cfg: &kebab_config::Config, opts: &EvalRunOpts) -> R .context("run migrations for run_eval")?; // ── 3. Build config_snapshot_json ───────────────────────────────────── - let config_snapshot_json = build_config_snapshot(cfg)?; + let config_snapshot_json = build_config_snapshot(cfg, opts.k)?; let config_snapshot_text = serde_json::to_string(&config_snapshot_json).context("serialize config_snapshot_json")?; @@ -215,10 +215,11 @@ fn execute_query(app: &App, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult /// stable run-time property of the config alone. P5-2 may compose it /// from `embedding.{model,version,dimensions}` if it needs the field /// for compare reports. -fn build_config_snapshot(cfg: &kebab_config::Config) -> Result { +fn build_config_snapshot(cfg: &kebab_config::Config, eval_k: usize) -> Result { let cfg_value = serde_json::to_value(cfg).context("serialize Config")?; Ok(serde_json::json!({ "config": cfg_value, + "eval_k": eval_k, "chunker_version": cfg.chunking.chunker_version, "embedding": { "model": cfg.models.embedding.model, diff --git a/crates/kebab-eval/src/variant.rs b/crates/kebab-eval/src/variant.rs index f691459..98f13ef 100644 --- a/crates/kebab-eval/src/variant.rs +++ b/crates/kebab-eval/src/variant.rs @@ -74,6 +74,9 @@ pub struct VariantConsistencyReport { pub a_dominant_groups: u32, /// missing>0 && missing>mis_ranked 인 그룹 수 (쿼리 확장 처방 우선). pub b_dominant_groups: u32, + /// 관찰된 최대 rank 가 POOL_K 미만일 때 true — eval run 의 --k 가 + /// POOL_K 보다 작아 pool 이 절단됐을 수 있음. MisRanked(A) 판정 불가. + pub pool_possibly_truncated: bool, } /// 저장된 run을 그룹으로 묶어 변형 일관성 리포트를 만든다. @@ -87,9 +90,15 @@ pub fn compute_variant_consistency( queries.iter().map(|q| (q.id.as_str(), q)).collect(); let mut grouped: BTreeMap> = BTreeMap::new(); + let mut observed_max_rank: u32 = 0; + let mut has_hits = false; for row in rows { let qr: QueryResult = serde_json::from_str(&row.result_json) .with_context(|| format!("parse result_json for {}", row.query_id))?; + for hit in &qr.hits_top_k { + has_hits = true; + observed_max_rank = observed_max_rank.max(hit.rank); + } let Some(gq) = golden_by_id.get(qr.query_id.as_str()) else { continue; }; @@ -97,10 +106,18 @@ pub fn compute_variant_consistency( continue; }; let (recall_narrow, recall_pool) = recall_narrow_pool(&qr, &gq.expected_doc_ids); - let answer_ok = qr.answer.as_ref().map(|a| { - gq.must_contain.iter().all(|s| a.answer.contains(s)) - && !gq.forbidden.iter().any(|s| a.answer.contains(s)) - }); + // Mirrors metrics.rs groundedness guards: skip errored rows and + // vacuous-true (no must_contain/forbidden configured). + let answer_ok = if qr.error.is_some() + || (gq.must_contain.is_empty() && gq.forbidden.is_empty()) + { + None + } else { + qr.answer.as_ref().map(|a| { + gq.must_contain.iter().all(|s| a.answer.contains(s)) + && !gq.forbidden.iter().any(|s| a.answer.contains(s)) + }) + }; let class = classify(&gq.expected_doc_ids, recall_narrow, recall_pool); grouped.entry(group).or_default().push(VariantResult { query_id: qr.query_id.clone(), @@ -136,6 +153,7 @@ pub fn compute_variant_consistency( groups.iter().map(|g| g.recall_spread_narrow).sum::() / groups.len() as f32 }; + let pool_possibly_truncated = has_hits && observed_max_rank < POOL_K; Ok(VariantConsistencyReport { groups, mean_recall_spread_narrow, @@ -143,6 +161,7 @@ pub fn compute_variant_consistency( total_groups, a_dominant_groups, b_dominant_groups, + pool_possibly_truncated, }) } @@ -165,6 +184,8 @@ fn recall_narrow_pool(qr: &QueryResult, expected: &[DocumentId]) -> (f32, f32) { (cover(NARROW_K), cover(POOL_K)) } +// Single label per query: when multiple expected docs produce mixed classes (e.g. one +// MisRanked + one Missing), recall_pool > recall_narrow (A: MisRanked) takes priority. fn classify(expected: &[DocumentId], recall_narrow: f32, recall_pool: f32) -> VariantClass { if expected.is_empty() { VariantClass::NoExpected @@ -184,6 +205,9 @@ fn rollup_group(group: String, variants: Vec) -> VariantGroupRepo .map(|v| v.recall_narrow) .collect(); let (recall_spread_narrow, worst_recall_narrow) = if measurable.is_empty() { + // All variants have no expected docs: spread=0/worst=NaN is intentional. + // This group won't match fully_consistent (NaN != 1.0) or A/B (both 0) — + // it's counted in total_groups but sits in a silent "limbo" bucket. (0.0, f32::NAN) } else { let max = measurable.iter().copied().fold(f32::MIN, f32::max); @@ -217,8 +241,22 @@ pub fn compute_variant_consistency_with_config( ) -> Result { let store = SqliteStore::open(cfg).context("open SqliteStore for variant consistency")?; store.run_migrations().context("run migrations")?; - if store.load_eval_run(run_id).context("load eval_runs row")?.is_none() { - anyhow::bail!("compute_variant_consistency: no eval_runs row for run_id {run_id}"); + let run_record = store + .load_eval_run(run_id) + .context("load eval_runs row")? + .ok_or_else(|| { + anyhow::anyhow!("compute_variant_consistency: no eval_runs row for run_id {run_id}") + })?; + let snapshot: serde_json::Value = + serde_json::from_str(&run_record.config_snapshot_json).unwrap_or(serde_json::Value::Null); + if let Some(eval_k) = snapshot["eval_k"].as_u64() { + let eval_k = eval_k as u32; + if eval_k < POOL_K { + anyhow::bail!( + "variant consistency needs the run to retrieve >= {POOL_K} candidates, \ + but run used k={eval_k}; re-run `kebab eval run --k {POOL_K}` (or higher)" + ); + } } let rows = store .load_eval_query_results(run_id) @@ -235,14 +273,22 @@ pub fn render_variants_md(rep: &VariantConsistencyReport) -> String { let _ = writeln!(s, "# Variant consistency\n"); let _ = writeln!( s, - "groups={} fully_consistent={} A_dominant={} B_dominant={} mean_spread@{}={:.3}\n", + "groups={} fully_consistent={} A_dominant={} B_dominant={} mean_spread@{}={:.3} pool=top-{}\n", rep.total_groups, rep.fully_consistent_groups, rep.a_dominant_groups, rep.b_dominant_groups, NARROW_K, rep.mean_recall_spread_narrow, + POOL_K, ); + if rep.pool_possibly_truncated { + let _ = writeln!( + s, + "WARNING: max observed rank < {POOL_K} — pool possibly truncated. \ + MisRanked(A) diagnoses may be suppressed. Re-run `kebab eval run --k {POOL_K}` (or higher).\n" + ); + } for g in &rep.groups { let ac = match g.answer_consistency { Some(true) => "all-ok", @@ -387,4 +433,98 @@ mod tests { let rep = compute_variant_consistency(&[q], &[row("solo", vec![hit("docX", 1)])]).unwrap(); assert_eq!(rep.total_groups, 0); } + + fn row_with_answer( + query_id: &str, + hits: Vec, + answer_text: &str, + error: Option<&str>, + ) -> EvalQueryResultRecord { + let hits_json = serde_json::to_value(&hits).unwrap(); + let error_json = + error.map_or(serde_json::Value::Null, |e| serde_json::Value::String(e.into())); + let qr_json = serde_json::json!({ + "query_id": query_id, + "query": query_id, + "mode": "vector", + "hits_top_k": hits_json, + "answer": { + "answer": answer_text, + "citations": [], + "grounded": false, + "refusal_reason": null, + "model": {"id": "test-model", "provider": "test", "dimensions": null}, + "embedding": null, + "prompt_template_version": "v1", + "retrieval": { + "trace_id": "t0", + "mode": "vector", + "k": 10, + "score_gate": 0.0, + "top_score": 0.0, + "chunks_returned": 0, + "chunks_used": 0 + }, + "usage": {"prompt_tokens": 0, "completion_tokens": 0, "latency_ms": 0}, + "created_at": "1970-01-01T00:00:00Z" + }, + "elapsed_ms": 0, + "error": error_json + }); + EvalQueryResultRecord { + query_id: query_id.into(), + result_json: serde_json::to_string(&qr_json).unwrap(), + } + } + + /// H1 회귀: eval k=10 으로 실행 시 모든 hit rank ≤ NARROW_K → + /// pool_possibly_truncated 플래그로 사용자에게 경고해야 한다. + #[test] + fn pool_truncation_flag_when_all_hits_within_narrow_k() { + let queries = vec![gq("v1", "g", "docX"), gq("v2", "g", "docX")]; + let rows = vec![ + row("v1", vec![hit("docX", 1)]), + row("v2", vec![hit("other", 7)]), // rank 7 ≤ NARROW_K=10 + ]; + let rep = compute_variant_consistency(&queries, &rows).unwrap(); + assert!(rep.pool_possibly_truncated, "all ranks ≤ NARROW_K must set pool_possibly_truncated"); + // v2 misses docX, pool also has no rank>10 → classified Missing, not MisRanked + assert_eq!(rep.a_dominant_groups, 0); + assert_eq!(rep.b_dominant_groups, 1); + } + + /// M1a: must_contain/forbidden 둘 다 빈 golden → vacuous-true 방지, + /// answer_ok = None (answer 있어도). + /// M1b: qr.error=Some → answer 있어도 answer_ok = None. + #[test] + fn answer_ok_vacuous_and_error_guarded() { + // M1a: gq() helper already has empty must_contain + forbidden + let gq_no_check = gq("v1", "g1", "docX"); + let row_v1 = row_with_answer("v1", vec![], "any text", None); + let rep = compute_variant_consistency(&[gq_no_check], &[row_v1]).unwrap(); + let v = &rep.groups[0].variants[0]; + assert_eq!(v.answer_ok, None, "vacuous-true guard: no checks → answer_ok = None"); + assert_eq!(rep.groups[0].answer_consistency, None); + + // M1b: must_contain present but error is also set + let mut gq_check = gq("v2", "g2", "docY"); + gq_check.must_contain = vec!["expected text".to_string()]; + let row_v2 = row_with_answer("v2", vec![], "expected text", Some("llm error")); + let rep2 = compute_variant_consistency(&[gq_check], &[row_v2]).unwrap(); + let v2 = &rep2.groups[0].variants[0]; + assert_eq!(v2.answer_ok, None, "error guard: qr.error present → answer_ok = None"); + } + + /// N1 순수 B: 두 변형 모두 pool 에서도 정답 없음 → b_dominant=1, a_dominant=0. + #[test] + fn pure_b_dominant_group() { + let queries = vec![gq("v1", "g", "docX"), gq("v2", "g", "docX")]; + let rows = vec![ + row("v1", vec![hit("other1", 1)]), // docX 없음 → Missing (B) + row("v2", vec![hit("other2", 1)]), // docX 없음 → Missing (B) + ]; + let rep = compute_variant_consistency(&queries, &rows).unwrap(); + assert_eq!(rep.b_dominant_groups, 1); + assert_eq!(rep.a_dominant_groups, 0); + } }