diff --git a/crates/kebab-eval/src/compare.rs b/crates/kebab-eval/src/compare.rs index 6a5986f..3ab8480 100644 --- a/crates/kebab-eval/src/compare.rs +++ b/crates/kebab-eval/src/compare.rs @@ -503,6 +503,7 @@ mod tests { must_contain: vec![], forbidden: vec![], difficulty: None, + group: None, }; let g = Some(&g); // a miss, b hit → Win diff --git a/crates/kebab-eval/src/loader.rs b/crates/kebab-eval/src/loader.rs index d1b2640..a628659 100644 --- a/crates/kebab-eval/src/loader.rs +++ b/crates/kebab-eval/src/loader.rs @@ -30,6 +30,7 @@ pub fn load_golden_set(path: &Path) -> Result> { let queries: Vec = serde_yaml::from_slice(&bytes) .with_context(|| format!("parse golden YAML at {}", path.display()))?; check_unique_ids(&queries)?; + check_group_integrity(&queries)?; Ok(queries) } @@ -54,6 +55,40 @@ pub(crate) fn load_golden_set_validated( Ok(queries) } +/// 같은 `group`에 속한 모든 쿼리가 동일한 `expected_doc_ids`(집합)를 +/// 공유하는지 검증. 변형 일관성 메트릭은 "같은 정답을 가진 다른 표현들"을 +/// 전제하므로, 그룹 내 정답이 갈리면 측정이 무의미해진다 → bail. +fn check_group_integrity(queries: &[GoldenQuery]) -> Result<()> { + use std::collections::BTreeMap; + // group -> (대표 정답 집합, 대표 query id) + let mut canonical: BTreeMap<&str, (BTreeSet, &str)> = BTreeMap::new(); + let mut offenders: BTreeSet = BTreeSet::new(); + for q in queries { + let Some(group) = q.group.as_deref() else { + continue; + }; + let docs: BTreeSet = q.expected_doc_ids.iter().map(|d| d.0.clone()).collect(); + match canonical.get(group) { + None => { + canonical.insert(group, (docs, q.id.as_str())); + } + Some((expected, _first)) if *expected != docs => { + offenders.insert(group.to_string()); + } + Some(_) => {} + } + } + if offenders.is_empty() { + Ok(()) + } else { + let list: Vec = offenders.into_iter().collect(); + Err(anyhow!( + "group(s) with divergent expected_doc_ids (same group must share one expected doc set): {}", + list.join(", ") + )) + } +} + fn check_unique_ids(queries: &[GoldenQuery]) -> Result<()> { let mut seen: HashSet<&str> = HashSet::new(); let mut dups: BTreeSet = BTreeSet::new(); @@ -194,6 +229,37 @@ mod tests { assert_eq!(qs.len(), 1); } + #[test] + fn rejects_group_with_divergent_expected_docs() { + let tmp = tempdir().unwrap(); + let yaml_path = tmp.path().join("golden.yaml"); + fs::write( + &yaml_path, + "- id: g1\n query: \"러스트 소유권\"\n group: ownership\n expected_doc_ids: [\"docA\"]\n\ + - id: g2\n query: \"rust ownership\"\n group: ownership\n expected_doc_ids: [\"docB\"]\n", + ) + .unwrap(); + let err = load_golden_set(&yaml_path).unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("group"), "msg: {msg}"); + assert!(msg.contains("ownership"), "msg: {msg}"); + } + + #[test] + fn accepts_group_with_matching_expected_docs() { + let tmp = tempdir().unwrap(); + let yaml_path = tmp.path().join("golden.yaml"); + fs::write( + &yaml_path, + "- id: g1\n query: \"러스트 소유권\"\n group: ownership\n expected_doc_ids: [\"docA\"]\n\ + - id: g2\n query: \"rust ownership\"\n group: ownership\n expected_doc_ids: [\"docA\"]\n", + ) + .unwrap(); + let qs = load_golden_set(&yaml_path).unwrap(); + assert_eq!(qs.len(), 2); + assert_eq!(qs[0].group.as_deref(), Some("ownership")); + } + fn seed_one_chunk(store: &SqliteStore, doc_id: &str, chunk_id: &str) { let conn = store.read_conn(); let asset_id = format!("a_{doc_id}"); diff --git a/crates/kebab-eval/src/metrics.rs b/crates/kebab-eval/src/metrics.rs index cdf5f0c..fba3344 100644 --- a/crates/kebab-eval/src/metrics.rs +++ b/crates/kebab-eval/src/metrics.rs @@ -456,6 +456,7 @@ mod tests { must_contain: vec![], forbidden: vec![], difficulty: None, + group: None, } } diff --git a/crates/kebab-eval/src/types.rs b/crates/kebab-eval/src/types.rs index db5e15d..0629485 100644 --- a/crates/kebab-eval/src/types.rs +++ b/crates/kebab-eval/src/types.rs @@ -26,6 +26,11 @@ pub struct GoldenQuery { pub forbidden: Vec, #[serde(default)] pub difficulty: Option, + /// 같은 의미의 여러 표현(동의어·다른 어휘·풀어쓴 문장·한/영)을 묶는 + /// 의도 그룹 id. 같은 그룹의 모든 변형은 동일한 `expected_doc_ids`(집합)를 + /// 공유해야 한다(loader가 강제). `None`이면 단독 쿼리(기존 동작 불변). + #[serde(default)] + pub group: Option, } fn default_lang() -> Lang {