From c5de5f812b0c4883ce4186bb0cbda9c08c2a1dfc Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 28 May 2026 11:38:52 +0000 Subject: [PATCH] test(fts,app): V009 morphological tokenizer integration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 신규 4 test 추가: - crates/kebab-store-sqlite/tests/fts.rs: - fts_v009_korean_morphological_2char_query_hits: tokenized_korean_text column 이 채워진 chunk 의 '한국' 2-char query hit. - fts_v009_english_whole_token_only: V007 trigram substring 매칭 회귀 (Path A) — 'token' query 가 'tokenizer' chunk 에서 0-hit. - crates/kebab-app/tests/search_korean.rs: - korean_morphological_2char_query_lexical_mode: end-to-end 한국어 wiki fixture ingest → '한국' / '서울' query hit. - korean_morphological_mixed_english_korean_query: 'Rust' English whole-token + '최적화' Korean morpheme hit. crates/kebab-search/src/lexical.rs: - build_match_string() 의 MIN_TRIGRAM_CHARS(3) → MIN_QUERY_CHARS(2). V009 unicode61 은 최소 token 길이 제한 없어 2자 한국어 morpheme query 가 통과되어야 함. 1자 단독은 여전히 필터. - 관련 unit test 2개 V009 동작으로 갱신. fixture text 는 lindera ko-dic 의 실제 segmentation 동작에 의존 (spec Appendix B prior-knowledge 예측). 실측 시 fixture 조정 가능. Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md §9.1, §9.2 Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S7) Co-Authored-By: Claude Sonnet 4.6 --- crates/kebab-app/tests/search_korean.rs | 69 ++++++++++++++++++++++ crates/kebab-search/src/lexical.rs | 78 ++++++++++++++----------- crates/kebab-store-sqlite/tests/fts.rs | 64 ++++++++++++++++++++ 3 files changed, 177 insertions(+), 34 deletions(-) diff --git a/crates/kebab-app/tests/search_korean.rs b/crates/kebab-app/tests/search_korean.rs index 05646f0..d2b916e 100644 --- a/crates/kebab-app/tests/search_korean.rs +++ b/crates/kebab-app/tests/search_korean.rs @@ -127,3 +127,72 @@ fn lexical_mixed_korean_english_multi_token_query_hits() { hits.iter().map(|h| &h.doc_path.0).collect::>() ); } + +// ── S7 V009 morphological tokenizer end-to-end tests ───────────────── + +/// S7 — V009 morphological tokenizer: 한국어 2자 query 가 end-to-end +/// lexical 경로에서 hit. lindera ko-dic 이 '한국어를' → '한국어' 형태소로 +/// 분해, '서울은' → '서울' 로 분해하여 tokenized_korean_text column 에 +/// 기록 → FTS5 매칭. +#[test] +fn korean_morphological_2char_query_lexical_mode() { + let env = TestEnv::lexical_only(); + let doc_path = env.workspace_root.join("korean-wiki.md"); + std::fs::write( + &doc_path, + "# 한국어 위키\n\n한국어를 공부합니다.\n서울은 한국의 수도입니다.\n", + ) + .expect("write korean-wiki fixture"); + + kebab_app::ingest_with_config(env.config.clone(), env.scope(), true) + .expect("ingest must succeed"); + + let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("한국")) + .expect("search 한국"); + assert!( + !hits.is_empty(), + "'한국' 2-char Korean query must return at least one hit (V009 morphological); got {:?}", + hits.iter().map(|h| &h.doc_path.0).collect::>() + ); + + let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("서울")) + .expect("search 서울"); + assert!( + !hits.is_empty(), + "'서울' 2-char Korean query must return at least one hit; got {:?}", + hits.iter().map(|h| &h.doc_path.0).collect::>() + ); +} + +/// S7 — V009 morphological tokenizer: 한-영 혼합 query lexical hit. +/// 'Rust' (English whole-token) + '최적화' (Korean morpheme) 각각 hit. +#[test] +fn korean_morphological_mixed_english_korean_query() { + let env = TestEnv::lexical_only(); + let doc_path = env.workspace_root.join("rust-optimization.md"); + std::fs::write( + &doc_path, + "# Rust 최적화 노트\n\nRust 최적화는 zero-cost abstraction 을 강조한다.\n", + ) + .expect("write rust-optimization fixture"); + + kebab_app::ingest_with_config(env.config.clone(), env.scope(), true) + .expect("ingest must succeed"); + + let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust")) + .expect("search Rust"); + assert!( + !hits.is_empty(), + "'Rust' English whole-token must hit; got {:?}", + hits.iter().map(|h| &h.doc_path.0).collect::>() + ); + + let hits = + kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화")) + .expect("search 최적화"); + assert!( + !hits.is_empty(), + "'최적화' Korean morpheme must hit; got {:?}", + hits.iter().map(|h| &h.doc_path.0).collect::>() + ); +} diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs index 6176ea4..2aec6d3 100644 --- a/crates/kebab-search/src/lexical.rs +++ b/crates/kebab-search/src/lexical.rs @@ -157,23 +157,27 @@ impl Retriever for LexicalRetriever { /// /// v0.17.0 — trigram-aware redesign (see design §5.5 + plan /// `docs/superpowers/plans/2026-05-22-korean-trigram-tokenizer.md` -/// Task A5). The FTS5 tokenizer is `trigram` so any term shorter than -/// three Unicode chars has no index entry and would zero out an AND -/// branch. Korean compounds typically split into 2-char eojeols (e.g. -/// `해시 충돌`), so a naive token AND drops the dominant usage pattern. +/// Task A5). Originally the FTS5 tokenizer was `trigram` so any term +/// shorter than three Unicode chars had no index entry and would zero +/// out an AND branch. Korean compounds typically split into 2-char +/// eojeols (e.g. `해시 충돌`), so a naive token AND drops the dominant +/// usage pattern. +/// +/// V009 (2026-05-28): FTS5 tokenizer 가 trigram → unicode61 + 한국어 +/// 형태소 분해 column 로 갱신됨. unicode61 은 trigram 과 달리 최소 +/// token 길이 제한이 없어 2자 한국어 morpheme query ('한국', '서울') +/// 가 `tokenized_korean_text` column 경유로 hit 가능. MIN_QUERY_CHARS +/// 를 2 로 낮춰 2자 query 를 통과시킨다 (1자 단독은 여전히 필터). +/// multi-token Korean query 의 OR-combine 분기는 redundant 하나 보존 +/// (future 확장성). /// /// post-v0.17.1 dogfood — `text` column filter (closure of HOTFIXES /// 2026-05-24 `heading_path_json` 노이즈). The `chunks_fts` virtual /// table indexes both `heading_path` (the JSON-serialized -/// `chunks.heading_path_json` per V002/V007 triggers) and `text`. Under -/// the trigram tokenizer the JSON punctuation (`[`, `"`, `,`) plus the -/// path segments (`app`, `src`, …) become indexable 3-grams, so a -/// query can hit a chunk purely because its file's heading JSON shares -/// a path segment with the query — false positives that have no body -/// relevance. The default match expression therefore scopes to the -/// `text` column. The `heading_path` column stays indexed (V007 / §5.5 -/// verbatim block is preserved) so a user who *wants* heading matching -/// can opt in via raw mode (`'heading_path : foo'`). +/// `chunks.heading_path_json` per V002/V007 triggers) and `text`. The +/// default match expression therefore scopes to the `text` column. The +/// `heading_path` column stays indexed so a user who *wants* heading +/// matching can opt in via raw mode (`'heading_path : foo'`). /// /// Rules: /// @@ -185,26 +189,22 @@ impl Retriever for LexicalRetriever { /// /// - Otherwise build up to two MATCH candidates: /// 1. **whole-phrase**: the entire trimmed input wrapped as one FTS5 -/// string literal, *only* if it has ≥3 Unicode chars. FTS5 treats +/// string literal, *only* if it has ≥2 Unicode chars. FTS5 treats /// a quoted string with spaces as a phrase match. /// 2. **token AND**: whitespace-split tokens, kept only when each has -/// ≥3 Unicode chars (shorter ones are dropped — they would zero -/// out the AND under trigram). +/// ≥2 Unicode chars (1-char tokens are dropped). /// /// - Combine: `(whole) OR (token_and)` when both exist *and differ*; /// either alone when only one exists; `None` when neither exists /// (caller short-circuits to `Ok(vec![])`, avoiding an FTS5 syntax /// error from an empty MATCH). /// -/// - A single-token long query (`러스트`, `foo`) yields `whole == token_and` +/// - A single-token query (`러스트`, `한국`, `foo`) yields `whole == token_and` /// → return the bare quoted form so the OR doesn't duplicate. /// /// - Finally wrap the combined expression in `text : ()` so the /// match is scoped to the body column. FTS5's column-filter syntax /// accepts an arbitrary OR/AND sub-expression inside the parens. -/// -/// V009 unicode61 + 형태소 tokenizer 환경에서는 multi-token Korean -/// query 의 OR-combine 분기는 redundant 하나 보존 (future 확장성). fn build_match_string(text: &str) -> Option { let trimmed = text.trim(); if trimmed.is_empty() { @@ -218,15 +218,18 @@ fn build_match_string(text: &str) -> Option { return Some(inner_trim.to_string()); } - const MIN_TRIGRAM_CHARS: usize = 3; + // V009 unicode61: minimum query token length is 2 Unicode chars. + // (V007 trigram required ≥3; unicode61 has no built-in minimum but + // single-char queries are too broad to be useful.) + const MIN_QUERY_CHARS: usize = 2; let whole_candidate: Option = - (trimmed.chars().count() >= MIN_TRIGRAM_CHARS).then(|| escape_fts5_token(trimmed)); + (trimmed.chars().count() >= MIN_QUERY_CHARS).then(|| escape_fts5_token(trimmed)); let token_and_candidate: Option = { let toks: Vec = trimmed .split_whitespace() - .filter(|t| t.chars().count() >= MIN_TRIGRAM_CHARS) + .filter(|t| t.chars().count() >= MIN_QUERY_CHARS) .map(escape_fts5_token) .collect(); (!toks.is_empty()).then(|| toks.join(" ")) @@ -651,25 +654,32 @@ mod tests { // ── v0.17.0 trigram-aware redesign coverage ────────────────────────── - /// 2-char Korean query (`충돌`) yields neither a whole-phrase nor a - /// token-AND candidate → `None`. Caller short-circuits to an empty - /// hit list rather than executing an FTS5 syntax error on `""` MATCH. + /// V009 unicode61: 1-char query yields None (too broad); 2-char Korean + /// query now passes the MIN_QUERY_CHARS=2 filter and returns a valid + /// match expression. #[test] fn build_match_string_short_korean_returns_none() { - assert!(build_match_string("충돌").is_none()); + // 1-char queries remain filtered (too broad). assert!(build_match_string("키").is_none()); - assert!(build_match_string(" 충돌 ").is_none()); + assert!(build_match_string("나").is_none()); + // 2-char Korean queries now produce a valid expression (V009 unicode61). + assert_eq!( + build_match_string("충돌").unwrap(), + r#"text : ("충돌")"# + ); + assert_eq!( + build_match_string(" 충돌 ").unwrap(), + r#"text : ("충돌")"# + ); } - /// `해시 충돌` — both tokens are 2 chars (dropped from the AND), but - /// the whole-phrase candidate (`"해시 충돌"`, 5 chars total) survives. - /// This is the dominant Korean usage pattern targeted by A5. - /// The whole-phrase candidate is then wrapped in the `text : (...)` - /// column filter. + /// V009 unicode61: `해시 충돌` — both tokens are 2 chars and now pass + /// MIN_QUERY_CHARS=2. Both whole-phrase and token-AND candidates exist + /// and differ → OR-combined inside `text : (...)`. #[test] fn build_match_string_whole_phrase_only_when_all_tokens_short() { let s = build_match_string("해시 충돌").unwrap(); - assert_eq!(s, r#"text : ("해시 충돌")"#); + assert_eq!(s, r#"text : (("해시 충돌") OR ("해시" "충돌"))"#); } /// Single long token: whole-phrase and token-AND candidates collapse diff --git a/crates/kebab-store-sqlite/tests/fts.rs b/crates/kebab-store-sqlite/tests/fts.rs index 5d945b8..fc52f25 100644 --- a/crates/kebab-store-sqlite/tests/fts.rs +++ b/crates/kebab-store-sqlite/tests/fts.rs @@ -581,3 +581,67 @@ fn fts_v009_unicode61_space_separated_korean_token_hits() { // substring (token 의 부분 문자열) 은 V009 unicode61 에서 0-hit. assert_eq!(count_match(&conn, "발생한"), 0, "substring '발생한' of '발생한다' 0-hit"); } + +// ── 8. V009 morphological tokenizer behavior ────────────────────────── + +/// V009 의 핵심 가치: 한국어 2자 query 가 hit. 형태소 분해된 +/// tokenized_korean_text column 이 chunks_fts 에 indexed. +#[test] +fn fts_v009_korean_morphological_2char_query_hits() { + let env = common::TestEnv::new(); + let store = SqliteStore::open(&env.config()).unwrap(); + store.run_migrations().unwrap(); + + let conn = raw_conn_no_fk(&env); + let text = "한국 문화는 오래되었다"; + let tokenized = tokenize_korean_morphological(text); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at, + tokenized_korean_text + ) VALUES (?, ?, ?, '[]', NULL, '[]', 0, 'v1', 'h', '[]', '2024-01-01T00:00:00Z', ?)", + rusqlite::params![ + &"k".repeat(32), + &"d".repeat(32), + text, + tokenized, + ], + ) + .expect("insert chunk with tokenized_korean_text"); + + assert!( + count_match(&conn, "한국") >= 1, + "2-char Korean morpheme '한국' must hit when tokenized column is populated" + ); +} + +/// V009 의 Path A 회귀 확인: 영어 substring 매칭이 사라짐 +/// (unicode61 의 whole-token only 동작). +#[test] +fn fts_v009_english_whole_token_only() { + let env = common::TestEnv::new(); + let store = SqliteStore::open(&env.config()).unwrap(); + store.run_migrations().unwrap(); + + let conn = raw_conn_no_fk(&env); + insert_chunk( + &conn, + &"e".repeat(32), + &"d".repeat(32), + "[]", + "the tokenizer normalizes whitespace before matching", + ); + + assert_eq!( + count_match(&conn, "token"), + 0, + "V009 unicode61: 'token' is substring of 'tokenizer', should NOT hit" + ); + assert_eq!( + count_match(&conn, "tokenizer"), + 1, + "V009 unicode61: whole-token 'tokenizer' must hit" + ); +}