2026-05-28 14:17:18 +00:00
3 changed files with 177 additions and 34 deletions
--- a/crates/kebab-app/tests/search_korean.rs
+++ b/crates/kebab-app/tests/search_korean.rs
@@ -127,3 +127,72 @@ fn lexical_mixed_korean_english_multi_token_query_hits() {
        hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
    );
 }
+
+// ── S7 V009 morphological tokenizer end-to-end tests ─────────────────
+
+/// S7 — V009 morphological tokenizer: 한국어 2자 query 가 end-to-end
+/// lexical 경로에서 hit. lindera ko-dic 이 '한국어를' → '한국어' 형태소로
+/// 분해, '서울은' → '서울' 로 분해하여 tokenized_korean_text column 에
+/// 기록 → FTS5 매칭.
+#[test]
+fn korean_morphological_2char_query_lexical_mode() {
+    let env = TestEnv::lexical_only();
+    let doc_path = env.workspace_root.join("korean-wiki.md");
+    std::fs::write(
+        &doc_path,
+        "# 한국어 위키\n\n한국어를 공부합니다.\n서울은 한국의 수도입니다.\n",
+    )
+    .expect("write korean-wiki fixture");
+
+    kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
+        .expect("ingest must succeed");
+
+    let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("한국"))
+        .expect("search 한국");
+    assert!(
+        !hits.is_empty(),
+        "'한국' 2-char Korean query must return at least one hit (V009 morphological); got {:?}",
+        hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
+    );
+
+    let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("서울"))
+        .expect("search 서울");
+    assert!(
+        !hits.is_empty(),
+        "'서울' 2-char Korean query must return at least one hit; got {:?}",
+        hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
+    );
+}
+
+/// S7 — V009 morphological tokenizer: 한-영 혼합 query lexical hit.
+/// 'Rust' (English whole-token) + '최적화' (Korean morpheme) 각각 hit.
+#[test]
+fn korean_morphological_mixed_english_korean_query() {
+    let env = TestEnv::lexical_only();
+    let doc_path = env.workspace_root.join("rust-optimization.md");
+    std::fs::write(
+        &doc_path,
+        "# Rust 최적화 노트\n\nRust 최적화는 zero-cost abstraction 을 강조한다.\n",
+    )
+    .expect("write rust-optimization fixture");
+
+    kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
+        .expect("ingest must succeed");
+
+    let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust"))
+        .expect("search Rust");
+    assert!(
+        !hits.is_empty(),
+        "'Rust' English whole-token must hit; got {:?}",
+        hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
+    );
+
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
+            .expect("search 최적화");
+    assert!(
+        !hits.is_empty(),
+        "'최적화' Korean morpheme must hit; got {:?}",
+        hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
+    );
+}
--- a/crates/kebab-search/src/lexical.rs
+++ b/crates/kebab-search/src/lexical.rs
@@ -157,23 +157,27 @@ impl Retriever for LexicalRetriever {
 ///
 /// v0.17.0 — trigram-aware redesign (see design §5.5 + plan
 /// `docs/superpowers/plans/2026-05-22-korean-trigram-tokenizer.md`
-/// Task A5). The FTS5 tokenizer is `trigram` so any term shorter than
-/// three Unicode chars has no index entry and would zero out an AND
-/// branch. Korean compounds typically split into 2-char eojeols (e.g.
-/// `해시 충돌`), so a naive token AND drops the dominant usage pattern.
+/// Task A5). Originally the FTS5 tokenizer was `trigram` so any term
+/// shorter than three Unicode chars had no index entry and would zero
+/// out an AND branch. Korean compounds typically split into 2-char
+/// eojeols (e.g. `해시 충돌`), so a naive token AND drops the dominant
+/// usage pattern.
+///
+/// V009 (2026-05-28): FTS5 tokenizer 가 trigram → unicode61 + 한국어
+/// 형태소 분해 column 로 갱신됨. unicode61 은 trigram 과 달리 최소
+/// token 길이 제한이 없어 2자 한국어 morpheme query ('한국', '서울')
+/// 가 `tokenized_korean_text` column 경유로 hit 가능. MIN_QUERY_CHARS
+/// 를 2 로 낮춰 2자 query 를 통과시킨다 (1자 단독은 여전히 필터).
+/// multi-token Korean query 의 OR-combine 분기는 redundant 하나 보존
+/// (future 확장성).
 ///
 /// post-v0.17.1 dogfood — `text` column filter (closure of HOTFIXES
 /// 2026-05-24 `heading_path_json` 노이즈). The `chunks_fts` virtual
 /// table indexes both `heading_path` (the JSON-serialized
-/// `chunks.heading_path_json` per V002/V007 triggers) and `text`. Under
-/// the trigram tokenizer the JSON punctuation (`[`, `"`, `,`) plus the
-/// path segments (`app`, `src`, …) become indexable 3-grams, so a
-/// query can hit a chunk purely because its file's heading JSON shares
-/// a path segment with the query — false positives that have no body
-/// relevance. The default match expression therefore scopes to the
-/// `text` column. The `heading_path` column stays indexed (V007 / §5.5
-/// verbatim block is preserved) so a user who *wants* heading matching
-/// can opt in via raw mode (`'heading_path : foo'`).
+/// `chunks.heading_path_json` per V002/V007 triggers) and `text`. The
+/// default match expression therefore scopes to the `text` column. The
+/// `heading_path` column stays indexed so a user who *wants* heading
+/// matching can opt in via raw mode (`'heading_path : foo'`).
 ///
 /// Rules:
 ///
@@ -185,26 +189,22 @@ impl Retriever for LexicalRetriever {
 ///
 /// - Otherwise build up to two MATCH candidates:
 ///   1. **whole-phrase**: the entire trimmed input wrapped as one FTS5
-///      string literal, *only* if it has ≥3 Unicode chars. FTS5 treats
+///      string literal, *only* if it has ≥2 Unicode chars. FTS5 treats
 ///      a quoted string with spaces as a phrase match.
 ///   2. **token AND**: whitespace-split tokens, kept only when each has
-///      ≥3 Unicode chars (shorter ones are dropped — they would zero
-///      out the AND under trigram).
+///      ≥2 Unicode chars (1-char tokens are dropped).
 ///
 /// - Combine: `(whole) OR (token_and)` when both exist *and differ*;
 ///   either alone when only one exists; `None` when neither exists
 ///   (caller short-circuits to `Ok(vec![])`, avoiding an FTS5 syntax
 ///   error from an empty MATCH).
 ///
-/// - A single-token long query (`러스트`, `foo`) yields `whole == token_and`
+/// - A single-token query (`러스트`, `한국`, `foo`) yields `whole == token_and`
 ///   → return the bare quoted form so the OR doesn't duplicate.
 ///
 /// - Finally wrap the combined expression in `text : (<expr>)` so the
 ///   match is scoped to the body column. FTS5's column-filter syntax
 ///   accepts an arbitrary OR/AND sub-expression inside the parens.
-///
-/// V009 unicode61 + 형태소 tokenizer 환경에서는 multi-token Korean
-/// query 의 OR-combine 분기는 redundant 하나 보존 (future 확장성).
 fn build_match_string(text: &str) -> Option<String> {
    let trimmed = text.trim();
    if trimmed.is_empty() {
@@ -218,15 +218,18 @@ fn build_match_string(text: &str) -> Option<String> {
        return Some(inner_trim.to_string());
    }

-    const MIN_TRIGRAM_CHARS: usize = 3;
+    // V009 unicode61: minimum query token length is 2 Unicode chars.
+    // (V007 trigram required ≥3; unicode61 has no built-in minimum but
+    // single-char queries are too broad to be useful.)
+    const MIN_QUERY_CHARS: usize = 2;

    let whole_candidate: Option<String> =
-        (trimmed.chars().count() >= MIN_TRIGRAM_CHARS).then(|| escape_fts5_token(trimmed));
+        (trimmed.chars().count() >= MIN_QUERY_CHARS).then(|| escape_fts5_token(trimmed));

    let token_and_candidate: Option<String> = {
        let toks: Vec<String> = trimmed
            .split_whitespace()
-            .filter(|t| t.chars().count() >= MIN_TRIGRAM_CHARS)
+            .filter(|t| t.chars().count() >= MIN_QUERY_CHARS)
            .map(escape_fts5_token)
            .collect();
        (!toks.is_empty()).then(|| toks.join(" "))
@@ -651,25 +654,32 @@ mod tests {

    // ── v0.17.0 trigram-aware redesign coverage ──────────────────────────

-    /// 2-char Korean query (`충돌`) yields neither a whole-phrase nor a
-    /// token-AND candidate → `None`. Caller short-circuits to an empty
-    /// hit list rather than executing an FTS5 syntax error on `""` MATCH.
+    /// V009 unicode61: 1-char query yields None (too broad); 2-char Korean
+    /// query now passes the MIN_QUERY_CHARS=2 filter and returns a valid
+    /// match expression.
    #[test]
    fn build_match_string_short_korean_returns_none() {
-        assert!(build_match_string("충돌").is_none());
+        // 1-char queries remain filtered (too broad).
        assert!(build_match_string("키").is_none());
-        assert!(build_match_string(" 충돌 ").is_none());
+        assert!(build_match_string("나").is_none());
+        // 2-char Korean queries now produce a valid expression (V009 unicode61).
+        assert_eq!(
+            build_match_string("충돌").unwrap(),
+            r#"text : ("충돌")"#
+        );
+        assert_eq!(
+            build_match_string(" 충돌 ").unwrap(),
+            r#"text : ("충돌")"#
+        );
    }

-    /// `해시 충돌` — both tokens are 2 chars (dropped from the AND), but
-    /// the whole-phrase candidate (`"해시 충돌"`, 5 chars total) survives.
-    /// This is the dominant Korean usage pattern targeted by A5.
-    /// The whole-phrase candidate is then wrapped in the `text : (...)`
-    /// column filter.
+    /// V009 unicode61: `해시 충돌` — both tokens are 2 chars and now pass
+    /// MIN_QUERY_CHARS=2. Both whole-phrase and token-AND candidates exist
+    /// and differ → OR-combined inside `text : (...)`.
    #[test]
    fn build_match_string_whole_phrase_only_when_all_tokens_short() {
        let s = build_match_string("해시 충돌").unwrap();
-        assert_eq!(s, r#"text : ("해시 충돌")"#);
+        assert_eq!(s, r#"text : (("해시 충돌") OR ("해시" "충돌"))"#);
    }

    /// Single long token: whole-phrase and token-AND candidates collapse
--- a/crates/kebab-store-sqlite/tests/fts.rs
+++ b/crates/kebab-store-sqlite/tests/fts.rs
@@ -581,3 +581,67 @@ fn fts_v009_unicode61_space_separated_korean_token_hits() {
    // substring (token 의 부분 문자열) 은 V009 unicode61 에서 0-hit.
    assert_eq!(count_match(&conn, "발생한"), 0, "substring '발생한' of '발생한다' 0-hit");
 }
+
+// ── 8. V009 morphological tokenizer behavior ──────────────────────────
+
+/// V009 의 핵심 가치: 한국어 2자 query 가 hit. 형태소 분해된
+/// tokenized_korean_text column 이 chunks_fts 에 indexed.
+#[test]
+fn fts_v009_korean_morphological_2char_query_hits() {
+    let env = common::TestEnv::new();
+    let store = SqliteStore::open(&env.config()).unwrap();
+    store.run_migrations().unwrap();
+
+    let conn = raw_conn_no_fk(&env);
+    let text = "한국 문화는 오래되었다";
+    let tokenized = tokenize_korean_morphological(text);
+    conn.execute(
+        "INSERT INTO chunks (
+            chunk_id, doc_id, text, heading_path_json, section_label,
+            source_spans_json, token_estimate, chunker_version,
+            policy_hash, block_ids_json, created_at,
+            tokenized_korean_text
+        ) VALUES (?, ?, ?, '[]', NULL, '[]', 0, 'v1', 'h', '[]', '2024-01-01T00:00:00Z', ?)",
+        rusqlite::params![
+            &"k".repeat(32),
+            &"d".repeat(32),
+            text,
+            tokenized,
+        ],
+    )
+    .expect("insert chunk with tokenized_korean_text");
+
+    assert!(
+        count_match(&conn, "한국") >= 1,
+        "2-char Korean morpheme '한국' must hit when tokenized column is populated"
+    );
+}
+
+/// V009 의 Path A 회귀 확인: 영어 substring 매칭이 사라짐
+/// (unicode61 의 whole-token only 동작).
+#[test]
+fn fts_v009_english_whole_token_only() {
+    let env = common::TestEnv::new();
+    let store = SqliteStore::open(&env.config()).unwrap();
+    store.run_migrations().unwrap();
+
+    let conn = raw_conn_no_fk(&env);
+    insert_chunk(
+        &conn,
+        &"e".repeat(32),
+        &"d".repeat(32),
+        "[]",
+        "the tokenizer normalizes whitespace before matching",
+    );
+
+    assert_eq!(
+        count_match(&conn, "token"),
+        0,
+        "V009 unicode61: 'token' is substring of 'tokenizer', should NOT hit"
+    );
+    assert_eq!(
+        count_match(&conn, "tokenizer"),
+        1,
+        "V009 unicode61: whole-token 'tokenizer' must hit"
+    );
+}