feat(v0.20.1): 한국어 morphological tokenizer (V009) + N-gram supplement + eager backfill #191

Merged
altair823 merged 25 commits from feat/korean-morphological-tokenizer into main 2026-05-28 14:17:18 +00:00
3 changed files with 177 additions and 34 deletions
Showing only changes of commit c5de5f812b - Show all commits

View File

@@ -127,3 +127,72 @@ fn lexical_mixed_korean_english_multi_token_query_hits() {
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
);
}
// ── S7 V009 morphological tokenizer end-to-end tests ─────────────────
/// S7 — V009 morphological tokenizer: 한국어 2자 query 가 end-to-end
/// lexical 경로에서 hit. lindera ko-dic 이 '한국어를' → '한국어' 형태소로
/// 분해, '서울은' → '서울' 로 분해하여 tokenized_korean_text column 에
/// 기록 → FTS5 매칭.
#[test]
fn korean_morphological_2char_query_lexical_mode() {
let env = TestEnv::lexical_only();
let doc_path = env.workspace_root.join("korean-wiki.md");
std::fs::write(
&doc_path,
"# 한국어 위키\n\n한국어를 공부합니다.\n서울은 한국의 수도입니다.\n",
)
.expect("write korean-wiki fixture");
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
.expect("ingest must succeed");
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("한국"))
.expect("search 한국");
assert!(
!hits.is_empty(),
"'한국' 2-char Korean query must return at least one hit (V009 morphological); got {:?}",
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
);
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("서울"))
.expect("search 서울");
assert!(
!hits.is_empty(),
"'서울' 2-char Korean query must return at least one hit; got {:?}",
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
);
}
/// S7 — V009 morphological tokenizer: 한-영 혼합 query lexical hit.
/// 'Rust' (English whole-token) + '최적화' (Korean morpheme) 각각 hit.
#[test]
fn korean_morphological_mixed_english_korean_query() {
let env = TestEnv::lexical_only();
let doc_path = env.workspace_root.join("rust-optimization.md");
std::fs::write(
&doc_path,
"# Rust 최적화 노트\n\nRust 최적화는 zero-cost abstraction 을 강조한다.\n",
)
.expect("write rust-optimization fixture");
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
.expect("ingest must succeed");
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust"))
.expect("search Rust");
assert!(
!hits.is_empty(),
"'Rust' English whole-token must hit; got {:?}",
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
);
let hits =
kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
.expect("search 최적화");
assert!(
!hits.is_empty(),
"'최적화' Korean morpheme must hit; got {:?}",
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
);
}

View File

@@ -157,23 +157,27 @@ impl Retriever for LexicalRetriever {
///
/// v0.17.0 — trigram-aware redesign (see design §5.5 + plan
/// `docs/superpowers/plans/2026-05-22-korean-trigram-tokenizer.md`
/// Task A5). The FTS5 tokenizer is `trigram` so any term shorter than
/// three Unicode chars has no index entry and would zero out an AND
/// branch. Korean compounds typically split into 2-char eojeols (e.g.
/// `해시 충돌`), so a naive token AND drops the dominant usage pattern.
/// Task A5). Originally the FTS5 tokenizer was `trigram` so any term
/// shorter than three Unicode chars had no index entry and would zero
/// out an AND branch. Korean compounds typically split into 2-char
/// eojeols (e.g. `해시 충돌`), so a naive token AND drops the dominant
/// usage pattern.
///
/// V009 (2026-05-28): FTS5 tokenizer 가 trigram → unicode61 + 한국어
/// 형태소 분해 column 로 갱신됨. unicode61 은 trigram 과 달리 최소
/// token 길이 제한이 없어 2자 한국어 morpheme query ('한국', '서울')
/// 가 `tokenized_korean_text` column 경유로 hit 가능. MIN_QUERY_CHARS
/// 를 2 로 낮춰 2자 query 를 통과시킨다 (1자 단독은 여전히 필터).
/// multi-token Korean query 의 OR-combine 분기는 redundant 하나 보존
/// (future 확장성).
///
/// post-v0.17.1 dogfood — `text` column filter (closure of HOTFIXES
/// 2026-05-24 `heading_path_json` 노이즈). The `chunks_fts` virtual
/// table indexes both `heading_path` (the JSON-serialized
/// `chunks.heading_path_json` per V002/V007 triggers) and `text`. Under
/// the trigram tokenizer the JSON punctuation (`[`, `"`, `,`) plus the
/// path segments (`app`, `src`, …) become indexable 3-grams, so a
/// query can hit a chunk purely because its file's heading JSON shares
/// a path segment with the query — false positives that have no body
/// relevance. The default match expression therefore scopes to the
/// `text` column. The `heading_path` column stays indexed (V007 / §5.5
/// verbatim block is preserved) so a user who *wants* heading matching
/// can opt in via raw mode (`'heading_path : foo'`).
/// `chunks.heading_path_json` per V002/V007 triggers) and `text`. The
/// default match expression therefore scopes to the `text` column. The
/// `heading_path` column stays indexed so a user who *wants* heading
/// matching can opt in via raw mode (`'heading_path : foo'`).
///
/// Rules:
///
@@ -185,26 +189,22 @@ impl Retriever for LexicalRetriever {
///
/// - Otherwise build up to two MATCH candidates:
/// 1. **whole-phrase**: the entire trimmed input wrapped as one FTS5
/// string literal, *only* if it has ≥3 Unicode chars. FTS5 treats
/// string literal, *only* if it has ≥2 Unicode chars. FTS5 treats
/// a quoted string with spaces as a phrase match.
/// 2. **token AND**: whitespace-split tokens, kept only when each has
/// ≥3 Unicode chars (shorter ones are dropped — they would zero
/// out the AND under trigram).
/// ≥2 Unicode chars (1-char tokens are dropped).
///
/// - Combine: `(whole) OR (token_and)` when both exist *and differ*;
/// either alone when only one exists; `None` when neither exists
/// (caller short-circuits to `Ok(vec![])`, avoiding an FTS5 syntax
/// error from an empty MATCH).
///
/// - A single-token long query (`러스트`, `foo`) yields `whole == token_and`
/// - A single-token query (`러스트`, `한국`, `foo`) yields `whole == token_and`
/// → return the bare quoted form so the OR doesn't duplicate.
///
/// - Finally wrap the combined expression in `text : (<expr>)` so the
/// match is scoped to the body column. FTS5's column-filter syntax
/// accepts an arbitrary OR/AND sub-expression inside the parens.
///
/// V009 unicode61 + 형태소 tokenizer 환경에서는 multi-token Korean
/// query 의 OR-combine 분기는 redundant 하나 보존 (future 확장성).
fn build_match_string(text: &str) -> Option<String> {
let trimmed = text.trim();
if trimmed.is_empty() {
@@ -218,15 +218,18 @@ fn build_match_string(text: &str) -> Option<String> {
return Some(inner_trim.to_string());
}
const MIN_TRIGRAM_CHARS: usize = 3;
// V009 unicode61: minimum query token length is 2 Unicode chars.
// (V007 trigram required ≥3; unicode61 has no built-in minimum but
// single-char queries are too broad to be useful.)
const MIN_QUERY_CHARS: usize = 2;
let whole_candidate: Option<String> =
(trimmed.chars().count() >= MIN_TRIGRAM_CHARS).then(|| escape_fts5_token(trimmed));
(trimmed.chars().count() >= MIN_QUERY_CHARS).then(|| escape_fts5_token(trimmed));
let token_and_candidate: Option<String> = {
let toks: Vec<String> = trimmed
.split_whitespace()
.filter(|t| t.chars().count() >= MIN_TRIGRAM_CHARS)
.filter(|t| t.chars().count() >= MIN_QUERY_CHARS)
.map(escape_fts5_token)
.collect();
(!toks.is_empty()).then(|| toks.join(" "))
@@ -651,25 +654,32 @@ mod tests {
// ── v0.17.0 trigram-aware redesign coverage ──────────────────────────
/// 2-char Korean query (`충돌`) yields neither a whole-phrase nor a
/// token-AND candidate → `None`. Caller short-circuits to an empty
/// hit list rather than executing an FTS5 syntax error on `""` MATCH.
/// V009 unicode61: 1-char query yields None (too broad); 2-char Korean
/// query now passes the MIN_QUERY_CHARS=2 filter and returns a valid
/// match expression.
#[test]
fn build_match_string_short_korean_returns_none() {
assert!(build_match_string("충돌").is_none());
// 1-char queries remain filtered (too broad).
assert!(build_match_string("").is_none());
assert!(build_match_string(" 충돌 ").is_none());
assert!(build_match_string("").is_none());
// 2-char Korean queries now produce a valid expression (V009 unicode61).
assert_eq!(
build_match_string("충돌").unwrap(),
r#"text : ("충돌")"#
);
assert_eq!(
build_match_string(" 충돌 ").unwrap(),
r#"text : ("충돌")"#
);
}
/// `해시 충돌` — both tokens are 2 chars (dropped from the AND), but
/// the whole-phrase candidate (`"해시 충돌"`, 5 chars total) survives.
/// This is the dominant Korean usage pattern targeted by A5.
/// The whole-phrase candidate is then wrapped in the `text : (...)`
/// column filter.
/// V009 unicode61: `해시 충돌` — both tokens are 2 chars and now pass
/// MIN_QUERY_CHARS=2. Both whole-phrase and token-AND candidates exist
/// and differ → OR-combined inside `text : (...)`.
#[test]
fn build_match_string_whole_phrase_only_when_all_tokens_short() {
let s = build_match_string("해시 충돌").unwrap();
assert_eq!(s, r#"text : ("해시 충돌")"#);
assert_eq!(s, r#"text : (("해시 충돌") OR ("해시" "충돌"))"#);
}
/// Single long token: whole-phrase and token-AND candidates collapse

View File

@@ -581,3 +581,67 @@ fn fts_v009_unicode61_space_separated_korean_token_hits() {
// substring (token 의 부분 문자열) 은 V009 unicode61 에서 0-hit.
assert_eq!(count_match(&conn, "발생한"), 0, "substring '발생한' of '발생한다' 0-hit");
}
// ── 8. V009 morphological tokenizer behavior ──────────────────────────
/// V009 의 핵심 가치: 한국어 2자 query 가 hit. 형태소 분해된
/// tokenized_korean_text column 이 chunks_fts 에 indexed.
#[test]
fn fts_v009_korean_morphological_2char_query_hits() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
let text = "한국 문화는 오래되었다";
let tokenized = tokenize_korean_morphological(text);
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at,
tokenized_korean_text
) VALUES (?, ?, ?, '[]', NULL, '[]', 0, 'v1', 'h', '[]', '2024-01-01T00:00:00Z', ?)",
rusqlite::params![
&"k".repeat(32),
&"d".repeat(32),
text,
tokenized,
],
)
.expect("insert chunk with tokenized_korean_text");
assert!(
count_match(&conn, "한국") >= 1,
"2-char Korean morpheme '한국' must hit when tokenized column is populated"
);
}
/// V009 의 Path A 회귀 확인: 영어 substring 매칭이 사라짐
/// (unicode61 의 whole-token only 동작).
#[test]
fn fts_v009_english_whole_token_only() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
insert_chunk(
&conn,
&"e".repeat(32),
&"d".repeat(32),
"[]",
"the tokenizer normalizes whitespace before matching",
);
assert_eq!(
count_match(&conn, "token"),
0,
"V009 unicode61: 'token' is substring of 'tokenizer', should NOT hit"
);
assert_eq!(
count_match(&conn, "tokenizer"),
1,
"V009 unicode61: whole-token 'tokenizer' must hit"
);
}