test(fts,app): V009 morphological tokenizer integration tests
신규 4 test 추가:
- crates/kebab-store-sqlite/tests/fts.rs:
- fts_v009_korean_morphological_2char_query_hits: tokenized_korean_text
column 이 채워진 chunk 의 '한국' 2-char query hit.
- fts_v009_english_whole_token_only: V007 trigram substring 매칭
회귀 (Path A) — 'token' query 가 'tokenizer' chunk 에서 0-hit.
- crates/kebab-app/tests/search_korean.rs:
- korean_morphological_2char_query_lexical_mode: end-to-end
한국어 wiki fixture ingest → '한국' / '서울' query hit.
- korean_morphological_mixed_english_korean_query: 'Rust' English
whole-token + '최적화' Korean morpheme hit.
crates/kebab-search/src/lexical.rs:
- build_match_string() 의 MIN_TRIGRAM_CHARS(3) → MIN_QUERY_CHARS(2).
V009 unicode61 은 최소 token 길이 제한 없어 2자 한국어 morpheme
query 가 통과되어야 함. 1자 단독은 여전히 필터.
- 관련 unit test 2개 V009 동작으로 갱신.
fixture text 는 lindera ko-dic 의 실제 segmentation 동작에 의존
(spec Appendix B prior-knowledge 예측). 실측 시 fixture 조정 가능.
Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md §9.1, §9.2
Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S7)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -127,3 +127,72 @@ fn lexical_mixed_korean_english_multi_token_query_hits() {
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
// ── S7 V009 morphological tokenizer end-to-end tests ─────────────────
|
||||
|
||||
/// S7 — V009 morphological tokenizer: 한국어 2자 query 가 end-to-end
|
||||
/// lexical 경로에서 hit. lindera ko-dic 이 '한국어를' → '한국어' 형태소로
|
||||
/// 분해, '서울은' → '서울' 로 분해하여 tokenized_korean_text column 에
|
||||
/// 기록 → FTS5 매칭.
|
||||
#[test]
|
||||
fn korean_morphological_2char_query_lexical_mode() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("korean-wiki.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# 한국어 위키\n\n한국어를 공부합니다.\n서울은 한국의 수도입니다.\n",
|
||||
)
|
||||
.expect("write korean-wiki fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("한국"))
|
||||
.expect("search 한국");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'한국' 2-char Korean query must return at least one hit (V009 morphological); got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("서울"))
|
||||
.expect("search 서울");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'서울' 2-char Korean query must return at least one hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// S7 — V009 morphological tokenizer: 한-영 혼합 query lexical hit.
|
||||
/// 'Rust' (English whole-token) + '최적화' (Korean morpheme) 각각 hit.
|
||||
#[test]
|
||||
fn korean_morphological_mixed_english_korean_query() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("rust-optimization.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# Rust 최적화 노트\n\nRust 최적화는 zero-cost abstraction 을 강조한다.\n",
|
||||
)
|
||||
.expect("write rust-optimization fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust"))
|
||||
.expect("search Rust");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'Rust' English whole-token must hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
|
||||
.expect("search 최적화");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'최적화' Korean morpheme must hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -157,23 +157,27 @@ impl Retriever for LexicalRetriever {
|
||||
///
|
||||
/// v0.17.0 — trigram-aware redesign (see design §5.5 + plan
|
||||
/// `docs/superpowers/plans/2026-05-22-korean-trigram-tokenizer.md`
|
||||
/// Task A5). The FTS5 tokenizer is `trigram` so any term shorter than
|
||||
/// three Unicode chars has no index entry and would zero out an AND
|
||||
/// branch. Korean compounds typically split into 2-char eojeols (e.g.
|
||||
/// `해시 충돌`), so a naive token AND drops the dominant usage pattern.
|
||||
/// Task A5). Originally the FTS5 tokenizer was `trigram` so any term
|
||||
/// shorter than three Unicode chars had no index entry and would zero
|
||||
/// out an AND branch. Korean compounds typically split into 2-char
|
||||
/// eojeols (e.g. `해시 충돌`), so a naive token AND drops the dominant
|
||||
/// usage pattern.
|
||||
///
|
||||
/// V009 (2026-05-28): FTS5 tokenizer 가 trigram → unicode61 + 한국어
|
||||
/// 형태소 분해 column 로 갱신됨. unicode61 은 trigram 과 달리 최소
|
||||
/// token 길이 제한이 없어 2자 한국어 morpheme query ('한국', '서울')
|
||||
/// 가 `tokenized_korean_text` column 경유로 hit 가능. MIN_QUERY_CHARS
|
||||
/// 를 2 로 낮춰 2자 query 를 통과시킨다 (1자 단독은 여전히 필터).
|
||||
/// multi-token Korean query 의 OR-combine 분기는 redundant 하나 보존
|
||||
/// (future 확장성).
|
||||
///
|
||||
/// post-v0.17.1 dogfood — `text` column filter (closure of HOTFIXES
|
||||
/// 2026-05-24 `heading_path_json` 노이즈). The `chunks_fts` virtual
|
||||
/// table indexes both `heading_path` (the JSON-serialized
|
||||
/// `chunks.heading_path_json` per V002/V007 triggers) and `text`. Under
|
||||
/// the trigram tokenizer the JSON punctuation (`[`, `"`, `,`) plus the
|
||||
/// path segments (`app`, `src`, …) become indexable 3-grams, so a
|
||||
/// query can hit a chunk purely because its file's heading JSON shares
|
||||
/// a path segment with the query — false positives that have no body
|
||||
/// relevance. The default match expression therefore scopes to the
|
||||
/// `text` column. The `heading_path` column stays indexed (V007 / §5.5
|
||||
/// verbatim block is preserved) so a user who *wants* heading matching
|
||||
/// can opt in via raw mode (`'heading_path : foo'`).
|
||||
/// `chunks.heading_path_json` per V002/V007 triggers) and `text`. The
|
||||
/// default match expression therefore scopes to the `text` column. The
|
||||
/// `heading_path` column stays indexed so a user who *wants* heading
|
||||
/// matching can opt in via raw mode (`'heading_path : foo'`).
|
||||
///
|
||||
/// Rules:
|
||||
///
|
||||
@@ -185,26 +189,22 @@ impl Retriever for LexicalRetriever {
|
||||
///
|
||||
/// - Otherwise build up to two MATCH candidates:
|
||||
/// 1. **whole-phrase**: the entire trimmed input wrapped as one FTS5
|
||||
/// string literal, *only* if it has ≥3 Unicode chars. FTS5 treats
|
||||
/// string literal, *only* if it has ≥2 Unicode chars. FTS5 treats
|
||||
/// a quoted string with spaces as a phrase match.
|
||||
/// 2. **token AND**: whitespace-split tokens, kept only when each has
|
||||
/// ≥3 Unicode chars (shorter ones are dropped — they would zero
|
||||
/// out the AND under trigram).
|
||||
/// ≥2 Unicode chars (1-char tokens are dropped).
|
||||
///
|
||||
/// - Combine: `(whole) OR (token_and)` when both exist *and differ*;
|
||||
/// either alone when only one exists; `None` when neither exists
|
||||
/// (caller short-circuits to `Ok(vec![])`, avoiding an FTS5 syntax
|
||||
/// error from an empty MATCH).
|
||||
///
|
||||
/// - A single-token long query (`러스트`, `foo`) yields `whole == token_and`
|
||||
/// - A single-token query (`러스트`, `한국`, `foo`) yields `whole == token_and`
|
||||
/// → return the bare quoted form so the OR doesn't duplicate.
|
||||
///
|
||||
/// - Finally wrap the combined expression in `text : (<expr>)` so the
|
||||
/// match is scoped to the body column. FTS5's column-filter syntax
|
||||
/// accepts an arbitrary OR/AND sub-expression inside the parens.
|
||||
///
|
||||
/// V009 unicode61 + 형태소 tokenizer 환경에서는 multi-token Korean
|
||||
/// query 의 OR-combine 분기는 redundant 하나 보존 (future 확장성).
|
||||
fn build_match_string(text: &str) -> Option<String> {
|
||||
let trimmed = text.trim();
|
||||
if trimmed.is_empty() {
|
||||
@@ -218,15 +218,18 @@ fn build_match_string(text: &str) -> Option<String> {
|
||||
return Some(inner_trim.to_string());
|
||||
}
|
||||
|
||||
const MIN_TRIGRAM_CHARS: usize = 3;
|
||||
// V009 unicode61: minimum query token length is 2 Unicode chars.
|
||||
// (V007 trigram required ≥3; unicode61 has no built-in minimum but
|
||||
// single-char queries are too broad to be useful.)
|
||||
const MIN_QUERY_CHARS: usize = 2;
|
||||
|
||||
let whole_candidate: Option<String> =
|
||||
(trimmed.chars().count() >= MIN_TRIGRAM_CHARS).then(|| escape_fts5_token(trimmed));
|
||||
(trimmed.chars().count() >= MIN_QUERY_CHARS).then(|| escape_fts5_token(trimmed));
|
||||
|
||||
let token_and_candidate: Option<String> = {
|
||||
let toks: Vec<String> = trimmed
|
||||
.split_whitespace()
|
||||
.filter(|t| t.chars().count() >= MIN_TRIGRAM_CHARS)
|
||||
.filter(|t| t.chars().count() >= MIN_QUERY_CHARS)
|
||||
.map(escape_fts5_token)
|
||||
.collect();
|
||||
(!toks.is_empty()).then(|| toks.join(" "))
|
||||
@@ -651,25 +654,32 @@ mod tests {
|
||||
|
||||
// ── v0.17.0 trigram-aware redesign coverage ──────────────────────────
|
||||
|
||||
/// 2-char Korean query (`충돌`) yields neither a whole-phrase nor a
|
||||
/// token-AND candidate → `None`. Caller short-circuits to an empty
|
||||
/// hit list rather than executing an FTS5 syntax error on `""` MATCH.
|
||||
/// V009 unicode61: 1-char query yields None (too broad); 2-char Korean
|
||||
/// query now passes the MIN_QUERY_CHARS=2 filter and returns a valid
|
||||
/// match expression.
|
||||
#[test]
|
||||
fn build_match_string_short_korean_returns_none() {
|
||||
assert!(build_match_string("충돌").is_none());
|
||||
// 1-char queries remain filtered (too broad).
|
||||
assert!(build_match_string("키").is_none());
|
||||
assert!(build_match_string(" 충돌 ").is_none());
|
||||
assert!(build_match_string("나").is_none());
|
||||
// 2-char Korean queries now produce a valid expression (V009 unicode61).
|
||||
assert_eq!(
|
||||
build_match_string("충돌").unwrap(),
|
||||
r#"text : ("충돌")"#
|
||||
);
|
||||
assert_eq!(
|
||||
build_match_string(" 충돌 ").unwrap(),
|
||||
r#"text : ("충돌")"#
|
||||
);
|
||||
}
|
||||
|
||||
/// `해시 충돌` — both tokens are 2 chars (dropped from the AND), but
|
||||
/// the whole-phrase candidate (`"해시 충돌"`, 5 chars total) survives.
|
||||
/// This is the dominant Korean usage pattern targeted by A5.
|
||||
/// The whole-phrase candidate is then wrapped in the `text : (...)`
|
||||
/// column filter.
|
||||
/// V009 unicode61: `해시 충돌` — both tokens are 2 chars and now pass
|
||||
/// MIN_QUERY_CHARS=2. Both whole-phrase and token-AND candidates exist
|
||||
/// and differ → OR-combined inside `text : (...)`.
|
||||
#[test]
|
||||
fn build_match_string_whole_phrase_only_when_all_tokens_short() {
|
||||
let s = build_match_string("해시 충돌").unwrap();
|
||||
assert_eq!(s, r#"text : ("해시 충돌")"#);
|
||||
assert_eq!(s, r#"text : (("해시 충돌") OR ("해시" "충돌"))"#);
|
||||
}
|
||||
|
||||
/// Single long token: whole-phrase and token-AND candidates collapse
|
||||
|
||||
@@ -581,3 +581,67 @@ fn fts_v009_unicode61_space_separated_korean_token_hits() {
|
||||
// substring (token 의 부분 문자열) 은 V009 unicode61 에서 0-hit.
|
||||
assert_eq!(count_match(&conn, "발생한"), 0, "substring '발생한' of '발생한다' 0-hit");
|
||||
}
|
||||
|
||||
// ── 8. V009 morphological tokenizer behavior ──────────────────────────
|
||||
|
||||
/// V009 의 핵심 가치: 한국어 2자 query 가 hit. 형태소 분해된
|
||||
/// tokenized_korean_text column 이 chunks_fts 에 indexed.
|
||||
#[test]
|
||||
fn fts_v009_korean_morphological_2char_query_hits() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
let text = "한국 문화는 오래되었다";
|
||||
let tokenized = tokenize_korean_morphological(text);
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at,
|
||||
tokenized_korean_text
|
||||
) VALUES (?, ?, ?, '[]', NULL, '[]', 0, 'v1', 'h', '[]', '2024-01-01T00:00:00Z', ?)",
|
||||
rusqlite::params![
|
||||
&"k".repeat(32),
|
||||
&"d".repeat(32),
|
||||
text,
|
||||
tokenized,
|
||||
],
|
||||
)
|
||||
.expect("insert chunk with tokenized_korean_text");
|
||||
|
||||
assert!(
|
||||
count_match(&conn, "한국") >= 1,
|
||||
"2-char Korean morpheme '한국' must hit when tokenized column is populated"
|
||||
);
|
||||
}
|
||||
|
||||
/// V009 의 Path A 회귀 확인: 영어 substring 매칭이 사라짐
|
||||
/// (unicode61 의 whole-token only 동작).
|
||||
#[test]
|
||||
fn fts_v009_english_whole_token_only() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&"e".repeat(32),
|
||||
&"d".repeat(32),
|
||||
"[]",
|
||||
"the tokenizer normalizes whitespace before matching",
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
count_match(&conn, "token"),
|
||||
0,
|
||||
"V009 unicode61: 'token' is substring of 'tokenizer', should NOT hit"
|
||||
);
|
||||
assert_eq!(
|
||||
count_match(&conn, "tokenizer"),
|
||||
1,
|
||||
"V009 unicode61: whole-token 'tokenizer' must hit"
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user