feat(v0.20.1): 한국어 morphological tokenizer (V009) + N-gram supplement + eager backfill #191

Merged
altair823 merged 25 commits from feat/korean-morphological-tokenizer into main 2026-05-28 14:17:18 +00:00
Showing only changes of commit f94e0c4a9b - Show all commits

View File

@@ -993,8 +993,16 @@ impl App {
/// the active config. This token surfaces in `SearchHit.index_version`
/// and on snapshot tests; including the chunker version pins it to
/// the chunking policy in effect.
///
/// V009 (2026-05-28): FTS5 tokenizer 가 trigram → unicode61 + 한국어
/// 형태소 분해 column 로 갱신됨. `fts5-v009-korean-morphological`
/// suffix 가 V007 baseline 과 구별되어 eval runner 의 config
/// snapshot 및 search cache 무효화에 picks up 된다.
fn lexical_index_version(config: &kebab_config::Config) -> IndexVersion {
IndexVersion(format!("lex:{}", config.chunking.chunker_version))
IndexVersion(format!(
"lex:{}:fts5-v009-korean-morphological",
config.chunking.chunker_version
))
}
/// p9-fb-37: stand-in for the vector retriever in the trace path when