feat(chunk): integrate lindera korean morphological tokenizer

V009 의 tokenized_korean_text column 에 들어갈 morpheme sequence 를 lindera ko-dic 으로 분해. chunk builder pipeline 의 chunk 생성 직후 시점에서 호출 → chunk struct 의 field 에 pre-fill → store 의 put_chunks 가 단일 transaction 안에서 INSERT. - crates/kebab-core/src/chunk.rs: Chunk struct 에 tokenized_korean_text: Option<String> field 추가 (#[serde(default)]). - crates/kebab-chunk/src/lib.rs: tokenize_korean_morphological() helper + OnceLock 캐싱 + fallback (None) 정책. - crates/kebab-chunk/Cargo.toml: lindera features = ["embed-ko-dic"] 추가 (DictionaryKind::KoDic 활성화에 필요). - 모든 chunker (tier2_shared, md_heading_v1, pdf_page_v1, 9개 code AST v1): Chunk 리터럴에 tokenized_korean_text pre-fill. - crates/kebab-store-sqlite/src/documents.rs::put_chunks: INSERT SQL column list + placeholder + binding 갱신 (12번째 column). - crates/kebab-chunk/tests/tokenize_korean.rs: 단위 테스트 2개. lindera 3.0.7 API 정정: load_dictionary_from_kind → load_embedded_dictionary, Token.text → Token.surface. Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md §6.2 Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S3) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 10:22:15 +00:00
parent 597d8b70ad
commit b134ae9dd5
20 changed files with 80 additions and 3 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5392,6 +5392,7 @@ dependencies = [
 "daachorse",
 "kanaria",
 "lindera-dictionary",
+ "lindera-ko-dic",
 "log",
 "once_cell",
 "percent-encoding",
--- a/crates/kebab-chunk/Cargo.toml
+++ b/crates/kebab-chunk/Cargo.toml
@@ -14,7 +14,7 @@ blake3                     = { workspace = true }
 anyhow                     = { workspace = true }
 tracing                    = { workspace = true }
 serde_yaml                 = { workspace = true }
-lindera        = { workspace = true }
+lindera        = { workspace = true, features = ["embed-ko-dic"] }
 lindera-ko-dic = { workspace = true, features = ["embed-ko-dic"] }

 [dev-dependencies]
--- a/crates/kebab-chunk/src/code_c_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_c_ast_v1.rs
@@ -145,6 +145,7 @@ fn make_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids: block_ids.to_vec(),
+        tokenized_korean_text: None,
        text,
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/src/code_cpp_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_cpp_ast_v1.rs
@@ -147,6 +147,7 @@ fn make_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids: block_ids.to_vec(),
+        tokenized_korean_text: None,
        text,
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/src/code_go_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_go_ast_v1.rs
@@ -147,6 +147,7 @@ fn make_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids: block_ids.to_vec(),
+        tokenized_korean_text: None,
        text,
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/src/code_java_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_java_ast_v1.rs
@@ -147,6 +147,7 @@ fn make_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids: block_ids.to_vec(),
+        tokenized_korean_text: None,
        text,
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/src/code_js_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_js_ast_v1.rs
@@ -147,6 +147,7 @@ fn make_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids: block_ids.to_vec(),
+        tokenized_korean_text: None,
        text,
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/src/code_kotlin_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_kotlin_ast_v1.rs
@@ -147,6 +147,7 @@ fn make_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids: block_ids.to_vec(),
+        tokenized_korean_text: None,
        text,
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/src/code_python_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_python_ast_v1.rs
@@ -147,6 +147,7 @@ fn make_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids: block_ids.to_vec(),
+        tokenized_korean_text: None,
        text,
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/src/code_rust_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_rust_ast_v1.rs
@@ -147,6 +147,7 @@ fn make_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids: block_ids.to_vec(),
+        tokenized_korean_text: None,
        text,
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/src/code_ts_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_ts_ast_v1.rs
@@ -147,6 +147,7 @@ fn make_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids: block_ids.to_vec(),
+        tokenized_korean_text: None,
        text,
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/src/lib.rs
+++ b/crates/kebab-chunk/src/lib.rs
@@ -47,3 +47,45 @@ pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
 pub use manifest_file_v1::ManifestFileV1Chunker;
 pub use md_heading_v1::MdHeadingV1Chunker;
 pub use pdf_page_v1::PdfPageV1Chunker;
+
+// ── Korean morphological tokenizer ───────────────────────────────────────────
+
+use lindera::dictionary::{load_embedded_dictionary, DictionaryKind};
+use lindera::mode::Mode;
+use lindera::segmenter::Segmenter;
+use lindera::tokenizer::Tokenizer;
+
+static KOREAN_TOKENIZER: std::sync::OnceLock<Option<Tokenizer>> = std::sync::OnceLock::new();
+
+/// 한국어 chunk text 를 lindera ko-dic 으로 형태소 분해해 공백 join 한 결과를 반환.
+/// chunker 들이 `Chunk.tokenized_korean_text` pre-fill 에 사용.
+/// 분석 실패 시 None — 호출자는 NULL fallback 처리.
+/// Tokenizer 는 OnceLock 으로 1회 초기화; dict load 실패 시 영구 None.
+pub fn tokenize_korean_morphological(text: &str) -> Option<String> {
+    if text.trim().is_empty() {
+        return None;
+    }
+    let tokenizer = KOREAN_TOKENIZER.get_or_init(|| {
+        let dict = match load_embedded_dictionary(DictionaryKind::KoDic) {
+            Ok(d) => d,
+            Err(e) => {
+                tracing::warn!(target: "kebab-chunk", "tokenize_korean_morphological: dict load failed: {e}");
+                return None;
+            }
+        };
+        let segmenter = Segmenter::new(Mode::Normal, dict, None);
+        Some(Tokenizer::new(segmenter))
+    });
+    let tokenizer = tokenizer.as_ref()?;
+    let tokens = tokenizer.tokenize(text).ok()?;
+    let joined = tokens
+        .iter()
+        .map(|t| t.surface.as_ref())
+        .collect::<Vec<_>>()
+        .join(" ");
+    if joined.is_empty() {
+        None
+    } else {
+        Some(joined)
+    }
+}
--- a/crates/kebab-chunk/src/md_heading_v1.rs
+++ b/crates/kebab-chunk/src/md_heading_v1.rs
@@ -332,6 +332,7 @@ fn build_chunk(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids,
+        tokenized_korean_text: crate::tokenize_korean_morphological(&text),
        text,
        heading_path,
        source_spans,
--- a/crates/kebab-chunk/src/pdf_page_v1.rs
+++ b/crates/kebab-chunk/src/pdf_page_v1.rs
@@ -170,6 +170,7 @@ impl Chunker for PdfPageV1Chunker {
                    chunk_id,
                    doc_id: DocumentId(doc.doc_id.0.clone()),
                    block_ids,
+                    tokenized_korean_text: crate::tokenize_korean_morphological(&slice),
                    text: slice,
                    heading_path: Vec::new(),
                    source_spans: vec![span],
--- a/crates/kebab-chunk/src/tier2_shared.rs
+++ b/crates/kebab-chunk/src/tier2_shared.rs
@@ -189,6 +189,7 @@ fn build_chunk_from_span(
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids,
+        tokenized_korean_text: crate::tokenize_korean_morphological(text),
        text: text.to_string(),
        heading_path: Vec::new(),
        source_spans: vec![span],
--- a/crates/kebab-chunk/tests/tokenize_korean.rs
+++ b/crates/kebab-chunk/tests/tokenize_korean.rs
@@ -0,0 +1,12 @@
+#[test]
+fn tokenize_korean_morphological_splits_2char_word() {
+    let out = kebab_chunk::tokenize_korean_morphological("한국 문화는 오래되었다").unwrap();
+    let tokens: Vec<&str> = out.split_whitespace().collect();
+    assert!(tokens.contains(&"한국"), "tokens = {tokens:?}");
+}
+
+#[test]
+fn tokenize_korean_morphological_empty_returns_none() {
+    assert!(kebab_chunk::tokenize_korean_morphological("").is_none());
+    assert!(kebab_chunk::tokenize_korean_morphological("   ").is_none());
+}
--- a/crates/kebab-core/src/chunk.rs
+++ b/crates/kebab-core/src/chunk.rs
@@ -23,4 +23,9 @@ pub struct Chunk {
    pub token_estimate: usize,
    pub chunker_version: ChunkerVersion,
    pub policy_hash: String,
+    /// 한국어 형태소 분해된 token 시퀀스 (공백 join). lindera ko-dic
+    /// 으로 chunker 가 pre-fill. None 시 raw text 만 FTS5 index.
+    /// Bug #8 (한국어 2자 query) 해결을 위한 V009 cascade.
+    #[serde(default)]
+    pub tokenized_korean_text: Option<String>,
 }
--- a/crates/kebab-store-sqlite/src/documents.rs
+++ b/crates/kebab-store-sqlite/src/documents.rs
@@ -105,8 +105,9 @@ impl kebab_core::DocumentStore for SqliteStore {
                "INSERT INTO chunks (
                    chunk_id, doc_id, text, heading_path_json,
                    section_label, source_spans_json, token_estimate,
-                    chunker_version, policy_hash, block_ids_json, created_at
-                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+                    chunker_version, policy_hash, block_ids_json, created_at,
+                    tokenized_korean_text
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
            )
            .map_err(StoreError::from)?;
        for chunk in chunks {
@@ -134,6 +135,7 @@ impl kebab_core::DocumentStore for SqliteStore {
                chunk.policy_hash,
                block_ids,
                now,
+                chunk.tokenized_korean_text.as_deref(),
            ])
            .map_err(StoreError::from)?;
        }
@@ -247,6 +249,7 @@ impl kebab_core::DocumentStore for SqliteStore {
            token_estimate: row.token_estimate as usize,
            chunker_version: kebab_core::ChunkerVersion(row.chunker_version),
            policy_hash: row.policy_hash,
+            tokenized_korean_text: None,
        }))
    }

--- a/crates/kebab-store-sqlite/tests/idempotency.rs
+++ b/crates/kebab-store-sqlite/tests/idempotency.rs
@@ -97,6 +97,7 @@ fn make_chunks(doc_id: &DocumentId) -> Vec<Chunk> {
        token_estimate: 5,
        chunker_version: ChunkerVersion("md-heading-v1".into()),
        policy_hash: "deadbeefdeadbeef".into(),
+        tokenized_korean_text: None,
    }]
 }

--- a/crates/kebab-tui/tests/inspect.rs
+++ b/crates/kebab-tui/tests/inspect.rs
@@ -113,6 +113,7 @@ fn make_chunk() -> Chunk {
        token_estimate: 12,
        chunker_version: ChunkerVersion("md-heading-v1".into()),
        policy_hash: "deadbeefdeadbeef".into(),
+        tokenized_korean_text: None,
    }
 }