diff --git a/Cargo.lock b/Cargo.lock index cc81324..b2fad5d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5392,6 +5392,7 @@ dependencies = [ "daachorse", "kanaria", "lindera-dictionary", + "lindera-ko-dic", "log", "once_cell", "percent-encoding", diff --git a/crates/kebab-chunk/Cargo.toml b/crates/kebab-chunk/Cargo.toml index c02c7b0..cdfc8d9 100644 --- a/crates/kebab-chunk/Cargo.toml +++ b/crates/kebab-chunk/Cargo.toml @@ -14,7 +14,7 @@ blake3 = { workspace = true } anyhow = { workspace = true } tracing = { workspace = true } serde_yaml = { workspace = true } -lindera = { workspace = true } +lindera = { workspace = true, features = ["embed-ko-dic"] } lindera-ko-dic = { workspace = true, features = ["embed-ko-dic"] } [dev-dependencies] diff --git a/crates/kebab-chunk/src/code_c_ast_v1.rs b/crates/kebab-chunk/src/code_c_ast_v1.rs index 4e97059..6a4daad 100644 --- a/crates/kebab-chunk/src/code_c_ast_v1.rs +++ b/crates/kebab-chunk/src/code_c_ast_v1.rs @@ -145,6 +145,7 @@ fn make_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids: block_ids.to_vec(), + tokenized_korean_text: None, text, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/code_cpp_ast_v1.rs b/crates/kebab-chunk/src/code_cpp_ast_v1.rs index 942eb8e..6bf458f 100644 --- a/crates/kebab-chunk/src/code_cpp_ast_v1.rs +++ b/crates/kebab-chunk/src/code_cpp_ast_v1.rs @@ -147,6 +147,7 @@ fn make_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids: block_ids.to_vec(), + tokenized_korean_text: None, text, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/code_go_ast_v1.rs b/crates/kebab-chunk/src/code_go_ast_v1.rs index e9d8b76..5ebf0a4 100644 --- a/crates/kebab-chunk/src/code_go_ast_v1.rs +++ b/crates/kebab-chunk/src/code_go_ast_v1.rs @@ -147,6 +147,7 @@ fn make_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids: block_ids.to_vec(), + tokenized_korean_text: None, text, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/code_java_ast_v1.rs b/crates/kebab-chunk/src/code_java_ast_v1.rs index 0f47540..acbc3a3 100644 --- a/crates/kebab-chunk/src/code_java_ast_v1.rs +++ b/crates/kebab-chunk/src/code_java_ast_v1.rs @@ -147,6 +147,7 @@ fn make_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids: block_ids.to_vec(), + tokenized_korean_text: None, text, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/code_js_ast_v1.rs b/crates/kebab-chunk/src/code_js_ast_v1.rs index ae0bc2e..1707831 100644 --- a/crates/kebab-chunk/src/code_js_ast_v1.rs +++ b/crates/kebab-chunk/src/code_js_ast_v1.rs @@ -147,6 +147,7 @@ fn make_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids: block_ids.to_vec(), + tokenized_korean_text: None, text, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/code_kotlin_ast_v1.rs b/crates/kebab-chunk/src/code_kotlin_ast_v1.rs index c992699..93ca430 100644 --- a/crates/kebab-chunk/src/code_kotlin_ast_v1.rs +++ b/crates/kebab-chunk/src/code_kotlin_ast_v1.rs @@ -147,6 +147,7 @@ fn make_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids: block_ids.to_vec(), + tokenized_korean_text: None, text, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/code_python_ast_v1.rs b/crates/kebab-chunk/src/code_python_ast_v1.rs index 246a3e0..00e14dc 100644 --- a/crates/kebab-chunk/src/code_python_ast_v1.rs +++ b/crates/kebab-chunk/src/code_python_ast_v1.rs @@ -147,6 +147,7 @@ fn make_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids: block_ids.to_vec(), + tokenized_korean_text: None, text, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/code_rust_ast_v1.rs b/crates/kebab-chunk/src/code_rust_ast_v1.rs index 83dcda3..a513f3f 100644 --- a/crates/kebab-chunk/src/code_rust_ast_v1.rs +++ b/crates/kebab-chunk/src/code_rust_ast_v1.rs @@ -147,6 +147,7 @@ fn make_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids: block_ids.to_vec(), + tokenized_korean_text: None, text, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/code_ts_ast_v1.rs b/crates/kebab-chunk/src/code_ts_ast_v1.rs index e76af55..33e5932 100644 --- a/crates/kebab-chunk/src/code_ts_ast_v1.rs +++ b/crates/kebab-chunk/src/code_ts_ast_v1.rs @@ -147,6 +147,7 @@ fn make_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids: block_ids.to_vec(), + tokenized_korean_text: None, text, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index e34de55..36225d6 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -47,3 +47,45 @@ pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker; pub use manifest_file_v1::ManifestFileV1Chunker; pub use md_heading_v1::MdHeadingV1Chunker; pub use pdf_page_v1::PdfPageV1Chunker; + +// ── Korean morphological tokenizer ─────────────────────────────────────────── + +use lindera::dictionary::{load_embedded_dictionary, DictionaryKind}; +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; +use lindera::tokenizer::Tokenizer; + +static KOREAN_TOKENIZER: std::sync::OnceLock> = std::sync::OnceLock::new(); + +/// 한국어 chunk text 를 lindera ko-dic 으로 형태소 분해해 공백 join 한 결과를 반환. +/// chunker 들이 `Chunk.tokenized_korean_text` pre-fill 에 사용. +/// 분석 실패 시 None — 호출자는 NULL fallback 처리. +/// Tokenizer 는 OnceLock 으로 1회 초기화; dict load 실패 시 영구 None. +pub fn tokenize_korean_morphological(text: &str) -> Option { + if text.trim().is_empty() { + return None; + } + let tokenizer = KOREAN_TOKENIZER.get_or_init(|| { + let dict = match load_embedded_dictionary(DictionaryKind::KoDic) { + Ok(d) => d, + Err(e) => { + tracing::warn!(target: "kebab-chunk", "tokenize_korean_morphological: dict load failed: {e}"); + return None; + } + }; + let segmenter = Segmenter::new(Mode::Normal, dict, None); + Some(Tokenizer::new(segmenter)) + }); + let tokenizer = tokenizer.as_ref()?; + let tokens = tokenizer.tokenize(text).ok()?; + let joined = tokens + .iter() + .map(|t| t.surface.as_ref()) + .collect::>() + .join(" "); + if joined.is_empty() { + None + } else { + Some(joined) + } +} diff --git a/crates/kebab-chunk/src/md_heading_v1.rs b/crates/kebab-chunk/src/md_heading_v1.rs index 1bac96c..0265d1f 100644 --- a/crates/kebab-chunk/src/md_heading_v1.rs +++ b/crates/kebab-chunk/src/md_heading_v1.rs @@ -332,6 +332,7 @@ fn build_chunk( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids, + tokenized_korean_text: crate::tokenize_korean_morphological(&text), text, heading_path, source_spans, diff --git a/crates/kebab-chunk/src/pdf_page_v1.rs b/crates/kebab-chunk/src/pdf_page_v1.rs index 246e336..e615163 100644 --- a/crates/kebab-chunk/src/pdf_page_v1.rs +++ b/crates/kebab-chunk/src/pdf_page_v1.rs @@ -170,6 +170,7 @@ impl Chunker for PdfPageV1Chunker { chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids, + tokenized_korean_text: crate::tokenize_korean_morphological(&slice), text: slice, heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/src/tier2_shared.rs b/crates/kebab-chunk/src/tier2_shared.rs index e3dfd14..8f67d79 100644 --- a/crates/kebab-chunk/src/tier2_shared.rs +++ b/crates/kebab-chunk/src/tier2_shared.rs @@ -189,6 +189,7 @@ fn build_chunk_from_span( chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids, + tokenized_korean_text: crate::tokenize_korean_morphological(text), text: text.to_string(), heading_path: Vec::new(), source_spans: vec![span], diff --git a/crates/kebab-chunk/tests/tokenize_korean.rs b/crates/kebab-chunk/tests/tokenize_korean.rs new file mode 100644 index 0000000..fb584e0 --- /dev/null +++ b/crates/kebab-chunk/tests/tokenize_korean.rs @@ -0,0 +1,12 @@ +#[test] +fn tokenize_korean_morphological_splits_2char_word() { + let out = kebab_chunk::tokenize_korean_morphological("한국 문화는 오래되었다").unwrap(); + let tokens: Vec<&str> = out.split_whitespace().collect(); + assert!(tokens.contains(&"한국"), "tokens = {tokens:?}"); +} + +#[test] +fn tokenize_korean_morphological_empty_returns_none() { + assert!(kebab_chunk::tokenize_korean_morphological("").is_none()); + assert!(kebab_chunk::tokenize_korean_morphological(" ").is_none()); +} diff --git a/crates/kebab-core/src/chunk.rs b/crates/kebab-core/src/chunk.rs index 5c0db0f..10dce5f 100644 --- a/crates/kebab-core/src/chunk.rs +++ b/crates/kebab-core/src/chunk.rs @@ -23,4 +23,9 @@ pub struct Chunk { pub token_estimate: usize, pub chunker_version: ChunkerVersion, pub policy_hash: String, + /// 한국어 형태소 분해된 token 시퀀스 (공백 join). lindera ko-dic + /// 으로 chunker 가 pre-fill. None 시 raw text 만 FTS5 index. + /// Bug #8 (한국어 2자 query) 해결을 위한 V009 cascade. + #[serde(default)] + pub tokenized_korean_text: Option, } diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index e09745e..70ebcda 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -105,8 +105,9 @@ impl kebab_core::DocumentStore for SqliteStore { "INSERT INTO chunks ( chunk_id, doc_id, text, heading_path_json, section_label, source_spans_json, token_estimate, - chunker_version, policy_hash, block_ids_json, created_at - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + chunker_version, policy_hash, block_ids_json, created_at, + tokenized_korean_text + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", ) .map_err(StoreError::from)?; for chunk in chunks { @@ -134,6 +135,7 @@ impl kebab_core::DocumentStore for SqliteStore { chunk.policy_hash, block_ids, now, + chunk.tokenized_korean_text.as_deref(), ]) .map_err(StoreError::from)?; } @@ -247,6 +249,7 @@ impl kebab_core::DocumentStore for SqliteStore { token_estimate: row.token_estimate as usize, chunker_version: kebab_core::ChunkerVersion(row.chunker_version), policy_hash: row.policy_hash, + tokenized_korean_text: None, })) } diff --git a/crates/kebab-store-sqlite/tests/idempotency.rs b/crates/kebab-store-sqlite/tests/idempotency.rs index faa2bd6..1171c0a 100644 --- a/crates/kebab-store-sqlite/tests/idempotency.rs +++ b/crates/kebab-store-sqlite/tests/idempotency.rs @@ -97,6 +97,7 @@ fn make_chunks(doc_id: &DocumentId) -> Vec { token_estimate: 5, chunker_version: ChunkerVersion("md-heading-v1".into()), policy_hash: "deadbeefdeadbeef".into(), + tokenized_korean_text: None, }] } diff --git a/crates/kebab-tui/tests/inspect.rs b/crates/kebab-tui/tests/inspect.rs index 842c512..4e0525f 100644 --- a/crates/kebab-tui/tests/inspect.rs +++ b/crates/kebab-tui/tests/inspect.rs @@ -113,6 +113,7 @@ fn make_chunk() -> Chunk { token_estimate: 12, chunker_version: ChunkerVersion("md-heading-v1".into()), policy_hash: "deadbeefdeadbeef".into(), + tokenized_korean_text: None, } }