V009 의 tokenized_korean_text column 에 들어갈 morpheme sequence 를 lindera ko-dic 으로 분해. chunk builder pipeline 의 chunk 생성 직후 시점에서 호출 → chunk struct 의 field 에 pre-fill → store 의 put_chunks 가 단일 transaction 안에서 INSERT. - crates/kebab-core/src/chunk.rs: Chunk struct 에 tokenized_korean_text: Option<String> field 추가 (#[serde(default)]). - crates/kebab-chunk/src/lib.rs: tokenize_korean_morphological() helper + OnceLock 캐싱 + fallback (None) 정책. - crates/kebab-chunk/Cargo.toml: lindera features = ["embed-ko-dic"] 추가 (DictionaryKind::KoDic 활성화에 필요). - 모든 chunker (tier2_shared, md_heading_v1, pdf_page_v1, 9개 code AST v1): Chunk 리터럴에 tokenized_korean_text pre-fill. - crates/kebab-store-sqlite/src/documents.rs::put_chunks: INSERT SQL column list + placeholder + binding 갱신 (12번째 column). - crates/kebab-chunk/tests/tokenize_korean.rs: 단위 테스트 2개. lindera 3.0.7 API 정정: load_dictionary_from_kind → load_embedded_dictionary, Token.text → Token.surface. Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md §6.2 Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S3) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
34 lines
1.3 KiB
TOML
34 lines
1.3 KiB
TOML
[package]
|
|
name = "kebab-chunk"
|
|
version = { workspace = true }
|
|
edition = { workspace = true }
|
|
rust-version = { workspace = true }
|
|
license = { workspace = true }
|
|
repository = { workspace = true }
|
|
description = "Chunkers that turn kb-core::CanonicalDocument into kb-core::Chunk batches (§3.5, §4.2, §7.2)"
|
|
|
|
[dependencies]
|
|
kebab-core = { path = "../kebab-core" }
|
|
serde_json_canonicalizer = "0.3"
|
|
blake3 = { workspace = true }
|
|
anyhow = { workspace = true }
|
|
tracing = { workspace = true }
|
|
serde_yaml = { workspace = true }
|
|
lindera = { workspace = true, features = ["embed-ko-dic"] }
|
|
lindera-ko-dic = { workspace = true, features = ["embed-ko-dic"] }
|
|
|
|
[dev-dependencies]
|
|
# kb-parse-md / kb-parse-code are dev-only — used by the snapshot integration
|
|
# tests to build a CanonicalDocument from fixture files. kb-parse-md absorbed
|
|
# kb-normalize in v0.19.0 (HOTFIXES.md 2026-05-26). Forbidden as regular deps
|
|
# per design §8 (chunker consumes CanonicalDocument from kb-core only);
|
|
# `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps)
|
|
# confirms this.
|
|
kebab-parse-md = { path = "../kebab-parse-md" }
|
|
kebab-parse-code = { path = "../kebab-parse-code" }
|
|
serde_json = { workspace = true }
|
|
time = { workspace = true }
|
|
|
|
[lints]
|
|
workspace = true
|