diff --git a/Cargo.lock b/Cargo.lock index 0ce2a30..fec852a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -539,6 +539,23 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "kb-chunk" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "kb-config", + "kb-core", + "kb-normalize", + "kb-parse-md", + "serde", + "serde_json", + "serde_json_canonicalizer", + "time", + "tracing", +] + [[package]] name = "kb-cli" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index b5d4b57..b09dfed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "crates/kb-source-fs", "crates/kb-parse-md", "crates/kb-normalize", + "crates/kb-chunk", "crates/kb-app", "crates/kb-cli", ] diff --git a/crates/kb-chunk/Cargo.toml b/crates/kb-chunk/Cargo.toml new file mode 100644 index 0000000..035a43d --- /dev/null +++ b/crates/kb-chunk/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "kb-chunk" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Chunkers that turn kb-core::CanonicalDocument into kb-core::Chunk batches (§3.5, §4.2, §7.2)" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-config = { path = "../kb-config" } +serde = { workspace = true } +serde_json_canonicalizer = "0.3" +blake3 = { workspace = true } +anyhow = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +# kb-parse-md / kb-normalize are dev-only — used by the snapshot integration +# test to build a CanonicalDocument from a fixture Markdown file. Forbidden as +# regular deps per design §8 (chunker consumes CanonicalDocument from kb-core +# only); `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps) +# confirms this. +kb-parse-md = { path = "../kb-parse-md" } +kb-normalize = { path = "../kb-normalize" } +serde_json = { workspace = true } +time = { workspace = true } diff --git a/crates/kb-chunk/src/lib.rs b/crates/kb-chunk/src/lib.rs new file mode 100644 index 0000000..4cff5f1 --- /dev/null +++ b/crates/kb-chunk/src/lib.rs @@ -0,0 +1,20 @@ +//! `kb-chunk` — chunkers that emit [`kb_core::Chunk`] batches. +//! +//! Per design §3.5 (Chunk), §4.2 (chunk_id recipe), §7.2 (`Chunker` +//! trait), §0 Q3/§14 (chunking priority). +//! +//! Public surface: +//! +//! * [`MdHeadingV1Chunker`] — heading-aware chunker for Markdown +//! `CanonicalDocument`s, emitting `chunker_version = "md-heading-v1"`. +//! +//! Behavior contract is enumerated on [`MdHeadingV1Chunker`]. +//! +//! This crate must NOT depend on any parser implementation +//! (`kb-parse-md`, `kb-parse-pdf`, …), the document/vector store, the +//! embedder, the retriever, the LLM, the RAG layer, or the UI layers. +//! It consumes `CanonicalDocument` purely through `kb-core` types. + +mod md_heading_v1; + +pub use md_heading_v1::MdHeadingV1Chunker; diff --git a/crates/kb-chunk/src/md_heading_v1.rs b/crates/kb-chunk/src/md_heading_v1.rs new file mode 100644 index 0000000..c589949 --- /dev/null +++ b/crates/kb-chunk/src/md_heading_v1.rs @@ -0,0 +1,134 @@ +//! `md-heading-v1` — heading-aware Markdown chunker. + +use kb_core::{ + CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, +}; + +/// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label +/// invalidates every downstream embedding record (design §9), so any change +/// must ship with a documented migration plan. +const VERSION_LABEL: &str = "md-heading-v1"; + +/// Heading-aware Markdown chunker. +/// +/// Implements [`kb_core::Chunker`] for Markdown-derived +/// [`CanonicalDocument`]s. +/// +/// **Behavior contract** (design §0 / §14, in priority order): +/// +/// 1. **Heading boundary first.** Chunks never span a `Block::Heading`. +/// The Heading block itself starts a new chunk and is included in that +/// chunk's `block_ids` so heading text is retrievable. +/// 2. **Never split a code block.** A `Block::Code` always lives in a +/// single chunk even when it exceeds `target_tokens`. +/// 3. **Tables stay in one chunk.** A `Block::Table` is emitted as a +/// single chunk regardless of size — the row-split refinement is +/// deferred per the P1-5 task spec. +/// 4. **Long sections split by paragraph.** Within a heading section +/// the chunker accumulates blocks until adding the next would exceed +/// `target_tokens`; it then emits the chunk and seeds the next chunk +/// with the previous chunk's tail blocks contributing roughly +/// `overlap_tokens` of content (paragraph-level overlap). +/// 5. **`heading_path` propagates.** Each chunk's `heading_path` is the +/// `heading_path` of its first contributing non-Heading block, or the +/// Heading block itself if that is the first. +/// 6. **`source_spans` merge.** A chunk lists every contributing block's +/// `source_span` in document order. +/// 7. **Version + policy hash recorded.** Each chunk records +/// `chunker_version = "md-heading-v1"`. The current `policy_hash` is +/// folded into the `chunk_id` recipe (design §4.2) so changing +/// `target_tokens` / `overlap_tokens` produces fresh chunk IDs. +/// +/// `ImageRef` and `AudioRef` blocks are emitted as their own chunks so +/// future image/audio search can locate them. Their `text` is the alt / +/// caption preview (empty string if unavailable) and `token_estimate = 0`. +/// +/// **Token-estimate proxy.** Until a real tokenizer is wired in (P3), the +/// estimator counts UTF-8 bytes and divides by [`BYTES_PER_TOKEN`]. The +/// constant is deliberately small (3) so the proxy *over*-estimates token +/// count — chunks sized against this proxy are guaranteed to fit in any +/// real BPE tokenizer's budget for English (~4 bytes/token) or Korean +/// (~3 bytes/token under E5/M-BERT). See `BYTES_PER_TOKEN` for rationale. +#[derive(Clone, Copy, Debug, Default)] +pub struct MdHeadingV1Chunker; + +/// Bytes-per-token proxy. We over-estimate (smaller divisor → larger +/// token count) so that real tokenizers downstream never see a chunk +/// exceeding their budget. English averages ~4 bytes/token under BPE, +/// Korean averages ~3 bytes/token under E5; picking 3 covers both. +const BYTES_PER_TOKEN: usize = 3; + +/// Maximum hex characters of `blake3(canonical_json(policy))` retained +/// in `policy_hash`. 16 hex chars = 64 bits of policy entropy, which is +/// far beyond enough to disambiguate the handful of policy variants a +/// single workspace will see. +const POLICY_HASH_HEX_LEN: usize = 16; + +impl Chunker for MdHeadingV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + _doc: &CanonicalDocument, + _policy: &ChunkPolicy, + ) -> anyhow::Result> { + Ok(Vec::new()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn chunker_version_is_md_heading_v1() { + assert_eq!( + MdHeadingV1Chunker.chunker_version(), + ChunkerVersion(VERSION_LABEL.to_string()) + ); + } + + #[test] + fn policy_hash_is_deterministic_and_16_hex() { + let policy = ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: true, + chunker_version: ChunkerVersion(VERSION_LABEL.to_string()), + }; + let h1 = MdHeadingV1Chunker.policy_hash(&policy); + let h2 = MdHeadingV1Chunker.policy_hash(&policy); + assert_eq!(h1, h2); + assert_eq!(h1.len(), POLICY_HASH_HEX_LEN); + assert!(h1.bytes().all(|b| b.is_ascii_hexdigit())); + } + + #[test] + fn policy_hash_differs_when_policy_differs() { + let p1 = ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: true, + chunker_version: ChunkerVersion(VERSION_LABEL.to_string()), + }; + let p2 = ChunkPolicy { + target_tokens: 500, + overlap_tokens: 0, // <-- only this differs + respect_markdown_headings: true, + chunker_version: ChunkerVersion(VERSION_LABEL.to_string()), + }; + assert_ne!( + MdHeadingV1Chunker.policy_hash(&p1), + MdHeadingV1Chunker.policy_hash(&p2) + ); + } +}