p1-5: scaffold kb-chunk crate with MdHeadingV1Chunker skeleton

Adds the new workspace member with the bare Chunker impl shape:
chunker_version() returns "md-heading-v1"; policy_hash() blake3-hashes
canonical JSON of ChunkPolicy and truncates to 16 hex chars; chunk()
is an empty stub the next commits fill in.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-30 16:27:42 +00:00
parent 4665910370
commit 8142449eb7
5 changed files with 200 additions and 0 deletions

17
Cargo.lock generated
View File

@@ -539,6 +539,23 @@ dependencies = [
"tracing-subscriber",
]
[[package]]
name = "kb-chunk"
version = "0.1.0"
dependencies = [
"anyhow",
"blake3",
"kb-config",
"kb-core",
"kb-normalize",
"kb-parse-md",
"serde",
"serde_json",
"serde_json_canonicalizer",
"time",
"tracing",
]
[[package]]
name = "kb-cli"
version = "0.1.0"

View File

@@ -7,6 +7,7 @@ members = [
"crates/kb-source-fs",
"crates/kb-parse-md",
"crates/kb-normalize",
"crates/kb-chunk",
"crates/kb-app",
"crates/kb-cli",
]

View File

@@ -0,0 +1,28 @@
[package]
name = "kb-chunk"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Chunkers that turn kb-core::CanonicalDocument into kb-core::Chunk batches (§3.5, §4.2, §7.2)"
[dependencies]
kb-core = { path = "../kb-core" }
kb-config = { path = "../kb-config" }
serde = { workspace = true }
serde_json_canonicalizer = "0.3"
blake3 = { workspace = true }
anyhow = { workspace = true }
tracing = { workspace = true }
[dev-dependencies]
# kb-parse-md / kb-normalize are dev-only — used by the snapshot integration
# test to build a CanonicalDocument from a fixture Markdown file. Forbidden as
# regular deps per design §8 (chunker consumes CanonicalDocument from kb-core
# only); `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps)
# confirms this.
kb-parse-md = { path = "../kb-parse-md" }
kb-normalize = { path = "../kb-normalize" }
serde_json = { workspace = true }
time = { workspace = true }

View File

@@ -0,0 +1,20 @@
//! `kb-chunk` — chunkers that emit [`kb_core::Chunk`] batches.
//!
//! Per design §3.5 (Chunk), §4.2 (chunk_id recipe), §7.2 (`Chunker`
//! trait), §0 Q3/§14 (chunking priority).
//!
//! Public surface:
//!
//! * [`MdHeadingV1Chunker`] — heading-aware chunker for Markdown
//! `CanonicalDocument`s, emitting `chunker_version = "md-heading-v1"`.
//!
//! Behavior contract is enumerated on [`MdHeadingV1Chunker`].
//!
//! This crate must NOT depend on any parser implementation
//! (`kb-parse-md`, `kb-parse-pdf`, …), the document/vector store, the
//! embedder, the retriever, the LLM, the RAG layer, or the UI layers.
//! It consumes `CanonicalDocument` purely through `kb-core` types.
mod md_heading_v1;
pub use md_heading_v1::MdHeadingV1Chunker;

View File

@@ -0,0 +1,134 @@
//! `md-heading-v1` — heading-aware Markdown chunker.
use kb_core::{
CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion,
};
/// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label
/// invalidates every downstream embedding record (design §9), so any change
/// must ship with a documented migration plan.
const VERSION_LABEL: &str = "md-heading-v1";
/// Heading-aware Markdown chunker.
///
/// Implements [`kb_core::Chunker`] for Markdown-derived
/// [`CanonicalDocument`]s.
///
/// **Behavior contract** (design §0 / §14, in priority order):
///
/// 1. **Heading boundary first.** Chunks never span a `Block::Heading`.
/// The Heading block itself starts a new chunk and is included in that
/// chunk's `block_ids` so heading text is retrievable.
/// 2. **Never split a code block.** A `Block::Code` always lives in a
/// single chunk even when it exceeds `target_tokens`.
/// 3. **Tables stay in one chunk.** A `Block::Table` is emitted as a
/// single chunk regardless of size — the row-split refinement is
/// deferred per the P1-5 task spec.
/// 4. **Long sections split by paragraph.** Within a heading section
/// the chunker accumulates blocks until adding the next would exceed
/// `target_tokens`; it then emits the chunk and seeds the next chunk
/// with the previous chunk's tail blocks contributing roughly
/// `overlap_tokens` of content (paragraph-level overlap).
/// 5. **`heading_path` propagates.** Each chunk's `heading_path` is the
/// `heading_path` of its first contributing non-Heading block, or the
/// Heading block itself if that is the first.
/// 6. **`source_spans` merge.** A chunk lists every contributing block's
/// `source_span` in document order.
/// 7. **Version + policy hash recorded.** Each chunk records
/// `chunker_version = "md-heading-v1"`. The current `policy_hash` is
/// folded into the `chunk_id` recipe (design §4.2) so changing
/// `target_tokens` / `overlap_tokens` produces fresh chunk IDs.
///
/// `ImageRef` and `AudioRef` blocks are emitted as their own chunks so
/// future image/audio search can locate them. Their `text` is the alt /
/// caption preview (empty string if unavailable) and `token_estimate = 0`.
///
/// **Token-estimate proxy.** Until a real tokenizer is wired in (P3), the
/// estimator counts UTF-8 bytes and divides by [`BYTES_PER_TOKEN`]. The
/// constant is deliberately small (3) so the proxy *over*-estimates token
/// count — chunks sized against this proxy are guaranteed to fit in any
/// real BPE tokenizer's budget for English (~4 bytes/token) or Korean
/// (~3 bytes/token under E5/M-BERT). See `BYTES_PER_TOKEN` for rationale.
#[derive(Clone, Copy, Debug, Default)]
pub struct MdHeadingV1Chunker;
/// Bytes-per-token proxy. We over-estimate (smaller divisor → larger
/// token count) so that real tokenizers downstream never see a chunk
/// exceeding their budget. English averages ~4 bytes/token under BPE,
/// Korean averages ~3 bytes/token under E5; picking 3 covers both.
const BYTES_PER_TOKEN: usize = 3;
/// Maximum hex characters of `blake3(canonical_json(policy))` retained
/// in `policy_hash`. 16 hex chars = 64 bits of policy entropy, which is
/// far beyond enough to disambiguate the handful of policy variants a
/// single workspace will see.
const POLICY_HASH_HEX_LEN: usize = 16;
impl Chunker for MdHeadingV1Chunker {
fn chunker_version(&self) -> ChunkerVersion {
ChunkerVersion(VERSION_LABEL.to_string())
}
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
let bytes = serde_json_canonicalizer::to_vec(policy)
.expect("canonical JSON serialization of ChunkPolicy must not fail");
let hex = blake3::hash(&bytes).to_hex().to_string();
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
_doc: &CanonicalDocument,
_policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
Ok(Vec::new())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn chunker_version_is_md_heading_v1() {
assert_eq!(
MdHeadingV1Chunker.chunker_version(),
ChunkerVersion(VERSION_LABEL.to_string())
);
}
#[test]
fn policy_hash_is_deterministic_and_16_hex() {
let policy = ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: true,
chunker_version: ChunkerVersion(VERSION_LABEL.to_string()),
};
let h1 = MdHeadingV1Chunker.policy_hash(&policy);
let h2 = MdHeadingV1Chunker.policy_hash(&policy);
assert_eq!(h1, h2);
assert_eq!(h1.len(), POLICY_HASH_HEX_LEN);
assert!(h1.bytes().all(|b| b.is_ascii_hexdigit()));
}
#[test]
fn policy_hash_differs_when_policy_differs() {
let p1 = ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: true,
chunker_version: ChunkerVersion(VERSION_LABEL.to_string()),
};
let p2 = ChunkPolicy {
target_tokens: 500,
overlap_tokens: 0, // <-- only this differs
respect_markdown_headings: true,
chunker_version: ChunkerVersion(VERSION_LABEL.to_string()),
};
assert_ne!(
MdHeadingV1Chunker.policy_hash(&p1),
MdHeadingV1Chunker.policy_hash(&p2)
);
}
}