diff --git a/Cargo.lock b/Cargo.lock index 0ce2a30..11bdbb3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -539,6 +539,21 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "kb-chunk" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "kb-core", + "kb-normalize", + "kb-parse-md", + "serde_json", + "serde_json_canonicalizer", + "time", + "tracing", +] + [[package]] name = "kb-cli" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index b5d4b57..b09dfed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "crates/kb-source-fs", "crates/kb-parse-md", "crates/kb-normalize", + "crates/kb-chunk", "crates/kb-app", "crates/kb-cli", ] diff --git a/crates/kb-chunk/Cargo.toml b/crates/kb-chunk/Cargo.toml new file mode 100644 index 0000000..2238643 --- /dev/null +++ b/crates/kb-chunk/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "kb-chunk" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Chunkers that turn kb-core::CanonicalDocument into kb-core::Chunk batches (§3.5, §4.2, §7.2)" + +[dependencies] +kb-core = { path = "../kb-core" } +serde_json_canonicalizer = "0.3" +blake3 = { workspace = true } +anyhow = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +# kb-parse-md / kb-normalize are dev-only — used by the snapshot integration +# test to build a CanonicalDocument from a fixture Markdown file. Forbidden as +# regular deps per design §8 (chunker consumes CanonicalDocument from kb-core +# only); `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps) +# confirms this. +kb-parse-md = { path = "../kb-parse-md" } +kb-normalize = { path = "../kb-normalize" } +serde_json = { workspace = true } +time = { workspace = true } diff --git a/crates/kb-chunk/src/lib.rs b/crates/kb-chunk/src/lib.rs new file mode 100644 index 0000000..4cff5f1 --- /dev/null +++ b/crates/kb-chunk/src/lib.rs @@ -0,0 +1,20 @@ +//! `kb-chunk` — chunkers that emit [`kb_core::Chunk`] batches. +//! +//! Per design §3.5 (Chunk), §4.2 (chunk_id recipe), §7.2 (`Chunker` +//! trait), §0 Q3/§14 (chunking priority). +//! +//! Public surface: +//! +//! * [`MdHeadingV1Chunker`] — heading-aware chunker for Markdown +//! `CanonicalDocument`s, emitting `chunker_version = "md-heading-v1"`. +//! +//! Behavior contract is enumerated on [`MdHeadingV1Chunker`]. +//! +//! This crate must NOT depend on any parser implementation +//! (`kb-parse-md`, `kb-parse-pdf`, …), the document/vector store, the +//! embedder, the retriever, the LLM, the RAG layer, or the UI layers. +//! It consumes `CanonicalDocument` purely through `kb-core` types. + +mod md_heading_v1; + +pub use md_heading_v1::MdHeadingV1Chunker; diff --git a/crates/kb-chunk/src/md_heading_v1.rs b/crates/kb-chunk/src/md_heading_v1.rs new file mode 100644 index 0000000..cf4c82a --- /dev/null +++ b/crates/kb-chunk/src/md_heading_v1.rs @@ -0,0 +1,834 @@ +//! `md-heading-v1` — heading-aware Markdown chunker. + +use kb_core::{ + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, + ChunkerVersion, DocumentId, SourceSpan, id_for_chunk, +}; + +/// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label +/// invalidates every downstream embedding record (design §9), so any change +/// must ship with a documented migration plan. +const VERSION_LABEL: &str = "md-heading-v1"; + +/// Bytes-per-token proxy. We over-estimate (smaller divisor → larger +/// token count) so that real tokenizers downstream never see a chunk +/// exceeding their budget. English averages ~4 bytes/token under BPE, +/// Korean averages ~3 bytes/token under E5; picking 3 covers both. +const BYTES_PER_TOKEN: usize = 3; + +/// Maximum hex characters of `blake3(canonical_json(policy))` retained +/// in `policy_hash`. 16 hex chars = 64 bits of policy entropy, which is +/// far beyond enough to disambiguate the handful of policy variants a +/// single workspace will see. +const POLICY_HASH_HEX_LEN: usize = 16; + +/// Heading-aware Markdown chunker. +/// +/// Implements [`kb_core::Chunker`] for Markdown-derived +/// [`CanonicalDocument`]s. +/// +/// **Behavior contract** (design §0 / §14, in priority order): +/// +/// 1. **Heading boundary first.** Chunks never span a `Block::Heading`. +/// The Heading block itself starts a new chunk and is included in that +/// chunk's `block_ids` so heading text is retrievable. +/// 2. **Never split a code block.** A `Block::Code` always lives in a +/// single chunk even when it exceeds `target_tokens`. +/// 3. **Tables stay in one chunk.** A `Block::Table` is emitted as a +/// single chunk regardless of size — the row-split refinement is +/// deferred per the P1-5 task spec. +/// 4. **Long sections split by paragraph.** Within a heading section +/// the chunker accumulates blocks until adding the next would exceed +/// `target_tokens`; it then emits the chunk and seeds the next chunk +/// with the previous chunk's tail blocks contributing roughly +/// `overlap_tokens` of content (paragraph-level overlap). +/// 5. **`heading_path` propagates.** Each chunk's `heading_path` is the +/// `heading_path` of its first contributing non-Heading block, or — +/// when the chunk leads with (or contains only) a Heading — the +/// parent path **plus the heading's own text** so heading-only or +/// heading-led chunks never lose their citation context. +/// 6. **`source_spans` merge.** A chunk lists every contributing block's +/// `source_span` in document order. +/// 7. **Version + policy hash recorded.** Each chunk records +/// `chunker_version = "md-heading-v1"`. The current `policy_hash` is +/// folded into the `chunk_id` recipe (design §4.2) so changing +/// `target_tokens` / `overlap_tokens` produces fresh chunk IDs. +/// +/// `ImageRef` and `AudioRef` blocks are emitted as their own chunks so +/// future image/audio search can locate them. Their `text` is the alt / +/// caption preview (empty string if unavailable) and `token_estimate = 0`. +/// +/// **Token-estimate proxy.** Until a real tokenizer is wired in (P3), the +/// estimator counts UTF-8 bytes and divides by [`BYTES_PER_TOKEN`]. The +/// constant is deliberately small (3) so the proxy *over*-estimates token +/// count — chunks sized against this proxy are guaranteed to fit in any +/// real BPE tokenizer's budget for English (~4 bytes/token) or Korean +/// (~3 bytes/token under E5/M-BERT). See [`BYTES_PER_TOKEN`] for rationale. +/// +/// **`policy.respect_markdown_headings`.** This field flows into +/// `policy_hash` (so flipping it yields fresh chunk IDs), but the +/// chunker variant `md-heading-v1` unconditionally treats headings as +/// boundaries by design — the `md-heading-v1` name is the contract. To +/// disable heading awareness, ship a different `chunker_version`; none +/// is shipped in P1-5. +#[derive(Clone, Copy, Debug, Default)] +pub struct MdHeadingV1Chunker; + +impl Chunker for MdHeadingV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + /// Compute the policy hash folded into `chunk_id` per design §4.2. + /// + /// # Panics + /// + /// Panics if canonical JSON serialization of `ChunkPolicy` fails. + /// This is unreachable in practice — `ChunkPolicy` is composed of + /// owned primitives (`usize`, `bool`, owned `String`) and + /// `serde_json_canonicalizer::to_vec` only fails on + /// non-serializable values such as non-finite floats or maps with + /// non-string keys, neither of which can be constructed via + /// `ChunkPolicy`'s public surface. The `expect` is preserved as a + /// future-proofing guard against drift if `ChunkPolicy` ever gains + /// a field with such a property. + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result> { + let policy_hash = self.policy_hash(policy); + let chunker_version = self.chunker_version(); + let mut out: Vec = Vec::new(); + + // Running accumulator: the paragraphs/lists/quotes (and the + // optional leading heading) that will be glued into the next + // emitted chunk. + let mut acc = ChunkAcc::default(); + + for block in &doc.blocks { + match block { + Block::Heading(_) => { + // §0/§14 priority 1: heading is a hard boundary. + // Flush whatever has accumulated, then seed a new + // accumulator that owns this heading. + flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); + acc.push_block(block); + } + Block::Code(_) | Block::Table(_) => { + // Atomic non-splittable text blocks. Flush running + // accumulator, then emit the atomic block as its + // own chunk. (Code never splits per priority 2; + // tables stay single per priority 3.) + flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); + let mut single = ChunkAcc::default(); + single.push_block(block); + flush(&mut single, doc, &chunker_version, &policy_hash, &mut out); + } + Block::ImageRef(_) | Block::AudioRef(_) => { + // Independent searchable artifacts. token_estimate=0 + // is enforced inside `build_chunk` for these kinds. + flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); + let mut single = ChunkAcc::default(); + single.push_block(block); + flush(&mut single, doc, &chunker_version, &policy_hash, &mut out); + } + Block::Paragraph(_) | Block::List(_) | Block::Quote(_) => { + // Soft-split candidates. If adding this block would + // exceed target_tokens (and we already have at least + // one non-heading block in the accumulator), emit + // the current chunk and seed the next one with + // overlap from the prior tail. + let next_tokens = estimate_block_tokens(block); + // Note: `acc.text_tokens` already includes the prior + // chunk's overlap seed. The clamp in + // `collect_overlap_seed` keeps seed ≤ target/2, so + // a flush here never produces a chunk smaller than + // the seed budget. + let would_exceed = acc.text_tokens + next_tokens + > policy.target_tokens + && acc.has_non_heading_content(); + if would_exceed { + let overlap_seed = collect_overlap_seed( + &acc, + policy.overlap_tokens, + policy.target_tokens, + ); + flush( + &mut acc, + doc, + &chunker_version, + &policy_hash, + &mut out, + ); + // Seed next accumulator with the prior chunk's + // tail blocks (paragraph-level overlap). The + // heading is *not* re-included here — it lives + // on the prior chunk. The follow-on chunk's + // heading_path is taken from the first seeded + // block (which carries the same path, as it sat + // under the same heading). + for b in overlap_seed { + acc.push_block(b); + } + } + acc.push_block(block); + } + } + } + flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); + + tracing::debug!( + target: "kb-chunk", + doc_id = %doc.doc_id, + chunks = out.len(), + "md-heading-v1 chunked", + ); + + Ok(out) + } +} + +/// Internal accumulator: pointers to blocks (lifetime-bound to the +/// `CanonicalDocument`) plus the running token estimate of their text. +#[derive(Default)] +struct ChunkAcc<'a> { + blocks: Vec<&'a Block>, + text_tokens: usize, +} + +impl<'a> ChunkAcc<'a> { + fn push_block(&mut self, b: &'a Block) { + self.text_tokens += estimate_block_tokens(b); + self.blocks.push(b); + } + + fn is_empty(&self) -> bool { + self.blocks.is_empty() + } + + /// True if any non-heading block sits in the accumulator. Used to + /// avoid splitting a chunk that contains only its leading heading + /// (which would emit a heading-only chunk before any prose). + fn has_non_heading_content(&self) -> bool { + self.blocks.iter().any(|b| !matches!(b, Block::Heading(_))) + } +} + +/// Drain `acc` into a fresh `Chunk` and push to `out`. No-op when empty. +fn flush( + acc: &mut ChunkAcc<'_>, + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + policy_hash: &str, + out: &mut Vec, +) { + if acc.is_empty() { + return; + } + let blocks = std::mem::take(&mut acc.blocks); + acc.text_tokens = 0; + out.push(build_chunk(doc, &blocks, chunker_version, policy_hash)); +} + +/// Collect the trailing blocks of `acc` (in document order) whose +/// combined token estimate fits under the seed budget. The heading +/// block (if it leads the accumulator) is excluded from the seed — +/// re-emitting the heading would conflate it with the next chunk's +/// own heading_path provenance. +/// +/// The seed budget is clamped to `min(overlap_tokens, target_tokens / 2)`. +/// Without the clamp, an `overlap_tokens >= target_tokens` policy +/// degenerates into 1-block-per-chunk: the seed already exceeds budget +/// before any new content lands, so the very next paragraph trips the +/// `would_exceed` flush. Halving the target guarantees the seed leaves +/// at least target/2 worth of room for fresh content in the next chunk. +fn collect_overlap_seed<'a>( + acc: &ChunkAcc<'a>, + overlap_tokens: usize, + target_tokens: usize, +) -> Vec<&'a Block> { + let seed_budget = overlap_tokens.min(target_tokens / 2); + if seed_budget == 0 { + return Vec::new(); + } + let mut taken = Vec::new(); + let mut budget = seed_budget; + for b in acc.blocks.iter().rev() { + if matches!(b, Block::Heading(_)) { + // Don't propagate the heading itself into the next chunk; + // its `heading_path` carries naturally on the next blocks + // (kb-normalize stamps every block under a heading with + // that heading's path). + continue; + } + let est = estimate_block_tokens(b); + if est > budget && !taken.is_empty() { + break; + } + taken.push(*b); + budget = budget.saturating_sub(est); + if budget == 0 { + break; + } + } + taken.reverse(); + taken +} + +/// Construct a `Chunk` from a non-empty contiguous slice of blocks. +fn build_chunk( + doc: &CanonicalDocument, + blocks: &[&Block], + chunker_version: &ChunkerVersion, + policy_hash: &str, +) -> Chunk { + debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block"); + + let block_ids: Vec = + blocks.iter().map(|b| common(b).block_id.clone()).collect(); + let source_spans: Vec = + blocks.iter().map(|b| common(b).source_span.clone()).collect(); + + // heading_path: pick the first non-Heading block's heading_path + // (which already includes every parent heading per kb-normalize). + // When the FIRST block is a Heading — either a heading-only chunk, + // or a chunk that leads with `# H1` immediately followed by another + // Heading or atomic block — the Heading block's own + // `common.heading_path` records only its *parents* (kb-normalize + // does not include a heading inside its own path). We synthesize + // the leading heading into the path so the citation context is not + // lost on patterns like `# Alpha\n## Beta\n...`. + let heading_path = match blocks[0] { + Block::Heading(h) => { + let mut path = h.common.heading_path.clone(); + path.push(h.text.clone()); + path + } + _ => common(blocks[0]).heading_path.clone(), + }; + + // Text rendering: simple double-newline join of each block's + // contribution. We deliberately pick a stable, low-fidelity + // representation — embedding-quality rewrites land in P3. + let mut text = String::new(); + let mut is_image_or_audio_only = true; + for (i, b) in blocks.iter().enumerate() { + let part = render_block_text(b); + if !matches!(b, Block::ImageRef(_) | Block::AudioRef(_)) { + is_image_or_audio_only = false; + } + if i > 0 { + text.push_str("\n\n"); + } + text.push_str(&part); + } + + let token_estimate = if is_image_or_audio_only { + 0 + } else { + // Token estimate is bytes / BYTES_PER_TOKEN, rounded up so the + // proxy never under-counts. + text.len().div_ceil(BYTES_PER_TOKEN) + }; + + let chunk_id = id_for_chunk( + &doc.doc_id, + chunker_version, + &block_ids, + policy_hash, + ); + + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids, + text, + heading_path, + source_spans, + token_estimate, + chunker_version: chunker_version.clone(), + } +} + +/// Render a block's contribution to a chunk's `text`. The rendering is +/// deliberately minimal — embedding-time normalization is a P3 concern. +fn render_block_text(b: &Block) -> String { + match b { + Block::Heading(h) => h.text.clone(), + Block::Paragraph(p) | Block::Quote(p) => p.text.clone(), + Block::List(l) => l + .items + .iter() + .map(|it| it.text.as_str()) + .collect::>() + .join("\n"), + Block::Code(c) => c.code.clone(), + Block::Table(t) => { + // Headers row joined with " | ", then each row likewise. + let mut s = t.headers.join(" | "); + for row in &t.rows { + s.push('\n'); + s.push_str(&row.join(" | ")); + } + s + } + // ImageRef text portion = alt (per task spec). Fall back to + // model caption text if alt is empty. + Block::ImageRef(i) => { + if !i.alt.is_empty() { + i.alt.clone() + } else { + i.caption + .as_ref() + .map(|c| c.text.clone()) + .unwrap_or_default() + } + } + // AudioRef has no caption preview yet (transcript joins land + // in P8). Empty string per task spec. + Block::AudioRef(_) => String::new(), + } +} + +fn estimate_block_tokens(b: &Block) -> usize { + match b { + // ImageRef / AudioRef contribute 0 — they are independent + // chunks and never participate in size accounting. + Block::ImageRef(_) | Block::AudioRef(_) => 0, + _ => render_block_text(b).len().div_ceil(BYTES_PER_TOKEN), + } +} + +/// Borrow the `CommonBlock` of any [`Block`] variant. +fn common(b: &Block) -> &kb_core::CommonBlock { + match b { + Block::Heading(h) => &h.common, + Block::Paragraph(t) | Block::Quote(t) => &t.common, + Block::List(l) => &l.common, + Block::Code(c) => &c.common, + Block::Table(t) => &t.common, + Block::ImageRef(i) => &i.common, + Block::AudioRef(a) => &a.common, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kb_core::{ + AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, + Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel, + WorkspacePath, id_for_block, + }; + use time::OffsetDateTime; + + fn make_doc(blocks: Vec) -> CanonicalDocument { + CanonicalDocument { + doc_id: kb_core::DocumentId("d".repeat(32)), + source_asset_id: AssetId("a".repeat(32)), + workspace_path: WorkspacePath::new("notes/test.md".into()).unwrap(), + title: "Test".into(), + lang: Lang("en".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + }, + provenance: Provenance { events: vec![] }, + parser_version: kb_core::ParserVersion("test-parser-0".into()), + schema_version: 1, + doc_version: 1, + } + } + + fn doc_id() -> kb_core::DocumentId { + kb_core::DocumentId("d".repeat(32)) + } + + fn span(start: u32, end: u32) -> SourceSpan { + SourceSpan::Line { start, end } + } + + fn common_for( + kind: &str, + heading_path: &[String], + ordinal: u32, + s: SourceSpan, + ) -> CommonBlock { + CommonBlock { + block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s), + heading_path: heading_path.to_vec(), + source_span: s, + } + } + + fn heading(level: u8, text: &str, ordinal: u32, line: u32) -> Block { + Block::Heading(HeadingBlock { + common: common_for("heading", &[], ordinal, span(line, line)), + level, + text: text.into(), + }) + } + + /// Heading variant that carries a parent path — kb-normalize stamps + /// every block under `# Alpha` with `heading_path = []` for the H1 + /// itself but `["Alpha"]` for the H2 that follows. Tests covering + /// the heading-only chunk path (I2) need that asymmetry. + fn heading_with_parents( + level: u8, + text: &str, + parents: &[&str], + ordinal: u32, + line: u32, + ) -> Block { + let hp: Vec = parents.iter().map(|s| (*s).into()).collect(); + Block::Heading(HeadingBlock { + common: common_for("heading", &hp, ordinal, span(line, line)), + level, + text: text.into(), + }) + } + + fn paragraph( + text: &str, + heading_path: &[&str], + ordinal: u32, + line: u32, + ) -> Block { + let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); + Block::Paragraph(TextBlock { + common: common_for("paragraph", &hp, ordinal, span(line, line)), + text: text.into(), + inlines: vec![], + }) + } + + fn code_block( + code: &str, + heading_path: &[&str], + ordinal: u32, + s: SourceSpan, + ) -> Block { + let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); + Block::Code(CodeBlock { + common: common_for("code", &hp, ordinal, s), + lang: Some("rust".into()), + code: code.into(), + }) + } + + fn table( + headers: Vec<&str>, + rows: Vec>, + heading_path: &[&str], + ordinal: u32, + s: SourceSpan, + ) -> Block { + let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); + Block::Table(TableBlock { + common: common_for("table", &hp, ordinal, s), + headers: headers.into_iter().map(String::from).collect(), + rows: rows + .into_iter() + .map(|r| r.into_iter().map(String::from).collect()) + .collect(), + }) + } + + fn image_ref( + alt: &str, + heading_path: &[&str], + ordinal: u32, + line: u32, + ) -> Block { + let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); + Block::ImageRef(ImageRefBlock { + common: common_for("imageref", &hp, ordinal, span(line, line)), + asset_id: None, + src: "img.png".into(), + alt: alt.into(), + ocr: None, + caption: None, + }) + } + + fn default_policy(target: usize, overlap: usize) -> ChunkPolicy { + ChunkPolicy { + target_tokens: target, + overlap_tokens: overlap, + respect_markdown_headings: true, + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } + } + + #[test] + fn chunker_version_is_md_heading_v1() { + assert_eq!( + MdHeadingV1Chunker.chunker_version(), + ChunkerVersion(VERSION_LABEL.to_string()) + ); + } + + #[test] + fn policy_hash_is_deterministic_and_16_hex() { + let p = default_policy(500, 80); + let h1 = MdHeadingV1Chunker.policy_hash(&p); + let h2 = MdHeadingV1Chunker.policy_hash(&p); + assert_eq!(h1, h2); + assert_eq!(h1.len(), POLICY_HASH_HEX_LEN); + assert!(h1.bytes().all(|b| b.is_ascii_hexdigit())); + } + + #[test] + fn policy_hash_differs_when_policy_differs() { + let p1 = default_policy(500, 80); + let p2 = default_policy(500, 0); + assert_ne!( + MdHeadingV1Chunker.policy_hash(&p1), + MdHeadingV1Chunker.policy_hash(&p2) + ); + } + + /// Heading boundary respected: two H2 sections produce separate + /// chunks; no chunk's block_ids straddle the H2→H2 boundary. + #[test] + fn heading_boundary_respected() { + let blocks = vec![ + heading(2, "First", 0, 1), + paragraph("body of first", &["First"], 0, 2), + heading(2, "Second", 1, 3), + paragraph("body of second", &["Second"], 0, 4), + ]; + let doc = make_doc(blocks); + let chunks = MdHeadingV1Chunker + .chunk(&doc, &default_policy(10_000, 0)) + .unwrap(); + assert_eq!(chunks.len(), 2); + // First chunk = (heading "First", paragraph) + assert_eq!(chunks[0].block_ids.len(), 2); + // Second chunk = (heading "Second", paragraph) + assert_eq!(chunks[1].block_ids.len(), 2); + // heading_path on chunk 0 belongs to "First" section. + assert_eq!(chunks[0].heading_path, vec!["First".to_string()]); + assert_eq!(chunks[1].heading_path, vec!["Second".to_string()]); + } + + /// A code block of ~800 tokens (≈2400 bytes) stays in a single + /// chunk even when target=500. + #[test] + fn code_block_never_splits() { + // 2400 bytes ≈ 800 tokens at BYTES_PER_TOKEN=3. + let big = "x".repeat(2400); + let blocks = vec![code_block(&big, &[], 0, span(1, 50))]; + let doc = make_doc(blocks); + let chunks = MdHeadingV1Chunker + .chunk(&doc, &default_policy(500, 80)) + .unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].block_ids.len(), 1); + assert!(chunks[0].token_estimate > 500); + } + + /// A table of size < 2× target stays in a single chunk. + #[test] + fn table_stays_single_chunk_when_small() { + let t = table( + vec!["a", "b", "c"], + vec![vec!["1", "2", "3"], vec!["4", "5", "6"]], + &[], + 0, + span(1, 4), + ); + let blocks = vec![t]; + let doc = make_doc(blocks); + let chunks = MdHeadingV1Chunker + .chunk(&doc, &default_policy(500, 80)) + .unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].block_ids.len(), 1); + } + + /// A long sequence of paragraphs splits at target_tokens with + /// overlap_tokens worth of seeded paragraph from the prior chunk. + #[test] + fn long_section_splits_with_overlap() { + // Each paragraph is 60 bytes ≈ 20 tokens. target=50, overlap=20 + // → after ~3 paragraphs we hit the target; the next chunk + // starts seeded with one paragraph from the prior tail. + let mut bs = vec![heading(2, "Long", 0, 1)]; + for i in 0..6u32 { + bs.push(paragraph(&"x".repeat(60), &["Long"], i, i + 2)); + } + let doc = make_doc(bs); + let chunks = MdHeadingV1Chunker + .chunk(&doc, &default_policy(50, 20)) + .unwrap(); + assert!( + chunks.len() >= 2, + "expected ≥2 chunks, got {}: {chunks:#?}", + chunks.len() + ); + // Every chunk lives under the same heading_path "Long". + for c in &chunks { + assert_eq!(c.heading_path, vec!["Long".to_string()]); + } + // Overlap propagates: the last block_id of chunk N appears in + // chunk N+1's block_ids (paragraph-level overlap rule). + for w in chunks.windows(2) { + let prev_tail = w[0].block_ids.last().unwrap(); + assert!( + w[1].block_ids.contains(prev_tail), + "chunk N+1 must seed from chunk N's tail; \ + prev_tail={prev_tail:?}, next ids={:?}", + w[1].block_ids + ); + } + } + + /// ImageRef → own chunk, token_estimate=0. + #[test] + fn image_ref_emits_own_chunk_zero_tokens() { + let blocks = vec![ + heading(2, "With image", 0, 1), + paragraph("intro", &["With image"], 0, 2), + image_ref("a cat", &["With image"], 0, 3), + paragraph("after", &["With image"], 1, 4), + ]; + let doc = make_doc(blocks); + let chunks = MdHeadingV1Chunker + .chunk(&doc, &default_policy(10_000, 0)) + .unwrap(); + // Expect: (heading + intro), (image), (after). The image must + // be its own chunk and carry token_estimate=0. + assert!(chunks.len() >= 3, "unexpected chunk count: {chunks:#?}"); + let img_chunk = chunks + .iter() + .find(|c| c.text == "a cat") + .expect("image chunk present"); + assert_eq!(img_chunk.token_estimate, 0); + assert_eq!(img_chunk.block_ids.len(), 1); + } + + /// Identical input + identical policy → identical chunk_ids over + /// 1000 iterations. + #[test] + fn deterministic_chunk_ids_1000() { + let blocks = vec![ + heading(2, "Det", 0, 1), + paragraph("body 1", &["Det"], 0, 2), + paragraph("body 2", &["Det"], 1, 3), + heading(2, "Det 2", 1, 4), + paragraph("body 3", &["Det 2"], 0, 5), + ]; + let doc = make_doc(blocks); + let policy = default_policy(50, 10); + let baseline: Vec = MdHeadingV1Chunker + .chunk(&doc, &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + for _ in 0..1000 { + let again: Vec = MdHeadingV1Chunker + .chunk(&doc, &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + assert_eq!(again, baseline); + } + } + + /// I2 regression: when a Heading is followed immediately by another + /// Heading or atomic block (no intervening prose), the resulting + /// heading-only / heading-led chunk must carry the heading text in + /// its own `heading_path`. Pattern: `# Alpha`, `## Beta`, code. + /// + /// Before the fix, chunk[0] (Heading-only "Alpha") would have + /// `heading_path = []` because `kb-normalize` does not stamp a + /// heading inside its own path; the chunker fell back to the + /// heading's parent path. After the fix it is `["Alpha"]`. + /// + /// `chunk_id` recipe (`doc_id, chunker_version, block_ids, + /// policy_hash`) does NOT include `heading_path`, so this fix does + /// NOT shift chunk_ids — only `heading_path` fields. + #[test] + fn heading_only_chunk_carries_self_in_path() { + // # Alpha (H1, no parents) + // ## Beta (H2, parent = ["Alpha"]) + // ```rust ... ``` (code, heading_path = ["Alpha", "Beta"]) + let blocks = vec![ + heading_with_parents(1, "Alpha", &[], 0, 1), + heading_with_parents(2, "Beta", &["Alpha"], 0, 2), + code_block("fn x() {}", &["Alpha", "Beta"], 0, span(3, 3)), + ]; + let doc = make_doc(blocks); + let chunks = MdHeadingV1Chunker + .chunk(&doc, &default_policy(10_000, 0)) + .unwrap(); + // Three chunks: Heading-only Alpha, Heading-only Beta, code. + assert_eq!(chunks.len(), 3, "got {chunks:#?}"); + assert_eq!(chunks[0].heading_path, vec!["Alpha".to_string()]); + assert_eq!( + chunks[1].heading_path, + vec!["Alpha".to_string(), "Beta".to_string()] + ); + assert_eq!( + chunks[2].heading_path, + vec!["Alpha".to_string(), "Beta".to_string()] + ); + } + + /// I3 regression: a pathological policy with + /// `overlap_tokens >= target_tokens` must NOT degenerate into + /// 1-block-per-chunk. The seed budget is clamped to `target/2`, + /// guaranteeing every flushed chunk has space for fresh content. + #[test] + fn overlap_clamped_when_overlap_exceeds_target() { + // 5 paragraphs of ~20 tokens each (60 bytes / 3 BPT). + // target = 50, overlap = 200 (4× target → would trip flush + // immediately without clamp). + let mut bs = vec![heading_with_parents(2, "Long", &[], 0, 1)]; + for i in 0..5u32 { + bs.push(paragraph(&"x".repeat(60), &["Long"], i, i + 2)); + } + let doc = make_doc(bs); + let policy = ChunkPolicy { + target_tokens: 50, + overlap_tokens: 200, + respect_markdown_headings: true, + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + }; + let chunks = MdHeadingV1Chunker.chunk(&doc, &policy).unwrap(); + // Without the clamp, every chunk after the first would have + // exactly 1 paragraph (because seed alone already exceeds + // target and acc.has_non_heading_content() is true the moment + // any seed lands). With the clamp, follow-on chunks must hold + // at least the seed paragraph + the new paragraph = ≥2 blocks. + for (i, c) in chunks.iter().enumerate() { + // The very first chunk includes the heading + first para + // (no seed), so it is also ≥2. Subsequent chunks must be + // seed+new ≥ 2. + assert!( + c.block_ids.len() >= 2, + "chunk {i} degenerated to {} block(s); pathology not \ + prevented: {chunks:#?}", + c.block_ids.len() + ); + } + } +} diff --git a/crates/kb-chunk/tests/long_section_snapshot.rs b/crates/kb-chunk/tests/long_section_snapshot.rs new file mode 100644 index 0000000..3148300 --- /dev/null +++ b/crates/kb-chunk/tests/long_section_snapshot.rs @@ -0,0 +1,177 @@ +//! Snapshot test pinning the `Vec` JSON for the +//! `fixtures/markdown/long-section.md` fixture. +//! +//! This is an integration test. `kb-parse-md` and `kb-normalize` are +//! dev-dep only — `cargo tree -p kb-chunk --depth 1` (default scope, +//! excludes dev-deps) confirms they are not regular deps. The §8 +//! module-boundary rule is preserved. +//! +//! The chunker output is fully deterministic given fixed inputs, so we +//! pin the entire `Vec` JSON. +//! +//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline. + +use std::path::PathBuf; + +use kb_chunk::MdHeadingV1Chunker; +use kb_core::{ + AssetId, AssetStorage, Checksum, ChunkPolicy, ChunkerVersion, Chunker, MediaType, + ParserVersion, RawAsset, SourceUri, WorkspacePath, +}; +use kb_normalize::build_canonical_document; +use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("fixtures") + .join("markdown") +} + +fn fixed_asset(workspace_path: &str) -> RawAsset { + let wp = WorkspacePath::new(workspace_path.into()).unwrap(); + RawAsset { + asset_id: AssetId("a".repeat(32)), + source_uri: SourceUri::File(PathBuf::from("/tmp/long-section.md")), + workspace_path: wp, + media_type: MediaType::Markdown, + byte_len: 0, + checksum: Checksum("0".repeat(64)), + discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + stored: AssetStorage::Reference { + path: PathBuf::from("/tmp/long-section.md"), + sha: Checksum("0".repeat(64)), + }, + } +} + +#[test] +fn long_section_chunks_snapshot() { + let dir = fixtures_dir(); + let bytes = std::fs::read(dir.join("long-section.md")).expect("fixture readable"); + + let asset = fixed_asset("notes/long-section.md"); + let hints = BodyHints { + first_h1: Some("Alpha".into()), + fs_ctime: asset.discovered_at, + fs_mtime: asset.discovered_at, + fallback_lang: Some("en".into()), + }; + let (metadata, fm_span, _fm_warns) = + parse_frontmatter(&bytes, &hints).expect("frontmatter parses"); + let body_offset_lines: u32 = match fm_span { + Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1, + None => 1, + }; + let (blocks, parse_warns) = + parse_blocks(&bytes, body_offset_lines).expect("blocks parse"); + + // Pin parser_version so doc_id / block_ids are reproducible. + let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into()); + let mut metadata = metadata; + metadata.aliases.sort(); + metadata.tags.sort(); + + let doc = + build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns) + .expect("build_canonical_document"); + + // Pin policy so policy_hash and chunk_ids are reproducible. + let policy = ChunkPolicy { + target_tokens: 200, + overlap_tokens: 40, + respect_markdown_headings: true, + chunker_version: ChunkerVersion("md-heading-v1".into()), + }; + + let chunks = MdHeadingV1Chunker.chunk(&doc, &policy).expect("chunk"); + let actual = serde_json::to_value(&chunks).unwrap(); + + let baseline_path = dir.join("long-section.chunks.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = + serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "long-section chunks snapshot drift\n\ + --- expected ({}) ---\n{baseline_text}\n\ + --- actual ---\n{pretty}\n\ + If intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} + +/// Determinism cross-check: re-running the same pipeline yields the same +/// chunk_ids byte-for-byte. +#[test] +fn long_section_chunks_are_deterministic() { + let dir = fixtures_dir(); + let bytes = std::fs::read(dir.join("long-section.md")).expect("fixture readable"); + + let asset = fixed_asset("notes/long-section.md"); + let hints = BodyHints { + first_h1: Some("Alpha".into()), + fs_ctime: asset.discovered_at, + fs_mtime: asset.discovered_at, + fallback_lang: Some("en".into()), + }; + + let policy = ChunkPolicy { + target_tokens: 200, + overlap_tokens: 40, + respect_markdown_headings: true, + chunker_version: ChunkerVersion("md-heading-v1".into()), + }; + let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into()); + + let mut baseline: Option> = None; + for _ in 0..5 { + let (metadata, _fm_span, _fm_warns) = + parse_frontmatter(&bytes, &hints).expect("frontmatter parses"); + let (blocks, parse_warns) = parse_blocks(&bytes, 1).expect("blocks parse"); + let mut metadata = metadata; + metadata.aliases.sort(); + metadata.tags.sort(); + let doc = build_canonical_document( + &asset, + metadata, + blocks, + &parser_version, + parse_warns, + ) + .expect("build_canonical_document"); + let ids: Vec = MdHeadingV1Chunker + .chunk(&doc, &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + match &baseline { + None => baseline = Some(ids), + Some(prev) => assert_eq!(prev, &ids), + } + } +} diff --git a/fixtures/markdown/long-section.chunks.snapshot.json b/fixtures/markdown/long-section.chunks.snapshot.json new file mode 100644 index 0000000..ef27566 --- /dev/null +++ b/fixtures/markdown/long-section.chunks.snapshot.json @@ -0,0 +1,206 @@ +[ + { + "block_ids": [ + "39308c41feedcbbc2f92d5d133366f6d", + "5e978557db4fd5d88807b00ce0d8ca01", + "52fbbe749357ad142492968e8febafb2" + ], + "chunk_id": "04903321ed830fcb4b8a50fa795e6c14", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Alpha" + ], + "source_spans": [ + { + "end": 1, + "kind": "line", + "start": 1 + }, + { + "end": 3, + "kind": "line", + "start": 3 + }, + { + "end": 5, + "kind": "line", + "start": 5 + } + ], + "text": "Alpha\n\nAlpha intro paragraph one. This first paragraph in the alpha section gives a brief overview of what is to follow and serves as the lead-in for the subsequent material covered under the alpha heading.\n\nAlpha intro paragraph two. The second paragraph extends the discussion with additional sentences, padding out the paragraph so that paragraph-level chunk splitting actually has multiple candidates to consider when deciding where to slice the content stream.", + "token_estimate": 155 + }, + { + "block_ids": [ + "839080233875e832d37ba80d4b9ef97a", + "1390fa96500a55669123383889c472c4" + ], + "chunk_id": "661a4e5ae606d4327eee70bd4e346b52", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Alpha", + "Alpha Sub" + ], + "source_spans": [ + { + "end": 7, + "kind": "line", + "start": 7 + }, + { + "end": 9, + "kind": "line", + "start": 9 + } + ], + "text": "Alpha Sub\n\nSome prose under the alpha sub-heading. The nested heading should still be respected as a chunk boundary distinct from the parent alpha heading.", + "token_estimate": 52 + }, + { + "block_ids": [ + "7e923dfac89c5d8a31879418ec194026" + ], + "chunk_id": "c8b0f5d9405fa8c36eb70dd9005a29dc", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Alpha", + "Alpha Sub" + ], + "source_spans": [ + { + "end": 53, + "kind": "line", + "start": 11 + } + ], + "text": "// A code block long enough to easily clear any reasonable target_tokens\n// so the never-split-code-block rule is exercised by this fixture. The\n// rest of the function body is intentional filler: line after line of\n// content that, were the chunker permitted to split it, would exceed\n// the target threshold and force a break in the middle of the snippet.\nfn long_code_example_one() {\n let mut numbers = Vec::new();\n for i in 0..10 {\n numbers.push(i * 2);\n }\n let mut total = 0_i64;\n for n in &numbers {\n total += *n as i64;\n }\n println!(\"total = {total}\");\n}\n\nfn long_code_example_two() {\n let words = [\"alpha\", \"beta\", \"gamma\", \"delta\", \"epsilon\"];\n for w in words.iter() {\n if w.starts_with('a') {\n println!(\"starts with a: {w}\");\n } else if w.starts_with('b') {\n println!(\"starts with b: {w}\");\n } else if w.starts_with('g') {\n println!(\"starts with g: {w}\");\n } else {\n println!(\"other: {w}\");\n }\n }\n}\n\nfn long_code_example_three() {\n let mut buf = String::new();\n for ch in \"lorem ipsum dolor sit amet\".chars() {\n if ch.is_ascii_alphabetic() {\n buf.push(ch.to_ascii_uppercase());\n }\n }\n println!(\"buf = {buf}\");\n}", + "token_estimate": 427 + }, + { + "block_ids": [ + "53e0b44f880cca19d9f0ff99d917f4f6", + "8f794bb2314006e07fb7650ad28d2bb9" + ], + "chunk_id": "3a01e78c14f3d2e3737d9b0b1411a535", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Beta" + ], + "source_spans": [ + { + "end": 55, + "kind": "line", + "start": 55 + }, + { + "end": 57, + "kind": "line", + "start": 57 + } + ], + "text": "Beta\n\nBeta paragraph one. The beta section opens with an introductory paragraph that sets up the table appearing further down.", + "token_estimate": 42 + }, + { + "block_ids": [ + "dc1a3da1f6c0de0cc0ecaf93deb3ed30" + ], + "chunk_id": "6acd3b817583ebfd2f6639db2c47b4f0", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Beta" + ], + "source_spans": [ + { + "end": 64, + "kind": "line", + "start": 59 + } + ], + "text": "name | kind | note\none | small | first row\ntwo | medium | second row\nthree | large | third row\nfour | huge | fourth row", + "token_estimate": 40 + }, + { + "block_ids": [ + "8b8ba26ffe0e34d4a33c26ce0d302654" + ], + "chunk_id": "f79e267b7e498702e1bd35d2a373e5c5", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Beta" + ], + "source_spans": [ + { + "end": 66, + "kind": "line", + "start": 66 + } + ], + "text": "Beta closing paragraph. After the table we have one more paragraph of prose that anchors the end of the beta section before we move on to gamma.", + "token_estimate": 48 + }, + { + "block_ids": [ + "a5bb8d0a4f33ef9276f287c6b2876864", + "6358dda59f10540018ef85d776ee2ec2", + "1ee4ebef26433d6d6b585d7bd6497028" + ], + "chunk_id": "880fa807ed5aac2c31b76de8294ed270", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Gamma" + ], + "source_spans": [ + { + "end": 68, + "kind": "line", + "start": 68 + }, + { + "end": 70, + "kind": "line", + "start": 70 + }, + { + "end": 72, + "kind": "line", + "start": 72 + } + ], + "text": "Gamma\n\nGamma paragraph one. The gamma section is intentionally long to exercise the paragraph-level split with overlap rule when chunking under a single heading without any nested sub-headings to break things up further.\n\nGamma paragraph two. We continue accumulating prose so that the running token estimator climbs steadily and eventually trips the target_tokens threshold, forcing the chunker to emit a chunk and seed the next chunk with overlap from the prior tail.", + "token_estimate": 157 + }, + { + "block_ids": [ + "1ee4ebef26433d6d6b585d7bd6497028", + "38db826bf29bd64a90a698926d94d83e" + ], + "chunk_id": "6584ae54bbf25ea275ee380648eb3ccb", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Gamma" + ], + "source_spans": [ + { + "end": 72, + "kind": "line", + "start": 72 + }, + { + "end": 74, + "kind": "line", + "start": 74 + } + ], + "text": "Gamma paragraph two. We continue accumulating prose so that the running token estimator climbs steadily and eventually trips the target_tokens threshold, forcing the chunker to emit a chunk and seed the next chunk with overlap from the prior tail.\n\nGamma paragraph three. Yet another paragraph under the gamma heading, padded with words to ensure the byte count clears the threshold and the splitting behaviour shows up unambiguously in the snapshot output.", + "token_estimate": 153 + } +] diff --git a/fixtures/markdown/long-section.md b/fixtures/markdown/long-section.md new file mode 100644 index 0000000..07d672b --- /dev/null +++ b/fixtures/markdown/long-section.md @@ -0,0 +1,74 @@ +# Alpha + +Alpha intro paragraph one. This first paragraph in the alpha section gives a brief overview of what is to follow and serves as the lead-in for the subsequent material covered under the alpha heading. + +Alpha intro paragraph two. The second paragraph extends the discussion with additional sentences, padding out the paragraph so that paragraph-level chunk splitting actually has multiple candidates to consider when deciding where to slice the content stream. + +## Alpha Sub + +Some prose under the alpha sub-heading. The nested heading should still be respected as a chunk boundary distinct from the parent alpha heading. + +```rust +// A code block long enough to easily clear any reasonable target_tokens +// so the never-split-code-block rule is exercised by this fixture. The +// rest of the function body is intentional filler: line after line of +// content that, were the chunker permitted to split it, would exceed +// the target threshold and force a break in the middle of the snippet. +fn long_code_example_one() { + let mut numbers = Vec::new(); + for i in 0..10 { + numbers.push(i * 2); + } + let mut total = 0_i64; + for n in &numbers { + total += *n as i64; + } + println!("total = {total}"); +} + +fn long_code_example_two() { + let words = ["alpha", "beta", "gamma", "delta", "epsilon"]; + for w in words.iter() { + if w.starts_with('a') { + println!("starts with a: {w}"); + } else if w.starts_with('b') { + println!("starts with b: {w}"); + } else if w.starts_with('g') { + println!("starts with g: {w}"); + } else { + println!("other: {w}"); + } + } +} + +fn long_code_example_three() { + let mut buf = String::new(); + for ch in "lorem ipsum dolor sit amet".chars() { + if ch.is_ascii_alphabetic() { + buf.push(ch.to_ascii_uppercase()); + } + } + println!("buf = {buf}"); +} +``` + +# Beta + +Beta paragraph one. The beta section opens with an introductory paragraph that sets up the table appearing further down. + +| name | kind | note | +|-------|--------|--------------| +| one | small | first row | +| two | medium | second row | +| three | large | third row | +| four | huge | fourth row | + +Beta closing paragraph. After the table we have one more paragraph of prose that anchors the end of the beta section before we move on to gamma. + +# Gamma + +Gamma paragraph one. The gamma section is intentionally long to exercise the paragraph-level split with overlap rule when chunking under a single heading without any nested sub-headings to break things up further. + +Gamma paragraph two. We continue accumulating prose so that the running token estimator climbs steadily and eventually trips the target_tokens threshold, forcing the chunker to emit a chunk and seed the next chunk with overlap from the prior tail. + +Gamma paragraph three. Yet another paragraph under the gamma heading, padded with words to ensure the byte count clears the threshold and the splitting behaviour shows up unambiguously in the snapshot output.