//! `md-heading-v1` — heading-aware Markdown chunker. use kebab_core::{ Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, SourceSpan, id_for_chunk, }; /// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label /// invalidates every downstream embedding record (design §9), so any change /// must ship with a documented migration plan. const VERSION_LABEL: &str = "md-heading-v1"; /// Bytes-per-token proxy. We over-estimate (smaller divisor → larger /// token count) so that real tokenizers downstream never see a chunk /// exceeding their budget. English averages ~4 bytes/token under BPE, /// Korean averages ~3 bytes/token under E5; picking 3 covers both. const BYTES_PER_TOKEN: usize = 3; /// Maximum hex characters of `blake3(canonical_json(policy))` retained /// in `policy_hash`. 16 hex chars = 64 bits of policy entropy, which is /// far beyond enough to disambiguate the handful of policy variants a /// single workspace will see. const POLICY_HASH_HEX_LEN: usize = 16; /// Heading-aware Markdown chunker. /// /// Implements [`kebab_core::Chunker`] for Markdown-derived /// [`CanonicalDocument`]s. /// /// **Behavior contract** (design §0 / §14, in priority order): /// /// 1. **Heading boundary first.** Chunks never span a `Block::Heading`. /// The Heading block itself starts a new chunk and is included in that /// chunk's `block_ids` so heading text is retrievable. /// 2. **Never split a code block.** A `Block::Code` always lives in a /// single chunk even when it exceeds `target_tokens`. /// 3. **Tables stay in one chunk.** A `Block::Table` is emitted as a /// single chunk regardless of size — the row-split refinement is /// deferred per the P1-5 task spec. /// 4. **Long sections split by paragraph.** Within a heading section /// the chunker accumulates blocks until adding the next would exceed /// `target_tokens`; it then emits the chunk and seeds the next chunk /// with the previous chunk's tail blocks contributing roughly /// `overlap_tokens` of content (paragraph-level overlap). /// 5. **`heading_path` propagates.** Each chunk's `heading_path` is the /// `heading_path` of its first contributing non-Heading block, or — /// when the chunk leads with (or contains only) a Heading — the /// parent path **plus the heading's own text** so heading-only or /// heading-led chunks never lose their citation context. /// 6. **`source_spans` merge.** A chunk lists every contributing block's /// `source_span` in document order. /// 7. **Version + policy hash recorded.** Each chunk records /// `chunker_version = "md-heading-v1"`. The current `policy_hash` is /// folded into the `chunk_id` recipe (design §4.2) so changing /// `target_tokens` / `overlap_tokens` produces fresh chunk IDs. /// /// `ImageRef` and `AudioRef` blocks are emitted as their own chunks so /// future image/audio search can locate them. Their `text` is the alt / /// caption preview (empty string if unavailable) and `token_estimate = 0`. /// /// **Token-estimate proxy.** Until a real tokenizer is wired in (P3), the /// estimator counts UTF-8 bytes and divides by [`BYTES_PER_TOKEN`]. The /// constant is deliberately small (3) so the proxy *over*-estimates token /// count — chunks sized against this proxy are guaranteed to fit in any /// real BPE tokenizer's budget for English (~4 bytes/token) or Korean /// (~3 bytes/token under E5/M-BERT). See [`BYTES_PER_TOKEN`] for rationale. /// /// **`policy.respect_markdown_headings`.** This field flows into /// `policy_hash` (so flipping it yields fresh chunk IDs), but the /// chunker variant `md-heading-v1` unconditionally treats headings as /// boundaries by design — the `md-heading-v1` name is the contract. To /// disable heading awareness, ship a different `chunker_version`; none /// is shipped in P1-5. #[derive(Clone, Copy, Debug, Default)] pub struct MdHeadingV1Chunker; impl Chunker for MdHeadingV1Chunker { fn chunker_version(&self) -> ChunkerVersion { ChunkerVersion(VERSION_LABEL.to_string()) } /// Compute the policy hash folded into `chunk_id` per design §4.2. /// /// # Panics /// /// Panics if canonical JSON serialization of `ChunkPolicy` fails. /// This is unreachable in practice — `ChunkPolicy` is composed of /// owned primitives (`usize`, `bool`, owned `String`) and /// `serde_json_canonicalizer::to_vec` only fails on /// non-serializable values such as non-finite floats or maps with /// non-string keys, neither of which can be constructed via /// `ChunkPolicy`'s public surface. The `expect` is preserved as a /// future-proofing guard against drift if `ChunkPolicy` ever gains /// a field with such a property. fn policy_hash(&self, policy: &ChunkPolicy) -> String { let bytes = serde_json_canonicalizer::to_vec(policy) .expect("canonical JSON serialization of ChunkPolicy must not fail"); let hex = blake3::hash(&bytes).to_hex().to_string(); hex[..POLICY_HASH_HEX_LEN].to_string() } fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { let policy_hash = self.policy_hash(policy); let chunker_version = self.chunker_version(); let mut out: Vec = Vec::new(); // Running accumulator: the paragraphs/lists/quotes (and the // optional leading heading) that will be glued into the next // emitted chunk. let mut acc = ChunkAcc::default(); for block in &doc.blocks { match block { Block::Heading(_) => { // §0/§14 priority 1: heading is a hard boundary. // Flush whatever has accumulated, then seed a new // accumulator that owns this heading. flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); acc.push_block(block); } Block::Code(_) | Block::Table(_) => { // Atomic non-splittable text blocks. Flush running // accumulator, then emit the atomic block as its // own chunk. (Code never splits per priority 2; // tables stay single per priority 3.) flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); let mut single = ChunkAcc::default(); single.push_block(block); flush(&mut single, doc, &chunker_version, &policy_hash, &mut out); } Block::ImageRef(_) | Block::AudioRef(_) => { // Independent searchable artifacts. token_estimate=0 // is enforced inside `build_chunk` for these kinds. flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); let mut single = ChunkAcc::default(); single.push_block(block); flush(&mut single, doc, &chunker_version, &policy_hash, &mut out); } Block::Paragraph(_) | Block::List(_) | Block::Quote(_) => { // Soft-split candidates. If adding this block would // exceed target_tokens (and we already have at least // one non-heading block in the accumulator), emit // the current chunk and seed the next one with // overlap from the prior tail. let next_tokens = estimate_block_tokens(block); // Note: `acc.text_tokens` already includes the prior // chunk's overlap seed. The clamp in // `collect_overlap_seed` keeps seed ≤ target/2, so // a flush here never produces a chunk smaller than // the seed budget. let would_exceed = acc.text_tokens + next_tokens > policy.target_tokens && acc.has_non_heading_content(); if would_exceed { let overlap_seed = collect_overlap_seed(&acc, policy.overlap_tokens, policy.target_tokens); flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); // Seed next accumulator with the prior chunk's // tail blocks (paragraph-level overlap). The // heading is *not* re-included here — it lives // on the prior chunk. The follow-on chunk's // heading_path is taken from the first seeded // block (which carries the same path, as it sat // under the same heading). for b in overlap_seed { acc.push_block(b); } } acc.push_block(block); } } } flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); tracing::debug!( target: "kebab-chunk", doc_id = %doc.doc_id, chunks = out.len(), "md-heading-v1 chunked", ); Ok(out) } } /// Internal accumulator: pointers to blocks (lifetime-bound to the /// `CanonicalDocument`) plus the running token estimate of their text. #[derive(Default)] struct ChunkAcc<'a> { blocks: Vec<&'a Block>, text_tokens: usize, } impl<'a> ChunkAcc<'a> { fn push_block(&mut self, b: &'a Block) { self.text_tokens += estimate_block_tokens(b); self.blocks.push(b); } fn is_empty(&self) -> bool { self.blocks.is_empty() } /// True if any non-heading block sits in the accumulator. Used to /// avoid splitting a chunk that contains only its leading heading /// (which would emit a heading-only chunk before any prose). fn has_non_heading_content(&self) -> bool { self.blocks.iter().any(|b| !matches!(b, Block::Heading(_))) } } /// Drain `acc` into a fresh `Chunk` and push to `out`. No-op when empty. fn flush( acc: &mut ChunkAcc<'_>, doc: &CanonicalDocument, chunker_version: &ChunkerVersion, policy_hash: &str, out: &mut Vec, ) { if acc.is_empty() { return; } let blocks = std::mem::take(&mut acc.blocks); acc.text_tokens = 0; out.push(build_chunk(doc, &blocks, chunker_version, policy_hash)); } /// Collect the trailing blocks of `acc` (in document order) whose /// combined token estimate fits under the seed budget. The heading /// block (if it leads the accumulator) is excluded from the seed — /// re-emitting the heading would conflate it with the next chunk's /// own heading_path provenance. /// /// The seed budget is clamped to `min(overlap_tokens, target_tokens / 2)`. /// Without the clamp, an `overlap_tokens >= target_tokens` policy /// degenerates into 1-block-per-chunk: the seed already exceeds budget /// before any new content lands, so the very next paragraph trips the /// `would_exceed` flush. Halving the target guarantees the seed leaves /// at least target/2 worth of room for fresh content in the next chunk. fn collect_overlap_seed<'a>( acc: &ChunkAcc<'a>, overlap_tokens: usize, target_tokens: usize, ) -> Vec<&'a Block> { let seed_budget = overlap_tokens.min(target_tokens / 2); if seed_budget == 0 { return Vec::new(); } let mut taken = Vec::new(); let mut budget = seed_budget; for b in acc.blocks.iter().rev() { if matches!(b, Block::Heading(_)) { // Don't propagate the heading itself into the next chunk; // its `heading_path` carries naturally on the next blocks // (kb-normalize stamps every block under a heading with // that heading's path). continue; } let est = estimate_block_tokens(b); if est > budget && !taken.is_empty() { break; } taken.push(*b); budget = budget.saturating_sub(est); if budget == 0 { break; } } taken.reverse(); taken } /// Construct a `Chunk` from a non-empty contiguous slice of blocks. fn build_chunk( doc: &CanonicalDocument, blocks: &[&Block], chunker_version: &ChunkerVersion, policy_hash: &str, ) -> Chunk { debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block"); let block_ids: Vec = blocks.iter().map(|b| common(b).block_id.clone()).collect(); let source_spans: Vec = blocks .iter() .map(|b| common(b).source_span.clone()) .collect(); // heading_path: pick the first non-Heading block's heading_path // (which already includes every parent heading per kb-normalize). // When the FIRST block is a Heading — either a heading-only chunk, // or a chunk that leads with `# H1` immediately followed by another // Heading or atomic block — the Heading block's own // `common.heading_path` records only its *parents* (kb-normalize // does not include a heading inside its own path). We synthesize // the leading heading into the path so the citation context is not // lost on patterns like `# Alpha\n## Beta\n...`. let heading_path = match blocks[0] { Block::Heading(h) => { let mut path = h.common.heading_path.clone(); path.push(h.text.clone()); path } _ => common(blocks[0]).heading_path.clone(), }; // Text rendering: simple double-newline join of each block's // contribution. We deliberately pick a stable, low-fidelity // representation — embedding-quality rewrites land in P3. let mut text = String::new(); let mut is_image_or_audio_only = true; for (i, b) in blocks.iter().enumerate() { let part = render_block_text(b); if !matches!(b, Block::ImageRef(_) | Block::AudioRef(_)) { is_image_or_audio_only = false; } if i > 0 { text.push_str("\n\n"); } text.push_str(&part); } let token_estimate = if is_image_or_audio_only { 0 } else { // Token estimate is bytes / BYTES_PER_TOKEN, rounded up so the // proxy never under-counts. text.len().div_ceil(BYTES_PER_TOKEN) }; let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, &block_ids, policy_hash); Chunk { chunk_id, doc_id: DocumentId(doc.doc_id.0.clone()), block_ids, tokenized_korean_text: crate::tokenize_korean_morphological(&text), text, heading_path, source_spans, token_estimate, chunker_version: chunker_version.clone(), policy_hash: policy_hash.to_string(), } } /// Render a block's contribution to a chunk's `text`. The rendering is /// deliberately minimal — embedding-time normalization is a P3 concern. fn render_block_text(b: &Block) -> String { match b { Block::Heading(h) => h.text.clone(), Block::Paragraph(p) | Block::Quote(p) => p.text.clone(), Block::List(l) => l .items .iter() .map(|it| it.text.as_str()) .collect::>() .join("\n"), Block::Code(c) => c.code.clone(), Block::Table(t) => { // Headers row joined with " | ", then each row likewise. let mut s = t.headers.join(" | "); for row in &t.rows { s.push('\n'); s.push_str(&row.join(" | ")); } s } // ImageRef text portion follows the P6-4 (β) plain-concat // contract — `[alt, ocr.joined, caption.text]` joined by // `\n\n`, dropping empty parts. Filename fallback for empty // alt keeps lexical search hits on filenames working even when // P6-1's filename auto-fill is bypassed. Block::ImageRef(i) => { let alt = if i.alt.is_empty() { // P6-1 falls back to filename so this branch is // defensive — keep it lest a future test fixture or // synthetic block path skip the auto-fill. i.src .rsplit('/') .next() .filter(|s| !s.is_empty()) .unwrap_or("[image]") .to_string() } else { i.alt.clone() }; let ocr = i.ocr.as_ref().map_or("", |o| o.joined.as_str()); let cap = i.caption.as_ref().map_or("", |c| c.text.as_str()); [alt.as_str(), ocr, cap] .iter() .filter(|s| !s.is_empty()) .copied() .collect::>() .join("\n\n") } // AudioRef has no caption preview yet (transcript joins land // in P8). Empty string per task spec. Block::AudioRef(_) => String::new(), } } fn estimate_block_tokens(b: &Block) -> usize { match b { // ImageRef / AudioRef contribute 0 — they are independent // chunks and never participate in size accounting. Block::ImageRef(_) | Block::AudioRef(_) => 0, _ => render_block_text(b).len().div_ceil(BYTES_PER_TOKEN), } } /// Borrow the `CommonBlock` of any [`Block`] variant. fn common(b: &Block) -> &kebab_core::CommonBlock { match b { Block::Heading(h) => &h.common, Block::Paragraph(t) | Block::Quote(t) => &t.common, Block::List(l) => &l.common, Block::Code(c) => &c.common, Block::Table(t) => &t.common, Block::ImageRef(i) => &i.common, Block::AudioRef(a) => &a.common, } } #[cfg(test)] mod tests { use super::*; use kebab_core::{ AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel, WorkspacePath, id_for_block, }; use time::OffsetDateTime; fn make_doc(blocks: Vec) -> CanonicalDocument { CanonicalDocument { doc_id: kebab_core::DocumentId("d".repeat(32)), source_asset_id: AssetId("a".repeat(32)), workspace_path: WorkspacePath::new("notes/test.md".into()).unwrap(), title: "Test".into(), lang: Lang("en".into()), blocks, metadata: Metadata { aliases: vec![], tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), source_type: SourceType::Note, trust_level: TrustLevel::Primary, user_id_alias: None, user: Default::default(), repo: None, git_branch: None, git_commit: None, code_lang: None, source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: kebab_core::ParserVersion("test-parser-0".into()), schema_version: 1, doc_version: 1, last_chunker_version: None, last_embedding_version: None, } } fn doc_id() -> kebab_core::DocumentId { kebab_core::DocumentId("d".repeat(32)) } fn span(start: u32, end: u32) -> SourceSpan { SourceSpan::Line { start, end } } fn common_for(kind: &str, heading_path: &[String], ordinal: u32, s: SourceSpan) -> CommonBlock { CommonBlock { block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s), heading_path: heading_path.to_vec(), source_span: s, } } fn heading(level: u8, text: &str, ordinal: u32, line: u32) -> Block { Block::Heading(HeadingBlock { common: common_for("heading", &[], ordinal, span(line, line)), level, text: text.into(), }) } /// Heading variant that carries a parent path — kb-normalize stamps /// every block under `# Alpha` with `heading_path = []` for the H1 /// itself but `["Alpha"]` for the H2 that follows. Tests covering /// the heading-only chunk path (I2) need that asymmetry. fn heading_with_parents( level: u8, text: &str, parents: &[&str], ordinal: u32, line: u32, ) -> Block { let hp: Vec = parents.iter().map(|s| (*s).into()).collect(); Block::Heading(HeadingBlock { common: common_for("heading", &hp, ordinal, span(line, line)), level, text: text.into(), }) } fn paragraph(text: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block { let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); Block::Paragraph(TextBlock { common: common_for("paragraph", &hp, ordinal, span(line, line)), text: text.into(), inlines: vec![], }) } fn code_block(code: &str, heading_path: &[&str], ordinal: u32, s: SourceSpan) -> Block { let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); Block::Code(CodeBlock { common: common_for("code", &hp, ordinal, s), lang: Some("rust".into()), code: code.into(), }) } fn table( headers: Vec<&str>, rows: Vec>, heading_path: &[&str], ordinal: u32, s: SourceSpan, ) -> Block { let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); Block::Table(TableBlock { common: common_for("table", &hp, ordinal, s), headers: headers.into_iter().map(String::from).collect(), rows: rows .into_iter() .map(|r| r.into_iter().map(String::from).collect()) .collect(), }) } fn image_ref(alt: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block { let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); Block::ImageRef(ImageRefBlock { common: common_for("imageref", &hp, ordinal, span(line, line)), asset_id: None, src: "img.png".into(), alt: alt.into(), ocr: None, caption: None, }) } fn default_policy(target: usize, overlap: usize) -> ChunkPolicy { ChunkPolicy { target_tokens: target, overlap_tokens: overlap, respect_markdown_headings: true, chunker_version: ChunkerVersion(VERSION_LABEL.into()), } } #[test] fn chunker_version_is_md_heading_v1() { assert_eq!( MdHeadingV1Chunker.chunker_version(), ChunkerVersion(VERSION_LABEL.to_string()) ); } #[test] fn policy_hash_is_deterministic_and_16_hex() { let p = default_policy(500, 80); let h1 = MdHeadingV1Chunker.policy_hash(&p); let h2 = MdHeadingV1Chunker.policy_hash(&p); assert_eq!(h1, h2); assert_eq!(h1.len(), POLICY_HASH_HEX_LEN); assert!(h1.bytes().all(|b| b.is_ascii_hexdigit())); } #[test] fn policy_hash_differs_when_policy_differs() { let p1 = default_policy(500, 80); let p2 = default_policy(500, 0); assert_ne!( MdHeadingV1Chunker.policy_hash(&p1), MdHeadingV1Chunker.policy_hash(&p2) ); } /// Heading boundary respected: two H2 sections produce separate /// chunks; no chunk's block_ids straddle the H2→H2 boundary. #[test] fn heading_boundary_respected() { let blocks = vec![ heading(2, "First", 0, 1), paragraph("body of first", &["First"], 0, 2), heading(2, "Second", 1, 3), paragraph("body of second", &["Second"], 0, 4), ]; let doc = make_doc(blocks); let chunks = MdHeadingV1Chunker .chunk(&doc, &default_policy(10_000, 0)) .unwrap(); assert_eq!(chunks.len(), 2); // First chunk = (heading "First", paragraph) assert_eq!(chunks[0].block_ids.len(), 2); // Second chunk = (heading "Second", paragraph) assert_eq!(chunks[1].block_ids.len(), 2); // heading_path on chunk 0 belongs to "First" section. assert_eq!(chunks[0].heading_path, vec!["First".to_string()]); assert_eq!(chunks[1].heading_path, vec!["Second".to_string()]); } /// A code block of ~800 tokens (≈2400 bytes) stays in a single /// chunk even when target=500. #[test] fn code_block_never_splits() { // 2400 bytes ≈ 800 tokens at BYTES_PER_TOKEN=3. let big = "x".repeat(2400); let blocks = vec![code_block(&big, &[], 0, span(1, 50))]; let doc = make_doc(blocks); let chunks = MdHeadingV1Chunker .chunk(&doc, &default_policy(500, 80)) .unwrap(); assert_eq!(chunks.len(), 1); assert_eq!(chunks[0].block_ids.len(), 1); assert!(chunks[0].token_estimate > 500); } /// A table of size < 2× target stays in a single chunk. #[test] fn table_stays_single_chunk_when_small() { let t = table( vec!["a", "b", "c"], vec![vec!["1", "2", "3"], vec!["4", "5", "6"]], &[], 0, span(1, 4), ); let blocks = vec![t]; let doc = make_doc(blocks); let chunks = MdHeadingV1Chunker .chunk(&doc, &default_policy(500, 80)) .unwrap(); assert_eq!(chunks.len(), 1); assert_eq!(chunks[0].block_ids.len(), 1); } /// A long sequence of paragraphs splits at target_tokens with /// overlap_tokens worth of seeded paragraph from the prior chunk. #[test] fn long_section_splits_with_overlap() { // Each paragraph is 60 bytes ≈ 20 tokens. target=50, overlap=20 // → after ~3 paragraphs we hit the target; the next chunk // starts seeded with one paragraph from the prior tail. let mut bs = vec![heading(2, "Long", 0, 1)]; for i in 0..6u32 { bs.push(paragraph(&"x".repeat(60), &["Long"], i, i + 2)); } let doc = make_doc(bs); let chunks = MdHeadingV1Chunker .chunk(&doc, &default_policy(50, 20)) .unwrap(); assert!( chunks.len() >= 2, "expected ≥2 chunks, got {}: {chunks:#?}", chunks.len() ); // Every chunk lives under the same heading_path "Long". for c in &chunks { assert_eq!(c.heading_path, vec!["Long".to_string()]); } // Overlap propagates: the last block_id of chunk N appears in // chunk N+1's block_ids (paragraph-level overlap rule). for w in chunks.windows(2) { let prev_tail = w[0].block_ids.last().unwrap(); assert!( w[1].block_ids.contains(prev_tail), "chunk N+1 must seed from chunk N's tail; \ prev_tail={prev_tail:?}, next ids={:?}", w[1].block_ids ); } } /// P6-4 (β) plain concatenation — alt + ocr.joined + caption.text /// joined by `\n\n`, dropping empty parts. Verifies all four /// (alt-only, alt+ocr, alt+caption, alt+ocr+caption) shapes. #[test] fn image_ref_p6_4_plain_concat_drops_empty_parts() { use kebab_core::{ModelCaption, OcrText}; let mk = |alt: &str, ocr: Option<&str>, cap: Option<&str>| { Block::ImageRef(ImageRefBlock { common: common_for("imageref", &[], 0, span(1, 1)), asset_id: None, src: "img.png".into(), alt: alt.into(), ocr: ocr.map(|t| OcrText { joined: t.into(), regions: vec![], engine: "test".into(), engine_version: "v1".into(), }), caption: cap.map(|t| ModelCaption { text: t.into(), model: "m".into(), model_version: "v".into(), }), }) }; // alt-only — no separators between empty parts. assert_eq!(render_block_text(&mk("photo.png", None, None)), "photo.png"); // alt + ocr — joined by exactly one `\n\n`. assert_eq!( render_block_text(&mk("photo.png", Some("Hello"), None)), "photo.png\n\nHello" ); // alt + caption. assert_eq!( render_block_text(&mk("photo.png", None, Some("a red square"))), "photo.png\n\na red square" ); // alt + ocr + caption — three parts joined by `\n\n` each. assert_eq!( render_block_text(&mk("photo.png", Some("Hello"), Some("a red square"))), "photo.png\n\nHello\n\na red square" ); // empty alt — falls back to filename derived from `src`. let blk = mk("", Some("text from image"), None); assert_eq!( render_block_text(&blk), "img.png\n\ntext from image", "empty alt must fall back to the basename of `src`" ); } /// ImageRef → own chunk, token_estimate=0. #[test] fn image_ref_emits_own_chunk_zero_tokens() { let blocks = vec![ heading(2, "With image", 0, 1), paragraph("intro", &["With image"], 0, 2), image_ref("a cat", &["With image"], 0, 3), paragraph("after", &["With image"], 1, 4), ]; let doc = make_doc(blocks); let chunks = MdHeadingV1Chunker .chunk(&doc, &default_policy(10_000, 0)) .unwrap(); // Expect: (heading + intro), (image), (after). The image must // be its own chunk and carry token_estimate=0. assert!(chunks.len() >= 3, "unexpected chunk count: {chunks:#?}"); let img_chunk = chunks .iter() .find(|c| c.text == "a cat") .expect("image chunk present"); assert_eq!(img_chunk.token_estimate, 0); assert_eq!(img_chunk.block_ids.len(), 1); } /// Identical input + identical policy → identical chunk_ids over /// 1000 iterations. #[test] fn deterministic_chunk_ids_1000() { let blocks = vec![ heading(2, "Det", 0, 1), paragraph("body 1", &["Det"], 0, 2), paragraph("body 2", &["Det"], 1, 3), heading(2, "Det 2", 1, 4), paragraph("body 3", &["Det 2"], 0, 5), ]; let doc = make_doc(blocks); let policy = default_policy(50, 10); let baseline: Vec = MdHeadingV1Chunker .chunk(&doc, &policy) .unwrap() .into_iter() .map(|c| c.chunk_id.0) .collect(); for _ in 0..1000 { let again: Vec = MdHeadingV1Chunker .chunk(&doc, &policy) .unwrap() .into_iter() .map(|c| c.chunk_id.0) .collect(); assert_eq!(again, baseline); } } /// I2 regression: when a Heading is followed immediately by another /// Heading or atomic block (no intervening prose), the resulting /// heading-only / heading-led chunk must carry the heading text in /// its own `heading_path`. Pattern: `# Alpha`, `## Beta`, code. /// /// Before the fix, chunk[0] (Heading-only "Alpha") would have /// `heading_path = []` because `kb-normalize` does not stamp a /// heading inside its own path; the chunker fell back to the /// heading's parent path. After the fix it is `["Alpha"]`. /// /// `chunk_id` recipe (`doc_id, chunker_version, block_ids, /// policy_hash`) does NOT include `heading_path`, so this fix does /// NOT shift chunk_ids — only `heading_path` fields. #[test] fn heading_only_chunk_carries_self_in_path() { // # Alpha (H1, no parents) // ## Beta (H2, parent = ["Alpha"]) // ```rust ... ``` (code, heading_path = ["Alpha", "Beta"]) let blocks = vec![ heading_with_parents(1, "Alpha", &[], 0, 1), heading_with_parents(2, "Beta", &["Alpha"], 0, 2), code_block("fn x() {}", &["Alpha", "Beta"], 0, span(3, 3)), ]; let doc = make_doc(blocks); let chunks = MdHeadingV1Chunker .chunk(&doc, &default_policy(10_000, 0)) .unwrap(); // Three chunks: Heading-only Alpha, Heading-only Beta, code. assert_eq!(chunks.len(), 3, "got {chunks:#?}"); assert_eq!(chunks[0].heading_path, vec!["Alpha".to_string()]); assert_eq!( chunks[1].heading_path, vec!["Alpha".to_string(), "Beta".to_string()] ); assert_eq!( chunks[2].heading_path, vec!["Alpha".to_string(), "Beta".to_string()] ); } /// I3 regression: a pathological policy with /// `overlap_tokens >= target_tokens` must NOT degenerate into /// 1-block-per-chunk. The seed budget is clamped to `target/2`, /// guaranteeing every flushed chunk has space for fresh content. #[test] fn overlap_clamped_when_overlap_exceeds_target() { // 5 paragraphs of ~20 tokens each (60 bytes / 3 BPT). // target = 50, overlap = 200 (4× target → would trip flush // immediately without clamp). let mut bs = vec![heading_with_parents(2, "Long", &[], 0, 1)]; for i in 0..5u32 { bs.push(paragraph(&"x".repeat(60), &["Long"], i, i + 2)); } let doc = make_doc(bs); let policy = ChunkPolicy { target_tokens: 50, overlap_tokens: 200, respect_markdown_headings: true, chunker_version: ChunkerVersion(VERSION_LABEL.into()), }; let chunks = MdHeadingV1Chunker.chunk(&doc, &policy).unwrap(); // Without the clamp, every chunk after the first would have // exactly 1 paragraph (because seed alone already exceeds // target and acc.has_non_heading_content() is true the moment // any seed lands). With the clamp, follow-on chunks must hold // at least the seed paragraph + the new paragraph = ≥2 blocks. for (i, c) in chunks.iter().enumerate() { // The very first chunk includes the heading + first para // (no seed), so it is also ≥2. Subsequent chunks must be // seed+new ≥ 2. assert!( c.block_ids.len() >= 2, "chunk {i} degenerated to {} block(s); pathology not \ prevented: {chunks:#?}", c.block_ids.len() ); } } }