p1-5: implement md-heading-v1 chunking rules

Fills in MdHeadingV1Chunker::chunk with the priority-ordered ruleset from the design (§0 / §14): 1. Heading is a hard boundary; the heading itself starts and is included in its chunk so heading text is retrievable. 2. Code blocks never split, regardless of size. 3. Tables stay single-chunk (row-split deferred per task spec). 4. Long sections split at target_tokens with paragraph-level overlap_tokens worth of seeded tail blocks. 5. ImageRef / AudioRef each become their own chunk with token_estimate = 0. 6. heading_path lifts from the first contributing non-Heading block; source_spans concatenate in document order. 7. chunk_id derives from id_for_chunk(doc_id, chunker_version, block_ids, policy_hash) per §4.2. Covers the unit + determinism rows of the P1-5 test plan: heading boundary respected, 800-token code block stays single, small table stays single, long paragraph chain splits with overlap, ImageRef chunk has token_estimate=0, and 1000-iter chunk_id determinism. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 16:30:26 +00:00
parent 8142449eb7
commit 0237022d0e
1 changed files with 593 additions and 37 deletions
--- a/crates/kb-chunk/src/md_heading_v1.rs
+++ b/crates/kb-chunk/src/md_heading_v1.rs
@@ -1,7 +1,8 @@
 //! `md-heading-v1` — heading-aware Markdown chunker.

 use kb_core::{
-    CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion,
+    Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker,
+    ChunkerVersion, DocumentId, SourceSpan, id_for_chunk,
 };

 /// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label
@@ -9,6 +10,18 @@ use kb_core::{
 /// must ship with a documented migration plan.
 const VERSION_LABEL: &str = "md-heading-v1";

+/// Bytes-per-token proxy. We over-estimate (smaller divisor → larger
+/// token count) so that real tokenizers downstream never see a chunk
+/// exceeding their budget. English averages ~4 bytes/token under BPE,
+/// Korean averages ~3 bytes/token under E5; picking 3 covers both.
+const BYTES_PER_TOKEN: usize = 3;
+
+/// Maximum hex characters of `blake3(canonical_json(policy))` retained
+/// in `policy_hash`. 16 hex chars = 64 bits of policy entropy, which is
+/// far beyond enough to disambiguate the handful of policy variants a
+/// single workspace will see.
+const POLICY_HASH_HEX_LEN: usize = 16;
+
 /// Heading-aware Markdown chunker.
 ///
 /// Implements [`kb_core::Chunker`] for Markdown-derived
@@ -48,22 +61,10 @@ const VERSION_LABEL: &str = "md-heading-v1";
 /// constant is deliberately small (3) so the proxy *over*-estimates token
 /// count — chunks sized against this proxy are guaranteed to fit in any
 /// real BPE tokenizer's budget for English (~4 bytes/token) or Korean
-/// (~3 bytes/token under E5/M-BERT). See `BYTES_PER_TOKEN` for rationale.
+/// (~3 bytes/token under E5/M-BERT). See [`BYTES_PER_TOKEN`] for rationale.
 #[derive(Clone, Copy, Debug, Default)]
 pub struct MdHeadingV1Chunker;

-/// Bytes-per-token proxy. We over-estimate (smaller divisor → larger
-/// token count) so that real tokenizers downstream never see a chunk
-/// exceeding their budget. English averages ~4 bytes/token under BPE,
-/// Korean averages ~3 bytes/token under E5; picking 3 covers both.
-const BYTES_PER_TOKEN: usize = 3;
-
-/// Maximum hex characters of `blake3(canonical_json(policy))` retained
-/// in `policy_hash`. 16 hex chars = 64 bits of policy entropy, which is
-/// far beyond enough to disambiguate the handful of policy variants a
-/// single workspace will see.
-const POLICY_HASH_HEX_LEN: usize = 16;
-
 impl Chunker for MdHeadingV1Chunker {
    fn chunker_version(&self) -> ChunkerVersion {
        ChunkerVersion(VERSION_LABEL.to_string())
@@ -78,16 +79,436 @@ impl Chunker for MdHeadingV1Chunker {

    fn chunk(
        &self,
-        _doc: &CanonicalDocument,
-        _policy: &ChunkPolicy,
+        doc: &CanonicalDocument,
+        policy: &ChunkPolicy,
    ) -> anyhow::Result<Vec<Chunk>> {
-        Ok(Vec::new())
+        let policy_hash = self.policy_hash(policy);
+        let chunker_version = self.chunker_version();
+        let mut out: Vec<Chunk> = Vec::new();
+
+        // Running accumulator: the paragraphs/lists/quotes (and the
+        // optional leading heading) that will be glued into the next
+        // emitted chunk.
+        let mut acc = ChunkAcc::default();
+
+        for block in &doc.blocks {
+            match block {
+                Block::Heading(_) => {
+                    // §0/§14 priority 1: heading is a hard boundary.
+                    // Flush whatever has accumulated, then seed a new
+                    // accumulator that owns this heading.
+                    flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
+                    acc.push_block(block);
+                }
+                Block::Code(_) | Block::Table(_) => {
+                    // Atomic non-splittable text blocks. Flush running
+                    // accumulator, then emit the atomic block as its
+                    // own chunk. (Code never splits per priority 2;
+                    // tables stay single per priority 3.)
+                    flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
+                    let mut single = ChunkAcc::default();
+                    single.push_block(block);
+                    flush(&mut single, doc, &chunker_version, &policy_hash, &mut out);
+                }
+                Block::ImageRef(_) | Block::AudioRef(_) => {
+                    // Independent searchable artifacts. token_estimate=0
+                    // is enforced inside `build_chunk` for these kinds.
+                    flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
+                    let mut single = ChunkAcc::default();
+                    single.push_block(block);
+                    flush(&mut single, doc, &chunker_version, &policy_hash, &mut out);
+                }
+                Block::Paragraph(_) | Block::List(_) | Block::Quote(_) => {
+                    // Soft-split candidates. If adding this block would
+                    // exceed target_tokens (and we already have at least
+                    // one non-heading block in the accumulator), emit
+                    // the current chunk and seed the next one with
+                    // overlap from the prior tail.
+                    let next_tokens = estimate_block_tokens(block);
+                    let would_exceed = acc.text_tokens + next_tokens
+                        > policy.target_tokens
+                        && acc.has_non_heading_content();
+                    if would_exceed {
+                        let overlap_seed = collect_overlap_seed(
+                            &acc,
+                            policy.overlap_tokens,
+                        );
+                        flush(
+                            &mut acc,
+                            doc,
+                            &chunker_version,
+                            &policy_hash,
+                            &mut out,
+                        );
+                        // Seed next accumulator with the prior chunk's
+                        // tail blocks (paragraph-level overlap). The
+                        // heading is *not* re-included here — it lives
+                        // on the prior chunk. The follow-on chunk's
+                        // heading_path is taken from the first seeded
+                        // block (which carries the same path, as it sat
+                        // under the same heading).
+                        for b in overlap_seed {
+                            acc.push_block(b);
+                        }
+                    }
+                    acc.push_block(block);
+                }
+            }
+        }
+        flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
+
+        tracing::debug!(
+            target: "kb-chunk",
+            doc_id = %doc.doc_id,
+            chunks = out.len(),
+            "md-heading-v1 chunked",
+        );
+
+        Ok(out)
+    }
+}
+
+/// Internal accumulator: pointers to blocks (lifetime-bound to the
+/// `CanonicalDocument`) plus the running token estimate of their text.
+#[derive(Default)]
+struct ChunkAcc<'a> {
+    blocks: Vec<&'a Block>,
+    text_tokens: usize,
+}
+
+impl<'a> ChunkAcc<'a> {
+    fn push_block(&mut self, b: &'a Block) {
+        self.text_tokens += estimate_block_tokens(b);
+        self.blocks.push(b);
+    }
+
+    fn is_empty(&self) -> bool {
+        self.blocks.is_empty()
+    }
+
+    /// True if any non-heading block sits in the accumulator. Used to
+    /// avoid splitting a chunk that contains only its leading heading
+    /// (which would emit a heading-only chunk before any prose).
+    fn has_non_heading_content(&self) -> bool {
+        self.blocks.iter().any(|b| !matches!(b, Block::Heading(_)))
+    }
+}
+
+/// Drain `acc` into a fresh `Chunk` and push to `out`. No-op when empty.
+fn flush(
+    acc: &mut ChunkAcc<'_>,
+    doc: &CanonicalDocument,
+    chunker_version: &ChunkerVersion,
+    policy_hash: &str,
+    out: &mut Vec<Chunk>,
+) {
+    if acc.is_empty() {
+        return;
+    }
+    let blocks = std::mem::take(&mut acc.blocks);
+    acc.text_tokens = 0;
+    out.push(build_chunk(doc, &blocks, chunker_version, policy_hash));
+}
+
+/// Collect the trailing blocks of `acc` (in document order) whose
+/// combined token estimate fits under `overlap_tokens`. The heading
+/// block (if it leads the accumulator) is excluded from the seed —
+/// re-emitting the heading would conflate it with the next chunk's
+/// own heading_path provenance.
+fn collect_overlap_seed<'a>(
+    acc: &ChunkAcc<'a>,
+    overlap_tokens: usize,
+) -> Vec<&'a Block> {
+    if overlap_tokens == 0 {
+        return Vec::new();
+    }
+    let mut taken = Vec::new();
+    let mut budget = overlap_tokens;
+    for b in acc.blocks.iter().rev() {
+        if matches!(b, Block::Heading(_)) {
+            // Don't propagate the heading itself into the next chunk;
+            // its `heading_path` carries naturally on the next blocks
+            // (kb-normalize stamps every block under a heading with
+            // that heading's path).
+            continue;
+        }
+        let est = estimate_block_tokens(b);
+        if est > budget && !taken.is_empty() {
+            break;
+        }
+        taken.push(*b);
+        budget = budget.saturating_sub(est);
+        if budget == 0 {
+            break;
+        }
+    }
+    taken.reverse();
+    taken
+}
+
+/// Construct a `Chunk` from a non-empty contiguous slice of blocks.
+fn build_chunk(
+    doc: &CanonicalDocument,
+    blocks: &[&Block],
+    chunker_version: &ChunkerVersion,
+    policy_hash: &str,
+) -> Chunk {
+    debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block");
+
+    let block_ids: Vec<BlockId> =
+        blocks.iter().map(|b| common(b).block_id.clone()).collect();
+    let source_spans: Vec<SourceSpan> =
+        blocks.iter().map(|b| common(b).source_span.clone()).collect();
+
+    // heading_path: pick the first non-Heading block's heading_path; if
+    // every block is a Heading (only the heading itself made it into
+    // this chunk), use that heading's heading_path.
+    let heading_path = blocks
+        .iter()
+        .find(|b| !matches!(b, Block::Heading(_)))
+        .map(|b| common(b).heading_path.clone())
+        .unwrap_or_else(|| common(blocks[0]).heading_path.clone());
+
+    // Text rendering: simple double-newline join of each block's
+    // contribution. We deliberately pick a stable, low-fidelity
+    // representation — embedding-quality rewrites land in P3.
+    let mut text = String::new();
+    let mut is_image_or_audio_only = true;
+    for (i, b) in blocks.iter().enumerate() {
+        let part = render_block_text(b);
+        if !matches!(b, Block::ImageRef(_) | Block::AudioRef(_)) {
+            is_image_or_audio_only = false;
+        }
+        if i > 0 {
+            text.push_str("\n\n");
+        }
+        text.push_str(&part);
+    }
+
+    let token_estimate = if is_image_or_audio_only {
+        0
+    } else {
+        // Token estimate is bytes / BYTES_PER_TOKEN, rounded up so the
+        // proxy never under-counts.
+        text.len().div_ceil(BYTES_PER_TOKEN)
+    };
+
+    let chunk_id = id_for_chunk(
+        &doc.doc_id,
+        chunker_version,
+        &block_ids,
+        policy_hash,
+    );
+
+    Chunk {
+        chunk_id,
+        doc_id: DocumentId(doc.doc_id.0.clone()),
+        block_ids,
+        text,
+        heading_path,
+        source_spans,
+        token_estimate,
+        chunker_version: chunker_version.clone(),
+    }
+}
+
+/// Render a block's contribution to a chunk's `text`. The rendering is
+/// deliberately minimal — embedding-time normalization is a P3 concern.
+fn render_block_text(b: &Block) -> String {
+    match b {
+        Block::Heading(h) => h.text.clone(),
+        Block::Paragraph(p) | Block::Quote(p) => p.text.clone(),
+        Block::List(l) => l
+            .items
+            .iter()
+            .map(|it| it.text.as_str())
+            .collect::<Vec<_>>()
+            .join("\n"),
+        Block::Code(c) => c.code.clone(),
+        Block::Table(t) => {
+            // Headers row joined with " | ", then each row likewise.
+            let mut s = t.headers.join(" | ");
+            for row in &t.rows {
+                s.push('\n');
+                s.push_str(&row.join(" | "));
+            }
+            s
+        }
+        // ImageRef text portion = alt (per task spec). Fall back to
+        // model caption text if alt is empty.
+        Block::ImageRef(i) => {
+            if !i.alt.is_empty() {
+                i.alt.clone()
+            } else {
+                i.caption
+                    .as_ref()
+                    .map(|c| c.text.clone())
+                    .unwrap_or_default()
+            }
+        }
+        // AudioRef has no caption preview yet (transcript joins land
+        // in P8). Empty string per task spec.
+        Block::AudioRef(_) => String::new(),
+    }
+}
+
+fn estimate_block_tokens(b: &Block) -> usize {
+    match b {
+        // ImageRef / AudioRef contribute 0 — they are independent
+        // chunks and never participate in size accounting.
+        Block::ImageRef(_) | Block::AudioRef(_) => 0,
+        _ => render_block_text(b).len().div_ceil(BYTES_PER_TOKEN),
+    }
+}
+
+/// Borrow the `CommonBlock` of any [`Block`] variant.
+fn common(b: &Block) -> &kb_core::CommonBlock {
+    match b {
+        Block::Heading(h) => &h.common,
+        Block::Paragraph(t) | Block::Quote(t) => &t.common,
+        Block::List(l) => &l.common,
+        Block::Code(c) => &c.common,
+        Block::Table(t) => &t.common,
+        Block::ImageRef(i) => &i.common,
+        Block::AudioRef(a) => &a.common,
    }
 }

 #[cfg(test)]
 mod tests {
    use super::*;
+    use kb_core::{
+        AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang,
+        Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel,
+        WorkspacePath, id_for_block,
+    };
+    use time::OffsetDateTime;
+
+    fn make_doc(blocks: Vec<Block>) -> CanonicalDocument {
+        CanonicalDocument {
+            doc_id: kb_core::DocumentId("d".repeat(32)),
+            source_asset_id: AssetId("a".repeat(32)),
+            workspace_path: WorkspacePath::new("notes/test.md".into()).unwrap(),
+            title: "Test".into(),
+            lang: Lang("en".into()),
+            blocks,
+            metadata: Metadata {
+                aliases: vec![],
+                tags: vec![],
+                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
+                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+            },
+            provenance: Provenance { events: vec![] },
+            parser_version: kb_core::ParserVersion("test-parser-0".into()),
+            schema_version: 1,
+            doc_version: 1,
+        }
+    }
+
+    fn doc_id() -> kb_core::DocumentId {
+        kb_core::DocumentId("d".repeat(32))
+    }
+
+    fn span(start: u32, end: u32) -> SourceSpan {
+        SourceSpan::Line { start, end }
+    }
+
+    fn common_for(
+        kind: &str,
+        heading_path: &[String],
+        ordinal: u32,
+        s: SourceSpan,
+    ) -> CommonBlock {
+        CommonBlock {
+            block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s),
+            heading_path: heading_path.to_vec(),
+            source_span: s,
+        }
+    }
+
+    fn heading(level: u8, text: &str, ordinal: u32, line: u32) -> Block {
+        Block::Heading(HeadingBlock {
+            common: common_for("heading", &[], ordinal, span(line, line)),
+            level,
+            text: text.into(),
+        })
+    }
+
+    fn paragraph(
+        text: &str,
+        heading_path: &[&str],
+        ordinal: u32,
+        line: u32,
+    ) -> Block {
+        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
+        Block::Paragraph(TextBlock {
+            common: common_for("paragraph", &hp, ordinal, span(line, line)),
+            text: text.into(),
+            inlines: vec![],
+        })
+    }
+
+    fn code_block(
+        code: &str,
+        heading_path: &[&str],
+        ordinal: u32,
+        s: SourceSpan,
+    ) -> Block {
+        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
+        Block::Code(CodeBlock {
+            common: common_for("code", &hp, ordinal, s),
+            lang: Some("rust".into()),
+            code: code.into(),
+        })
+    }
+
+    fn table(
+        headers: Vec<&str>,
+        rows: Vec<Vec<&str>>,
+        heading_path: &[&str],
+        ordinal: u32,
+        s: SourceSpan,
+    ) -> Block {
+        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
+        Block::Table(TableBlock {
+            common: common_for("table", &hp, ordinal, s),
+            headers: headers.into_iter().map(String::from).collect(),
+            rows: rows
+                .into_iter()
+                .map(|r| r.into_iter().map(String::from).collect())
+                .collect(),
+        })
+    }
+
+    fn image_ref(
+        alt: &str,
+        heading_path: &[&str],
+        ordinal: u32,
+        line: u32,
+    ) -> Block {
+        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
+        Block::ImageRef(ImageRefBlock {
+            common: common_for("imageref", &hp, ordinal, span(line, line)),
+            asset_id: None,
+            src: "img.png".into(),
+            alt: alt.into(),
+            ocr: None,
+            caption: None,
+        })
+    }
+
+    fn default_policy(target: usize, overlap: usize) -> ChunkPolicy {
+        ChunkPolicy {
+            target_tokens: target,
+            overlap_tokens: overlap,
+            respect_markdown_headings: true,
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
+    }

    #[test]
    fn chunker_version_is_md_heading_v1() {
@@ -99,14 +520,9 @@ mod tests {

    #[test]
    fn policy_hash_is_deterministic_and_16_hex() {
-        let policy = ChunkPolicy {
-            target_tokens: 500,
-            overlap_tokens: 80,
-            respect_markdown_headings: true,
-            chunker_version: ChunkerVersion(VERSION_LABEL.to_string()),
-        };
-        let h1 = MdHeadingV1Chunker.policy_hash(&policy);
-        let h2 = MdHeadingV1Chunker.policy_hash(&policy);
+        let p = default_policy(500, 80);
+        let h1 = MdHeadingV1Chunker.policy_hash(&p);
+        let h2 = MdHeadingV1Chunker.policy_hash(&p);
        assert_eq!(h1, h2);
        assert_eq!(h1.len(), POLICY_HASH_HEX_LEN);
        assert!(h1.bytes().all(|b| b.is_ascii_hexdigit()));
@@ -114,21 +530,161 @@ mod tests {

    #[test]
    fn policy_hash_differs_when_policy_differs() {
-        let p1 = ChunkPolicy {
-            target_tokens: 500,
-            overlap_tokens: 80,
-            respect_markdown_headings: true,
-            chunker_version: ChunkerVersion(VERSION_LABEL.to_string()),
-        };
-        let p2 = ChunkPolicy {
-            target_tokens: 500,
-            overlap_tokens: 0, // <-- only this differs
-            respect_markdown_headings: true,
-            chunker_version: ChunkerVersion(VERSION_LABEL.to_string()),
-        };
+        let p1 = default_policy(500, 80);
+        let p2 = default_policy(500, 0);
        assert_ne!(
            MdHeadingV1Chunker.policy_hash(&p1),
            MdHeadingV1Chunker.policy_hash(&p2)
        );
    }
+
+    /// Heading boundary respected: two H2 sections produce separate
+    /// chunks; no chunk's block_ids straddle the H2→H2 boundary.
+    #[test]
+    fn heading_boundary_respected() {
+        let blocks = vec![
+            heading(2, "First", 0, 1),
+            paragraph("body of first", &["First"], 0, 2),
+            heading(2, "Second", 1, 3),
+            paragraph("body of second", &["Second"], 0, 4),
+        ];
+        let doc = make_doc(blocks);
+        let chunks = MdHeadingV1Chunker
+            .chunk(&doc, &default_policy(10_000, 0))
+            .unwrap();
+        assert_eq!(chunks.len(), 2);
+        // First chunk = (heading "First", paragraph)
+        assert_eq!(chunks[0].block_ids.len(), 2);
+        // Second chunk = (heading "Second", paragraph)
+        assert_eq!(chunks[1].block_ids.len(), 2);
+        // heading_path on chunk 0 belongs to "First" section.
+        assert_eq!(chunks[0].heading_path, vec!["First".to_string()]);
+        assert_eq!(chunks[1].heading_path, vec!["Second".to_string()]);
+    }
+
+    /// A code block of ~800 tokens (≈2400 bytes) stays in a single
+    /// chunk even when target=500.
+    #[test]
+    fn code_block_never_splits() {
+        // 2400 bytes ≈ 800 tokens at BYTES_PER_TOKEN=3.
+        let big = "x".repeat(2400);
+        let blocks = vec![code_block(&big, &[], 0, span(1, 50))];
+        let doc = make_doc(blocks);
+        let chunks = MdHeadingV1Chunker
+            .chunk(&doc, &default_policy(500, 80))
+            .unwrap();
+        assert_eq!(chunks.len(), 1);
+        assert_eq!(chunks[0].block_ids.len(), 1);
+        assert!(chunks[0].token_estimate > 500);
+    }
+
+    /// A table of size < 2× target stays in a single chunk.
+    #[test]
+    fn table_stays_single_chunk_when_small() {
+        let t = table(
+            vec!["a", "b", "c"],
+            vec![vec!["1", "2", "3"], vec!["4", "5", "6"]],
+            &[],
+            0,
+            span(1, 4),
+        );
+        let blocks = vec![t];
+        let doc = make_doc(blocks);
+        let chunks = MdHeadingV1Chunker
+            .chunk(&doc, &default_policy(500, 80))
+            .unwrap();
+        assert_eq!(chunks.len(), 1);
+        assert_eq!(chunks[0].block_ids.len(), 1);
+    }
+
+    /// A long sequence of paragraphs splits at target_tokens with
+    /// overlap_tokens worth of seeded paragraph from the prior chunk.
+    #[test]
+    fn long_section_splits_with_overlap() {
+        // Each paragraph is 60 bytes ≈ 20 tokens. target=50, overlap=20
+        // → after ~3 paragraphs we hit the target; the next chunk
+        // starts seeded with one paragraph from the prior tail.
+        let mut bs = vec![heading(2, "Long", 0, 1)];
+        for i in 0..6u32 {
+            bs.push(paragraph(&"x".repeat(60), &["Long"], i, i + 2));
+        }
+        let doc = make_doc(bs);
+        let chunks = MdHeadingV1Chunker
+            .chunk(&doc, &default_policy(50, 20))
+            .unwrap();
+        assert!(
+            chunks.len() >= 2,
+            "expected ≥2 chunks, got {}: {chunks:#?}",
+            chunks.len()
+        );
+        // Every chunk lives under the same heading_path "Long".
+        for c in &chunks {
+            assert_eq!(c.heading_path, vec!["Long".to_string()]);
+        }
+        // Overlap propagates: the last block_id of chunk N appears in
+        // chunk N+1's block_ids (paragraph-level overlap rule).
+        for w in chunks.windows(2) {
+            let prev_tail = w[0].block_ids.last().unwrap();
+            assert!(
+                w[1].block_ids.contains(prev_tail),
+                "chunk N+1 must seed from chunk N's tail; \
+                 prev_tail={prev_tail:?}, next ids={:?}",
+                w[1].block_ids
+            );
+        }
+    }
+
+    /// ImageRef → own chunk, token_estimate=0.
+    #[test]
+    fn image_ref_emits_own_chunk_zero_tokens() {
+        let blocks = vec![
+            heading(2, "With image", 0, 1),
+            paragraph("intro", &["With image"], 0, 2),
+            image_ref("a cat", &["With image"], 0, 3),
+            paragraph("after", &["With image"], 1, 4),
+        ];
+        let doc = make_doc(blocks);
+        let chunks = MdHeadingV1Chunker
+            .chunk(&doc, &default_policy(10_000, 0))
+            .unwrap();
+        // Expect: (heading + intro), (image), (after). The image must
+        // be its own chunk and carry token_estimate=0.
+        assert!(chunks.len() >= 3, "unexpected chunk count: {chunks:#?}");
+        let img_chunk = chunks
+            .iter()
+            .find(|c| c.text == "a cat")
+            .expect("image chunk present");
+        assert_eq!(img_chunk.token_estimate, 0);
+        assert_eq!(img_chunk.block_ids.len(), 1);
+    }
+
+    /// Identical input + identical policy → identical chunk_ids over
+    /// 1000 iterations.
+    #[test]
+    fn deterministic_chunk_ids_1000() {
+        let blocks = vec![
+            heading(2, "Det", 0, 1),
+            paragraph("body 1", &["Det"], 0, 2),
+            paragraph("body 2", &["Det"], 1, 3),
+            heading(2, "Det 2", 1, 4),
+            paragraph("body 3", &["Det 2"], 0, 5),
+        ];
+        let doc = make_doc(blocks);
+        let policy = default_policy(50, 10);
+        let baseline: Vec<String> = MdHeadingV1Chunker
+            .chunk(&doc, &policy)
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
+        for _ in 0..1000 {
+            let again: Vec<String> = MdHeadingV1Chunker
+                .chunk(&doc, &policy)
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
+            assert_eq!(again, baseline);
+        }
+    }
 }