kebab/crates/kebab-chunk/src/md_heading_v1.rs

//! `md-heading-v1` — heading-aware Markdown chunker.

use kebab_core::{
    Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
    SourceSpan, id_for_chunk,
};

/// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label
/// invalidates every downstream embedding record (design §9), so any change
/// must ship with a documented migration plan.
const VERSION_LABEL: &str = "md-heading-v1";

/// Bytes-per-token proxy. We over-estimate (smaller divisor → larger
/// token count) so that real tokenizers downstream never see a chunk
/// exceeding their budget. English averages ~4 bytes/token under BPE,
/// Korean averages ~3 bytes/token under E5; picking 3 covers both.
const BYTES_PER_TOKEN: usize = 3;

/// Maximum hex characters of `blake3(canonical_json(policy))` retained
/// in `policy_hash`. 16 hex chars = 64 bits of policy entropy, which is
/// far beyond enough to disambiguate the handful of policy variants a
/// single workspace will see.
const POLICY_HASH_HEX_LEN: usize = 16;

/// Heading-aware Markdown chunker.
///
/// Implements [`kebab_core::Chunker`] for Markdown-derived
/// [`CanonicalDocument`]s.
///
/// **Behavior contract** (design §0 / §14, in priority order):
///
/// 1. **Heading boundary first.** Chunks never span a `Block::Heading`.
///    The Heading block itself starts a new chunk and is included in that
///    chunk's `block_ids` so heading text is retrievable.
/// 2. **Never split a code block.** A `Block::Code` always lives in a
///    single chunk even when it exceeds `target_tokens`.
/// 3. **Tables stay in one chunk.** A `Block::Table` is emitted as a
///    single chunk regardless of size — the row-split refinement is
///    deferred per the P1-5 task spec.
/// 4. **Long sections split by paragraph.** Within a heading section
///    the chunker accumulates blocks until adding the next would exceed
///    `target_tokens`; it then emits the chunk and seeds the next chunk
///    with the previous chunk's tail blocks contributing roughly
///    `overlap_tokens` of content (paragraph-level overlap).
/// 5. **`heading_path` propagates.** Each chunk's `heading_path` is the
///    `heading_path` of its first contributing non-Heading block, or —
///    when the chunk leads with (or contains only) a Heading — the
///    parent path **plus the heading's own text** so heading-only or
///    heading-led chunks never lose their citation context.
/// 6. **`source_spans` merge.** A chunk lists every contributing block's
///    `source_span` in document order.
/// 7. **Version + policy hash recorded.** Each chunk records
///    `chunker_version = "md-heading-v1"`. The current `policy_hash` is
///    folded into the `chunk_id` recipe (design §4.2) so changing
///    `target_tokens` / `overlap_tokens` produces fresh chunk IDs.
///
/// `ImageRef` and `AudioRef` blocks are emitted as their own chunks so
/// future image/audio search can locate them. Their `text` is the alt /
/// caption preview (empty string if unavailable) and `token_estimate = 0`.
///
/// **Token-estimate proxy.** Until a real tokenizer is wired in (P3), the
/// estimator counts UTF-8 bytes and divides by [`BYTES_PER_TOKEN`]. The
/// constant is deliberately small (3) so the proxy *over*-estimates token
/// count — chunks sized against this proxy are guaranteed to fit in any
/// real BPE tokenizer's budget for English (~4 bytes/token) or Korean
/// (~3 bytes/token under E5/M-BERT). See [`BYTES_PER_TOKEN`] for rationale.
///
/// **`policy.respect_markdown_headings`.** This field flows into
/// `policy_hash` (so flipping it yields fresh chunk IDs), but the
/// chunker variant `md-heading-v1` unconditionally treats headings as
/// boundaries by design — the `md-heading-v1` name is the contract. To
/// disable heading awareness, ship a different `chunker_version`; none
/// is shipped in P1-5.
#[derive(Clone, Copy, Debug, Default)]
pub struct MdHeadingV1Chunker;

impl Chunker for MdHeadingV1Chunker {
    fn chunker_version(&self) -> ChunkerVersion {
        ChunkerVersion(VERSION_LABEL.to_string())
    }

    /// Compute the policy hash folded into `chunk_id` per design §4.2.
    ///
    /// # Panics
    ///
    /// Panics if canonical JSON serialization of `ChunkPolicy` fails.
    /// This is unreachable in practice — `ChunkPolicy` is composed of
    /// owned primitives (`usize`, `bool`, owned `String`) and
    /// `serde_json_canonicalizer::to_vec` only fails on
    /// non-serializable values such as non-finite floats or maps with
    /// non-string keys, neither of which can be constructed via
    /// `ChunkPolicy`'s public surface. The `expect` is preserved as a
    /// future-proofing guard against drift if `ChunkPolicy` ever gains
    /// a field with such a property.
    fn policy_hash(&self, policy: &ChunkPolicy) -> String {
        let bytes = serde_json_canonicalizer::to_vec(policy)
            .expect("canonical JSON serialization of ChunkPolicy must not fail");
        let hex = blake3::hash(&bytes).to_hex().to_string();
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        let policy_hash = self.policy_hash(policy);
        let chunker_version = self.chunker_version();
        let mut out: Vec<Chunk> = Vec::new();

        // Running accumulator: the paragraphs/lists/quotes (and the
        // optional leading heading) that will be glued into the next
        // emitted chunk.
        let mut acc = ChunkAcc::default();

        for block in &doc.blocks {
            match block {
                Block::Heading(_) => {
                    // §0/§14 priority 1: heading is a hard boundary.
                    // Flush whatever has accumulated, then seed a new
                    // accumulator that owns this heading.
                    flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
                    acc.push_block(block);
                }
                Block::Code(_) | Block::Table(_) => {
                    // Atomic non-splittable text blocks. Flush running
                    // accumulator, then emit the atomic block as its
                    // own chunk. (Code never splits per priority 2;
                    // tables stay single per priority 3.)
                    flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
                    let mut single = ChunkAcc::default();
                    single.push_block(block);
                    flush(&mut single, doc, &chunker_version, &policy_hash, &mut out);
                }
                Block::ImageRef(_) | Block::AudioRef(_) => {
                    // Independent searchable artifacts. token_estimate=0
                    // is enforced inside `build_chunk` for these kinds.
                    flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
                    let mut single = ChunkAcc::default();
                    single.push_block(block);
                    flush(&mut single, doc, &chunker_version, &policy_hash, &mut out);
                }
                Block::Paragraph(_) | Block::List(_) | Block::Quote(_) => {
                    // Soft-split candidates. If adding this block would
                    // exceed target_tokens (and we already have at least
                    // one non-heading block in the accumulator), emit
                    // the current chunk and seed the next one with
                    // overlap from the prior tail.
                    let next_tokens = estimate_block_tokens(block);
                    // Note: `acc.text_tokens` already includes the prior
                    // chunk's overlap seed. The clamp in
                    // `collect_overlap_seed` keeps seed ≤ target/2, so
                    // a flush here never produces a chunk smaller than
                    // the seed budget.
                    let would_exceed = acc.text_tokens + next_tokens > policy.target_tokens
                        && acc.has_non_heading_content();
                    if would_exceed {
                        let overlap_seed =
                            collect_overlap_seed(&acc, policy.overlap_tokens, policy.target_tokens);
                        flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
                        // Seed next accumulator with the prior chunk's
                        // tail blocks (paragraph-level overlap). The
                        // heading is *not* re-included here — it lives
                        // on the prior chunk. The follow-on chunk's
                        // heading_path is taken from the first seeded
                        // block (which carries the same path, as it sat
                        // under the same heading).
                        for b in overlap_seed {
                            acc.push_block(b);
                        }
                    }
                    acc.push_block(block);
                }
            }
        }
        flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);

        tracing::debug!(
            target: "kebab-chunk",
            doc_id = %doc.doc_id,
            chunks = out.len(),
            "md-heading-v1 chunked",
        );

        Ok(out)
    }
}

/// Internal accumulator: pointers to blocks (lifetime-bound to the
/// `CanonicalDocument`) plus the running token estimate of their text.
#[derive(Default)]
struct ChunkAcc<'a> {
    blocks: Vec<&'a Block>,
    text_tokens: usize,
}

impl<'a> ChunkAcc<'a> {
    fn push_block(&mut self, b: &'a Block) {
        self.text_tokens += estimate_block_tokens(b);
        self.blocks.push(b);
    }

    fn is_empty(&self) -> bool {
        self.blocks.is_empty()
    }

    /// True if any non-heading block sits in the accumulator. Used to
    /// avoid splitting a chunk that contains only its leading heading
    /// (which would emit a heading-only chunk before any prose).
    fn has_non_heading_content(&self) -> bool {
        self.blocks.iter().any(|b| !matches!(b, Block::Heading(_)))
    }
}

/// Drain `acc` into a fresh `Chunk` and push to `out`. No-op when empty.
fn flush(
    acc: &mut ChunkAcc<'_>,
    doc: &CanonicalDocument,
    chunker_version: &ChunkerVersion,
    policy_hash: &str,
    out: &mut Vec<Chunk>,
) {
    if acc.is_empty() {
        return;
    }
    let blocks = std::mem::take(&mut acc.blocks);
    acc.text_tokens = 0;
    out.push(build_chunk(doc, &blocks, chunker_version, policy_hash));
}

/// Collect the trailing blocks of `acc` (in document order) whose
/// combined token estimate fits under the seed budget. The heading
/// block (if it leads the accumulator) is excluded from the seed —
/// re-emitting the heading would conflate it with the next chunk's
/// own heading_path provenance.
///
/// The seed budget is clamped to `min(overlap_tokens, target_tokens / 2)`.
/// Without the clamp, an `overlap_tokens >= target_tokens` policy
/// degenerates into 1-block-per-chunk: the seed already exceeds budget
/// before any new content lands, so the very next paragraph trips the
/// `would_exceed` flush. Halving the target guarantees the seed leaves
/// at least target/2 worth of room for fresh content in the next chunk.
fn collect_overlap_seed<'a>(
    acc: &ChunkAcc<'a>,
    overlap_tokens: usize,
    target_tokens: usize,
) -> Vec<&'a Block> {
    let seed_budget = overlap_tokens.min(target_tokens / 2);
    if seed_budget == 0 {
        return Vec::new();
    }
    let mut taken = Vec::new();
    let mut budget = seed_budget;
    for b in acc.blocks.iter().rev() {
        if matches!(b, Block::Heading(_)) {
            // Don't propagate the heading itself into the next chunk;
            // its `heading_path` carries naturally on the next blocks
            // (kb-normalize stamps every block under a heading with
            // that heading's path).
            continue;
        }
        let est = estimate_block_tokens(b);
        if est > budget && !taken.is_empty() {
            break;
        }
        taken.push(*b);
        budget = budget.saturating_sub(est);
        if budget == 0 {
            break;
        }
    }
    taken.reverse();
    taken
}

/// Construct a `Chunk` from a non-empty contiguous slice of blocks.
fn build_chunk(
    doc: &CanonicalDocument,
    blocks: &[&Block],
    chunker_version: &ChunkerVersion,
    policy_hash: &str,
) -> Chunk {
    debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block");

    let block_ids: Vec<BlockId> = blocks.iter().map(|b| common(b).block_id.clone()).collect();
    let source_spans: Vec<SourceSpan> = blocks
        .iter()
        .map(|b| common(b).source_span.clone())
        .collect();

    // heading_path: pick the first non-Heading block's heading_path
    // (which already includes every parent heading per kb-normalize).
    // When the FIRST block is a Heading — either a heading-only chunk,
    // or a chunk that leads with `# H1` immediately followed by another
    // Heading or atomic block — the Heading block's own
    // `common.heading_path` records only its *parents* (kb-normalize
    // does not include a heading inside its own path). We synthesize
    // the leading heading into the path so the citation context is not
    // lost on patterns like `# Alpha\n## Beta\n...`.
    let heading_path = match blocks[0] {
        Block::Heading(h) => {
            let mut path = h.common.heading_path.clone();
            path.push(h.text.clone());
            path
        }
        _ => common(blocks[0]).heading_path.clone(),
    };

    // Text rendering: simple double-newline join of each block's
    // contribution. We deliberately pick a stable, low-fidelity
    // representation — embedding-quality rewrites land in P3.
    let mut text = String::new();
    let mut is_image_or_audio_only = true;
    for (i, b) in blocks.iter().enumerate() {
        let part = render_block_text(b);
        if !matches!(b, Block::ImageRef(_) | Block::AudioRef(_)) {
            is_image_or_audio_only = false;
        }
        if i > 0 {
            text.push_str("\n\n");
        }
        text.push_str(&part);
    }

    let token_estimate = if is_image_or_audio_only {
        0
    } else {
        // Token estimate is bytes / BYTES_PER_TOKEN, rounded up so the
        // proxy never under-counts.
        text.len().div_ceil(BYTES_PER_TOKEN)
    };

    let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, &block_ids, policy_hash);

    Chunk {
        chunk_id,
        doc_id: DocumentId(doc.doc_id.0.clone()),
        block_ids,
        tokenized_korean_text: crate::tokenize_korean_morphological(&text),
        text,
        heading_path,
        source_spans,
        token_estimate,
        chunker_version: chunker_version.clone(),
        policy_hash: policy_hash.to_string(),
        aliases: None,
    }
}

/// Render a block's contribution to a chunk's `text`. The rendering is
/// deliberately minimal — embedding-time normalization is a P3 concern.
fn render_block_text(b: &Block) -> String {
    match b {
        Block::Heading(h) => h.text.clone(),
        Block::Paragraph(p) | Block::Quote(p) => p.text.clone(),
        Block::List(l) => l
            .items
            .iter()
            .map(|it| it.text.as_str())
            .collect::<Vec<_>>()
            .join("\n"),
        Block::Code(c) => c.code.clone(),
        Block::Table(t) => {
            // Headers row joined with " | ", then each row likewise.
            let mut s = t.headers.join(" | ");
            for row in &t.rows {
                s.push('\n');
                s.push_str(&row.join(" | "));
            }
            s
        }
        // ImageRef text portion follows the P6-4 (β) plain-concat
        // contract — `[alt, ocr.joined, caption.text]` joined by
        // `\n\n`, dropping empty parts. Filename fallback for empty
        // alt keeps lexical search hits on filenames working even when
        // P6-1's filename auto-fill is bypassed.
        Block::ImageRef(i) => {
            let alt = if i.alt.is_empty() {
                // P6-1 falls back to filename so this branch is
                // defensive — keep it lest a future test fixture or
                // synthetic block path skip the auto-fill.
                i.src
                    .rsplit('/')
                    .next()
                    .filter(|s| !s.is_empty())
                    .unwrap_or("[image]")
                    .to_string()
            } else {
                i.alt.clone()
            };
            let ocr = i.ocr.as_ref().map_or("", |o| o.joined.as_str());
            let cap = i.caption.as_ref().map_or("", |c| c.text.as_str());
            [alt.as_str(), ocr, cap]
                .iter()
                .filter(|s| !s.is_empty())
                .copied()
                .collect::<Vec<_>>()
                .join("\n\n")
        }
        // AudioRef has no caption preview yet (transcript joins land
        // in P8). Empty string per task spec.
        Block::AudioRef(_) => String::new(),
    }
}

fn estimate_block_tokens(b: &Block) -> usize {
    match b {
        // ImageRef / AudioRef contribute 0 — they are independent
        // chunks and never participate in size accounting.
        Block::ImageRef(_) | Block::AudioRef(_) => 0,
        _ => render_block_text(b).len().div_ceil(BYTES_PER_TOKEN),
    }
}

/// Borrow the `CommonBlock` of any [`Block`] variant.
fn common(b: &Block) -> &kebab_core::CommonBlock {
    match b {
        Block::Heading(h) => &h.common,
        Block::Paragraph(t) | Block::Quote(t) => &t.common,
        Block::List(l) => &l.common,
        Block::Code(c) => &c.common,
        Block::Table(t) => &t.common,
        Block::ImageRef(i) => &i.common,
        Block::AudioRef(a) => &a.common,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use kebab_core::{
        AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, Metadata, Provenance,
        SourceType, TableBlock, TextBlock, TrustLevel, WorkspacePath, id_for_block,
    };
    use time::OffsetDateTime;

    fn make_doc(blocks: Vec<Block>) -> CanonicalDocument {
        CanonicalDocument {
            doc_id: kebab_core::DocumentId("d".repeat(32)),
            source_asset_id: AssetId("a".repeat(32)),
            workspace_path: WorkspacePath::new("notes/test.md".into()).unwrap(),
            title: "Test".into(),
            lang: Lang("en".into()),
            blocks,
            metadata: Metadata {
                aliases: vec![],
                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                source_type: SourceType::Note,
                trust_level: TrustLevel::Primary,
                user_id_alias: None,
                user: Default::default(),
                repo: None,
                git_branch: None,
                git_commit: None,
                code_lang: None,
            },
            provenance: Provenance { events: vec![] },
            parser_version: kebab_core::ParserVersion("test-parser-0".into()),
            schema_version: 1,
            doc_version: 1,
            last_chunker_version: None,
            last_embedding_version: None,
        }
    }

    fn doc_id() -> kebab_core::DocumentId {
        kebab_core::DocumentId("d".repeat(32))
    }

    fn span(start: u32, end: u32) -> SourceSpan {
        SourceSpan::Line { start, end }
    }

    fn common_for(kind: &str, heading_path: &[String], ordinal: u32, s: SourceSpan) -> CommonBlock {
        CommonBlock {
            block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s),
            heading_path: heading_path.to_vec(),
            source_span: s,
        }
    }

    fn heading(level: u8, text: &str, ordinal: u32, line: u32) -> Block {
        Block::Heading(HeadingBlock {
            common: common_for("heading", &[], ordinal, span(line, line)),
            level,
            text: text.into(),
        })
    }

    /// Heading variant that carries a parent path — kb-normalize stamps
    /// every block under `# Alpha` with `heading_path = []` for the H1
    /// itself but `["Alpha"]` for the H2 that follows. Tests covering
    /// the heading-only chunk path (I2) need that asymmetry.
    fn heading_with_parents(
        level: u8,
        text: &str,
        parents: &[&str],
        ordinal: u32,
        line: u32,
    ) -> Block {
        let hp: Vec<String> = parents.iter().map(|s| (*s).into()).collect();
        Block::Heading(HeadingBlock {
            common: common_for("heading", &hp, ordinal, span(line, line)),
            level,
            text: text.into(),
        })
    }

    fn paragraph(text: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
        Block::Paragraph(TextBlock {
            common: common_for("paragraph", &hp, ordinal, span(line, line)),
            text: text.into(),
            inlines: vec![],
        })
    }

    fn code_block(code: &str, heading_path: &[&str], ordinal: u32, s: SourceSpan) -> Block {
        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
        Block::Code(CodeBlock {
            common: common_for("code", &hp, ordinal, s),
            lang: Some("rust".into()),
            code: code.into(),
        })
    }

    fn table(
        headers: Vec<&str>,
        rows: Vec<Vec<&str>>,
        heading_path: &[&str],
        ordinal: u32,
        s: SourceSpan,
    ) -> Block {
        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
        Block::Table(TableBlock {
            common: common_for("table", &hp, ordinal, s),
            headers: headers.into_iter().map(String::from).collect(),
            rows: rows
                .into_iter()
                .map(|r| r.into_iter().map(String::from).collect())
                .collect(),
        })
    }

    fn image_ref(alt: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
        Block::ImageRef(ImageRefBlock {
            common: common_for("imageref", &hp, ordinal, span(line, line)),
            asset_id: None,
            src: "img.png".into(),
            alt: alt.into(),
            ocr: None,
            caption: None,
        })
    }

    fn default_policy(target: usize, overlap: usize) -> ChunkPolicy {
        ChunkPolicy {
            target_tokens: target,
            overlap_tokens: overlap,
            respect_markdown_headings: true,
            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
        }
    }

    #[test]
    fn chunker_version_is_md_heading_v1() {
        assert_eq!(
            MdHeadingV1Chunker.chunker_version(),
            ChunkerVersion(VERSION_LABEL.to_string())
        );
    }

    #[test]
    fn policy_hash_is_deterministic_and_16_hex() {
        let p = default_policy(500, 80);
        let h1 = MdHeadingV1Chunker.policy_hash(&p);
        let h2 = MdHeadingV1Chunker.policy_hash(&p);
        assert_eq!(h1, h2);
        assert_eq!(h1.len(), POLICY_HASH_HEX_LEN);
        assert!(h1.bytes().all(|b| b.is_ascii_hexdigit()));
    }

    #[test]
    fn policy_hash_differs_when_policy_differs() {
        let p1 = default_policy(500, 80);
        let p2 = default_policy(500, 0);
        assert_ne!(
            MdHeadingV1Chunker.policy_hash(&p1),
            MdHeadingV1Chunker.policy_hash(&p2)
        );
    }

    /// Heading boundary respected: two H2 sections produce separate
    /// chunks; no chunk's block_ids straddle the H2→H2 boundary.
    #[test]
    fn heading_boundary_respected() {
        let blocks = vec![
            heading(2, "First", 0, 1),
            paragraph("body of first", &["First"], 0, 2),
            heading(2, "Second", 1, 3),
            paragraph("body of second", &["Second"], 0, 4),
        ];
        let doc = make_doc(blocks);
        let chunks = MdHeadingV1Chunker
            .chunk(&doc, &default_policy(10_000, 0))
            .unwrap();
        assert_eq!(chunks.len(), 2);
        // First chunk = (heading "First", paragraph)
        assert_eq!(chunks[0].block_ids.len(), 2);
        // Second chunk = (heading "Second", paragraph)
        assert_eq!(chunks[1].block_ids.len(), 2);
        // heading_path on chunk 0 belongs to "First" section.
        assert_eq!(chunks[0].heading_path, vec!["First".to_string()]);
        assert_eq!(chunks[1].heading_path, vec!["Second".to_string()]);
    }

    /// A code block of ~800 tokens (≈2400 bytes) stays in a single
    /// chunk even when target=500.
    #[test]
    fn code_block_never_splits() {
        // 2400 bytes ≈ 800 tokens at BYTES_PER_TOKEN=3.
        let big = "x".repeat(2400);
        let blocks = vec![code_block(&big, &[], 0, span(1, 50))];
        let doc = make_doc(blocks);
        let chunks = MdHeadingV1Chunker
            .chunk(&doc, &default_policy(500, 80))
            .unwrap();
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].block_ids.len(), 1);
        assert!(chunks[0].token_estimate > 500);
    }

    /// A table of size < 2× target stays in a single chunk.
    #[test]
    fn table_stays_single_chunk_when_small() {
        let t = table(
            vec!["a", "b", "c"],
            vec![vec!["1", "2", "3"], vec!["4", "5", "6"]],
            &[],
            0,
            span(1, 4),
        );
        let blocks = vec![t];
        let doc = make_doc(blocks);
        let chunks = MdHeadingV1Chunker
            .chunk(&doc, &default_policy(500, 80))
            .unwrap();
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].block_ids.len(), 1);
    }

    /// A long sequence of paragraphs splits at target_tokens with
    /// overlap_tokens worth of seeded paragraph from the prior chunk.
    #[test]
    fn long_section_splits_with_overlap() {
        // Each paragraph is 60 bytes ≈ 20 tokens. target=50, overlap=20
        // → after ~3 paragraphs we hit the target; the next chunk
        // starts seeded with one paragraph from the prior tail.
        let mut bs = vec![heading(2, "Long", 0, 1)];
        for i in 0..6u32 {
            bs.push(paragraph(&"x".repeat(60), &["Long"], i, i + 2));
        }
        let doc = make_doc(bs);
        let chunks = MdHeadingV1Chunker
            .chunk(&doc, &default_policy(50, 20))
            .unwrap();
        assert!(
            chunks.len() >= 2,
            "expected ≥2 chunks, got {}: {chunks:#?}",
            chunks.len()
        );
        // Every chunk lives under the same heading_path "Long".
        for c in &chunks {
            assert_eq!(c.heading_path, vec!["Long".to_string()]);
        }
        // Overlap propagates: the last block_id of chunk N appears in
        // chunk N+1's block_ids (paragraph-level overlap rule).
        for w in chunks.windows(2) {
            let prev_tail = w[0].block_ids.last().unwrap();
            assert!(
                w[1].block_ids.contains(prev_tail),
                "chunk N+1 must seed from chunk N's tail; \
                 prev_tail={prev_tail:?}, next ids={:?}",
                w[1].block_ids
            );
        }
    }

    /// P6-4 (β) plain concatenation — alt + ocr.joined + caption.text
    /// joined by `\n\n`, dropping empty parts. Verifies all four
    /// (alt-only, alt+ocr, alt+caption, alt+ocr+caption) shapes.
    #[test]
    fn image_ref_p6_4_plain_concat_drops_empty_parts() {
        use kebab_core::{ModelCaption, OcrText};

        let mk = |alt: &str, ocr: Option<&str>, cap: Option<&str>| {
            Block::ImageRef(ImageRefBlock {
                common: common_for("imageref", &[], 0, span(1, 1)),
                asset_id: None,
                src: "img.png".into(),
                alt: alt.into(),
                ocr: ocr.map(|t| OcrText {
                    joined: t.into(),
                    regions: vec![],
                    engine: "test".into(),
                    engine_version: "v1".into(),
                }),
                caption: cap.map(|t| ModelCaption {
                    text: t.into(),
                    model: "m".into(),
                    model_version: "v".into(),
                }),
            })
        };

        // alt-only — no separators between empty parts.
        assert_eq!(render_block_text(&mk("photo.png", None, None)), "photo.png");

        // alt + ocr — joined by exactly one `\n\n`.
        assert_eq!(
            render_block_text(&mk("photo.png", Some("Hello"), None)),
            "photo.png\n\nHello"
        );

        // alt + caption.
        assert_eq!(
            render_block_text(&mk("photo.png", None, Some("a red square"))),
            "photo.png\n\na red square"
        );

        // alt + ocr + caption — three parts joined by `\n\n` each.
        assert_eq!(
            render_block_text(&mk("photo.png", Some("Hello"), Some("a red square"))),
            "photo.png\n\nHello\n\na red square"
        );

        // empty alt — falls back to filename derived from `src`.
        let blk = mk("", Some("text from image"), None);
        assert_eq!(
            render_block_text(&blk),
            "img.png\n\ntext from image",
            "empty alt must fall back to the basename of `src`"
        );
    }

    /// ImageRef → own chunk, token_estimate=0.
    #[test]
    fn image_ref_emits_own_chunk_zero_tokens() {
        let blocks = vec![
            heading(2, "With image", 0, 1),
            paragraph("intro", &["With image"], 0, 2),
            image_ref("a cat", &["With image"], 0, 3),
            paragraph("after", &["With image"], 1, 4),
        ];
        let doc = make_doc(blocks);
        let chunks = MdHeadingV1Chunker
            .chunk(&doc, &default_policy(10_000, 0))
            .unwrap();
        // Expect: (heading + intro), (image), (after). The image must
        // be its own chunk and carry token_estimate=0.
        assert!(chunks.len() >= 3, "unexpected chunk count: {chunks:#?}");
        let img_chunk = chunks
            .iter()
            .find(|c| c.text == "a cat")
            .expect("image chunk present");
        assert_eq!(img_chunk.token_estimate, 0);
        assert_eq!(img_chunk.block_ids.len(), 1);
    }

    /// Identical input + identical policy → identical chunk_ids over
    /// 1000 iterations.
    #[test]
    fn deterministic_chunk_ids_1000() {
        let blocks = vec![
            heading(2, "Det", 0, 1),
            paragraph("body 1", &["Det"], 0, 2),
            paragraph("body 2", &["Det"], 1, 3),
            heading(2, "Det 2", 1, 4),
            paragraph("body 3", &["Det 2"], 0, 5),
        ];
        let doc = make_doc(blocks);
        let policy = default_policy(50, 10);
        let baseline: Vec<String> = MdHeadingV1Chunker
            .chunk(&doc, &policy)
            .unwrap()
            .into_iter()
            .map(|c| c.chunk_id.0)
            .collect();
        for _ in 0..1000 {
            let again: Vec<String> = MdHeadingV1Chunker
                .chunk(&doc, &policy)
                .unwrap()
                .into_iter()
                .map(|c| c.chunk_id.0)
                .collect();
            assert_eq!(again, baseline);
        }
    }

    /// I2 regression: when a Heading is followed immediately by another
    /// Heading or atomic block (no intervening prose), the resulting
    /// heading-only / heading-led chunk must carry the heading text in
    /// its own `heading_path`. Pattern: `# Alpha`, `## Beta`, code.
    ///
    /// Before the fix, chunk[0] (Heading-only "Alpha") would have
    /// `heading_path = []` because `kb-normalize` does not stamp a
    /// heading inside its own path; the chunker fell back to the
    /// heading's parent path. After the fix it is `["Alpha"]`.
    ///
    /// `chunk_id` recipe (`doc_id, chunker_version, block_ids,
    /// policy_hash`) does NOT include `heading_path`, so this fix does
    /// NOT shift chunk_ids — only `heading_path` fields.
    #[test]
    fn heading_only_chunk_carries_self_in_path() {
        // # Alpha (H1, no parents)
        // ## Beta (H2, parent = ["Alpha"])
        // ```rust ... ``` (code, heading_path = ["Alpha", "Beta"])
        let blocks = vec![
            heading_with_parents(1, "Alpha", &[], 0, 1),
            heading_with_parents(2, "Beta", &["Alpha"], 0, 2),
            code_block("fn x() {}", &["Alpha", "Beta"], 0, span(3, 3)),
        ];
        let doc = make_doc(blocks);
        let chunks = MdHeadingV1Chunker
            .chunk(&doc, &default_policy(10_000, 0))
            .unwrap();
        // Three chunks: Heading-only Alpha, Heading-only Beta, code.
        assert_eq!(chunks.len(), 3, "got {chunks:#?}");
        assert_eq!(chunks[0].heading_path, vec!["Alpha".to_string()]);
        assert_eq!(
            chunks[1].heading_path,
            vec!["Alpha".to_string(), "Beta".to_string()]
        );
        assert_eq!(
            chunks[2].heading_path,
            vec!["Alpha".to_string(), "Beta".to_string()]
        );
    }

    /// I3 regression: a pathological policy with
    /// `overlap_tokens >= target_tokens` must NOT degenerate into
    /// 1-block-per-chunk. The seed budget is clamped to `target/2`,
    /// guaranteeing every flushed chunk has space for fresh content.
    #[test]
    fn overlap_clamped_when_overlap_exceeds_target() {
        // 5 paragraphs of ~20 tokens each (60 bytes / 3 BPT).
        // target = 50, overlap = 200 (4× target → would trip flush
        // immediately without clamp).
        let mut bs = vec![heading_with_parents(2, "Long", &[], 0, 1)];
        for i in 0..5u32 {
            bs.push(paragraph(&"x".repeat(60), &["Long"], i, i + 2));
        }
        let doc = make_doc(bs);
        let policy = ChunkPolicy {
            target_tokens: 50,
            overlap_tokens: 200,
            respect_markdown_headings: true,
            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
        };
        let chunks = MdHeadingV1Chunker.chunk(&doc, &policy).unwrap();
        // Without the clamp, every chunk after the first would have
        // exactly 1 paragraph (because seed alone already exceeds
        // target and acc.has_non_heading_content() is true the moment
        // any seed lands). With the clamp, follow-on chunks must hold
        // at least the seed paragraph + the new paragraph = ≥2 blocks.
        for (i, c) in chunks.iter().enumerate() {
            // The very first chunk includes the heading + first para
            // (no seed), so it is also ≥2. Subsequent chunks must be
            // seed+new ≥ 2.
            assert!(
                c.block_ids.len() >= 2,
                "chunk {i} degenerated to {} block(s); pathology not \
                 prevented: {chunks:#?}",
                c.block_ids.len()
            );
        }
    }
}