p1-5: heading-only chunk carries self in heading_path
When a Heading block opens a chunk and is followed by another Heading or an atomic block (Code, Table, ImageRef, AudioRef) with no intervening prose, the prior fallback used `common.heading_path` from the heading itself — which per kb-normalize convention does NOT include the heading's own text. Result: heading-only and heading-led chunks for `# Alpha\n## Beta\n...` patterns landed with `heading_path = []`, losing citation context. Synthesize the leading heading into the chunk's heading_path when blocks[0] is a Heading: parent path + heading.text. The first non-Heading branch (existing logic for normal mid-section chunks) is unchanged. `chunk_id` recipe is `(doc_id, chunker_version, block_ids, policy_hash)` — `heading_path` is not in the recipe, so this fix does NOT shift chunk_ids. Snapshot baseline `long-section.chunks.snapshot.json` also unchanged because every heading in that fixture is followed by a paragraph (the bug only triggers on direct heading→heading or heading→atomic adjacency). Adds `heading_with_parents` test helper and a regression test pinning the `# Alpha\n## Beta\n[code]` pattern. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -260,14 +260,23 @@ fn build_chunk(
|
||||
let source_spans: Vec<SourceSpan> =
|
||||
blocks.iter().map(|b| common(b).source_span.clone()).collect();
|
||||
|
||||
// heading_path: pick the first non-Heading block's heading_path; if
|
||||
// every block is a Heading (only the heading itself made it into
|
||||
// this chunk), use that heading's heading_path.
|
||||
let heading_path = blocks
|
||||
.iter()
|
||||
.find(|b| !matches!(b, Block::Heading(_)))
|
||||
.map(|b| common(b).heading_path.clone())
|
||||
.unwrap_or_else(|| common(blocks[0]).heading_path.clone());
|
||||
// heading_path: pick the first non-Heading block's heading_path
|
||||
// (which already includes every parent heading per kb-normalize).
|
||||
// When the FIRST block is a Heading — either a heading-only chunk,
|
||||
// or a chunk that leads with `# H1` immediately followed by another
|
||||
// Heading or atomic block — the Heading block's own
|
||||
// `common.heading_path` records only its *parents* (kb-normalize
|
||||
// does not include a heading inside its own path). We synthesize
|
||||
// the leading heading into the path so the citation context is not
|
||||
// lost on patterns like `# Alpha\n## Beta\n...`.
|
||||
let heading_path = match blocks[0] {
|
||||
Block::Heading(h) => {
|
||||
let mut path = h.common.heading_path.clone();
|
||||
path.push(h.text.clone());
|
||||
path
|
||||
}
|
||||
_ => common(blocks[0]).heading_path.clone(),
|
||||
};
|
||||
|
||||
// Text rendering: simple double-newline join of each block's
|
||||
// contribution. We deliberately pick a stable, low-fidelity
|
||||
@@ -438,6 +447,25 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
/// Heading variant that carries a parent path — kb-normalize stamps
|
||||
/// every block under `# Alpha` with `heading_path = []` for the H1
|
||||
/// itself but `["Alpha"]` for the H2 that follows. Tests covering
|
||||
/// the heading-only chunk path (I2) need that asymmetry.
|
||||
fn heading_with_parents(
|
||||
level: u8,
|
||||
text: &str,
|
||||
parents: &[&str],
|
||||
ordinal: u32,
|
||||
line: u32,
|
||||
) -> Block {
|
||||
let hp: Vec<String> = parents.iter().map(|s| (*s).into()).collect();
|
||||
Block::Heading(HeadingBlock {
|
||||
common: common_for("heading", &hp, ordinal, span(line, line)),
|
||||
level,
|
||||
text: text.into(),
|
||||
})
|
||||
}
|
||||
|
||||
fn paragraph(
|
||||
text: &str,
|
||||
heading_path: &[&str],
|
||||
@@ -687,4 +715,44 @@ mod tests {
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
|
||||
/// I2 regression: when a Heading is followed immediately by another
|
||||
/// Heading or atomic block (no intervening prose), the resulting
|
||||
/// heading-only / heading-led chunk must carry the heading text in
|
||||
/// its own `heading_path`. Pattern: `# Alpha`, `## Beta`, code.
|
||||
///
|
||||
/// Before the fix, chunk[0] (Heading-only "Alpha") would have
|
||||
/// `heading_path = []` because `kb-normalize` does not stamp a
|
||||
/// heading inside its own path; the chunker fell back to the
|
||||
/// heading's parent path. After the fix it is `["Alpha"]`.
|
||||
///
|
||||
/// `chunk_id` recipe (`doc_id, chunker_version, block_ids,
|
||||
/// policy_hash`) does NOT include `heading_path`, so this fix does
|
||||
/// NOT shift chunk_ids — only `heading_path` fields.
|
||||
#[test]
|
||||
fn heading_only_chunk_carries_self_in_path() {
|
||||
// # Alpha (H1, no parents)
|
||||
// ## Beta (H2, parent = ["Alpha"])
|
||||
// ```rust ... ``` (code, heading_path = ["Alpha", "Beta"])
|
||||
let blocks = vec![
|
||||
heading_with_parents(1, "Alpha", &[], 0, 1),
|
||||
heading_with_parents(2, "Beta", &["Alpha"], 0, 2),
|
||||
code_block("fn x() {}", &["Alpha", "Beta"], 0, span(3, 3)),
|
||||
];
|
||||
let doc = make_doc(blocks);
|
||||
let chunks = MdHeadingV1Chunker
|
||||
.chunk(&doc, &default_policy(10_000, 0))
|
||||
.unwrap();
|
||||
// Three chunks: Heading-only Alpha, Heading-only Beta, code.
|
||||
assert_eq!(chunks.len(), 3, "got {chunks:#?}");
|
||||
assert_eq!(chunks[0].heading_path, vec!["Alpha".to_string()]);
|
||||
assert_eq!(
|
||||
chunks[1].heading_path,
|
||||
vec!["Alpha".to_string(), "Beta".to_string()]
|
||||
);
|
||||
assert_eq!(
|
||||
chunks[2].heading_path,
|
||||
vec!["Alpha".to_string(), "Beta".to_string()]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user