p1-5: add long-section fixture + Vec<Chunk> snapshot test

Bakes the chunker output for fixtures/markdown/long-section.md (3 H1s,
nested H2 under Alpha, a 50-line code block, a 3-col x 4-row table,
and a multi-paragraph Gamma section) into the JSON snapshot baseline.
Confirms the priority rules end-to-end:

  - Heading boundaries hold across H1 → H2 → H1 transitions
  - The code block emits one chunk at 427 tokens > target=200
  - The table stays single-chunk
  - Gamma's paragraph stream splits with one block of overlap seed

A second test runs the full parse → normalize → chunk pipeline 5
times and asserts identical chunk_ids each pass.

Drops the unused `kb-config` and `serde` from regular dependencies —
they were declared but no source path imports them; `serde` flows in
transitively via `kb-core` as a public API requirement, and
`ChunkingCfg` lives in `kb-config` but the chunker takes
`ChunkPolicy` directly. Production deps are now exactly the allowed
set actually used: anyhow, blake3, kb-core, serde_json_canonicalizer,
tracing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-30 16:33:29 +00:00
parent 0237022d0e
commit 58f7b8573d
5 changed files with 457 additions and 4 deletions

2
Cargo.lock generated
View File

@@ -545,11 +545,9 @@ version = "0.1.0"
dependencies = [
"anyhow",
"blake3",
"kb-config",
"kb-core",
"kb-normalize",
"kb-parse-md",
"serde",
"serde_json",
"serde_json_canonicalizer",
"time",

View File

@@ -9,8 +9,6 @@ description = "Chunkers that turn kb-core::CanonicalDocument into kb-core::Chu
[dependencies]
kb-core = { path = "../kb-core" }
kb-config = { path = "../kb-config" }
serde = { workspace = true }
serde_json_canonicalizer = "0.3"
blake3 = { workspace = true }
anyhow = { workspace = true }

View File

@@ -0,0 +1,177 @@
//! Snapshot test pinning the `Vec<Chunk>` JSON for the
//! `fixtures/markdown/long-section.md` fixture.
//!
//! This is an integration test. `kb-parse-md` and `kb-normalize` are
//! dev-dep only — `cargo tree -p kb-chunk --depth 1` (default scope,
//! excludes dev-deps) confirms they are not regular deps. The §8
//! module-boundary rule is preserved.
//!
//! The chunker output is fully deterministic given fixed inputs, so we
//! pin the entire `Vec<Chunk>` JSON.
//!
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
use std::path::PathBuf;
use kb_chunk::MdHeadingV1Chunker;
use kb_core::{
AssetId, AssetStorage, Checksum, ChunkPolicy, ChunkerVersion, Chunker, MediaType,
ParserVersion, RawAsset, SourceUri, WorkspacePath,
};
use kb_normalize::build_canonical_document;
use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
use serde_json::Value;
use time::OffsetDateTime;
fn fixtures_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..")
.join("fixtures")
.join("markdown")
}
fn fixed_asset(workspace_path: &str) -> RawAsset {
let wp = WorkspacePath::new(workspace_path.into()).unwrap();
RawAsset {
asset_id: AssetId("a".repeat(32)),
source_uri: SourceUri::File(PathBuf::from("/tmp/long-section.md")),
workspace_path: wp,
media_type: MediaType::Markdown,
byte_len: 0,
checksum: Checksum("0".repeat(64)),
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
stored: AssetStorage::Reference {
path: PathBuf::from("/tmp/long-section.md"),
sha: Checksum("0".repeat(64)),
},
}
}
#[test]
fn long_section_chunks_snapshot() {
let dir = fixtures_dir();
let bytes = std::fs::read(dir.join("long-section.md")).expect("fixture readable");
let asset = fixed_asset("notes/long-section.md");
let hints = BodyHints {
first_h1: Some("Alpha".into()),
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: Some("en".into()),
};
let (metadata, fm_span, _fm_warns) =
parse_frontmatter(&bytes, &hints).expect("frontmatter parses");
let body_offset_lines: u32 = match fm_span {
Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1,
None => 1,
};
let (blocks, parse_warns) =
parse_blocks(&bytes, body_offset_lines).expect("blocks parse");
// Pin parser_version so doc_id / block_ids are reproducible.
let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into());
let mut metadata = metadata;
metadata.aliases.sort();
metadata.tags.sort();
let doc =
build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
.expect("build_canonical_document");
// Pin policy so policy_hash and chunk_ids are reproducible.
let policy = ChunkPolicy {
target_tokens: 200,
overlap_tokens: 40,
respect_markdown_headings: true,
chunker_version: ChunkerVersion("md-heading-v1".into()),
};
let chunks = MdHeadingV1Chunker.chunk(&doc, &policy).expect("chunk");
let actual = serde_json::to_value(&chunks).unwrap();
let baseline_path = dir.join("long-section.chunks.snapshot.json");
let baseline_text = match std::fs::read_to_string(&baseline_path) {
Ok(s) => s,
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
let pretty = serde_json::to_string_pretty(&actual).unwrap();
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
return;
}
Err(e) => panic!(
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
baseline_path.display()
),
};
let expected: Value =
serde_json::from_str(&baseline_text).expect("baseline parses as json");
if actual != expected {
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
let pretty = serde_json::to_string_pretty(&actual).unwrap();
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
eprintln!("updated baseline {}", baseline_path.display());
return;
}
let pretty = serde_json::to_string_pretty(&actual).unwrap();
panic!(
"long-section chunks snapshot drift\n\
--- expected ({}) ---\n{baseline_text}\n\
--- actual ---\n{pretty}\n\
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
baseline_path.display()
);
}
}
/// Determinism cross-check: re-running the same pipeline yields the same
/// chunk_ids byte-for-byte.
#[test]
fn long_section_chunks_are_deterministic() {
let dir = fixtures_dir();
let bytes = std::fs::read(dir.join("long-section.md")).expect("fixture readable");
let asset = fixed_asset("notes/long-section.md");
let hints = BodyHints {
first_h1: Some("Alpha".into()),
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: Some("en".into()),
};
let policy = ChunkPolicy {
target_tokens: 200,
overlap_tokens: 40,
respect_markdown_headings: true,
chunker_version: ChunkerVersion("md-heading-v1".into()),
};
let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into());
let mut baseline: Option<Vec<String>> = None;
for _ in 0..5 {
let (metadata, _fm_span, _fm_warns) =
parse_frontmatter(&bytes, &hints).expect("frontmatter parses");
let (blocks, parse_warns) = parse_blocks(&bytes, 1).expect("blocks parse");
let mut metadata = metadata;
metadata.aliases.sort();
metadata.tags.sort();
let doc = build_canonical_document(
&asset,
metadata,
blocks,
&parser_version,
parse_warns,
)
.expect("build_canonical_document");
let ids: Vec<String> = MdHeadingV1Chunker
.chunk(&doc, &policy)
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
match &baseline {
None => baseline = Some(ids),
Some(prev) => assert_eq!(prev, &ids),
}
}
}

View File

@@ -0,0 +1,206 @@
[
{
"block_ids": [
"39308c41feedcbbc2f92d5d133366f6d",
"5e978557db4fd5d88807b00ce0d8ca01",
"52fbbe749357ad142492968e8febafb2"
],
"chunk_id": "04903321ed830fcb4b8a50fa795e6c14",
"chunker_version": "md-heading-v1",
"doc_id": "550b21c4a6a3c526f4f39b759a5fb740",
"heading_path": [
"Alpha"
],
"source_spans": [
{
"end": 1,
"kind": "line",
"start": 1
},
{
"end": 3,
"kind": "line",
"start": 3
},
{
"end": 5,
"kind": "line",
"start": 5
}
],
"text": "Alpha\n\nAlpha intro paragraph one. This first paragraph in the alpha section gives a brief overview of what is to follow and serves as the lead-in for the subsequent material covered under the alpha heading.\n\nAlpha intro paragraph two. The second paragraph extends the discussion with additional sentences, padding out the paragraph so that paragraph-level chunk splitting actually has multiple candidates to consider when deciding where to slice the content stream.",
"token_estimate": 155
},
{
"block_ids": [
"839080233875e832d37ba80d4b9ef97a",
"1390fa96500a55669123383889c472c4"
],
"chunk_id": "661a4e5ae606d4327eee70bd4e346b52",
"chunker_version": "md-heading-v1",
"doc_id": "550b21c4a6a3c526f4f39b759a5fb740",
"heading_path": [
"Alpha",
"Alpha Sub"
],
"source_spans": [
{
"end": 7,
"kind": "line",
"start": 7
},
{
"end": 9,
"kind": "line",
"start": 9
}
],
"text": "Alpha Sub\n\nSome prose under the alpha sub-heading. The nested heading should still be respected as a chunk boundary distinct from the parent alpha heading.",
"token_estimate": 52
},
{
"block_ids": [
"7e923dfac89c5d8a31879418ec194026"
],
"chunk_id": "c8b0f5d9405fa8c36eb70dd9005a29dc",
"chunker_version": "md-heading-v1",
"doc_id": "550b21c4a6a3c526f4f39b759a5fb740",
"heading_path": [
"Alpha",
"Alpha Sub"
],
"source_spans": [
{
"end": 53,
"kind": "line",
"start": 11
}
],
"text": "// A code block long enough to easily clear any reasonable target_tokens\n// so the never-split-code-block rule is exercised by this fixture. The\n// rest of the function body is intentional filler: line after line of\n// content that, were the chunker permitted to split it, would exceed\n// the target threshold and force a break in the middle of the snippet.\nfn long_code_example_one() {\n let mut numbers = Vec::new();\n for i in 0..10 {\n numbers.push(i * 2);\n }\n let mut total = 0_i64;\n for n in &numbers {\n total += *n as i64;\n }\n println!(\"total = {total}\");\n}\n\nfn long_code_example_two() {\n let words = [\"alpha\", \"beta\", \"gamma\", \"delta\", \"epsilon\"];\n for w in words.iter() {\n if w.starts_with('a') {\n println!(\"starts with a: {w}\");\n } else if w.starts_with('b') {\n println!(\"starts with b: {w}\");\n } else if w.starts_with('g') {\n println!(\"starts with g: {w}\");\n } else {\n println!(\"other: {w}\");\n }\n }\n}\n\nfn long_code_example_three() {\n let mut buf = String::new();\n for ch in \"lorem ipsum dolor sit amet\".chars() {\n if ch.is_ascii_alphabetic() {\n buf.push(ch.to_ascii_uppercase());\n }\n }\n println!(\"buf = {buf}\");\n}",
"token_estimate": 427
},
{
"block_ids": [
"53e0b44f880cca19d9f0ff99d917f4f6",
"8f794bb2314006e07fb7650ad28d2bb9"
],
"chunk_id": "3a01e78c14f3d2e3737d9b0b1411a535",
"chunker_version": "md-heading-v1",
"doc_id": "550b21c4a6a3c526f4f39b759a5fb740",
"heading_path": [
"Beta"
],
"source_spans": [
{
"end": 55,
"kind": "line",
"start": 55
},
{
"end": 57,
"kind": "line",
"start": 57
}
],
"text": "Beta\n\nBeta paragraph one. The beta section opens with an introductory paragraph that sets up the table appearing further down.",
"token_estimate": 42
},
{
"block_ids": [
"dc1a3da1f6c0de0cc0ecaf93deb3ed30"
],
"chunk_id": "6acd3b817583ebfd2f6639db2c47b4f0",
"chunker_version": "md-heading-v1",
"doc_id": "550b21c4a6a3c526f4f39b759a5fb740",
"heading_path": [
"Beta"
],
"source_spans": [
{
"end": 64,
"kind": "line",
"start": 59
}
],
"text": "name | kind | note\none | small | first row\ntwo | medium | second row\nthree | large | third row\nfour | huge | fourth row",
"token_estimate": 40
},
{
"block_ids": [
"8b8ba26ffe0e34d4a33c26ce0d302654"
],
"chunk_id": "f79e267b7e498702e1bd35d2a373e5c5",
"chunker_version": "md-heading-v1",
"doc_id": "550b21c4a6a3c526f4f39b759a5fb740",
"heading_path": [
"Beta"
],
"source_spans": [
{
"end": 66,
"kind": "line",
"start": 66
}
],
"text": "Beta closing paragraph. After the table we have one more paragraph of prose that anchors the end of the beta section before we move on to gamma.",
"token_estimate": 48
},
{
"block_ids": [
"a5bb8d0a4f33ef9276f287c6b2876864",
"6358dda59f10540018ef85d776ee2ec2",
"1ee4ebef26433d6d6b585d7bd6497028"
],
"chunk_id": "880fa807ed5aac2c31b76de8294ed270",
"chunker_version": "md-heading-v1",
"doc_id": "550b21c4a6a3c526f4f39b759a5fb740",
"heading_path": [
"Gamma"
],
"source_spans": [
{
"end": 68,
"kind": "line",
"start": 68
},
{
"end": 70,
"kind": "line",
"start": 70
},
{
"end": 72,
"kind": "line",
"start": 72
}
],
"text": "Gamma\n\nGamma paragraph one. The gamma section is intentionally long to exercise the paragraph-level split with overlap rule when chunking under a single heading without any nested sub-headings to break things up further.\n\nGamma paragraph two. We continue accumulating prose so that the running token estimator climbs steadily and eventually trips the target_tokens threshold, forcing the chunker to emit a chunk and seed the next chunk with overlap from the prior tail.",
"token_estimate": 157
},
{
"block_ids": [
"1ee4ebef26433d6d6b585d7bd6497028",
"38db826bf29bd64a90a698926d94d83e"
],
"chunk_id": "6584ae54bbf25ea275ee380648eb3ccb",
"chunker_version": "md-heading-v1",
"doc_id": "550b21c4a6a3c526f4f39b759a5fb740",
"heading_path": [
"Gamma"
],
"source_spans": [
{
"end": 72,
"kind": "line",
"start": 72
},
{
"end": 74,
"kind": "line",
"start": 74
}
],
"text": "Gamma paragraph two. We continue accumulating prose so that the running token estimator climbs steadily and eventually trips the target_tokens threshold, forcing the chunker to emit a chunk and seed the next chunk with overlap from the prior tail.\n\nGamma paragraph three. Yet another paragraph under the gamma heading, padded with words to ensure the byte count clears the threshold and the splitting behaviour shows up unambiguously in the snapshot output.",
"token_estimate": 153
}
]

View File

@@ -0,0 +1,74 @@
# Alpha
Alpha intro paragraph one. This first paragraph in the alpha section gives a brief overview of what is to follow and serves as the lead-in for the subsequent material covered under the alpha heading.
Alpha intro paragraph two. The second paragraph extends the discussion with additional sentences, padding out the paragraph so that paragraph-level chunk splitting actually has multiple candidates to consider when deciding where to slice the content stream.
## Alpha Sub
Some prose under the alpha sub-heading. The nested heading should still be respected as a chunk boundary distinct from the parent alpha heading.
```rust
// A code block long enough to easily clear any reasonable target_tokens
// so the never-split-code-block rule is exercised by this fixture. The
// rest of the function body is intentional filler: line after line of
// content that, were the chunker permitted to split it, would exceed
// the target threshold and force a break in the middle of the snippet.
fn long_code_example_one() {
let mut numbers = Vec::new();
for i in 0..10 {
numbers.push(i * 2);
}
let mut total = 0_i64;
for n in &numbers {
total += *n as i64;
}
println!("total = {total}");
}
fn long_code_example_two() {
let words = ["alpha", "beta", "gamma", "delta", "epsilon"];
for w in words.iter() {
if w.starts_with('a') {
println!("starts with a: {w}");
} else if w.starts_with('b') {
println!("starts with b: {w}");
} else if w.starts_with('g') {
println!("starts with g: {w}");
} else {
println!("other: {w}");
}
}
}
fn long_code_example_three() {
let mut buf = String::new();
for ch in "lorem ipsum dolor sit amet".chars() {
if ch.is_ascii_alphabetic() {
buf.push(ch.to_ascii_uppercase());
}
}
println!("buf = {buf}");
}
```
# Beta
Beta paragraph one. The beta section opens with an introductory paragraph that sets up the table appearing further down.
| name | kind | note |
|-------|--------|--------------|
| one | small | first row |
| two | medium | second row |
| three | large | third row |
| four | huge | fourth row |
Beta closing paragraph. After the table we have one more paragraph of prose that anchors the end of the beta section before we move on to gamma.
# Gamma
Gamma paragraph one. The gamma section is intentionally long to exercise the paragraph-level split with overlap rule when chunking under a single heading without any nested sub-headings to break things up further.
Gamma paragraph two. We continue accumulating prose so that the running token estimator climbs steadily and eventually trips the target_tokens threshold, forcing the chunker to emit a chunk and seed the next chunk with overlap from the prior tail.
Gamma paragraph three. Yet another paragraph under the gamma heading, padded with words to ensure the byte count clears the threshold and the splitting behaviour shows up unambiguously in the snapshot output.