diff --git a/Cargo.lock b/Cargo.lock index fec852a..11bdbb3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -545,11 +545,9 @@ version = "0.1.0" dependencies = [ "anyhow", "blake3", - "kb-config", "kb-core", "kb-normalize", "kb-parse-md", - "serde", "serde_json", "serde_json_canonicalizer", "time", diff --git a/crates/kb-chunk/Cargo.toml b/crates/kb-chunk/Cargo.toml index 035a43d..2238643 100644 --- a/crates/kb-chunk/Cargo.toml +++ b/crates/kb-chunk/Cargo.toml @@ -9,8 +9,6 @@ description = "Chunkers that turn kb-core::CanonicalDocument into kb-core::Chu [dependencies] kb-core = { path = "../kb-core" } -kb-config = { path = "../kb-config" } -serde = { workspace = true } serde_json_canonicalizer = "0.3" blake3 = { workspace = true } anyhow = { workspace = true } diff --git a/crates/kb-chunk/tests/long_section_snapshot.rs b/crates/kb-chunk/tests/long_section_snapshot.rs new file mode 100644 index 0000000..3148300 --- /dev/null +++ b/crates/kb-chunk/tests/long_section_snapshot.rs @@ -0,0 +1,177 @@ +//! Snapshot test pinning the `Vec` JSON for the +//! `fixtures/markdown/long-section.md` fixture. +//! +//! This is an integration test. `kb-parse-md` and `kb-normalize` are +//! dev-dep only — `cargo tree -p kb-chunk --depth 1` (default scope, +//! excludes dev-deps) confirms they are not regular deps. The §8 +//! module-boundary rule is preserved. +//! +//! The chunker output is fully deterministic given fixed inputs, so we +//! pin the entire `Vec` JSON. +//! +//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline. + +use std::path::PathBuf; + +use kb_chunk::MdHeadingV1Chunker; +use kb_core::{ + AssetId, AssetStorage, Checksum, ChunkPolicy, ChunkerVersion, Chunker, MediaType, + ParserVersion, RawAsset, SourceUri, WorkspacePath, +}; +use kb_normalize::build_canonical_document; +use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("fixtures") + .join("markdown") +} + +fn fixed_asset(workspace_path: &str) -> RawAsset { + let wp = WorkspacePath::new(workspace_path.into()).unwrap(); + RawAsset { + asset_id: AssetId("a".repeat(32)), + source_uri: SourceUri::File(PathBuf::from("/tmp/long-section.md")), + workspace_path: wp, + media_type: MediaType::Markdown, + byte_len: 0, + checksum: Checksum("0".repeat(64)), + discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + stored: AssetStorage::Reference { + path: PathBuf::from("/tmp/long-section.md"), + sha: Checksum("0".repeat(64)), + }, + } +} + +#[test] +fn long_section_chunks_snapshot() { + let dir = fixtures_dir(); + let bytes = std::fs::read(dir.join("long-section.md")).expect("fixture readable"); + + let asset = fixed_asset("notes/long-section.md"); + let hints = BodyHints { + first_h1: Some("Alpha".into()), + fs_ctime: asset.discovered_at, + fs_mtime: asset.discovered_at, + fallback_lang: Some("en".into()), + }; + let (metadata, fm_span, _fm_warns) = + parse_frontmatter(&bytes, &hints).expect("frontmatter parses"); + let body_offset_lines: u32 = match fm_span { + Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1, + None => 1, + }; + let (blocks, parse_warns) = + parse_blocks(&bytes, body_offset_lines).expect("blocks parse"); + + // Pin parser_version so doc_id / block_ids are reproducible. + let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into()); + let mut metadata = metadata; + metadata.aliases.sort(); + metadata.tags.sort(); + + let doc = + build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns) + .expect("build_canonical_document"); + + // Pin policy so policy_hash and chunk_ids are reproducible. + let policy = ChunkPolicy { + target_tokens: 200, + overlap_tokens: 40, + respect_markdown_headings: true, + chunker_version: ChunkerVersion("md-heading-v1".into()), + }; + + let chunks = MdHeadingV1Chunker.chunk(&doc, &policy).expect("chunk"); + let actual = serde_json::to_value(&chunks).unwrap(); + + let baseline_path = dir.join("long-section.chunks.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = + serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "long-section chunks snapshot drift\n\ + --- expected ({}) ---\n{baseline_text}\n\ + --- actual ---\n{pretty}\n\ + If intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} + +/// Determinism cross-check: re-running the same pipeline yields the same +/// chunk_ids byte-for-byte. +#[test] +fn long_section_chunks_are_deterministic() { + let dir = fixtures_dir(); + let bytes = std::fs::read(dir.join("long-section.md")).expect("fixture readable"); + + let asset = fixed_asset("notes/long-section.md"); + let hints = BodyHints { + first_h1: Some("Alpha".into()), + fs_ctime: asset.discovered_at, + fs_mtime: asset.discovered_at, + fallback_lang: Some("en".into()), + }; + + let policy = ChunkPolicy { + target_tokens: 200, + overlap_tokens: 40, + respect_markdown_headings: true, + chunker_version: ChunkerVersion("md-heading-v1".into()), + }; + let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into()); + + let mut baseline: Option> = None; + for _ in 0..5 { + let (metadata, _fm_span, _fm_warns) = + parse_frontmatter(&bytes, &hints).expect("frontmatter parses"); + let (blocks, parse_warns) = parse_blocks(&bytes, 1).expect("blocks parse"); + let mut metadata = metadata; + metadata.aliases.sort(); + metadata.tags.sort(); + let doc = build_canonical_document( + &asset, + metadata, + blocks, + &parser_version, + parse_warns, + ) + .expect("build_canonical_document"); + let ids: Vec = MdHeadingV1Chunker + .chunk(&doc, &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + match &baseline { + None => baseline = Some(ids), + Some(prev) => assert_eq!(prev, &ids), + } + } +} diff --git a/fixtures/markdown/long-section.chunks.snapshot.json b/fixtures/markdown/long-section.chunks.snapshot.json new file mode 100644 index 0000000..ef27566 --- /dev/null +++ b/fixtures/markdown/long-section.chunks.snapshot.json @@ -0,0 +1,206 @@ +[ + { + "block_ids": [ + "39308c41feedcbbc2f92d5d133366f6d", + "5e978557db4fd5d88807b00ce0d8ca01", + "52fbbe749357ad142492968e8febafb2" + ], + "chunk_id": "04903321ed830fcb4b8a50fa795e6c14", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Alpha" + ], + "source_spans": [ + { + "end": 1, + "kind": "line", + "start": 1 + }, + { + "end": 3, + "kind": "line", + "start": 3 + }, + { + "end": 5, + "kind": "line", + "start": 5 + } + ], + "text": "Alpha\n\nAlpha intro paragraph one. This first paragraph in the alpha section gives a brief overview of what is to follow and serves as the lead-in for the subsequent material covered under the alpha heading.\n\nAlpha intro paragraph two. The second paragraph extends the discussion with additional sentences, padding out the paragraph so that paragraph-level chunk splitting actually has multiple candidates to consider when deciding where to slice the content stream.", + "token_estimate": 155 + }, + { + "block_ids": [ + "839080233875e832d37ba80d4b9ef97a", + "1390fa96500a55669123383889c472c4" + ], + "chunk_id": "661a4e5ae606d4327eee70bd4e346b52", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Alpha", + "Alpha Sub" + ], + "source_spans": [ + { + "end": 7, + "kind": "line", + "start": 7 + }, + { + "end": 9, + "kind": "line", + "start": 9 + } + ], + "text": "Alpha Sub\n\nSome prose under the alpha sub-heading. The nested heading should still be respected as a chunk boundary distinct from the parent alpha heading.", + "token_estimate": 52 + }, + { + "block_ids": [ + "7e923dfac89c5d8a31879418ec194026" + ], + "chunk_id": "c8b0f5d9405fa8c36eb70dd9005a29dc", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Alpha", + "Alpha Sub" + ], + "source_spans": [ + { + "end": 53, + "kind": "line", + "start": 11 + } + ], + "text": "// A code block long enough to easily clear any reasonable target_tokens\n// so the never-split-code-block rule is exercised by this fixture. The\n// rest of the function body is intentional filler: line after line of\n// content that, were the chunker permitted to split it, would exceed\n// the target threshold and force a break in the middle of the snippet.\nfn long_code_example_one() {\n let mut numbers = Vec::new();\n for i in 0..10 {\n numbers.push(i * 2);\n }\n let mut total = 0_i64;\n for n in &numbers {\n total += *n as i64;\n }\n println!(\"total = {total}\");\n}\n\nfn long_code_example_two() {\n let words = [\"alpha\", \"beta\", \"gamma\", \"delta\", \"epsilon\"];\n for w in words.iter() {\n if w.starts_with('a') {\n println!(\"starts with a: {w}\");\n } else if w.starts_with('b') {\n println!(\"starts with b: {w}\");\n } else if w.starts_with('g') {\n println!(\"starts with g: {w}\");\n } else {\n println!(\"other: {w}\");\n }\n }\n}\n\nfn long_code_example_three() {\n let mut buf = String::new();\n for ch in \"lorem ipsum dolor sit amet\".chars() {\n if ch.is_ascii_alphabetic() {\n buf.push(ch.to_ascii_uppercase());\n }\n }\n println!(\"buf = {buf}\");\n}", + "token_estimate": 427 + }, + { + "block_ids": [ + "53e0b44f880cca19d9f0ff99d917f4f6", + "8f794bb2314006e07fb7650ad28d2bb9" + ], + "chunk_id": "3a01e78c14f3d2e3737d9b0b1411a535", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Beta" + ], + "source_spans": [ + { + "end": 55, + "kind": "line", + "start": 55 + }, + { + "end": 57, + "kind": "line", + "start": 57 + } + ], + "text": "Beta\n\nBeta paragraph one. The beta section opens with an introductory paragraph that sets up the table appearing further down.", + "token_estimate": 42 + }, + { + "block_ids": [ + "dc1a3da1f6c0de0cc0ecaf93deb3ed30" + ], + "chunk_id": "6acd3b817583ebfd2f6639db2c47b4f0", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Beta" + ], + "source_spans": [ + { + "end": 64, + "kind": "line", + "start": 59 + } + ], + "text": "name | kind | note\none | small | first row\ntwo | medium | second row\nthree | large | third row\nfour | huge | fourth row", + "token_estimate": 40 + }, + { + "block_ids": [ + "8b8ba26ffe0e34d4a33c26ce0d302654" + ], + "chunk_id": "f79e267b7e498702e1bd35d2a373e5c5", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Beta" + ], + "source_spans": [ + { + "end": 66, + "kind": "line", + "start": 66 + } + ], + "text": "Beta closing paragraph. After the table we have one more paragraph of prose that anchors the end of the beta section before we move on to gamma.", + "token_estimate": 48 + }, + { + "block_ids": [ + "a5bb8d0a4f33ef9276f287c6b2876864", + "6358dda59f10540018ef85d776ee2ec2", + "1ee4ebef26433d6d6b585d7bd6497028" + ], + "chunk_id": "880fa807ed5aac2c31b76de8294ed270", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Gamma" + ], + "source_spans": [ + { + "end": 68, + "kind": "line", + "start": 68 + }, + { + "end": 70, + "kind": "line", + "start": 70 + }, + { + "end": 72, + "kind": "line", + "start": 72 + } + ], + "text": "Gamma\n\nGamma paragraph one. The gamma section is intentionally long to exercise the paragraph-level split with overlap rule when chunking under a single heading without any nested sub-headings to break things up further.\n\nGamma paragraph two. We continue accumulating prose so that the running token estimator climbs steadily and eventually trips the target_tokens threshold, forcing the chunker to emit a chunk and seed the next chunk with overlap from the prior tail.", + "token_estimate": 157 + }, + { + "block_ids": [ + "1ee4ebef26433d6d6b585d7bd6497028", + "38db826bf29bd64a90a698926d94d83e" + ], + "chunk_id": "6584ae54bbf25ea275ee380648eb3ccb", + "chunker_version": "md-heading-v1", + "doc_id": "550b21c4a6a3c526f4f39b759a5fb740", + "heading_path": [ + "Gamma" + ], + "source_spans": [ + { + "end": 72, + "kind": "line", + "start": 72 + }, + { + "end": 74, + "kind": "line", + "start": 74 + } + ], + "text": "Gamma paragraph two. We continue accumulating prose so that the running token estimator climbs steadily and eventually trips the target_tokens threshold, forcing the chunker to emit a chunk and seed the next chunk with overlap from the prior tail.\n\nGamma paragraph three. Yet another paragraph under the gamma heading, padded with words to ensure the byte count clears the threshold and the splitting behaviour shows up unambiguously in the snapshot output.", + "token_estimate": 153 + } +] diff --git a/fixtures/markdown/long-section.md b/fixtures/markdown/long-section.md new file mode 100644 index 0000000..07d672b --- /dev/null +++ b/fixtures/markdown/long-section.md @@ -0,0 +1,74 @@ +# Alpha + +Alpha intro paragraph one. This first paragraph in the alpha section gives a brief overview of what is to follow and serves as the lead-in for the subsequent material covered under the alpha heading. + +Alpha intro paragraph two. The second paragraph extends the discussion with additional sentences, padding out the paragraph so that paragraph-level chunk splitting actually has multiple candidates to consider when deciding where to slice the content stream. + +## Alpha Sub + +Some prose under the alpha sub-heading. The nested heading should still be respected as a chunk boundary distinct from the parent alpha heading. + +```rust +// A code block long enough to easily clear any reasonable target_tokens +// so the never-split-code-block rule is exercised by this fixture. The +// rest of the function body is intentional filler: line after line of +// content that, were the chunker permitted to split it, would exceed +// the target threshold and force a break in the middle of the snippet. +fn long_code_example_one() { + let mut numbers = Vec::new(); + for i in 0..10 { + numbers.push(i * 2); + } + let mut total = 0_i64; + for n in &numbers { + total += *n as i64; + } + println!("total = {total}"); +} + +fn long_code_example_two() { + let words = ["alpha", "beta", "gamma", "delta", "epsilon"]; + for w in words.iter() { + if w.starts_with('a') { + println!("starts with a: {w}"); + } else if w.starts_with('b') { + println!("starts with b: {w}"); + } else if w.starts_with('g') { + println!("starts with g: {w}"); + } else { + println!("other: {w}"); + } + } +} + +fn long_code_example_three() { + let mut buf = String::new(); + for ch in "lorem ipsum dolor sit amet".chars() { + if ch.is_ascii_alphabetic() { + buf.push(ch.to_ascii_uppercase()); + } + } + println!("buf = {buf}"); +} +``` + +# Beta + +Beta paragraph one. The beta section opens with an introductory paragraph that sets up the table appearing further down. + +| name | kind | note | +|-------|--------|--------------| +| one | small | first row | +| two | medium | second row | +| three | large | third row | +| four | huge | fourth row | + +Beta closing paragraph. After the table we have one more paragraph of prose that anchors the end of the beta section before we move on to gamma. + +# Gamma + +Gamma paragraph one. The gamma section is intentionally long to exercise the paragraph-level split with overlap rule when chunking under a single heading without any nested sub-headings to break things up further. + +Gamma paragraph two. We continue accumulating prose so that the running token estimator climbs steadily and eventually trips the target_tokens threshold, forcing the chunker to emit a chunk and seed the next chunk with overlap from the prior tail. + +Gamma paragraph three. Yet another paragraph under the gamma heading, padded with words to ensure the byte count clears the threshold and the splitting behaviour shows up unambiguously in the snapshot output.