feat(p10-1d): code-cpp-ast-v1 chunker + snapshot test
Identical chunker body to code-c-ast-v1 (per-language work happens in the CppAstExtractor, Task C). Snapshot fixture covers nested namespace + class + ctor/dtor + method + operator overload + template fn + free fn + top-level main, verifying namespace::Class::method symbol convention per design §3.4. 5 chunks emitted: - <top-level> (includes, namespace opening) - kebab::chunk::MdHeadingV1Chunker (class unit) - kebab::identity (template function) - kebab::global_helper (free function in namespace) - main (top-level main function) Template function symbols emit without <T> parameters per spec convention. Namespace::Class::method pattern verified. All tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
322
crates/kebab-chunk/src/code_cpp_ast_v1.rs
Normal file
322
crates/kebab-chunk/src/code_cpp_ast_v1.rs
Normal file
@@ -0,0 +1,322 @@
|
||||
//! `code-cpp-ast-v1` — maps a tree-sitter-derived C++ AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-cpp-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeCppAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeCppAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeCppAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeCppAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-cpp-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.cpp".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-cpp-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("cpp".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
lang: Some("cpp".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("cpp".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_cpp_ast_v1() {
|
||||
assert_eq!(CodeCppAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-cpp-ast-v1".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "int parse() {\n\t// x\n}"),
|
||||
("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"),
|
||||
]);
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-cpp-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::<Vec<_>>().join("");
|
||||
let code = format!("int big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort(); ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
})];
|
||||
let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeCppAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
|
||||
let base: Vec<String> = CodeCppAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeCppAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeCppAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,7 @@
|
||||
//! It consumes `CanonicalDocument` purely through `kb-core` types.
|
||||
|
||||
mod code_c_ast_v1;
|
||||
mod code_cpp_ast_v1;
|
||||
mod code_go_ast_v1;
|
||||
mod code_java_ast_v1;
|
||||
mod code_js_ast_v1;
|
||||
@@ -32,6 +33,7 @@ pub mod manifest_file_v1;
|
||||
pub mod code_text_paragraph_v1;
|
||||
|
||||
pub use code_c_ast_v1::CodeCAstV1Chunker;
|
||||
pub use code_cpp_ast_v1::CodeCppAstV1Chunker;
|
||||
pub use code_go_ast_v1::CodeGoAstV1Chunker;
|
||||
pub use code_java_ast_v1::CodeJavaAstV1Chunker;
|
||||
pub use code_js_ast_v1::CodeJsAstV1Chunker;
|
||||
|
||||
200
crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
Normal file
200
crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative C++ code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_c_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeCppAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("projects/record.cpp".into());
|
||||
let aid = AssetId("c".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-cpp-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Representative units (C++ specific):
|
||||
// 0. includes + namespace opening (lines 1–4, ≤200)
|
||||
// 1. class definition (lines 6–20, ≤200)
|
||||
// 2. template function (lines 22–25, ≤200)
|
||||
// 3. namespace closing + free fn (lines 27–29, ≤200)
|
||||
// 4. main fn (lines 31–34, ≤200)
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"<top-level>",
|
||||
1,
|
||||
4,
|
||||
"#include <string>\n#include <vector>\n\nnamespace kebab {".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::chunk::MdHeadingV1Chunker",
|
||||
6,
|
||||
20,
|
||||
"class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::identity",
|
||||
22,
|
||||
25,
|
||||
"template <typename T>\nT identity(T value) {\n return value;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::global_helper",
|
||||
27,
|
||||
29,
|
||||
"void global_helper() {\n // free function in kebab namespace\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"main",
|
||||
31,
|
||||
34,
|
||||
"int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}".to_string(),
|
||||
),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("cpp".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("cpp".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "record.cpp".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-cpp-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_cpp_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.cpp.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-cpp-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_cpp_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
107
crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json
vendored
Normal file
107
crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,107 @@
|
||||
[
|
||||
{
|
||||
"block_ids": [
|
||||
"53292605459065d170cd36c118e20546"
|
||||
],
|
||||
"chunk_id": "50a5b324300d9082eac4ce2a422810e1",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 4,
|
||||
"line_start": 1,
|
||||
"symbol": "<top-level>"
|
||||
}
|
||||
],
|
||||
"text": "#include <string>\n#include <vector>\n\nnamespace kebab {",
|
||||
"token_estimate": 18
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"f349acad94c9fa4cf9ad1c0a93e83610"
|
||||
],
|
||||
"chunk_id": "0e6bc7c522665af8a4b0f66afb9d29c8",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 20,
|
||||
"line_start": 6,
|
||||
"symbol": "kebab::chunk::MdHeadingV1Chunker"
|
||||
}
|
||||
],
|
||||
"text": "class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};",
|
||||
"token_estimate": 95
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"8b9811387717d0bd4abf84abcc35b8b1"
|
||||
],
|
||||
"chunk_id": "d9326d252905b665b2adb9a416c20451",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 25,
|
||||
"line_start": 22,
|
||||
"symbol": "kebab::identity"
|
||||
}
|
||||
],
|
||||
"text": "template <typename T>\nT identity(T value) {\n return value;\n}",
|
||||
"token_estimate": 21
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"1754cb6b971f6a4cb292f144a4f0570b"
|
||||
],
|
||||
"chunk_id": "56ee5f991de4a413c016da8dc4acfc35",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 29,
|
||||
"line_start": 27,
|
||||
"symbol": "kebab::global_helper"
|
||||
}
|
||||
],
|
||||
"text": "void global_helper() {\n // free function in kebab namespace\n}",
|
||||
"token_estimate": 22
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"14b5f3393d6d25f822f5b70763d24acd"
|
||||
],
|
||||
"chunk_id": "c0d7c043cdd575c530db3909b54cc906",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 34,
|
||||
"line_start": 31,
|
||||
"symbol": "main"
|
||||
}
|
||||
],
|
||||
"text": "int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}",
|
||||
"token_estimate": 23
|
||||
}
|
||||
]
|
||||
40
crates/kebab-chunk/tests/fixtures/sample.cpp
vendored
Normal file
40
crates/kebab-chunk/tests/fixtures/sample.cpp
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace kebab {
|
||||
namespace chunk {
|
||||
|
||||
class MdHeadingV1Chunker {
|
||||
public:
|
||||
MdHeadingV1Chunker() = default;
|
||||
~MdHeadingV1Chunker() = default;
|
||||
|
||||
std::string chunk_doc(const std::string& doc) {
|
||||
return doc;
|
||||
}
|
||||
|
||||
int operator()(int x) const {
|
||||
return x * 2;
|
||||
}
|
||||
|
||||
private:
|
||||
int counter_ = 0;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
T identity(T value) {
|
||||
return value;
|
||||
}
|
||||
|
||||
} // namespace chunk
|
||||
|
||||
void global_helper() {
|
||||
// free function in kebab namespace
|
||||
}
|
||||
|
||||
} // namespace kebab
|
||||
|
||||
int main() {
|
||||
kebab::chunk::MdHeadingV1Chunker c;
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user