feat(p10-2): manifest-file-v1 chunker (whole-file 1 chunk, symbol <manifest>)

Emits 1 Chunk per manifest file (Cargo.toml / pyproject.toml / package.json /
tsconfig.json / pom.xml / build.gradle / go.mod). Symbol unified to
"<manifest>"; manifest type distinguished by code_lang (toml / json / xml /
groovy / go-mod) read from Block::Code.lang. Oversize >200 lines splits via
tier2_shared::push_chunks_with_oversize.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 13:11:46 +00:00
parent 51004ac593
commit 22d4161728
7 changed files with 353 additions and 0 deletions

View File

@@ -27,6 +27,7 @@ mod pdf_page_v1;
mod tier2_shared;
pub mod k8s_manifest_resource_v1;
pub mod dockerfile_file_v1;
pub mod manifest_file_v1;
pub use code_go_ast_v1::CodeGoAstV1Chunker;
pub use code_java_ast_v1::CodeJavaAstV1Chunker;
@@ -39,3 +40,4 @@ pub use md_heading_v1::MdHeadingV1Chunker;
pub use pdf_page_v1::PdfPageV1Chunker;
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
pub use dockerfile_file_v1::DockerfileFileV1Chunker;
pub use manifest_file_v1::ManifestFileV1Chunker;

View File

@@ -0,0 +1,58 @@
//! p10-2: manifest whole-file chunker (Tier 2).
//!
//! Reads entire manifest file (Cargo.toml / package.json / pom.xml / go.mod /
//! build.gradle / pyproject.toml / tsconfig.json) and emits a single Chunk
//! with symbol "<manifest>", code_lang read from Block::Code.lang, line range
//! 1..EOF. Oversize >200 lines splits into line-windows sharing the symbol via
//! tier2_shared::push_chunks_with_oversize.
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
use anyhow::Result;
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
pub const VERSION_LABEL: &str = "manifest-file-v1";
#[derive(Clone, Copy, Debug, Default)]
pub struct ManifestFileV1Chunker;
impl Chunker for ManifestFileV1Chunker {
fn chunker_version(&self) -> ChunkerVersion {
ChunkerVersion(VERSION_LABEL.to_string())
}
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
policy_hash(policy)
}
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
// Expect a single Block::Code carrying the full manifest text.
let (text, lang) = match doc.blocks.first() {
Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")),
_ => return Ok(vec![]),
};
let total_lines = text.lines().count().max(1) as u32;
let mut chunks = Vec::new();
push_chunks_with_oversize(
&mut chunks,
doc,
policy,
text,
1,
total_lines,
"<manifest>",
lang,
VERSION_LABEL,
)?;
tracing::debug!(
target: "kebab-chunk",
doc_id = %doc.doc_id,
chunks = chunks.len(),
"manifest-file-v1 chunked",
);
Ok(chunks)
}
}

View File

@@ -0,0 +1,7 @@
[package]
name = "demo"
version = "0.1.0"
edition = "2021"
[dependencies]
serde = "1"

View File

@@ -0,0 +1,5 @@
module example.com/demo
go 1.22
require github.com/spf13/cobra v1.8.0

View File

@@ -0,0 +1,7 @@
{
"name": "demo",
"version": "0.1.0",
"dependencies": {
"react": "^18.0.0"
}
}

View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0">
<modelVersion>4.0.0</modelVersion>
<groupId>com.demo</groupId>
<artifactId>demo</artifactId>
<version>0.1.0</version>
</project>

View File

@@ -0,0 +1,267 @@
//! Behavioural tests for `ManifestFileV1Chunker`.
//!
//! Documents are constructed manually (no kebab-parse-code dependency) by
//! placing the raw manifest text into a single `Block::Code`, mirroring the
//! pattern used in `dockerfile_file_v1.rs`.
use std::path::PathBuf;
use kebab_chunk::ManifestFileV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
// ── helpers ──────────────────────────────────────────────────────────────────
fn fixtures_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests")
.join("fixtures")
}
/// Build a `CanonicalDocument` with a single `Block::Code` containing manifest text.
fn manifest_doc(lang: &str, manifest_text: &str) -> CanonicalDocument {
let wp = WorkspacePath(format!("build/{}", manifest_filename(lang)));
let aid = AssetId("m".repeat(64));
let pv = ParserVersion("code-manifest-v1".into());
let doc_id = id_for_doc(&wp, &aid, &pv);
let line_count = manifest_text.lines().count() as u32;
let span = SourceSpan::Code {
line_start: 1,
line_end: line_count.max(1),
symbol: None,
lang: Some(lang.into()),
};
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
let block = Block::Code(CodeBlock {
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some(lang.into()),
code: manifest_text.to_string(),
});
CanonicalDocument {
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: format!("Manifest ({})", lang),
lang: Lang("und".into()),
blocks: vec![block],
metadata: Metadata {
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some(lang.into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn manifest_filename(lang: &str) -> &'static str {
match lang {
"toml" => "Cargo.toml",
"json" => "package.json",
"xml" => "pom.xml",
"go-mod" => "go.mod",
_ => "manifest",
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion("manifest-file-v1".into()),
}
}
// ── tests ─────────────────────────────────────────────────────────────────────
/// A Cargo.toml fixture must emit exactly 1 chunk with the correct symbol,
/// lang, and line range.
#[test]
fn cargo_toml_single_chunk_with_toml_lang() {
let fixture_path = fixtures_dir().join("sample_cargo.toml");
let text = std::fs::read_to_string(&fixture_path)
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
let doc = manifest_doc("toml", &text);
let chunks = ManifestFileV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
assert_eq!(
chunks.len(),
1,
"expected 1 chunk, got {}: {chunks:#?}",
chunks.len()
);
let span = chunks[0].source_spans.first().expect("at least one span");
match span {
SourceSpan::Code {
line_start,
line_end: _,
symbol,
lang,
} => {
assert_eq!(*line_start, 1, "line_start must be 1");
assert_eq!(
symbol.as_deref(),
Some("<manifest>"),
"symbol must be '<manifest>'"
);
assert_eq!(lang.as_deref(), Some("toml"), "lang must be 'toml'");
}
other => panic!("expected SourceSpan::Code, got {other:?}"),
}
assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1");
}
/// A package.json fixture must emit exactly 1 chunk with the correct symbol,
/// lang, and line range.
#[test]
fn package_json_single_chunk_with_json_lang() {
let fixture_path = fixtures_dir().join("sample_package.json");
let text = std::fs::read_to_string(&fixture_path)
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
let doc = manifest_doc("json", &text);
let chunks = ManifestFileV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
assert_eq!(
chunks.len(),
1,
"expected 1 chunk, got {}: {chunks:#?}",
chunks.len()
);
let span = chunks[0].source_spans.first().expect("at least one span");
match span {
SourceSpan::Code {
line_start,
line_end: _,
symbol,
lang,
} => {
assert_eq!(*line_start, 1, "line_start must be 1");
assert_eq!(
symbol.as_deref(),
Some("<manifest>"),
"symbol must be '<manifest>'"
);
assert_eq!(lang.as_deref(), Some("json"), "lang must be 'json'");
}
other => panic!("expected SourceSpan::Code, got {other:?}"),
}
assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1");
}
/// A pom.xml fixture must emit exactly 1 chunk with the correct symbol,
/// lang, and line range.
#[test]
fn pom_xml_single_chunk_with_xml_lang() {
let fixture_path = fixtures_dir().join("sample_pom.xml");
let text = std::fs::read_to_string(&fixture_path)
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
let doc = manifest_doc("xml", &text);
let chunks = ManifestFileV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
assert_eq!(
chunks.len(),
1,
"expected 1 chunk, got {}: {chunks:#?}",
chunks.len()
);
let span = chunks[0].source_spans.first().expect("at least one span");
match span {
SourceSpan::Code {
line_start,
line_end: _,
symbol,
lang,
} => {
assert_eq!(*line_start, 1, "line_start must be 1");
assert_eq!(
symbol.as_deref(),
Some("<manifest>"),
"symbol must be '<manifest>'"
);
assert_eq!(lang.as_deref(), Some("xml"), "lang must be 'xml'");
}
other => panic!("expected SourceSpan::Code, got {other:?}"),
}
assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1");
}
/// A go.mod fixture must emit exactly 1 chunk with the correct symbol,
/// lang, and line range.
#[test]
fn go_mod_single_chunk_with_go_mod_lang() {
let fixture_path = fixtures_dir().join("sample_go.mod");
let text = std::fs::read_to_string(&fixture_path)
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
let doc = manifest_doc("go-mod", &text);
let chunks = ManifestFileV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
assert_eq!(
chunks.len(),
1,
"expected 1 chunk, got {}: {chunks:#?}",
chunks.len()
);
let span = chunks[0].source_spans.first().expect("at least one span");
match span {
SourceSpan::Code {
line_start,
line_end: _,
symbol,
lang,
} => {
assert_eq!(*line_start, 1, "line_start must be 1");
assert_eq!(
symbol.as_deref(),
Some("<manifest>"),
"symbol must be '<manifest>'"
);
assert_eq!(lang.as_deref(), Some("go-mod"), "lang must be 'go-mod'");
}
other => panic!("expected SourceSpan::Code, got {other:?}"),
}
assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1");
}