feat(p10-2): dockerfile-file-v1 chunker (whole-file 1 chunk, symbol <dockerfile>)
Reads entire Dockerfile / Dockerfile.* / *.dockerfile content and emits a single Chunk with symbol "<dockerfile>", code_lang "dockerfile", line range 1..EOF. Oversize >200 lines splits into line-windows sharing the symbol via tier2_shared::push_chunks_with_oversize. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
57
crates/kebab-chunk/src/dockerfile_file_v1.rs
Normal file
57
crates/kebab-chunk/src/dockerfile_file_v1.rs
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
//! p10-2: dockerfile whole-file chunker (Tier 2).
|
||||||
|
//!
|
||||||
|
//! Reads entire Dockerfile content and emits a single Chunk with symbol
|
||||||
|
//! "<dockerfile>", code_lang "dockerfile", line range 1..EOF.
|
||||||
|
//! Oversize >200 lines splits into line-windows sharing the symbol via
|
||||||
|
//! tier2_shared::push_chunks_with_oversize.
|
||||||
|
|
||||||
|
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||||
|
use anyhow::Result;
|
||||||
|
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
|
||||||
|
|
||||||
|
pub const VERSION_LABEL: &str = "dockerfile-file-v1";
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug, Default)]
|
||||||
|
pub struct DockerfileFileV1Chunker;
|
||||||
|
|
||||||
|
impl Chunker for DockerfileFileV1Chunker {
|
||||||
|
fn chunker_version(&self) -> ChunkerVersion {
|
||||||
|
ChunkerVersion(VERSION_LABEL.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||||
|
policy_hash(policy)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||||
|
// Expect a single Block::Code carrying the full Dockerfile text.
|
||||||
|
let text = match doc.blocks.first() {
|
||||||
|
Some(Block::Code(cb)) => cb.code.as_str(),
|
||||||
|
_ => return Ok(vec![]),
|
||||||
|
};
|
||||||
|
|
||||||
|
let total_lines = text.lines().count().max(1) as u32;
|
||||||
|
let mut chunks = Vec::new();
|
||||||
|
|
||||||
|
push_chunks_with_oversize(
|
||||||
|
&mut chunks,
|
||||||
|
doc,
|
||||||
|
policy,
|
||||||
|
text,
|
||||||
|
1,
|
||||||
|
total_lines,
|
||||||
|
"<dockerfile>",
|
||||||
|
"dockerfile",
|
||||||
|
VERSION_LABEL,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
tracing::debug!(
|
||||||
|
target: "kebab-chunk",
|
||||||
|
doc_id = %doc.doc_id,
|
||||||
|
chunks = chunks.len(),
|
||||||
|
"dockerfile-file-v1 chunked",
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(chunks)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -26,6 +26,7 @@ mod md_heading_v1;
|
|||||||
mod pdf_page_v1;
|
mod pdf_page_v1;
|
||||||
mod tier2_shared;
|
mod tier2_shared;
|
||||||
pub mod k8s_manifest_resource_v1;
|
pub mod k8s_manifest_resource_v1;
|
||||||
|
pub mod dockerfile_file_v1;
|
||||||
|
|
||||||
pub use code_go_ast_v1::CodeGoAstV1Chunker;
|
pub use code_go_ast_v1::CodeGoAstV1Chunker;
|
||||||
pub use code_java_ast_v1::CodeJavaAstV1Chunker;
|
pub use code_java_ast_v1::CodeJavaAstV1Chunker;
|
||||||
@@ -37,3 +38,4 @@ pub use code_ts_ast_v1::CodeTsAstV1Chunker;
|
|||||||
pub use md_heading_v1::MdHeadingV1Chunker;
|
pub use md_heading_v1::MdHeadingV1Chunker;
|
||||||
pub use pdf_page_v1::PdfPageV1Chunker;
|
pub use pdf_page_v1::PdfPageV1Chunker;
|
||||||
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
|
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
|
||||||
|
pub use dockerfile_file_v1::DockerfileFileV1Chunker;
|
||||||
|
|||||||
134
crates/kebab-chunk/tests/dockerfile_file_v1.rs
Normal file
134
crates/kebab-chunk/tests/dockerfile_file_v1.rs
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
//! Behavioural tests for `DockerfileFileV1Chunker`.
|
||||||
|
//!
|
||||||
|
//! Documents are constructed manually (no kebab-parse-code dependency) by
|
||||||
|
//! placing the raw Dockerfile text into a single `Block::Code`, mirroring the
|
||||||
|
//! pattern used in `k8s_manifest_resource_v1.rs`.
|
||||||
|
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use kebab_chunk::DockerfileFileV1Chunker;
|
||||||
|
use kebab_core::{
|
||||||
|
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||||
|
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||||
|
WorkspacePath, id_for_block, id_for_doc,
|
||||||
|
};
|
||||||
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
|
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn fixtures_dir() -> PathBuf {
|
||||||
|
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||||
|
.join("tests")
|
||||||
|
.join("fixtures")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a `CanonicalDocument` with a single `Block::Code` containing `dockerfile_text`.
|
||||||
|
fn dockerfile_doc(dockerfile_text: &str) -> CanonicalDocument {
|
||||||
|
let wp = WorkspacePath("build/Dockerfile".into());
|
||||||
|
let aid = AssetId("d".repeat(64));
|
||||||
|
let pv = ParserVersion("code-dockerfile-v1".into());
|
||||||
|
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||||
|
|
||||||
|
let line_count = dockerfile_text.lines().count() as u32;
|
||||||
|
let span = SourceSpan::Code {
|
||||||
|
line_start: 1,
|
||||||
|
line_end: line_count.max(1),
|
||||||
|
symbol: None,
|
||||||
|
lang: Some("dockerfile".into()),
|
||||||
|
};
|
||||||
|
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
|
||||||
|
let block = Block::Code(CodeBlock {
|
||||||
|
common: CommonBlock {
|
||||||
|
block_id: bid,
|
||||||
|
heading_path: vec![],
|
||||||
|
source_span: span,
|
||||||
|
},
|
||||||
|
lang: Some("dockerfile".into()),
|
||||||
|
code: dockerfile_text.to_string(),
|
||||||
|
});
|
||||||
|
|
||||||
|
CanonicalDocument {
|
||||||
|
doc_id,
|
||||||
|
source_asset_id: aid,
|
||||||
|
workspace_path: wp,
|
||||||
|
title: "Dockerfile".into(),
|
||||||
|
lang: Lang("und".into()),
|
||||||
|
blocks: vec![block],
|
||||||
|
metadata: Metadata {
|
||||||
|
aliases: vec![],
|
||||||
|
tags: vec![],
|
||||||
|
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||||
|
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||||
|
source_type: SourceType::Note,
|
||||||
|
trust_level: TrustLevel::Primary,
|
||||||
|
user_id_alias: None,
|
||||||
|
user: Default::default(),
|
||||||
|
repo: Some("kebab".into()),
|
||||||
|
git_branch: Some("main".into()),
|
||||||
|
git_commit: Some("0".repeat(40)),
|
||||||
|
code_lang: Some("dockerfile".into()),
|
||||||
|
},
|
||||||
|
provenance: Provenance { events: vec![] },
|
||||||
|
parser_version: pv,
|
||||||
|
schema_version: 1,
|
||||||
|
doc_version: 1,
|
||||||
|
last_chunker_version: None,
|
||||||
|
last_embedding_version: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn policy() -> ChunkPolicy {
|
||||||
|
ChunkPolicy {
|
||||||
|
target_tokens: 500,
|
||||||
|
overlap_tokens: 80,
|
||||||
|
respect_markdown_headings: false,
|
||||||
|
chunker_version: ChunkerVersion("dockerfile-file-v1".into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// A simple 5-line Dockerfile fixture must emit exactly 1 chunk with the
|
||||||
|
/// correct symbol, lang, and line range.
|
||||||
|
#[test]
|
||||||
|
fn dockerfile_emits_single_chunk() {
|
||||||
|
let fixture_path = fixtures_dir().join("sample.dockerfile");
|
||||||
|
let text = std::fs::read_to_string(&fixture_path)
|
||||||
|
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||||
|
|
||||||
|
let doc = dockerfile_doc(&text);
|
||||||
|
let chunks = DockerfileFileV1Chunker
|
||||||
|
.chunk(&doc, &policy())
|
||||||
|
.expect("chunk");
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
chunks.len(),
|
||||||
|
1,
|
||||||
|
"expected 1 chunk, got {}: {chunks:#?}",
|
||||||
|
chunks.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Inspect the Chunk's source_spans for symbol / lang / line range.
|
||||||
|
let span = chunks[0].source_spans.first().expect("at least one span");
|
||||||
|
match span {
|
||||||
|
SourceSpan::Code {
|
||||||
|
line_start,
|
||||||
|
line_end,
|
||||||
|
symbol,
|
||||||
|
lang,
|
||||||
|
} => {
|
||||||
|
assert_eq!(*line_start, 1, "line_start must be 1");
|
||||||
|
assert_eq!(*line_end, 5, "line_end must be 5 (5-line fixture)");
|
||||||
|
assert_eq!(
|
||||||
|
symbol.as_deref(),
|
||||||
|
Some("<dockerfile>"),
|
||||||
|
"symbol must be '<dockerfile>'"
|
||||||
|
);
|
||||||
|
assert_eq!(lang.as_deref(), Some("dockerfile"), "lang must be 'dockerfile'");
|
||||||
|
}
|
||||||
|
other => panic!("expected SourceSpan::Code, got {other:?}"),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify chunker_version label.
|
||||||
|
assert_eq!(chunks[0].chunker_version.0, "dockerfile-file-v1");
|
||||||
|
}
|
||||||
5
crates/kebab-chunk/tests/fixtures/sample.dockerfile
vendored
Normal file
5
crates/kebab-chunk/tests/fixtures/sample.dockerfile
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
FROM rust:1.94-slim AS builder
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
RUN cargo build --release
|
||||||
|
CMD ["/app/target/release/kebab"]
|
||||||
Reference in New Issue
Block a user