From 51004ac59327680022afe5090875f391f0875272 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 13:09:13 +0000 Subject: [PATCH] feat(p10-2): dockerfile-file-v1 chunker (whole-file 1 chunk, symbol ) Reads entire Dockerfile / Dockerfile.* / *.dockerfile content and emits a single Chunk with symbol "", code_lang "dockerfile", line range 1..EOF. Oversize >200 lines splits into line-windows sharing the symbol via tier2_shared::push_chunks_with_oversize. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-chunk/src/dockerfile_file_v1.rs | 57 ++++++++ crates/kebab-chunk/src/lib.rs | 2 + .../kebab-chunk/tests/dockerfile_file_v1.rs | 134 ++++++++++++++++++ .../tests/fixtures/sample.dockerfile | 5 + 4 files changed, 198 insertions(+) create mode 100644 crates/kebab-chunk/src/dockerfile_file_v1.rs create mode 100644 crates/kebab-chunk/tests/dockerfile_file_v1.rs create mode 100644 crates/kebab-chunk/tests/fixtures/sample.dockerfile diff --git a/crates/kebab-chunk/src/dockerfile_file_v1.rs b/crates/kebab-chunk/src/dockerfile_file_v1.rs new file mode 100644 index 0000000..519d1ae --- /dev/null +++ b/crates/kebab-chunk/src/dockerfile_file_v1.rs @@ -0,0 +1,57 @@ +//! p10-2: dockerfile whole-file chunker (Tier 2). +//! +//! Reads entire Dockerfile content and emits a single Chunk with symbol +//! "", code_lang "dockerfile", line range 1..EOF. +//! Oversize >200 lines splits into line-windows sharing the symbol via +//! tier2_shared::push_chunks_with_oversize. + +use crate::tier2_shared::{policy_hash, push_chunks_with_oversize}; +use anyhow::Result; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; + +pub const VERSION_LABEL: &str = "dockerfile-file-v1"; + +#[derive(Clone, Copy, Debug, Default)] +pub struct DockerfileFileV1Chunker; + +impl Chunker for DockerfileFileV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + policy_hash(policy) + } + + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result> { + // Expect a single Block::Code carrying the full Dockerfile text. + let text = match doc.blocks.first() { + Some(Block::Code(cb)) => cb.code.as_str(), + _ => return Ok(vec![]), + }; + + let total_lines = text.lines().count().max(1) as u32; + let mut chunks = Vec::new(); + + push_chunks_with_oversize( + &mut chunks, + doc, + policy, + text, + 1, + total_lines, + "", + "dockerfile", + VERSION_LABEL, + )?; + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = chunks.len(), + "dockerfile-file-v1 chunked", + ); + + Ok(chunks) + } +} diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index 516620a..a700e91 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -26,6 +26,7 @@ mod md_heading_v1; mod pdf_page_v1; mod tier2_shared; pub mod k8s_manifest_resource_v1; +pub mod dockerfile_file_v1; pub use code_go_ast_v1::CodeGoAstV1Chunker; pub use code_java_ast_v1::CodeJavaAstV1Chunker; @@ -37,3 +38,4 @@ pub use code_ts_ast_v1::CodeTsAstV1Chunker; pub use md_heading_v1::MdHeadingV1Chunker; pub use pdf_page_v1::PdfPageV1Chunker; pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker; +pub use dockerfile_file_v1::DockerfileFileV1Chunker; diff --git a/crates/kebab-chunk/tests/dockerfile_file_v1.rs b/crates/kebab-chunk/tests/dockerfile_file_v1.rs new file mode 100644 index 0000000..44dd94a --- /dev/null +++ b/crates/kebab-chunk/tests/dockerfile_file_v1.rs @@ -0,0 +1,134 @@ +//! Behavioural tests for `DockerfileFileV1Chunker`. +//! +//! Documents are constructed manually (no kebab-parse-code dependency) by +//! placing the raw Dockerfile text into a single `Block::Code`, mirroring the +//! pattern used in `k8s_manifest_resource_v1.rs`. + +use std::path::PathBuf; + +use kebab_chunk::DockerfileFileV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, +}; +use time::OffsetDateTime; + +// ── helpers ────────────────────────────────────────────────────────────────── + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +/// Build a `CanonicalDocument` with a single `Block::Code` containing `dockerfile_text`. +fn dockerfile_doc(dockerfile_text: &str) -> CanonicalDocument { + let wp = WorkspacePath("build/Dockerfile".into()); + let aid = AssetId("d".repeat(64)); + let pv = ParserVersion("code-dockerfile-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + let line_count = dockerfile_text.lines().count() as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: line_count.max(1), + symbol: None, + lang: Some("dockerfile".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], 0, &span); + let block = Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("dockerfile".into()), + code: dockerfile_text.to_string(), + }); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "Dockerfile".into(), + lang: Lang("und".into()), + blocks: vec![block], + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("dockerfile".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("dockerfile-file-v1".into()), + } +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +/// A simple 5-line Dockerfile fixture must emit exactly 1 chunk with the +/// correct symbol, lang, and line range. +#[test] +fn dockerfile_emits_single_chunk() { + let fixture_path = fixtures_dir().join("sample.dockerfile"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = dockerfile_doc(&text); + let chunks = DockerfileFileV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 1, + "expected 1 chunk, got {}: {chunks:#?}", + chunks.len() + ); + + // Inspect the Chunk's source_spans for symbol / lang / line range. + let span = chunks[0].source_spans.first().expect("at least one span"); + match span { + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => { + assert_eq!(*line_start, 1, "line_start must be 1"); + assert_eq!(*line_end, 5, "line_end must be 5 (5-line fixture)"); + assert_eq!( + symbol.as_deref(), + Some(""), + "symbol must be ''" + ); + assert_eq!(lang.as_deref(), Some("dockerfile"), "lang must be 'dockerfile'"); + } + other => panic!("expected SourceSpan::Code, got {other:?}"), + } + + // Verify chunker_version label. + assert_eq!(chunks[0].chunker_version.0, "dockerfile-file-v1"); +} diff --git a/crates/kebab-chunk/tests/fixtures/sample.dockerfile b/crates/kebab-chunk/tests/fixtures/sample.dockerfile new file mode 100644 index 0000000..94352b8 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample.dockerfile @@ -0,0 +1,5 @@ +FROM rust:1.94-slim AS builder +WORKDIR /app +COPY . . +RUN cargo build --release +CMD ["/app/target/release/kebab"]