diff --git a/Cargo.lock b/Cargo.lock index 311a7be..28cbf89 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4181,6 +4181,7 @@ dependencies = [ "kebab-parse-md", "serde_json", "serde_json_canonicalizer", + "serde_yaml", "time", "tracing", ] diff --git a/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs b/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs new file mode 100644 index 0000000..71a4104 --- /dev/null +++ b/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs @@ -0,0 +1,169 @@ +//! p10-2: k8s manifest resource-aware chunker. +//! +//! Splits a multi-document YAML file on `^---\s*$` boundaries, recognises +//! documents that have both `apiVersion` and `kind` string fields as k8s +//! resources, and emits one `Chunk` per resource (with oversize >200-line +//! fallback). Non-k8s documents are skipped; invalid YAML yields 0 chunks +//! for the entire file. + +use crate::tier2_shared::{policy_hash, push_chunks_with_oversize}; +use anyhow::Result; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; + +pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1"; + +#[derive(Clone, Copy, Debug, Default)] +pub struct K8sManifestResourceV1Chunker; + +impl Chunker for K8sManifestResourceV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + policy_hash(policy) + } + + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result> { + // Expect a single Block::Code carrying the full YAML text. + let text = match doc.blocks.first() { + Some(Block::Code(cb)) => cb.code.as_str(), + _ => return Ok(vec![]), + }; + + let slices = split_yaml_documents(text); + let mut chunks: Vec = Vec::new(); + + for slice in slices { + // Invalid YAML in any document → return 0 chunks for the file. + let value: serde_yaml::Value = match serde_yaml::from_str(slice.text) { + Ok(v) => v, + Err(_) => return Ok(vec![]), + }; + + let Some(mapping) = value.as_mapping() else { + continue; + }; + + let api = mapping + .get("apiVersion") + .and_then(|v| v.as_str()) + .unwrap_or(""); + let kind = mapping + .get("kind") + .and_then(|v| v.as_str()) + .unwrap_or(""); + + // Skip non-k8s documents. + if api.is_empty() || kind.is_empty() { + continue; + } + + let metadata = mapping + .get("metadata") + .and_then(|v| v.as_mapping()); + let name = metadata + .and_then(|m| m.get("name")) + .and_then(|v| v.as_str()) + .unwrap_or(""); + let namespace = metadata + .and_then(|m| m.get("namespace")) + .and_then(|v| v.as_str()); + + let symbol = match namespace { + Some(ns) if !ns.is_empty() => format!("{kind}/{ns}/{name}"), + _ => format!("{kind}/{name}"), + }; + + push_chunks_with_oversize( + &mut chunks, + doc, + policy, + slice.text, + slice.line_start, + slice.line_end, + &symbol, + "yaml", + VERSION_LABEL, + )?; + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = chunks.len(), + "k8s-manifest-resource-v1 chunked", + ); + + Ok(chunks) + } +} + +struct YamlSlice<'a> { + text: &'a str, + line_start: u32, + line_end: u32, +} + +/// Split raw YAML text into per-document slices on `---` separator lines. +/// Line numbers are 1-indexed. +fn split_yaml_documents(text: &str) -> Vec> { + let lines: Vec<&str> = text.lines().collect(); + + // Collect indices of separator lines (0-based), then append a sentinel at + // the end so the last slice is always terminated. + let mut separators: Vec = lines + .iter() + .enumerate() + .filter_map(|(i, l)| { + let trimmed = l.trim_end(); + if trimmed == "---" + || trimmed.starts_with("--- ") + || trimmed.starts_with("---\t") + { + Some(i) + } else { + None + } + }) + .collect(); + separators.push(lines.len()); + + let mut slices: Vec> = Vec::new(); + let mut doc_start_line: usize = 0; // 0-based index of current doc start + + for sep_line in separators { + if sep_line > doc_start_line { + let start_byte = byte_offset_of_line(text, doc_start_line); + let end_byte = byte_offset_of_line(text, sep_line); + let slice_text = &text[start_byte..end_byte]; + if !slice_text.trim().is_empty() { + slices.push(YamlSlice { + text: slice_text, + line_start: (doc_start_line + 1) as u32, + line_end: sep_line as u32, + }); + } + } + doc_start_line = sep_line + 1; + } + + slices +} + +/// Return the byte offset of the start of `line_idx` (0-based line index). +fn byte_offset_of_line(text: &str, line_idx: usize) -> usize { + if line_idx == 0 { + return 0; + } + let mut count = 0usize; + for (i, c) in text.char_indices() { + if c == '\n' { + count += 1; + if count == line_idx { + return i + 1; + } + } + } + text.len() +} diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index 750d18e..516620a 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -24,6 +24,8 @@ mod code_rust_ast_v1; mod code_ts_ast_v1; mod md_heading_v1; mod pdf_page_v1; +mod tier2_shared; +pub mod k8s_manifest_resource_v1; pub use code_go_ast_v1::CodeGoAstV1Chunker; pub use code_java_ast_v1::CodeJavaAstV1Chunker; @@ -34,3 +36,4 @@ pub use code_rust_ast_v1::CodeRustAstV1Chunker; pub use code_ts_ast_v1::CodeTsAstV1Chunker; pub use md_heading_v1::MdHeadingV1Chunker; pub use pdf_page_v1::PdfPageV1Chunker; +pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker; diff --git a/crates/kebab-chunk/src/tier2_shared.rs b/crates/kebab-chunk/src/tier2_shared.rs new file mode 100644 index 0000000..f52173c --- /dev/null +++ b/crates/kebab-chunk/src/tier2_shared.rs @@ -0,0 +1,142 @@ +//! p10-2: Tier 2 chunker shared helpers (oversize fallback + Chunk build). +//! +//! Mirrors `code_rust_ast_v1`'s Chunk-construction pattern exactly so that +//! id / hashes / token-count / ChunkPolicy semantics stay identical across +//! Tier 1 (AST) and Tier 2 (resource-aware) chunkers. + +use anyhow::Result; +use kebab_core::{ + BlockId, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, DocumentId, SourceSpan, + id_for_chunk, +}; + +pub(crate) const AST_CHUNK_MAX_LINES: u32 = 200; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; + +/// Compute the policy hash the same way `code_rust_ast_v1` does. +pub(crate) fn policy_hash(policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() +} + +/// Emit one chunk for `(text, line_start..=line_end, symbol, lang)`, splitting +/// into line-windows of at most `AST_CHUNK_MAX_LINES` if the slice is oversize. +/// Mirrors the oversize path in `code_rust_ast_v1`'s `chunk` impl. +#[allow(clippy::too_many_arguments)] +pub(crate) fn push_chunks_with_oversize( + out: &mut Vec, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + text: &str, + line_start: u32, + line_end: u32, + symbol: &str, + lang: &str, + chunker_version: &str, +) -> Result<()> { + let n_lines = (line_end - line_start + 1).max(1); + let cv = ChunkerVersion(chunker_version.to_string()); + let base_policy_hash = policy_hash(policy); + + if n_lines <= AST_CHUNK_MAX_LINES { + out.push(build_chunk( + doc, + &cv, + &base_policy_hash, + text, + line_start, + line_end, + symbol, + lang, + None, + )); + return Ok(()); + } + + let lines: Vec<&str> = text.lines().collect(); + let total = lines.len(); + let mut window_start = line_start; + let mut i = 0usize; + while i < total { + let take = (AST_CHUNK_MAX_LINES as usize).min(total - i); + let window_text = lines[i..i + take].join("\n"); + let window_end = window_start + take as u32 - 1; + out.push(build_chunk( + doc, + &cv, + &base_policy_hash, + &window_text, + window_start, + window_end, + symbol, + lang, + Some(window_start), + )); + i += take; + window_start = window_end + 1; + } + Ok(()) +} + +/// Build a single `Chunk`, mirroring `make_chunk` in `code_rust_ast_v1.rs` +/// exactly (same id recipe, same token estimate, same field set). +/// +/// `split_key` is `Some(line_start_of_window)` for oversize splits, `None` +/// for normal single-chunk emission. Mirrors the `Some(part_ls)` / `None` +/// split_key pattern in 1A-2. +#[allow(clippy::too_many_arguments)] +fn build_chunk( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + base_policy_hash: &str, + text: &str, + line_start: u32, + line_end: u32, + symbol: &str, + lang: &str, + split_key: Option, +) -> Chunk { + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol.to_string()), + lang: Some(lang.to_string()), + }; + + // id_hash mirrors code_rust_ast_v1's make_chunk logic: + // split_key Some(k) => "{base_policy_hash}#L{k}" + // split_key None => base_policy_hash + let id_hash = match split_key { + Some(k) => format!("{base_policy_hash}#L{k}"), + None => base_policy_hash.to_string(), + }; + + // block_ids: Tier 2 chunkers have no per-block structure (the whole file + // is one Block::Code), so we pass an empty slice — same as using the doc- + // level slice without explicit block granularity. + let block_ids: Vec = vec![]; + + let chunk_id = id_for_chunk( + &DocumentId(doc.doc_id.0.clone()), + chunker_version, + &block_ids, + &id_hash, + ); + + let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN); + + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids, + text: text.to_string(), + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.to_string(), + } +} diff --git a/crates/kebab-chunk/tests/fixtures/sample_k8s.yaml b/crates/kebab-chunk/tests/fixtures/sample_k8s.yaml new file mode 100644 index 0000000..b7f61f0 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_k8s.yaml @@ -0,0 +1,34 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api-server + namespace: prod +spec: + replicas: 3 + selector: + matchLabels: + app: api-server + template: + metadata: + labels: + app: api-server + spec: + containers: + - name: api + image: example/api:1.2.3 +--- +apiVersion: v1 +kind: Service +metadata: + name: api-server + namespace: prod +spec: + selector: + app: api-server + ports: + - port: 80 + targetPort: 8080 +--- +# Non-k8s document — apiVersion missing +kind: ClusterIP +foo: bar diff --git a/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs new file mode 100644 index 0000000..42625a0 --- /dev/null +++ b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs @@ -0,0 +1,205 @@ +//! Behavioural tests for `K8sManifestResourceV1Chunker`. +//! +//! Documents are constructed manually (no kebab-parse-code dependency) by +//! placing the raw YAML text into a single `Block::Code`, mirroring the +//! pattern used in `code_rust_ast_snapshot.rs`. + +use std::path::PathBuf; + +use kebab_chunk::K8sManifestResourceV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, +}; +use time::OffsetDateTime; + +// ── helpers ────────────────────────────────────────────────────────────────── + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +/// Build a `CanonicalDocument` with a single `Block::Code` containing `yaml_text`. +fn yaml_doc(yaml_text: &str) -> CanonicalDocument { + let wp = WorkspacePath("manifests/deploy.yaml".into()); + let aid = AssetId("c".repeat(64)); + let pv = ParserVersion("code-yaml-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + let line_count = yaml_text.lines().count() as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: line_count.max(1), + symbol: None, + lang: Some("yaml".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], 0, &span); + let block = Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("yaml".into()), + code: yaml_text.to_string(), + }); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "deploy.yaml".into(), + lang: Lang("und".into()), + blocks: vec![block], + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("yaml".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("k8s-manifest-resource-v1".into()), + } +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +/// Three YAML documents: 2 valid k8s resources + 1 non-k8s (no apiVersion). +/// The chunker must emit exactly 2 chunks with the correct symbols and lang. +#[test] +fn k8s_multi_doc_emits_one_chunk_per_resource() { + let fixture_path = fixtures_dir().join("sample_k8s.yaml"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = yaml_doc(&text); + let chunks = K8sManifestResourceV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 2, + "expected 2 k8s chunks, got {}: {chunks:#?}", + chunks.len() + ); + + let symbols: Vec<&str> = chunks + .iter() + .map(|c| { + match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + symbol.as_deref().expect("symbol must be Some for k8s chunks") + } + other => panic!("expected Code span, got {other:?}"), + } + }) + .collect(); + + assert_eq!( + symbols, + vec!["Deployment/prod/api-server", "Service/prod/api-server"], + "symbols mismatch: {symbols:?}" + ); + + // Verify lang = "yaml" on every chunk. + for chunk in &chunks { + match &chunk.source_spans[0] { + SourceSpan::Code { lang, .. } => { + assert_eq!(lang.as_deref(), Some("yaml"), "lang must be 'yaml'"); + } + other => panic!("expected Code span, got {other:?}"), + } + } + + // Verify chunker_version label. + for chunk in &chunks { + assert_eq!(chunk.chunker_version.0, "k8s-manifest-resource-v1"); + } +} + +/// A YAML document with an indentation error (tab in a space-indented context) +/// must cause the chunker to return 0 chunks for the entire file. +#[test] +fn k8s_invalid_yaml_emits_zero_chunks() { + // serde_yaml 0.9 is lenient about duplicate keys (last wins), so use a + // genuine YAML structural error (unclosed flow sequence) to force a parse + // failure. + let actually_bad = "apiVersion: v1\nkind: Service\nfoo: [\nbar\n"; + + let doc = yaml_doc(actually_bad); + let chunks = K8sManifestResourceV1Chunker + .chunk(&doc, &policy()) + .expect("chunk should not error — return Ok(vec![]) for invalid yaml"); + + assert_eq!( + chunks.len(), + 0, + "invalid YAML must yield 0 chunks, got {}: {chunks:#?}", + chunks.len() + ); +} + +/// A cluster-scoped resource (no `metadata.namespace`) must produce a symbol +/// of the form `/` (two components, no namespace segment). +#[test] +fn k8s_cluster_scoped_resource_symbol() { + let yaml = "\ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-admin +rules: +- apiGroups: [\"*\"] + resources: [\"*\"] + verbs: [\"*\"] +"; + + let doc = yaml_doc(yaml); + let chunks = K8sManifestResourceV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 1, + "expected 1 chunk for cluster-scoped resource, got {}: {chunks:#?}", + chunks.len() + ); + + match &chunks[0].source_spans[0] { + SourceSpan::Code { symbol, lang, .. } => { + assert_eq!( + symbol.as_deref(), + Some("ClusterRole/cluster-admin"), + "cluster-scoped symbol must be /" + ); + assert_eq!(lang.as_deref(), Some("yaml")); + } + other => panic!("expected Code span, got {other:?}"), + } +}