From 226ce8b744a673699aa026ed32885a871ddfe17b Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 13:19:54 +0000 Subject: [PATCH] feat(p10-2): activate Tier 2 chunkers in ingest_one_code_asset dispatch Adds yaml / dockerfile / toml / json / xml / groovy / go-mod arms to the existing 7-arm AST match. parser_version unified to "none-v1" for Tier 2. synthesize_tier2_document builds a minimal Document (single Block::Code with raw file text) since Tier 2 has no parse step. allowlist in ingest_one_asset extended to admit Tier 2 langs. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 163 ++++++++++++++++++++++++++++++++++-- 1 file changed, 158 insertions(+), 5 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 08c89cc..149b687 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTsAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -948,10 +948,11 @@ fn ingest_one_asset( force_reingest, ); } - // p10-1A-2 / 1B: code ingest dispatch. + // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. MediaType::Code(lang) if matches!(lang.as_str(), - "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin") => + "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") => { return ingest_one_code_asset( app, @@ -1831,6 +1832,9 @@ fn ingest_one_code_asset( "go" => ParserVersion(kebab_parse_code::GO_PARSER_VERSION.to_string()), "java" => ParserVersion(kebab_parse_code::JAVA_PARSER_VERSION.to_string()), "kotlin" => ParserVersion(kebab_parse_code::KOTLIN_PARSER_VERSION.to_string()), + // p10-2: Tier 2 has no parse step — sentinel "none-v1". + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + => ParserVersion("none-v1".to_string()), other => anyhow::bail!("unsupported code_lang: {other}"), }; @@ -1842,7 +1846,12 @@ fn ingest_one_code_asset( "javascript" => CodeJsAstV1Chunker.chunker_version(), "go" => CodeGoAstV1Chunker.chunker_version(), "java" => CodeJavaAstV1Chunker.chunker_version(), - "kotlin" => CodeKotlinAstV1Chunker.chunker_version(), + "kotlin" => CodeKotlinAstV1Chunker.chunker_version(), + // p10-2 Tier 2: + "yaml" => K8sManifestResourceV1Chunker.chunker_version(), + "dockerfile" => DockerfileFileV1Chunker.chunker_version(), + "toml" | "json" | "xml" | "groovy" | "go-mod" + => ManifestFileV1Chunker.chunker_version(), other => anyhow::bail!("unreachable chunker_version: {other}"), }; @@ -1890,6 +1899,10 @@ fn ingest_one_code_asset( "kotlin" => KotlinAstExtractor::new() .extract(&ctx, &bytes) .context("kb-parse-code::KotlinAstExtractor::extract (code:kotlin)")?, + // p10-2 Tier 2: no extractor — synthesize Document directly from raw bytes. + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => { + synthesize_tier2_document(asset, &bytes, code_lang, &parser_version)? + } other => anyhow::bail!("unreachable (extract): {other}"), }; @@ -1913,9 +1926,20 @@ fn ingest_one_code_asset( "java" => CodeJavaAstV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::CodeJavaAstV1Chunker::chunk (code:java)")?, - "kotlin" => CodeKotlinAstV1Chunker + "kotlin" => CodeKotlinAstV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::CodeKotlinAstV1Chunker::chunk (code:kotlin)")?, + // p10-2 Tier 2: + "yaml" => K8sManifestResourceV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::K8sManifestResourceV1Chunker::chunk")?, + "dockerfile" => DockerfileFileV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::DockerfileFileV1Chunker::chunk")?, + "toml" | "json" | "xml" | "groovy" | "go-mod" + => ManifestFileV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::ManifestFileV1Chunker::chunk")?, other => anyhow::bail!("unreachable (chunk): {other}"), }; @@ -2011,6 +2035,135 @@ fn ingest_one_code_asset( }) } +/// p10-2: Build a minimal [`CanonicalDocument`] for Tier 2 code assets +/// (yaml / dockerfile / toml / json / xml / groovy / go-mod) that have +/// no AST extractor. Produces a single `Block::Code` whose source span +/// covers the entire file, mirroring the shape the Tier 1 extractors +/// produce for glue / top-level regions. +fn synthesize_tier2_document( + asset: &RawAsset, + bytes: &[u8], + code_lang: &str, + parser_version: &ParserVersion, +) -> anyhow::Result { + use anyhow::Context as _; + use kebab_core::{ + BlockId, CodeBlock, CommonBlock, Lang, Metadata, Provenance, ProvenanceEvent, + ProvenanceKind, SourceSpan, SourceType, TrustLevel, id_for_block, id_for_doc, + }; + + let text = std::str::from_utf8(bytes) + .with_context(|| format!("tier2 doc not utf-8: {}", asset.workspace_path.0))? + .to_string(); + + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version); + + let n_lines = text.lines().count().max(1) as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: n_lines, + symbol: Some("".to_string()), + lang: Some(code_lang.to_string()), + }; + let block_id: BlockId = id_for_block( + &doc_id, + "code", + &[], + 0, + &span, + ); + let block = kebab_core::Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: vec![], + source_span: span, + }, + lang: Some(code_lang.to_string()), + code: text, + }); + + let now = time::OffsetDateTime::now_utc(); + let events = vec![ + ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }, + ProvenanceEvent { + at: now, + agent: "kb-app".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; tier2_synthesized; lang={}", + parser_version.0, code_lang + )), + }, + ]; + + // Resolve abs path for repo detection (mirrors RustAstExtractor pattern). + let workspace_root = std::path::PathBuf::new(); // not needed for detect_repo walk + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => p.clone(), + kebab_core::SourceUri::Kb(_) => workspace_root, + }; + let (repo, git_branch, git_commit) = match kebab_parse_code::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let title = { + let fname = asset.workspace_path.0 + .rsplit('/') + .next() + .unwrap_or(&asset.workspace_path.0); + // strip extension + match fname.rfind('.') { + Some(i) => fname[..i].to_string(), + None => fname.to_string(), + } + }; + + let metadata = Metadata { + aliases: vec![], + tags: vec![], + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: serde_json::Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some(code_lang.to_string()), + }; + + tracing::debug!( + target: "kebab-app", + "synthesized tier2 doc_id={} workspace_path={} lang={}", + doc_id.0, + asset.workspace_path.0, + code_lang, + ); + + Ok(kebab_core::CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks: vec![block], + metadata, + provenance: Provenance { events }, + parser_version: parser_version.clone(), + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) +} + /// Pull the BCP-47 language hint from the canonical document. P6-1 /// stamps `Lang("und")` by default; image-pipeline OCR / caption /// adapters special-case "und" so the hint is intentionally dropped