From d1560be80d30e27c9c5c408a72113fcea03099f3 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 13:56:45 +0000 Subject: [PATCH] feat(p10-1d): activate C + C++ in ingest_one_code_asset dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends 4-arm match (parser_version / chunker_version / extract / chunks) + allowlist + tier3_fallback_cv with "c" + "cpp" arms. C uses CAstExtractor + CodeCAstV1Chunker; C++ uses CppAstExtractor + CodeCppAstV1Chunker. Both langs are Tier 3-fallback-eligible (e.g. .h file with C++ syntax may fail tree-sitter-c parse → Tier 3 paragraph fallback per p10-3 wrapper). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 907d93b..ddf211a 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -50,7 +50,7 @@ use kebab_core::{ use kebab_llm_local::OllamaLanguageModel; use kebab_normalize::build_canonical_document; use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; -use kebab_parse_code::{GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; +use kebab_parse_code::{CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; use kebab_parse_pdf::PdfTextExtractor; use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -948,12 +948,12 @@ fn ingest_one_asset( force_reingest, ); } - // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. + // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. p10-1D: c/cpp added. MediaType::Code(lang) if matches!(lang.as_str(), "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" - | "shell") => + | "shell" | "c" | "cpp") => { return ingest_one_code_asset( app, @@ -1838,6 +1838,9 @@ fn ingest_one_code_asset( => ParserVersion("none-v1".to_string()), // p10-3: shell direct routes to Tier 3 (no parse step). "shell" => ParserVersion("none-v1".to_string()), + // p10-1D: C + C++ AST extractors. + "c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()), + "cpp" => ParserVersion(kebab_parse_code::CPP_PARSER_VERSION.to_string()), other => anyhow::bail!("unsupported code_lang: {other}"), }; @@ -1857,6 +1860,9 @@ fn ingest_one_code_asset( => ManifestFileV1Chunker.chunker_version(), // p10-3: "shell" => CodeTextParagraphV1Chunker.chunker_version(), + // p10-1D: C + C++ AST chunkers. + "c" => CodeCAstV1Chunker.chunker_version(), + "cpp" => CodeCppAstV1Chunker.chunker_version(), other => anyhow::bail!("unreachable chunker_version: {other}"), }; @@ -1911,6 +1917,13 @@ fn ingest_one_code_asset( } // p10-3: shell reuses the same synthesizer. "shell" => synthesize_tier2_document(asset, &bytes, "shell", &parser_version), + // p10-1D: C + C++ AST extractors. + "c" => CAstExtractor::new() + .extract(&ctx, &bytes) + .context("kebab-parse-code::CAstExtractor::extract (code:c)"), + "cpp" => CppAstExtractor::new() + .extract(&ctx, &bytes) + .context("kebab-parse-code::CppAstExtractor::extract (code:cpp)"), other => anyhow::bail!("unreachable (extract): {other}"), }; @@ -1987,6 +2000,13 @@ fn ingest_one_code_asset( "shell" => CodeTextParagraphV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (code:shell)"), + // p10-1D: C + C++ AST chunkers. + "c" => CodeCAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kebab-chunk::CodeCAstV1Chunker::chunk (code:c)"), + "cpp" => CodeCppAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kebab-chunk::CodeCppAstV1Chunker::chunk (code:cpp)"), other => anyhow::bail!("unreachable (chunk): {other}"), } };