From 8bdb3e8090f2990f3ec33e6aadfd3a9b1821a0ee Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 00:35:53 +0000 Subject: [PATCH] refactor(p10-1b): generalize ingest_one_code_asset for multi-language dispatch Rust path observably unchanged (verified by existing code_ingest_smoke tests). Python/TS/JS arms bail with TODO; per-lang extractor + chunker land in subsequent tasks. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 64 +++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 39f43e8..17bc1d9 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -918,8 +918,10 @@ fn ingest_one_asset( force_reingest, ); } - // p10-1A-2 Task 8: Rust code ingest. - MediaType::Code(lang) if lang == "rust" => { + // p10-1A-2 / 1B: code ingest dispatch. + MediaType::Code(lang) + if matches!(lang.as_str(), "rust" | "python" | "typescript" | "javascript") => + { return ingest_one_code_asset( app, asset, @@ -928,6 +930,7 @@ fn ingest_one_asset( vector_store, existing_doc_ids, force_reingest, + lang.as_str(), ); } // p10-1A-2: non-Rust Code, Audio, and Other are not yet wired; @@ -1642,6 +1645,7 @@ fn ingest_one_pdf_asset( /// /// All other steps (incremental skip, byte read, ExtractContext, put_*, /// embed, purge_vector_orphans) are identical to the PDF function. +#[allow(clippy::too_many_arguments)] fn ingest_one_code_asset( app: &App, asset: &RawAsset, @@ -1650,6 +1654,7 @@ fn ingest_one_code_asset( vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, force_reingest: bool, + code_lang: &str, // <-- NEW (p10-1b Task D) ) -> anyhow::Result { let path = match &asset.source_uri { SourceUri::File(p) => p.clone(), @@ -1671,17 +1676,28 @@ fn ingest_one_code_asset( }); } }; - // p10-1A-2 task 8: incremental-ingest early-skip for the code flow. - // Code docs use `code-rust-v1` as the parser_version and - // `CodeRustAstV1Chunker` as the chunker — both pinned per-medium - // today (no config knob). - let code_parser_version = - ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()); + + // p10-1b Task D: parser_version per-lang. + let parser_version = match code_lang { + "rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()), + "python" => anyhow::bail!("python ingest not yet wired (p10-1b Task G)"), + "typescript" => anyhow::bail!("typescript ingest not yet wired (p10-1b Task J)"), + "javascript" => anyhow::bail!("javascript ingest not yet wired (p10-1b Task L)"), + other => anyhow::bail!("unsupported code_lang: {other}"), + }; + + // p10-1b Task D: chunker_version per-lang (Python/TS/JS are unreachable here; + // they bail above and get real chunkers in Tasks G/J/L). + let chunker_version = match code_lang { + "rust" => CodeRustAstV1Chunker.chunker_version(), + other => anyhow::bail!("unreachable chunker_version: {other}"), + }; + if let Some(item) = try_skip_unchanged( app, asset, - &code_parser_version, - &CodeRustAstV1Chunker.chunker_version(), + &parser_version, + &chunker_version, embedder.map(|e| e.model_version()).as_ref(), force_reingest, )? { @@ -1697,20 +1713,26 @@ fn ingest_one_code_asset( workspace_root: &workspace_root, config: &extract_config, }; - let mut canonical = RustAstExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-code::RustAstExtractor::extract")?; - // Per-medium chunker selection: Rust code always uses code-rust-ast-v1 - // regardless of `config.chunking.chunker_version`. - let chunker = CodeRustAstV1Chunker; - let chunks = chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodeRustAstV1Chunker::chunk")?; + // p10-1b Task D: extractor per-lang. + let mut canonical = match code_lang { + "rust" => RustAstExtractor::new() + .extract(&ctx, &bytes) + .context("kb-parse-code::RustAstExtractor::extract (code:rust)")?, + other => anyhow::bail!("unreachable (extract): {other}"), + }; + + // p10-1b Task D: chunker per-lang. + let chunks = match code_lang { + "rust" => CodeRustAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)")?, + other => anyhow::bail!("unreachable (chunk): {other}"), + }; // Stamp chunker + embedding versions so incremental skip detection has // data on the second run. - canonical.last_chunker_version = Some(chunker.chunker_version()); + canonical.last_chunker_version = Some(chunker_version.clone()); if let Some(emb) = embedder { canonical.last_embedding_version = Some(emb.model_version()); } @@ -1794,7 +1816,7 @@ fn ingest_one_code_asset( block_count: u32::try_from(canonical.blocks.len()).ok(), chunk_count: u32::try_from(chunks.len()).ok(), parser_version: Some(canonical.parser_version.clone()), - chunker_version: Some(chunker.chunker_version()), + chunker_version: Some(chunker_version), warnings, error: None, })