refactor(p10-1b): generalize ingest_one_code_asset for multi-language dispatch
Rust path observably unchanged (verified by existing code_ingest_smoke tests). Python/TS/JS arms bail with TODO; per-lang extractor + chunker land in subsequent tasks. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -918,8 +918,10 @@ fn ingest_one_asset(
|
||||
force_reingest,
|
||||
);
|
||||
}
|
||||
// p10-1A-2 Task 8: Rust code ingest.
|
||||
MediaType::Code(lang) if lang == "rust" => {
|
||||
// p10-1A-2 / 1B: code ingest dispatch.
|
||||
MediaType::Code(lang)
|
||||
if matches!(lang.as_str(), "rust" | "python" | "typescript" | "javascript") =>
|
||||
{
|
||||
return ingest_one_code_asset(
|
||||
app,
|
||||
asset,
|
||||
@@ -928,6 +930,7 @@ fn ingest_one_asset(
|
||||
vector_store,
|
||||
existing_doc_ids,
|
||||
force_reingest,
|
||||
lang.as_str(),
|
||||
);
|
||||
}
|
||||
// p10-1A-2: non-Rust Code, Audio, and Other are not yet wired;
|
||||
@@ -1642,6 +1645,7 @@ fn ingest_one_pdf_asset(
|
||||
///
|
||||
/// All other steps (incremental skip, byte read, ExtractContext, put_*,
|
||||
/// embed, purge_vector_orphans) are identical to the PDF function.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn ingest_one_code_asset(
|
||||
app: &App,
|
||||
asset: &RawAsset,
|
||||
@@ -1650,6 +1654,7 @@ fn ingest_one_code_asset(
|
||||
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
force_reingest: bool,
|
||||
code_lang: &str, // <-- NEW (p10-1b Task D)
|
||||
) -> anyhow::Result<kebab_core::IngestItem> {
|
||||
let path = match &asset.source_uri {
|
||||
SourceUri::File(p) => p.clone(),
|
||||
@@ -1671,17 +1676,28 @@ fn ingest_one_code_asset(
|
||||
});
|
||||
}
|
||||
};
|
||||
// p10-1A-2 task 8: incremental-ingest early-skip for the code flow.
|
||||
// Code docs use `code-rust-v1` as the parser_version and
|
||||
// `CodeRustAstV1Chunker` as the chunker — both pinned per-medium
|
||||
// today (no config knob).
|
||||
let code_parser_version =
|
||||
ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string());
|
||||
|
||||
// p10-1b Task D: parser_version per-lang.
|
||||
let parser_version = match code_lang {
|
||||
"rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()),
|
||||
"python" => anyhow::bail!("python ingest not yet wired (p10-1b Task G)"),
|
||||
"typescript" => anyhow::bail!("typescript ingest not yet wired (p10-1b Task J)"),
|
||||
"javascript" => anyhow::bail!("javascript ingest not yet wired (p10-1b Task L)"),
|
||||
other => anyhow::bail!("unsupported code_lang: {other}"),
|
||||
};
|
||||
|
||||
// p10-1b Task D: chunker_version per-lang (Python/TS/JS are unreachable here;
|
||||
// they bail above and get real chunkers in Tasks G/J/L).
|
||||
let chunker_version = match code_lang {
|
||||
"rust" => CodeRustAstV1Chunker.chunker_version(),
|
||||
other => anyhow::bail!("unreachable chunker_version: {other}"),
|
||||
};
|
||||
|
||||
if let Some(item) = try_skip_unchanged(
|
||||
app,
|
||||
asset,
|
||||
&code_parser_version,
|
||||
&CodeRustAstV1Chunker.chunker_version(),
|
||||
&parser_version,
|
||||
&chunker_version,
|
||||
embedder.map(|e| e.model_version()).as_ref(),
|
||||
force_reingest,
|
||||
)? {
|
||||
@@ -1697,20 +1713,26 @@ fn ingest_one_code_asset(
|
||||
workspace_root: &workspace_root,
|
||||
config: &extract_config,
|
||||
};
|
||||
let mut canonical = RustAstExtractor::new()
|
||||
.extract(&ctx, &bytes)
|
||||
.context("kb-parse-code::RustAstExtractor::extract")?;
|
||||
|
||||
// Per-medium chunker selection: Rust code always uses code-rust-ast-v1
|
||||
// regardless of `config.chunking.chunker_version`.
|
||||
let chunker = CodeRustAstV1Chunker;
|
||||
let chunks = chunker
|
||||
.chunk(&canonical, chunk_policy)
|
||||
.context("kb-chunk::CodeRustAstV1Chunker::chunk")?;
|
||||
// p10-1b Task D: extractor per-lang.
|
||||
let mut canonical = match code_lang {
|
||||
"rust" => RustAstExtractor::new()
|
||||
.extract(&ctx, &bytes)
|
||||
.context("kb-parse-code::RustAstExtractor::extract (code:rust)")?,
|
||||
other => anyhow::bail!("unreachable (extract): {other}"),
|
||||
};
|
||||
|
||||
// p10-1b Task D: chunker per-lang.
|
||||
let chunks = match code_lang {
|
||||
"rust" => CodeRustAstV1Chunker
|
||||
.chunk(&canonical, chunk_policy)
|
||||
.context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)")?,
|
||||
other => anyhow::bail!("unreachable (chunk): {other}"),
|
||||
};
|
||||
|
||||
// Stamp chunker + embedding versions so incremental skip detection has
|
||||
// data on the second run.
|
||||
canonical.last_chunker_version = Some(chunker.chunker_version());
|
||||
canonical.last_chunker_version = Some(chunker_version.clone());
|
||||
if let Some(emb) = embedder {
|
||||
canonical.last_embedding_version = Some(emb.model_version());
|
||||
}
|
||||
@@ -1794,7 +1816,7 @@ fn ingest_one_code_asset(
|
||||
block_count: u32::try_from(canonical.blocks.len()).ok(),
|
||||
chunk_count: u32::try_from(chunks.len()).ok(),
|
||||
parser_version: Some(canonical.parser_version.clone()),
|
||||
chunker_version: Some(chunker.chunker_version()),
|
||||
chunker_version: Some(chunker_version),
|
||||
warnings,
|
||||
error: None,
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user