feat(p10-3): Tier 1/2 → Tier 3 fallback wrapper in ingest_one_code_asset

After the chunks match resolves, an Ok(empty) result (Tier 2 invalid YAML
/ non-k8s YAML / similar) or Err (Tier 1 extractor / chunker failure) is
retried against CodeTextParagraphV1Chunker. On retry, chunker_version is
swapped to "code-text-paragraph-v1" and canonical.parser_version to
"none-v1" so downstream stamping + try_skip_unchanged remain consistent.

Extract failure is handled similarly — when a Tier 1 extractor errors
(e.g. tree-sitter parse failure), a synthesize_tier2_document-shaped
fallback doc is built from raw bytes and routed through Tier 3 chunker
directly (extract_fell_back guard).

shell direct path + Tier 2 extract synthesize_tier2_document failures
are exempted from the fallback chain (they ARE Tier 3 already, or the
error is real).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-21 11:32:49 +00:00
parent 88d7fbc182
commit 5051ea7534

View File

@@ -1842,7 +1842,7 @@ fn ingest_one_code_asset(
};
// p10-1b Task D/G/J/L: chunker_version per-lang.
let chunker_version = match code_lang {
let mut chunker_version = match code_lang {
"rust" => CodeRustAstV1Chunker.chunker_version(),
"python" => CodePythonAstV1Chunker.chunker_version(),
"typescript" => CodeTsAstV1Chunker.chunker_version(),
@@ -1882,76 +1882,145 @@ fn ingest_one_code_asset(
};
// p10-1b Task D/G/J/L: extractor per-lang.
let mut canonical = match code_lang {
// p10-3: capture Result so Tier 1 extractor errors can fall back to Tier 3.
let canonical_result: anyhow::Result<kebab_core::CanonicalDocument> = match code_lang {
"rust" => RustAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::RustAstExtractor::extract (code:rust)")?,
.context("kb-parse-code::RustAstExtractor::extract (code:rust)"),
"python" => PythonAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::PythonAstExtractor::extract (code:python)")?,
.context("kb-parse-code::PythonAstExtractor::extract (code:python)"),
"typescript" => TypescriptAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::TypescriptAstExtractor::extract (code:typescript)")?,
.context("kb-parse-code::TypescriptAstExtractor::extract (code:typescript)"),
"javascript" => JavascriptAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::JavascriptAstExtractor::extract (code:javascript)")?,
.context("kb-parse-code::JavascriptAstExtractor::extract (code:javascript)"),
"go" => GoAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::GoAstExtractor::extract (code:go)")?,
.context("kb-parse-code::GoAstExtractor::extract (code:go)"),
"java" => JavaAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::JavaAstExtractor::extract (code:java)")?,
.context("kb-parse-code::JavaAstExtractor::extract (code:java)"),
"kotlin" => KotlinAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::KotlinAstExtractor::extract (code:kotlin)")?,
.context("kb-parse-code::KotlinAstExtractor::extract (code:kotlin)"),
// p10-2 Tier 2: no extractor — synthesize Document directly from raw bytes.
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => {
synthesize_tier2_document(asset, &bytes, code_lang, &parser_version)?
synthesize_tier2_document(asset, &bytes, code_lang, &parser_version)
}
// p10-3: shell reuses the same synthesizer.
"shell" => synthesize_tier2_document(asset, &bytes, "shell", &parser_version)?,
"shell" => synthesize_tier2_document(asset, &bytes, "shell", &parser_version),
other => anyhow::bail!("unreachable (extract): {other}"),
};
// p10-3: Tier 1 extractor failure → fall back to Tier 3 synthesized doc.
// Tier 2 (yaml/dockerfile/…) and shell errors are real (e.g. non-UTF-8) — propagate.
let mut canonical = match canonical_result {
Ok(d) => d,
Err(e) if code_lang == "shell"
|| matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") =>
{
return Err(e).context("synthesize_tier2_document failed for tier 2/3 lang");
}
Err(e) => {
// Tier 1 extractor errored — fall back to Tier 3 synthesized doc.
tracing::warn!(
workspace_path = %asset.workspace_path.0,
code_lang = code_lang,
error = %e,
"tier1 extract errored; falling back to tier 3 synthesized doc"
);
chunker_version = CodeTextParagraphV1Chunker.chunker_version();
let tier3_parser_version = ParserVersion("none-v1".to_string());
synthesize_tier2_document(asset, &bytes, code_lang, &tier3_parser_version)
.context("synthesize_tier2_document for tier 3 fallback after extract error")?
}
};
// p10-1b Task D/G/J/L: chunker per-lang.
let chunks = match code_lang {
"rust" => CodeRustAstV1Chunker
// p10-3: track whether the extract stage already fell back to Tier 3.
// Tier 2 langs already have "none-v1" parser_version normally, so exclude them
// from the extract_fell_back guard with the !matches! exclusion.
let extract_fell_back = canonical.parser_version.0 == "none-v1"
&& !matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell");
let chunks_result: anyhow::Result<Vec<Chunk>> = if extract_fell_back {
// Tier 1 lang whose extractor errored — go straight to Tier 3 chunker.
CodeTextParagraphV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)")?,
"python" => CodePythonAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodePythonAstV1Chunker::chunk (code:python)")?,
"typescript" => CodeTsAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeTsAstV1Chunker::chunk (code:typescript)")?,
"javascript" => CodeJsAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeJsAstV1Chunker::chunk (code:javascript)")?,
"go" => CodeGoAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeGoAstV1Chunker::chunk (code:go)")?,
"java" => CodeJavaAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeJavaAstV1Chunker::chunk (code:java)")?,
"kotlin" => CodeKotlinAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeKotlinAstV1Chunker::chunk (code:kotlin)")?,
// p10-2 Tier 2:
"yaml" => K8sManifestResourceV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::K8sManifestResourceV1Chunker::chunk")?,
"dockerfile" => DockerfileFileV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::DockerfileFileV1Chunker::chunk")?,
"toml" | "json" | "xml" | "groovy" | "go-mod"
=> ManifestFileV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::ManifestFileV1Chunker::chunk")?,
// p10-3:
"shell" => CodeTextParagraphV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeTextParagraphV1Chunker::chunk (code:shell)")?,
other => anyhow::bail!("unreachable (chunk): {other}"),
.context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 after extract fallback)")
} else {
match code_lang {
"rust" => CodeRustAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)"),
"python" => CodePythonAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodePythonAstV1Chunker::chunk (code:python)"),
"typescript" => CodeTsAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeTsAstV1Chunker::chunk (code:typescript)"),
"javascript" => CodeJsAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeJsAstV1Chunker::chunk (code:javascript)"),
"go" => CodeGoAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeGoAstV1Chunker::chunk (code:go)"),
"java" => CodeJavaAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeJavaAstV1Chunker::chunk (code:java)"),
"kotlin" => CodeKotlinAstV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeKotlinAstV1Chunker::chunk (code:kotlin)"),
// p10-2 Tier 2:
"yaml" => K8sManifestResourceV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::K8sManifestResourceV1Chunker::chunk"),
"dockerfile" => DockerfileFileV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::DockerfileFileV1Chunker::chunk"),
"toml" | "json" | "xml" | "groovy" | "go-mod" => ManifestFileV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::ManifestFileV1Chunker::chunk"),
// p10-3:
"shell" => CodeTextParagraphV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeTextParagraphV1Chunker::chunk (code:shell)"),
other => anyhow::bail!("unreachable (chunk): {other}"),
}
};
// p10-3: Tier 1/2 0-chunk OR error → Tier 3 fallback retry.
// "shell" direct path is already Tier 3 — don't retry-double-up.
let chunks: Vec<Chunk> = match chunks_result {
Ok(v) if !v.is_empty() => v,
other if code_lang == "shell" => other?, // shell propagates directly
Ok(_empty) => {
tracing::warn!(
workspace_path = %asset.workspace_path.0,
code_lang = code_lang,
"tier1/2 emitted 0 chunks; falling back to tier 3 (code-text-paragraph-v1)"
);
chunker_version = CodeTextParagraphV1Chunker.chunker_version();
canonical.parser_version = ParserVersion("none-v1".to_string());
CodeTextParagraphV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback)")?
}
Err(e) => {
tracing::warn!(
workspace_path = %asset.workspace_path.0,
code_lang = code_lang,
error = %e,
"tier1/2 chunker errored; falling back to tier 3 (code-text-paragraph-v1)"
);
chunker_version = CodeTextParagraphV1Chunker.chunker_version();
canonical.parser_version = ParserVersion("none-v1".to_string());
CodeTextParagraphV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)")?
}
};
// Stamp chunker + embedding versions so incremental skip detection has