fix(p10-3+p10-1d): land the missing try_skip_unchanged fallback-aware fix

PR #155 (p10-3) merged WITHOUT the reviewer's required Option B1 fix —
the implementer reported a commit SHA (2a39513) that never made it to main.
Result: every reingest of a Tier 3-fallback file (non-k8s YAML, invalid
YAML, AST extractor failure) re-runs full extract + chunk + embed because
the parser/chunker version comparison can never match (stored is
code-text-paragraph-v1 / none-v1, but caller uses Tier 1/2 dispatch
values).

This commit:
1. Adds the 7th param `fallback_chunker_version: Option<&ChunkerVersion>`
   to try_skip_unchanged + the stored_is_tier3_fallback detection branch
   (skip parser/chunker equality, keep embedder check).
2. Threads `None` through non-code call sites (md / image / pdf).
3. Code call site computes tier3_fallback_cv covering all Tier 1/2 langs
   that can fall back: rust / python / ts / js / go / java / kotlin /
   yaml / dockerfile / toml / json / xml / groovy / go-mod / c / cpp
   (p10-1D additions).
4. Adds tier3_yaml_fallback_reingest_is_unchanged + tier3_shell_reingest_is_unchanged
   regression tests (the originally-promised PR #155 regression coverage
   that also never made it to main).

Smoke tests: 14 + 2 = 16 PASS.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-21 14:19:17 +00:00
parent d1560be80d
commit 1034de25a2
2 changed files with 159 additions and 0 deletions

View File

@@ -795,6 +795,7 @@ fn try_skip_unchanged(
current_chunker_version: &ChunkerVersion,
current_embedding_version: Option<&kebab_core::EmbeddingVersion>,
force_reingest: bool,
fallback_chunker_version: Option<&ChunkerVersion>, // p10-3 fix
) -> anyhow::Result<Option<kebab_core::IngestItem>> {
if force_reingest {
return Ok(None);
@@ -829,6 +830,50 @@ fn try_skip_unchanged(
if existing_doc.source_asset_id != asset.asset_id {
return Ok(None);
}
// p10-3 fix: detect "stored doc was previously Tier 3 fallback".
// When a Tier 1/2 extractor emits empty chunks, the fallback wrapper
// retries with CodeTextParagraphV1Chunker and stores
// last_chunker_version = "code-text-paragraph-v1" + parser_version = "none-v1".
// On the next ingest the caller computes current_parser_version /
// current_chunker_version from the Tier 1/2 dispatch (e.g.
// "k8s-manifest-resource-v1"), which can never match the stored
// fallback values, causing spurious re-ingests. Detect this case
// and bypass the parser/chunker equality checks — only the embedder
// version still must match.
let stored_is_tier3_fallback = fallback_chunker_version.is_some_and(|fbv| {
existing_doc.last_chunker_version.as_ref() == Some(fbv)
&& existing_doc.parser_version.0 == "none-v1"
});
if stored_is_tier3_fallback {
// Embedder version still must match.
let embedder_match = existing_doc.last_embedding_version.as_ref()
== current_embedding_version;
if !embedder_match {
return Ok(None);
}
let candidate_doc_id = existing_doc.doc_id.clone();
tracing::debug!(
target: "kebab-app::ingest",
path = %asset.workspace_path.0,
doc_id = %candidate_doc_id.0,
"skip-unchanged: tier 3 fallback state detected; bypassing parser/chunker equality"
);
return Ok(Some(kebab_core::IngestItem {
kind: kebab_core::IngestItemKind::Unchanged,
doc_id: Some(candidate_doc_id),
doc_path: asset.workspace_path.clone(),
asset_id: Some(asset.asset_id.clone()),
byte_len: Some(asset.byte_len),
block_count: u32::try_from(existing_doc.blocks.len()).ok(),
chunk_count: None,
parser_version: Some(existing_doc.parser_version.clone()),
chunker_version: existing_doc.last_chunker_version.clone(),
warnings: Vec::new(),
error: None,
}));
}
// 2. Parser unchanged: parser_version is baked into id_for_doc so
// a version bump yields a different doc_id and the row above
// would have been missing. Checking here explicitly keeps the
@@ -1017,6 +1062,7 @@ fn ingest_one_asset(
&MdHeadingV1Chunker.chunker_version(),
embedder.map(|e| e.model_version()).as_ref(),
force_reingest,
None,
)? {
return Ok(item);
}
@@ -1211,6 +1257,7 @@ fn ingest_one_image_asset(
&MdHeadingV1Chunker.chunker_version(),
embedder.map(|e| e.model_version()).as_ref(),
force_reingest,
None,
)? {
return Ok(item);
}
@@ -1657,6 +1704,7 @@ fn ingest_one_pdf_asset(
&PdfPageV1Chunker.chunker_version(),
embedder.map(|e| e.model_version()).as_ref(),
force_reingest,
None,
)? {
return Ok(item);
}
@@ -1866,6 +1914,18 @@ fn ingest_one_code_asset(
other => anyhow::bail!("unreachable chunker_version: {other}"),
};
// p10-3 fix: if this lang can fall back to Tier 3, compute the fallback
// chunker_version so try_skip_unchanged can detect the stored-as-Tier-3
// state and skip parser/chunker equality checks.
let tier3_fallback_cv: Option<ChunkerVersion> = match code_lang {
"rust" | "python" | "typescript" | "javascript"
| "go" | "java" | "kotlin"
| "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
| "c" | "cpp" // p10-1D
=> Some(CodeTextParagraphV1Chunker.chunker_version()),
_ => None,
};
if let Some(item) = try_skip_unchanged(
app,
asset,
@@ -1873,6 +1933,7 @@ fn ingest_one_code_asset(
&chunker_version,
embedder.map(|e| e.model_version()).as_ref(),
force_reingest,
tier3_fallback_cv.as_ref(),
)? {
return Ok(item);
}

View File

@@ -1064,3 +1064,101 @@ fn rust_file_re_ingest_is_unchanged() {
);
assert_eq!(item2.doc_id, item1.doc_id);
}
/// p10-3 fix regression: a docker-compose YAML that falls back to Tier 3
/// (k8s chunker returns empty, CodeTextParagraphV1Chunker retries) must
/// report Unchanged on the second ingest rather than re-processing.
/// Before the fix, try_skip_unchanged returned None because the stored
/// last_chunker_version ("code-text-paragraph-v1" / parser_version
/// "none-v1") never matched the caller's dispatch values.
#[test]
fn tier3_yaml_fallback_reingest_is_unchanged() {
let env = TestEnv::lexical_only();
std::fs::write(
env.workspace_root.join("docker-compose.yml"),
"version: '3'\nservices:\n api:\n image: nginx:latest\n",
)
.unwrap();
let report1 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("first ingest");
let item1 = report1
.items
.as_ref()
.expect("items present")
.iter()
.find(|i| i.doc_path.0.ends_with("docker-compose.yml"))
.expect("docker-compose.yml in first report");
assert!(
matches!(item1.kind, IngestItemKind::New),
"first ingest must be New, got {:?}", item1.kind
);
assert_eq!(
item1.chunker_version.as_ref().map(|c| c.0.as_str()),
Some("code-text-paragraph-v1"),
"first ingest must use Tier 3 fallback chunker"
);
let report2 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("second ingest");
let item2 = report2
.items
.as_ref()
.expect("items present")
.iter()
.find(|i| i.doc_path.0.ends_with("docker-compose.yml"))
.expect("docker-compose.yml in second report");
assert!(
matches!(item2.kind, IngestItemKind::Unchanged),
"second ingest must be Unchanged, got {:?}", item2.kind
);
}
/// p10-3 fix regression: a shell file (direct Tier 3, not a fallback)
/// must also report Unchanged on re-ingest. Shell goes straight to
/// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false
/// (parser_version is "none-v1" and chunker matches the current dispatch),
/// but the normal equality path should pass regardless.
#[test]
fn tier3_shell_reingest_is_unchanged() {
let env = TestEnv::lexical_only();
std::fs::write(
env.workspace_root.join("deploy.sh"),
"#!/usr/bin/env bash\nset -e\necho hello\n",
)
.unwrap();
let report1 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("first ingest");
let item1 = report1
.items
.as_ref()
.expect("items present")
.iter()
.find(|i| i.doc_path.0.ends_with("deploy.sh"))
.expect("deploy.sh in first report");
assert!(
matches!(item1.kind, IngestItemKind::New),
"first ingest must be New, got {:?}", item1.kind
);
let report2 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("second ingest");
let item2 = report2
.items
.as_ref()
.expect("items present")
.iter()
.find(|i| i.doc_path.0.ends_with("deploy.sh"))
.expect("deploy.sh in second report");
assert!(
matches!(item2.kind, IngestItemKind::Unchanged),
"shell reingest must be Unchanged, got {:?}", item2.kind
);
}