fix(p10-3+p10-1d): land the missing try_skip_unchanged fallback-aware fix
PR #155 (p10-3) merged WITHOUT the reviewer's required Option B1 fix — the implementer reported a commit SHA (2a39513) that never made it to main. Result: every reingest of a Tier 3-fallback file (non-k8s YAML, invalid YAML, AST extractor failure) re-runs full extract + chunk + embed because the parser/chunker version comparison can never match (stored is code-text-paragraph-v1 / none-v1, but caller uses Tier 1/2 dispatch values). This commit: 1. Adds the 7th param `fallback_chunker_version: Option<&ChunkerVersion>` to try_skip_unchanged + the stored_is_tier3_fallback detection branch (skip parser/chunker equality, keep embedder check). 2. Threads `None` through non-code call sites (md / image / pdf). 3. Code call site computes tier3_fallback_cv covering all Tier 1/2 langs that can fall back: rust / python / ts / js / go / java / kotlin / yaml / dockerfile / toml / json / xml / groovy / go-mod / c / cpp (p10-1D additions). 4. Adds tier3_yaml_fallback_reingest_is_unchanged + tier3_shell_reingest_is_unchanged regression tests (the originally-promised PR #155 regression coverage that also never made it to main). Smoke tests: 14 + 2 = 16 PASS. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -795,6 +795,7 @@ fn try_skip_unchanged(
|
||||
current_chunker_version: &ChunkerVersion,
|
||||
current_embedding_version: Option<&kebab_core::EmbeddingVersion>,
|
||||
force_reingest: bool,
|
||||
fallback_chunker_version: Option<&ChunkerVersion>, // p10-3 fix
|
||||
) -> anyhow::Result<Option<kebab_core::IngestItem>> {
|
||||
if force_reingest {
|
||||
return Ok(None);
|
||||
@@ -829,6 +830,50 @@ fn try_skip_unchanged(
|
||||
if existing_doc.source_asset_id != asset.asset_id {
|
||||
return Ok(None);
|
||||
}
|
||||
// p10-3 fix: detect "stored doc was previously Tier 3 fallback".
|
||||
// When a Tier 1/2 extractor emits empty chunks, the fallback wrapper
|
||||
// retries with CodeTextParagraphV1Chunker and stores
|
||||
// last_chunker_version = "code-text-paragraph-v1" + parser_version = "none-v1".
|
||||
// On the next ingest the caller computes current_parser_version /
|
||||
// current_chunker_version from the Tier 1/2 dispatch (e.g.
|
||||
// "k8s-manifest-resource-v1"), which can never match the stored
|
||||
// fallback values, causing spurious re-ingests. Detect this case
|
||||
// and bypass the parser/chunker equality checks — only the embedder
|
||||
// version still must match.
|
||||
let stored_is_tier3_fallback = fallback_chunker_version.is_some_and(|fbv| {
|
||||
existing_doc.last_chunker_version.as_ref() == Some(fbv)
|
||||
&& existing_doc.parser_version.0 == "none-v1"
|
||||
});
|
||||
|
||||
if stored_is_tier3_fallback {
|
||||
// Embedder version still must match.
|
||||
let embedder_match = existing_doc.last_embedding_version.as_ref()
|
||||
== current_embedding_version;
|
||||
if !embedder_match {
|
||||
return Ok(None);
|
||||
}
|
||||
let candidate_doc_id = existing_doc.doc_id.clone();
|
||||
tracing::debug!(
|
||||
target: "kebab-app::ingest",
|
||||
path = %asset.workspace_path.0,
|
||||
doc_id = %candidate_doc_id.0,
|
||||
"skip-unchanged: tier 3 fallback state detected; bypassing parser/chunker equality"
|
||||
);
|
||||
return Ok(Some(kebab_core::IngestItem {
|
||||
kind: kebab_core::IngestItemKind::Unchanged,
|
||||
doc_id: Some(candidate_doc_id),
|
||||
doc_path: asset.workspace_path.clone(),
|
||||
asset_id: Some(asset.asset_id.clone()),
|
||||
byte_len: Some(asset.byte_len),
|
||||
block_count: u32::try_from(existing_doc.blocks.len()).ok(),
|
||||
chunk_count: None,
|
||||
parser_version: Some(existing_doc.parser_version.clone()),
|
||||
chunker_version: existing_doc.last_chunker_version.clone(),
|
||||
warnings: Vec::new(),
|
||||
error: None,
|
||||
}));
|
||||
}
|
||||
|
||||
// 2. Parser unchanged: parser_version is baked into id_for_doc so
|
||||
// a version bump yields a different doc_id and the row above
|
||||
// would have been missing. Checking here explicitly keeps the
|
||||
@@ -1017,6 +1062,7 @@ fn ingest_one_asset(
|
||||
&MdHeadingV1Chunker.chunker_version(),
|
||||
embedder.map(|e| e.model_version()).as_ref(),
|
||||
force_reingest,
|
||||
None,
|
||||
)? {
|
||||
return Ok(item);
|
||||
}
|
||||
@@ -1211,6 +1257,7 @@ fn ingest_one_image_asset(
|
||||
&MdHeadingV1Chunker.chunker_version(),
|
||||
embedder.map(|e| e.model_version()).as_ref(),
|
||||
force_reingest,
|
||||
None,
|
||||
)? {
|
||||
return Ok(item);
|
||||
}
|
||||
@@ -1657,6 +1704,7 @@ fn ingest_one_pdf_asset(
|
||||
&PdfPageV1Chunker.chunker_version(),
|
||||
embedder.map(|e| e.model_version()).as_ref(),
|
||||
force_reingest,
|
||||
None,
|
||||
)? {
|
||||
return Ok(item);
|
||||
}
|
||||
@@ -1866,6 +1914,18 @@ fn ingest_one_code_asset(
|
||||
other => anyhow::bail!("unreachable chunker_version: {other}"),
|
||||
};
|
||||
|
||||
// p10-3 fix: if this lang can fall back to Tier 3, compute the fallback
|
||||
// chunker_version so try_skip_unchanged can detect the stored-as-Tier-3
|
||||
// state and skip parser/chunker equality checks.
|
||||
let tier3_fallback_cv: Option<ChunkerVersion> = match code_lang {
|
||||
"rust" | "python" | "typescript" | "javascript"
|
||||
| "go" | "java" | "kotlin"
|
||||
| "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
|
||||
| "c" | "cpp" // p10-1D
|
||||
=> Some(CodeTextParagraphV1Chunker.chunker_version()),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
if let Some(item) = try_skip_unchanged(
|
||||
app,
|
||||
asset,
|
||||
@@ -1873,6 +1933,7 @@ fn ingest_one_code_asset(
|
||||
&chunker_version,
|
||||
embedder.map(|e| e.model_version()).as_ref(),
|
||||
force_reingest,
|
||||
tier3_fallback_cv.as_ref(),
|
||||
)? {
|
||||
return Ok(item);
|
||||
}
|
||||
|
||||
@@ -1064,3 +1064,101 @@ fn rust_file_re_ingest_is_unchanged() {
|
||||
);
|
||||
assert_eq!(item2.doc_id, item1.doc_id);
|
||||
}
|
||||
|
||||
/// p10-3 fix regression: a docker-compose YAML that falls back to Tier 3
|
||||
/// (k8s chunker returns empty, CodeTextParagraphV1Chunker retries) must
|
||||
/// report Unchanged on the second ingest rather than re-processing.
|
||||
/// Before the fix, try_skip_unchanged returned None because the stored
|
||||
/// last_chunker_version ("code-text-paragraph-v1" / parser_version
|
||||
/// "none-v1") never matched the caller's dispatch values.
|
||||
#[test]
|
||||
fn tier3_yaml_fallback_reingest_is_unchanged() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
std::fs::write(
|
||||
env.workspace_root.join("docker-compose.yml"),
|
||||
"version: '3'\nservices:\n api:\n image: nginx:latest\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("first ingest");
|
||||
let item1 = report1
|
||||
.items
|
||||
.as_ref()
|
||||
.expect("items present")
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("docker-compose.yml"))
|
||||
.expect("docker-compose.yml in first report");
|
||||
assert!(
|
||||
matches!(item1.kind, IngestItemKind::New),
|
||||
"first ingest must be New, got {:?}", item1.kind
|
||||
);
|
||||
assert_eq!(
|
||||
item1.chunker_version.as_ref().map(|c| c.0.as_str()),
|
||||
Some("code-text-paragraph-v1"),
|
||||
"first ingest must use Tier 3 fallback chunker"
|
||||
);
|
||||
|
||||
let report2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest");
|
||||
let item2 = report2
|
||||
.items
|
||||
.as_ref()
|
||||
.expect("items present")
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("docker-compose.yml"))
|
||||
.expect("docker-compose.yml in second report");
|
||||
assert!(
|
||||
matches!(item2.kind, IngestItemKind::Unchanged),
|
||||
"second ingest must be Unchanged, got {:?}", item2.kind
|
||||
);
|
||||
}
|
||||
|
||||
/// p10-3 fix regression: a shell file (direct Tier 3, not a fallback)
|
||||
/// must also report Unchanged on re-ingest. Shell goes straight to
|
||||
/// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false
|
||||
/// (parser_version is "none-v1" and chunker matches the current dispatch),
|
||||
/// but the normal equality path should pass regardless.
|
||||
#[test]
|
||||
fn tier3_shell_reingest_is_unchanged() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
std::fs::write(
|
||||
env.workspace_root.join("deploy.sh"),
|
||||
"#!/usr/bin/env bash\nset -e\necho hello\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("first ingest");
|
||||
let item1 = report1
|
||||
.items
|
||||
.as_ref()
|
||||
.expect("items present")
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("deploy.sh"))
|
||||
.expect("deploy.sh in first report");
|
||||
assert!(
|
||||
matches!(item1.kind, IngestItemKind::New),
|
||||
"first ingest must be New, got {:?}", item1.kind
|
||||
);
|
||||
|
||||
let report2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest");
|
||||
let item2 = report2
|
||||
.items
|
||||
.as_ref()
|
||||
.expect("items present")
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("deploy.sh"))
|
||||
.expect("deploy.sh in second report");
|
||||
assert!(
|
||||
matches!(item2.kind, IngestItemKind::Unchanged),
|
||||
"shell reingest must be Unchanged, got {:?}", item2.kind
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user