From 1034de25a2b401a317d31bc5c7eca36cc2bbca6f Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 14:19:17 +0000 Subject: [PATCH] fix(p10-3+p10-1d): land the missing try_skip_unchanged fallback-aware fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #155 (p10-3) merged WITHOUT the reviewer's required Option B1 fix — the implementer reported a commit SHA (2a39513) that never made it to main. Result: every reingest of a Tier 3-fallback file (non-k8s YAML, invalid YAML, AST extractor failure) re-runs full extract + chunk + embed because the parser/chunker version comparison can never match (stored is code-text-paragraph-v1 / none-v1, but caller uses Tier 1/2 dispatch values). This commit: 1. Adds the 7th param `fallback_chunker_version: Option<&ChunkerVersion>` to try_skip_unchanged + the stored_is_tier3_fallback detection branch (skip parser/chunker equality, keep embedder check). 2. Threads `None` through non-code call sites (md / image / pdf). 3. Code call site computes tier3_fallback_cv covering all Tier 1/2 langs that can fall back: rust / python / ts / js / go / java / kotlin / yaml / dockerfile / toml / json / xml / groovy / go-mod / c / cpp (p10-1D additions). 4. Adds tier3_yaml_fallback_reingest_is_unchanged + tier3_shell_reingest_is_unchanged regression tests (the originally-promised PR #155 regression coverage that also never made it to main). Smoke tests: 14 + 2 = 16 PASS. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 61 +++++++++++++ crates/kebab-app/tests/code_ingest_smoke.rs | 98 +++++++++++++++++++++ 2 files changed, 159 insertions(+) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index ddf211a..37013e3 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -795,6 +795,7 @@ fn try_skip_unchanged( current_chunker_version: &ChunkerVersion, current_embedding_version: Option<&kebab_core::EmbeddingVersion>, force_reingest: bool, + fallback_chunker_version: Option<&ChunkerVersion>, // p10-3 fix ) -> anyhow::Result> { if force_reingest { return Ok(None); @@ -829,6 +830,50 @@ fn try_skip_unchanged( if existing_doc.source_asset_id != asset.asset_id { return Ok(None); } + // p10-3 fix: detect "stored doc was previously Tier 3 fallback". + // When a Tier 1/2 extractor emits empty chunks, the fallback wrapper + // retries with CodeTextParagraphV1Chunker and stores + // last_chunker_version = "code-text-paragraph-v1" + parser_version = "none-v1". + // On the next ingest the caller computes current_parser_version / + // current_chunker_version from the Tier 1/2 dispatch (e.g. + // "k8s-manifest-resource-v1"), which can never match the stored + // fallback values, causing spurious re-ingests. Detect this case + // and bypass the parser/chunker equality checks — only the embedder + // version still must match. + let stored_is_tier3_fallback = fallback_chunker_version.is_some_and(|fbv| { + existing_doc.last_chunker_version.as_ref() == Some(fbv) + && existing_doc.parser_version.0 == "none-v1" + }); + + if stored_is_tier3_fallback { + // Embedder version still must match. + let embedder_match = existing_doc.last_embedding_version.as_ref() + == current_embedding_version; + if !embedder_match { + return Ok(None); + } + let candidate_doc_id = existing_doc.doc_id.clone(); + tracing::debug!( + target: "kebab-app::ingest", + path = %asset.workspace_path.0, + doc_id = %candidate_doc_id.0, + "skip-unchanged: tier 3 fallback state detected; bypassing parser/chunker equality" + ); + return Ok(Some(kebab_core::IngestItem { + kind: kebab_core::IngestItemKind::Unchanged, + doc_id: Some(candidate_doc_id), + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: u32::try_from(existing_doc.blocks.len()).ok(), + chunk_count: None, + parser_version: Some(existing_doc.parser_version.clone()), + chunker_version: existing_doc.last_chunker_version.clone(), + warnings: Vec::new(), + error: None, + })); + } + // 2. Parser unchanged: parser_version is baked into id_for_doc so // a version bump yields a different doc_id and the row above // would have been missing. Checking here explicitly keeps the @@ -1017,6 +1062,7 @@ fn ingest_one_asset( &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1211,6 +1257,7 @@ fn ingest_one_image_asset( &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1657,6 +1704,7 @@ fn ingest_one_pdf_asset( &PdfPageV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1866,6 +1914,18 @@ fn ingest_one_code_asset( other => anyhow::bail!("unreachable chunker_version: {other}"), }; + // p10-3 fix: if this lang can fall back to Tier 3, compute the fallback + // chunker_version so try_skip_unchanged can detect the stored-as-Tier-3 + // state and skip parser/chunker equality checks. + let tier3_fallback_cv: Option = match code_lang { + "rust" | "python" | "typescript" | "javascript" + | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "c" | "cpp" // p10-1D + => Some(CodeTextParagraphV1Chunker.chunker_version()), + _ => None, + }; + if let Some(item) = try_skip_unchanged( app, asset, @@ -1873,6 +1933,7 @@ fn ingest_one_code_asset( &chunker_version, embedder.map(|e| e.model_version()).as_ref(), force_reingest, + tier3_fallback_cv.as_ref(), )? { return Ok(item); } diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index a462666..4acb12e 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -1064,3 +1064,101 @@ fn rust_file_re_ingest_is_unchanged() { ); assert_eq!(item2.doc_id, item1.doc_id); } + +/// p10-3 fix regression: a docker-compose YAML that falls back to Tier 3 +/// (k8s chunker returns empty, CodeTextParagraphV1Chunker retries) must +/// report Unchanged on the second ingest rather than re-processing. +/// Before the fix, try_skip_unchanged returned None because the stored +/// last_chunker_version ("code-text-paragraph-v1" / parser_version +/// "none-v1") never matched the caller's dispatch values. +#[test] +fn tier3_yaml_fallback_reingest_is_unchanged() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("docker-compose.yml"), + "version: '3'\nservices:\n api:\n image: nginx:latest\n", + ) + .unwrap(); + + let report1 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); + let item1 = report1 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml in first report"); + assert!( + matches!(item1.kind, IngestItemKind::New), + "first ingest must be New, got {:?}", item1.kind + ); + assert_eq!( + item1.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-text-paragraph-v1"), + "first ingest must use Tier 3 fallback chunker" + ); + + let report2 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); + let item2 = report2 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml in second report"); + assert!( + matches!(item2.kind, IngestItemKind::Unchanged), + "second ingest must be Unchanged, got {:?}", item2.kind + ); +} + +/// p10-3 fix regression: a shell file (direct Tier 3, not a fallback) +/// must also report Unchanged on re-ingest. Shell goes straight to +/// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false +/// (parser_version is "none-v1" and chunker matches the current dispatch), +/// but the normal equality path should pass regardless. +#[test] +fn tier3_shell_reingest_is_unchanged() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("deploy.sh"), + "#!/usr/bin/env bash\nset -e\necho hello\n", + ) + .unwrap(); + + let report1 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); + let item1 = report1 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh in first report"); + assert!( + matches!(item1.kind, IngestItemKind::New), + "first ingest must be New, got {:?}", item1.kind + ); + + let report2 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); + let item2 = report2 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh in second report"); + assert!( + matches!(item2.kind, IngestItemKind::Unchanged), + "shell reingest must be Unchanged, got {:?}", item2.kind + ); +}