diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index ddf211a..37013e3 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -795,6 +795,7 @@ fn try_skip_unchanged( current_chunker_version: &ChunkerVersion, current_embedding_version: Option<&kebab_core::EmbeddingVersion>, force_reingest: bool, + fallback_chunker_version: Option<&ChunkerVersion>, // p10-3 fix ) -> anyhow::Result> { if force_reingest { return Ok(None); @@ -829,6 +830,50 @@ fn try_skip_unchanged( if existing_doc.source_asset_id != asset.asset_id { return Ok(None); } + // p10-3 fix: detect "stored doc was previously Tier 3 fallback". + // When a Tier 1/2 extractor emits empty chunks, the fallback wrapper + // retries with CodeTextParagraphV1Chunker and stores + // last_chunker_version = "code-text-paragraph-v1" + parser_version = "none-v1". + // On the next ingest the caller computes current_parser_version / + // current_chunker_version from the Tier 1/2 dispatch (e.g. + // "k8s-manifest-resource-v1"), which can never match the stored + // fallback values, causing spurious re-ingests. Detect this case + // and bypass the parser/chunker equality checks — only the embedder + // version still must match. + let stored_is_tier3_fallback = fallback_chunker_version.is_some_and(|fbv| { + existing_doc.last_chunker_version.as_ref() == Some(fbv) + && existing_doc.parser_version.0 == "none-v1" + }); + + if stored_is_tier3_fallback { + // Embedder version still must match. + let embedder_match = existing_doc.last_embedding_version.as_ref() + == current_embedding_version; + if !embedder_match { + return Ok(None); + } + let candidate_doc_id = existing_doc.doc_id.clone(); + tracing::debug!( + target: "kebab-app::ingest", + path = %asset.workspace_path.0, + doc_id = %candidate_doc_id.0, + "skip-unchanged: tier 3 fallback state detected; bypassing parser/chunker equality" + ); + return Ok(Some(kebab_core::IngestItem { + kind: kebab_core::IngestItemKind::Unchanged, + doc_id: Some(candidate_doc_id), + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: u32::try_from(existing_doc.blocks.len()).ok(), + chunk_count: None, + parser_version: Some(existing_doc.parser_version.clone()), + chunker_version: existing_doc.last_chunker_version.clone(), + warnings: Vec::new(), + error: None, + })); + } + // 2. Parser unchanged: parser_version is baked into id_for_doc so // a version bump yields a different doc_id and the row above // would have been missing. Checking here explicitly keeps the @@ -1017,6 +1062,7 @@ fn ingest_one_asset( &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1211,6 +1257,7 @@ fn ingest_one_image_asset( &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1657,6 +1704,7 @@ fn ingest_one_pdf_asset( &PdfPageV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1866,6 +1914,18 @@ fn ingest_one_code_asset( other => anyhow::bail!("unreachable chunker_version: {other}"), }; + // p10-3 fix: if this lang can fall back to Tier 3, compute the fallback + // chunker_version so try_skip_unchanged can detect the stored-as-Tier-3 + // state and skip parser/chunker equality checks. + let tier3_fallback_cv: Option = match code_lang { + "rust" | "python" | "typescript" | "javascript" + | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "c" | "cpp" // p10-1D + => Some(CodeTextParagraphV1Chunker.chunker_version()), + _ => None, + }; + if let Some(item) = try_skip_unchanged( app, asset, @@ -1873,6 +1933,7 @@ fn ingest_one_code_asset( &chunker_version, embedder.map(|e| e.model_version()).as_ref(), force_reingest, + tier3_fallback_cv.as_ref(), )? { return Ok(item); } diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index a462666..4acb12e 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -1064,3 +1064,101 @@ fn rust_file_re_ingest_is_unchanged() { ); assert_eq!(item2.doc_id, item1.doc_id); } + +/// p10-3 fix regression: a docker-compose YAML that falls back to Tier 3 +/// (k8s chunker returns empty, CodeTextParagraphV1Chunker retries) must +/// report Unchanged on the second ingest rather than re-processing. +/// Before the fix, try_skip_unchanged returned None because the stored +/// last_chunker_version ("code-text-paragraph-v1" / parser_version +/// "none-v1") never matched the caller's dispatch values. +#[test] +fn tier3_yaml_fallback_reingest_is_unchanged() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("docker-compose.yml"), + "version: '3'\nservices:\n api:\n image: nginx:latest\n", + ) + .unwrap(); + + let report1 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); + let item1 = report1 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml in first report"); + assert!( + matches!(item1.kind, IngestItemKind::New), + "first ingest must be New, got {:?}", item1.kind + ); + assert_eq!( + item1.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-text-paragraph-v1"), + "first ingest must use Tier 3 fallback chunker" + ); + + let report2 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); + let item2 = report2 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml in second report"); + assert!( + matches!(item2.kind, IngestItemKind::Unchanged), + "second ingest must be Unchanged, got {:?}", item2.kind + ); +} + +/// p10-3 fix regression: a shell file (direct Tier 3, not a fallback) +/// must also report Unchanged on re-ingest. Shell goes straight to +/// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false +/// (parser_version is "none-v1" and chunker matches the current dispatch), +/// but the normal equality path should pass regardless. +#[test] +fn tier3_shell_reingest_is_unchanged() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("deploy.sh"), + "#!/usr/bin/env bash\nset -e\necho hello\n", + ) + .unwrap(); + + let report1 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); + let item1 = report1 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh in first report"); + assert!( + matches!(item1.kind, IngestItemKind::New), + "first ingest must be New, got {:?}", item1.kind + ); + + let report2 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); + let item2 = report2 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh in second report"); + assert!( + matches!(item2.kind, IngestItemKind::Unchanged), + "shell reingest must be Unchanged, got {:?}", item2.kind + ); +}