From 641b92af7d1dcac41d121d7cf9e39a55ae2b4d8d Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 05:27:21 +0000 Subject: [PATCH 1/2] fix(dogfood): document-centric try_skip_unchanged for twin-file idempotency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Identical-content files at different workspace paths share one assets row (assets.asset_id = blake3 content hash, PRIMARY KEY). The UPSERT `ON CONFLICT(asset_id) DO UPDATE SET workspace_path = excluded` made twin files overwrite each other's workspace_path on every ingest, so `get_asset_by_workspace_path(path1)` returned the OTHER twin's row (or None) — break idempotent unchanged-detection for both files. Fix: switch try_skip_unchanged to document-centric lookup. `documents. workspace_path` is already UNIQUE (V001) and `id_for_doc(path, ...)` includes path, so each twin has its own stable document row. Compare `doc.source_asset_id` with the new asset's checksum instead of going through the assets table. Dogfood (multi-root: kebab-docs + httpx + zod + lodash) showed 27 of 726 docs marked Updated on every idempotent re-ingest — all 27 are twin-file victims (empty `__init__.py` ×3, AGENTS.md ↔ CLAUDE.md same content, duplicate logo PDFs/JPGs). After: re-ingest reports 0 new / 0 updated / 726 unchanged. No schema migration needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 52 ++++++----- .../kebab-app/tests/twin_files_idempotent.rs | 90 +++++++++++++++++++ crates/kebab-core/src/traits.rs | 14 +++ crates/kebab-store-sqlite/src/documents.rs | 66 ++++++++++++++ 4 files changed, 198 insertions(+), 24 deletions(-) create mode 100644 crates/kebab-app/tests/twin_files_idempotent.rs diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index dd837f7..2e4b3c2 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -773,31 +773,19 @@ fn try_skip_unchanged( if force_reingest { return Ok(None); } - let existing_asset = match app + // Document-centric skip: look up the existing document row by + // workspace_path directly. This avoids the twin-file flip-flop + // that the old asset-side lookup suffers from — multiple files + // with identical content share one `assets` row whose + // `workspace_path` is overwritten on every UPSERT, so + // `get_asset_by_workspace_path(path1)` could return the OTHER + // twin's path (or None) after any ingest of the twin. The + // `documents` table has a UNIQUE index on `workspace_path` (V001), + // so each twin has its own stable row regardless of asset de-dup. + let existing_doc = match app .sqlite - .get_asset_by_workspace_path(&asset.workspace_path) + .get_document_by_workspace_path(&asset.workspace_path) { - Ok(Some(a)) => a, - Ok(None) => return Ok(None), - Err(e) => { - tracing::debug!( - target: "kebab-app", - path = %asset.workspace_path.0, - error = %e, - "skip-check: get_asset_by_workspace_path failed; falling through to re-process" - ); - return Ok(None); - } - }; - if existing_asset.checksum != asset.checksum { - return Ok(None); - } - let candidate_doc_id = kebab_core::id_for_doc( - &asset.workspace_path, - &asset.asset_id, - current_parser_version, - ); - let existing_doc = match app.sqlite.get_document(&candidate_doc_id) { Ok(Some(d)) => d, Ok(None) => return Ok(None), Err(e) => { @@ -805,21 +793,37 @@ fn try_skip_unchanged( target: "kebab-app", path = %asset.workspace_path.0, error = %e, - "skip-check: get_document failed; falling through to re-process" + "skip-check: get_document_by_workspace_path failed; falling through to re-process" ); return Ok(None); } }; + // 1. Content unchanged: the freshly-computed asset_id (blake3 + // content hash) must match what this document was ingested from. + if existing_doc.source_asset_id != asset.asset_id { + return Ok(None); + } + // 2. Parser unchanged: parser_version is baked into id_for_doc so + // a version bump yields a different doc_id and the row above + // would have been missing. Checking here explicitly keeps the + // logic self-documenting and guards against future id_for_doc + // changes. + if existing_doc.parser_version != *current_parser_version { + return Ok(None); + } + // 3. Chunker unchanged. let chunker_match = existing_doc.last_chunker_version.as_ref() == Some(current_chunker_version); if !chunker_match { return Ok(None); } + // 4. Embedder unchanged. let embedder_match = existing_doc.last_embedding_version.as_ref() == current_embedding_version; if !embedder_match { return Ok(None); } + let candidate_doc_id = existing_doc.doc_id.clone(); tracing::debug!( target: "kebab-app::ingest", path = %asset.workspace_path.0, diff --git a/crates/kebab-app/tests/twin_files_idempotent.rs b/crates/kebab-app/tests/twin_files_idempotent.rs new file mode 100644 index 0000000..9b08bf4 --- /dev/null +++ b/crates/kebab-app/tests/twin_files_idempotent.rs @@ -0,0 +1,90 @@ +//! Regression test for the twin-file idempotency bug. +//! +//! Identical-content files at different workspace paths share one +//! `assets` row (`asset_id` = blake3 content hash, PRIMARY KEY). The +//! old UPSERT `ON CONFLICT(asset_id) DO UPDATE SET workspace_path = +//! excluded.workspace_path` made each twin overwrite the other's path +//! on every ingest, so `get_asset_by_workspace_path(path1)` returned +//! None (or the wrong twin) → re-process every time. +//! +//! Fix: `try_skip_unchanged` now uses `get_document_by_workspace_path` +//! instead. `documents.workspace_path` is UNIQUE (V001) so each twin +//! has its own stable document row. +//! +//! Assertion contract: +//! 1st ingest → 2 New (one per twin) +//! 2nd ingest → 0 New, 0 Updated, 2 Unchanged + +mod common; + +use common::TestEnv; +use kebab_app::ingest_with_config; +use kebab_core::IngestItemKind; + +#[test] +fn twin_files_second_ingest_is_unchanged() { + let env = TestEnv::lexical_only(); + + // Write two files with identical content at different paths. + let pkg_a = env.workspace_root.join("pkg_a"); + let pkg_b = env.workspace_root.join("pkg_b"); + std::fs::create_dir_all(&pkg_a).unwrap(); + std::fs::create_dir_all(&pkg_b).unwrap(); + + let content = b"# shared\nThis content is identical in both files.\n"; + std::fs::write(pkg_a.join("__init__.py"), content).unwrap(); + std::fs::write(pkg_b.join("__init__.py"), content).unwrap(); + + // First ingest — both files must be New. + let first = ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest must succeed"); + assert_eq!(first.errors, 0, "first ingest: no errors; report={first:?}"); + + let items = first.items.as_ref().expect("items must be present"); + let twin_items: Vec<_> = items + .iter() + .filter(|i| { + i.doc_path.0.ends_with("__init__.py") + }) + .collect(); + assert_eq!( + twin_items.len(), + 2, + "first ingest: expected exactly 2 __init__.py items; items={items:?}" + ); + for item in &twin_items { + assert_eq!( + item.kind, + IngestItemKind::New, + "first ingest: each twin must be New; item={item:?}" + ); + } + + // Second ingest — same files, same content → both must be Unchanged. + let second = ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest must succeed"); + assert_eq!(second.errors, 0, "second ingest: no errors; report={second:?}"); + assert_eq!(second.new, 0, "second ingest: no new docs; report={second:?}"); + assert_eq!( + second.updated, 0, + "second ingest: no updated docs (twin-file bug would set this to 2); report={second:?}" + ); + + let second_items = second.items.as_ref().expect("items must be present"); + let twin_items2: Vec<_> = second_items + .iter() + .filter(|i| i.doc_path.0.ends_with("__init__.py")) + .collect(); + assert_eq!( + twin_items2.len(), + 2, + "second ingest: expected exactly 2 __init__.py items; items={second_items:?}" + ); + for item in &twin_items2 { + assert_eq!( + item.kind, + IngestItemKind::Unchanged, + "second ingest: each twin must be Unchanged; item={item:?}" + ); + } +} diff --git a/crates/kebab-core/src/traits.rs b/crates/kebab-core/src/traits.rs index bb4d0c3..74ed71d 100644 --- a/crates/kebab-core/src/traits.rs +++ b/crates/kebab-core/src/traits.rs @@ -169,6 +169,20 @@ pub trait DocumentStore { &self, path: &WorkspacePath, ) -> anyhow::Result>; + + /// Look up a document row by its workspace path. Used by the + /// document-centric skip path in `try_skip_unchanged` to avoid the + /// twin-file flip-flop that the asset-side lookup suffers from + /// (multiple files with identical content share one `assets` row + /// whose `workspace_path` is overwritten on every UPSERT, so + /// `get_asset_by_workspace_path` returns the wrong twin's path). + /// + /// `documents.workspace_path` is UNIQUE (V001), so each twin has + /// its own stable document row regardless of the asset de-dup. + fn get_document_by_workspace_path( + &self, + path: &WorkspacePath, + ) -> anyhow::Result>; } pub trait VectorStore { diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index b769b03..d7d2a0a 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -286,6 +286,72 @@ impl kebab_core::DocumentStore for SqliteStore { } } + fn get_document_by_workspace_path( + &self, + path: &kebab_core::WorkspacePath, + ) -> Result> { + let conn = self.lock_conn(); + let row: Option = conn + .query_row( + "SELECT + doc_id, asset_id, workspace_path, title, lang, + source_type, trust_level, parser_version, + doc_version, schema_version, metadata_json, + provenance_json, created_at, updated_at, + last_chunker_version, last_embedding_version + FROM documents WHERE workspace_path = ?", + params![path.0], + document_row_from_sql, + ) + .map(Some) + .or_else(rows_optional) + .map_err(StoreError::from)?; + let Some(row) = row else { return Ok(None) }; + + let doc_id = kebab_core::DocumentId(row.doc_id.clone()); + let mut blocks_stmt = conn + .prepare( + "SELECT payload_json FROM blocks + WHERE doc_id = ? ORDER BY ordinal ASC", + ) + .map_err(StoreError::from)?; + let block_rows = blocks_stmt + .query_map(params![row.doc_id], |r| { + let payload_json: String = r.get(0)?; + Ok(payload_json) + }) + .map_err(StoreError::from)?; + let mut blocks: Vec = Vec::new(); + for block_row in block_rows { + let payload_json = block_row.map_err(StoreError::from)?; + let block: kebab_core::Block = serde_json::from_str(&payload_json) + .context("deserialize block payload_json")?; + blocks.push(block); + } + + let metadata: kebab_core::Metadata = serde_json::from_str(&row.metadata_json) + .context("deserialize metadata_json")?; + let provenance: kebab_core::Provenance = + serde_json::from_str(&row.provenance_json) + .context("deserialize provenance_json")?; + + Ok(Some(kebab_core::CanonicalDocument { + doc_id, + source_asset_id: kebab_core::AssetId(row.asset_id), + workspace_path: kebab_core::WorkspacePath(row.workspace_path), + title: row.title.unwrap_or_default(), + lang: kebab_core::Lang(row.lang.unwrap_or_default()), + blocks, + metadata, + provenance, + parser_version: kebab_core::ParserVersion(row.parser_version), + schema_version: row.schema_version as u32, + doc_version: row.doc_version as u32, + last_chunker_version: row.last_chunker_version.map(kebab_core::ChunkerVersion), + last_embedding_version: row.last_embedding_version.map(kebab_core::EmbeddingVersion), + })) + } + fn list_documents( &self, filter: &kebab_core::DocFilter, -- 2.49.1 From 360f825f3a9ead841b57611fb9a5d6e2497e6e56 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 05:35:17 +0000 Subject: [PATCH 2/2] docs(dogfood): refresh try_skip_unchanged doc-comment to match new flow (PR #146 review) Round 1 review found the function-level doc-comment still described the old asset-side algorithm (item 2 asset-row checksum, item 3 id_for_doc miss). Updated to the document-centric flow. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 2e4b3c2..e627b06 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -748,15 +748,18 @@ struct ImagePipeline<'a> { /// hold (per design §9 cascade rule): /// /// 1. `force_reingest == false` — caller hasn't asked to bypass skip. -/// 2. The freshly-scanned asset's blake3 checksum equals what the -/// existing `assets` row stores at the same `workspace_path`. -/// 3. The doc keyed on `(workspace_path, asset_id, current_parser_version)` -/// exists. If the parser_version changed, `id_for_doc` produces a -/// different `doc_id` so the lookup misses → no skip → re-process. -/// 4. The existing doc's stamped `last_chunker_version` AND -/// `last_embedding_version` match the values the caller is about -/// to use (`Some(v) == Some(v)` and `None == None` — see design -/// doc for the `None == None` rule when no embedder is configured). +/// 2. A document already exists at this `workspace_path` +/// (`get_document_by_workspace_path`). The lookup is document-side, not +/// asset-side, so twin files (identical content at different paths) each +/// hit their own stable doc row — `documents.workspace_path` is UNIQUE +/// while `assets` may dedupe content into a single row with a flip-flop +/// `workspace_path` column (dogfood bug #4, see `tasks/HOTFIXES.md`). +/// 3. The existing doc's `source_asset_id` equals the freshly-scanned +/// asset's blake3 checksum (content unchanged). +/// 4. The existing doc's `parser_version` matches the current extractor's +/// `parser_version` (extractor not upgraded). Combined with `chunker_version` +/// and `last_embedding_version` checks immediately below — full cascade +/// per design §9. /// /// Returns `Ok(None)` (proceed with full re-process) when any check /// fails or any DB read errors out — the skip path is opportunistic; -- 2.49.1