fix(dogfood): document-centric try_skip_unchanged for twin-file idempotency

Identical-content files at different workspace paths share one assets row
(assets.asset_id = blake3 content hash, PRIMARY KEY). The UPSERT
`ON CONFLICT(asset_id) DO UPDATE SET workspace_path = excluded` made
twin files overwrite each other's workspace_path on every ingest, so
`get_asset_by_workspace_path(path1)` returned the OTHER twin's row (or
None) — break idempotent unchanged-detection for both files.

Fix: switch try_skip_unchanged to document-centric lookup. `documents.
workspace_path` is already UNIQUE (V001) and `id_for_doc(path, ...)`
includes path, so each twin has its own stable document row. Compare
`doc.source_asset_id` with the new asset's checksum instead of going
through the assets table.

Dogfood (multi-root: kebab-docs + httpx + zod + lodash) showed 27 of
726 docs marked Updated on every idempotent re-ingest — all 27 are
twin-file victims (empty `__init__.py` ×3, AGENTS.md ↔ CLAUDE.md
same content, duplicate logo PDFs/JPGs).

After: re-ingest reports 0 new / 0 updated / 726 unchanged.

No schema migration needed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 05:27:21 +00:00
parent 08fb743598
commit 641b92af7d
4 changed files with 198 additions and 24 deletions

View File

@@ -286,6 +286,72 @@ impl kebab_core::DocumentStore for SqliteStore {
}
}
fn get_document_by_workspace_path(
&self,
path: &kebab_core::WorkspacePath,
) -> Result<Option<kebab_core::CanonicalDocument>> {
let conn = self.lock_conn();
let row: Option<DocumentRow> = conn
.query_row(
"SELECT
doc_id, asset_id, workspace_path, title, lang,
source_type, trust_level, parser_version,
doc_version, schema_version, metadata_json,
provenance_json, created_at, updated_at,
last_chunker_version, last_embedding_version
FROM documents WHERE workspace_path = ?",
params![path.0],
document_row_from_sql,
)
.map(Some)
.or_else(rows_optional)
.map_err(StoreError::from)?;
let Some(row) = row else { return Ok(None) };
let doc_id = kebab_core::DocumentId(row.doc_id.clone());
let mut blocks_stmt = conn
.prepare(
"SELECT payload_json FROM blocks
WHERE doc_id = ? ORDER BY ordinal ASC",
)
.map_err(StoreError::from)?;
let block_rows = blocks_stmt
.query_map(params![row.doc_id], |r| {
let payload_json: String = r.get(0)?;
Ok(payload_json)
})
.map_err(StoreError::from)?;
let mut blocks: Vec<kebab_core::Block> = Vec::new();
for block_row in block_rows {
let payload_json = block_row.map_err(StoreError::from)?;
let block: kebab_core::Block = serde_json::from_str(&payload_json)
.context("deserialize block payload_json")?;
blocks.push(block);
}
let metadata: kebab_core::Metadata = serde_json::from_str(&row.metadata_json)
.context("deserialize metadata_json")?;
let provenance: kebab_core::Provenance =
serde_json::from_str(&row.provenance_json)
.context("deserialize provenance_json")?;
Ok(Some(kebab_core::CanonicalDocument {
doc_id,
source_asset_id: kebab_core::AssetId(row.asset_id),
workspace_path: kebab_core::WorkspacePath(row.workspace_path),
title: row.title.unwrap_or_default(),
lang: kebab_core::Lang(row.lang.unwrap_or_default()),
blocks,
metadata,
provenance,
parser_version: kebab_core::ParserVersion(row.parser_version),
schema_version: row.schema_version as u32,
doc_version: row.doc_version as u32,
last_chunker_version: row.last_chunker_version.map(kebab_core::ChunkerVersion),
last_embedding_version: row.last_embedding_version.map(kebab_core::EmbeddingVersion),
}))
}
fn list_documents(
&self,
filter: &kebab_core::DocFilter,