diff --git a/crates/kebab-app/src/fetch.rs b/crates/kebab-app/src/fetch.rs index e8a456c..d0f1eea 100644 --- a/crates/kebab-app/src/fetch.rs +++ b/crates/kebab-app/src/fetch.rs @@ -189,10 +189,12 @@ fn fetch_span( // (markdown / note / paper / reference / inbox) is the *user-facing* // category, not the rendering format — the actual byte-level format // lives on the source `RawAsset.media_type`. Look it up via - // workspace_path (unique key per asset). - if let Some(asset) = ::get_asset_by_workspace_path( + // doc.source_asset_id (PRIMARY KEY) so twin files (identical content + // at different paths) always read *this* document's own asset row, + // not whichever twin last wrote `assets.workspace_path`. + if let Some(asset) = ::get_asset( &app.sqlite, - &doc.workspace_path, + &doc.source_asset_id, )? { if matches!( asset.media_type, diff --git a/crates/kebab-app/tests/twin_files_fetch_span.rs b/crates/kebab-app/tests/twin_files_fetch_span.rs new file mode 100644 index 0000000..b8c1d9b --- /dev/null +++ b/crates/kebab-app/tests/twin_files_fetch_span.rs @@ -0,0 +1,176 @@ +//! Regression test for the twin-file fetch_span media-type lookup bug. +//! +//! Twin files (identical content at different workspace paths) share one +//! `assets` row whose PRIMARY KEY is the blake3 content hash. The old +//! `fetch_span` implementation called +//! `get_asset_by_workspace_path(&doc.workspace_path)` to check whether the +//! media type was PDF/audio (and therefore reject span fetch). For a twin +//! file that lookup could silently return the *other* twin's asset row if +//! `assets.workspace_path` had been overwritten on the most recent ingest of +//! the sibling — making the media-type branch decision incorrect. +//! +//! Fix: `fetch_span` now uses the 2-step lookup +//! `get_document_by_workspace_path` → `doc.source_asset_id` → `get_asset` +//! so the result is always anchored to the requesting document, not +//! whichever twin last updated `assets.workspace_path`. +//! +//! This test builds a twin-file scenario (two .md files at different paths +//! with identical content), ingests both, then calls `fetch_span` on each +//! twin's `doc_id` and asserts it succeeds. Before the fix, if the asset +//! row's workspace_path happened to point at the wrong twin the span could +//! return an incorrect `span_not_supported` for a non-PDF/audio file, or +//! conversely allow span on a PDF twin by accident. After the fix, the +//! lookup is always doc-specific. + +mod common; + +use common::TestEnv; +use kebab_app::ingest_with_config; +use kebab_core::{DocumentStore, FetchKind, FetchOpts, FetchQuery, IngestItemKind}; + +#[test] +fn twin_files_fetch_span_uses_correct_asset() { + let env = TestEnv::lexical_only(); + + // Write two markdown files with identical content at different paths. + let dir_a = env.workspace_root.join("src_a"); + let dir_b = env.workspace_root.join("src_b"); + std::fs::create_dir_all(&dir_a).unwrap(); + std::fs::create_dir_all(&dir_b).unwrap(); + + // The content must produce at least 1 line so span fetch is non-trivial. + let content = "# Twin\n\nLine one.\n\nLine two.\n\nLine three.\n"; + std::fs::write(dir_a.join("note.md"), content).unwrap(); + std::fs::write(dir_b.join("note.md"), content).unwrap(); + + // Ingest all files (fixture workspace + our two new twins). + let report = ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors; report={report:?}"); + + // Both twin paths must appear as New in the report. + let items = report.items.as_ref().expect("items must be present"); + let twin_items: Vec<_> = items + .iter() + .filter(|i| { + i.doc_path.0.ends_with("src_a/note.md") + || i.doc_path.0.ends_with("src_b/note.md") + }) + .collect(); + assert_eq!( + twin_items.len(), + 2, + "exactly 2 twin items expected; items={items:?}" + ); + for item in &twin_items { + assert_eq!( + item.kind, + IngestItemKind::New, + "each twin must be New; item={item:?}" + ); + } + + // Resolve doc_ids for both workspace paths. + // The ingest layer normalises workspace_path to the path relative to + // workspace_root (e.g. "src_a/note.md"), so we look up by that form. + let store = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap(); + store.run_migrations().unwrap(); + + // Find the twin items by matching on suffix so the test is robust to + // however the workspace root is represented. + let items = report.items.as_ref().expect("items must be present"); + let path_a_str = items + .iter() + .find(|i| i.doc_path.0.ends_with("src_a/note.md")) + .map(|i| i.doc_path.0.clone()) + .expect("src_a/note.md must appear in ingest report"); + let path_b_str = items + .iter() + .find(|i| i.doc_path.0.ends_with("src_b/note.md")) + .map(|i| i.doc_path.0.clone()) + .expect("src_b/note.md must appear in ingest report"); + + let path_a = kebab_core::WorkspacePath(path_a_str); + let path_b = kebab_core::WorkspacePath(path_b_str); + + let doc_a = store + .get_document_by_workspace_path(&path_a) + .expect("get_document_by_workspace_path path_a") + .expect("doc_a must exist after ingest"); + let doc_b = store + .get_document_by_workspace_path(&path_b) + .expect("get_document_by_workspace_path path_b") + .expect("doc_b must exist after ingest"); + + // Both twins share one asset_id (same content hash). + assert_eq!( + doc_a.source_asset_id, doc_b.source_asset_id, + "twin files must share one asset_id" + ); + + // Open App and issue span fetch on each twin's doc_id. + let app = env.app(); + + let result_a = app + .fetch( + FetchQuery::Span { + doc_id: doc_a.doc_id.clone(), + line_start: 1, + line_end: 2, + }, + FetchOpts::default(), + ) + .expect("fetch_span on twin A must succeed for a markdown file"); + assert_eq!(result_a.kind, FetchKind::Span); + assert!( + result_a.text.as_deref().is_some_and(|t| !t.is_empty()), + "span text for twin A must not be empty" + ); + + let result_b = app + .fetch( + FetchQuery::Span { + doc_id: doc_b.doc_id.clone(), + line_start: 1, + line_end: 2, + }, + FetchOpts::default(), + ) + .expect("fetch_span on twin B must succeed for a markdown file"); + assert_eq!(result_b.kind, FetchKind::Span); + assert!( + result_b.text.as_deref().is_some_and(|t| !t.is_empty()), + "span text for twin B must not be empty" + ); + + // Ingest again to force the asset.workspace_path flip-flop, then + // re-check. Pre-fix this was the scenario that triggered the bug: + // after the second ingest the asset row's workspace_path could point + // at either twin, making one twin's span fetch behave incorrectly. + let report2 = ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest must succeed"); + assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}"); + + // Re-open app after second ingest and verify span still works on both. + let app2 = env.app(); + + app2.fetch( + FetchQuery::Span { + doc_id: doc_a.doc_id.clone(), + line_start: 1, + line_end: 3, + }, + FetchOpts::default(), + ) + .expect("fetch_span on twin A after flip-flop must still succeed"); + + app2.fetch( + FetchQuery::Span { + doc_id: doc_b.doc_id.clone(), + line_start: 1, + line_end: 3, + }, + FetchOpts::default(), + ) + .expect("fetch_span on twin B after flip-flop must still succeed"); +} diff --git a/crates/kebab-core/src/traits.rs b/crates/kebab-core/src/traits.rs index fee79ad..83c33f1 100644 --- a/crates/kebab-core/src/traits.rs +++ b/crates/kebab-core/src/traits.rs @@ -8,7 +8,7 @@ use serde_json::Value; use crate::asset::{RawAsset, WorkspacePath}; use crate::chunk::Chunk; use crate::document::{Block, CanonicalDocument}; -use crate::ids::{ChunkId, DocumentId}; +use crate::ids::{AssetId, ChunkId, DocumentId}; use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus}; use crate::media::MediaType; use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery}; @@ -161,10 +161,23 @@ pub trait DocumentStore { fn get_document(&self, id: &DocumentId) -> anyhow::Result>; fn get_chunk(&self, id: &ChunkId) -> anyhow::Result>; fn list_documents(&self, filter: &DocFilter) -> anyhow::Result>; + /// Look up an asset row by its `asset_id` (PRIMARY KEY = blake3 + /// content hash). Twin-file safe: asset_id is PK so there is + /// exactly one row per unique content hash, regardless of how many + /// `documents` rows share it. Use this instead of + /// `get_asset_by_workspace_path` when you already have a + /// `CanonicalDocument` (which carries `source_asset_id`). + fn get_asset(&self, id: &AssetId) -> anyhow::Result>; + /// p9-fb-23: look up an asset row by its workspace path. Used by /// the incremental-ingest skip path to compare the freshly /// computed blake3 checksum against what's already in SQLite. The /// schema enforces a unique workspace_path per asset. + /// + /// NOTE: for twin files (identical content at different paths), + /// `assets.workspace_path` is "last-registered path" — it + /// flip-flops on every ingest. Prefer `get_asset` (by asset_id) + /// when you have a `CanonicalDocument.source_asset_id`. fn get_asset_by_workspace_path( &self, path: &WorkspacePath, diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index fe0921a..277eb9a 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -264,6 +264,28 @@ impl kebab_core::DocumentStore for SqliteStore { })) } + fn get_asset( + &self, + id: &kebab_core::AssetId, + ) -> Result> { + let conn = self.lock_conn(); + let result = conn.query_row( + r#"SELECT + asset_id, source_uri, workspace_path, media_type, + byte_len, checksum, storage_kind, storage_path, + discovered_at + FROM assets + WHERE asset_id = ?"#, + rusqlite::params![id.0.as_str()], + asset_from_row, + ); + match result { + Ok(asset) => Ok(Some(asset)), + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(e.into()), + } + } + fn get_asset_by_workspace_path( &self, path: &kebab_core::WorkspacePath, @@ -632,7 +654,8 @@ fn rows_optional(err: rusqlite::Error) -> rusqlite::Result> { /// Reconstruct a [`kebab_core::RawAsset`] from one `assets` row. /// Row mapper for `RawAsset`. Column names are self-documenting; the -/// SELECT in [`DocumentStore::get_asset_by_workspace_path`] must include +/// SELECTs in [`DocumentStore::get_asset`] and +/// [`DocumentStore::get_asset_by_workspace_path`] must both include /// all nine columns by their schema names. fn asset_from_row(row: &rusqlite::Row<'_>) -> rusqlite::Result { use std::path::PathBuf; diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index d6621b4..aa1ff19 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -652,6 +652,20 @@ pub fn purge_deleted_workspace_path( /// path (which has bytes + computed `storage_kind/path`) and the /// `DocumentStore::put_asset` path (which only has the `RawAsset` and /// reads `storage_kind/path` from `asset.stored`). +/// +/// **`assets.workspace_path` is "last-registered path" semantics for +/// twin files** (two source files with identical content share one +/// `assets` row keyed on `asset_id = blake3(content)`). Each ingest +/// of either twin overwrites `workspace_path` with whichever path was +/// seen most recently — this is intentional and correct after PR #146 +/// made `try_skip_unchanged` document-centric (uses +/// `get_document_by_workspace_path`, not `get_asset_by_workspace_path`) +/// and PR #149 made `reset --orphans-only` document-centric too. +/// Do NOT "fix" the flip-flop by adding a UNIQUE constraint on +/// `workspace_path` in the `assets` table — twin de-dup is load-bearing. +/// When you need media_type for a known document, use the 2-step lookup +/// `get_document_by_workspace_path` → `doc.source_asset_id` → +/// `get_asset(asset_id)` so the result is twin-safe. pub(crate) fn upsert_asset_row( conn: &Connection, asset: &kebab_core::RawAsset,