fix(dogfood): document-centric fetch_span + remove get_asset_by_workspace_path

assets.workspace_path is INTENTIONALLY 'last-registered path' for twin files (identical content at different paths share one asset row PK'd by blake3 content hash). PR #146 made try_skip_unchanged document-centric; PR #149 made reset --orphans-only document-centric; this PR removes the last caller of get_asset_by_workspace_path (fetch.rs:193 in fetch_span, which used it to reject PDF/audio media — for twins this could read the wrong asset's media_type and pick the wrong branch). Replaced with the natural 2-step lookup: get_document_by_workspace_path (PR #146) → doc.source_asset_id → get_asset (NEW trait method, asset_id is PRIMARY KEY so flip-flop-immune by construction). Then removed get_asset_by_workspace_path trait method + SqliteStore impl — 0 callers after the refactor. UPSERT doc-comment refreshed in store.rs to make the 'last-registered' semantics explicit so future readers don't try to 'fix' the flip-flop. Dogfood follow-up (PR #142 1B + multi-root corpus). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 08:03:38 +00:00
parent 1e6de9fe9f
commit 453ec15df4
5 changed files with 233 additions and 5 deletions
--- a/crates/kebab-app/src/fetch.rs
+++ b/crates/kebab-app/src/fetch.rs
@@ -189,10 +189,12 @@ fn fetch_span(
    // (markdown / note / paper / reference / inbox) is the *user-facing*
    // category, not the rendering format — the actual byte-level format
    // lives on the source `RawAsset.media_type`. Look it up via
-    // workspace_path (unique key per asset).
-    if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset_by_workspace_path(
+    // doc.source_asset_id (PRIMARY KEY) so twin files (identical content
+    // at different paths) always read *this* document's own asset row,
+    // not whichever twin last wrote `assets.workspace_path`.
+    if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset(
        &app.sqlite,
-        &doc.workspace_path,
+        &doc.source_asset_id,
    )? {
        if matches!(
            asset.media_type,
--- a/crates/kebab-app/tests/twin_files_fetch_span.rs
+++ b/crates/kebab-app/tests/twin_files_fetch_span.rs
@@ -0,0 +1,176 @@
+//! Regression test for the twin-file fetch_span media-type lookup bug.
+//!
+//! Twin files (identical content at different workspace paths) share one
+//! `assets` row whose PRIMARY KEY is the blake3 content hash. The old
+//! `fetch_span` implementation called
+//! `get_asset_by_workspace_path(&doc.workspace_path)` to check whether the
+//! media type was PDF/audio (and therefore reject span fetch). For a twin
+//! file that lookup could silently return the *other* twin's asset row if
+//! `assets.workspace_path` had been overwritten on the most recent ingest of
+//! the sibling — making the media-type branch decision incorrect.
+//!
+//! Fix: `fetch_span` now uses the 2-step lookup
+//!   `get_document_by_workspace_path` → `doc.source_asset_id` → `get_asset`
+//! so the result is always anchored to the requesting document, not
+//! whichever twin last updated `assets.workspace_path`.
+//!
+//! This test builds a twin-file scenario (two .md files at different paths
+//! with identical content), ingests both, then calls `fetch_span` on each
+//! twin's `doc_id` and asserts it succeeds. Before the fix, if the asset
+//! row's workspace_path happened to point at the wrong twin the span could
+//! return an incorrect `span_not_supported` for a non-PDF/audio file, or
+//! conversely allow span on a PDF twin by accident. After the fix, the
+//! lookup is always doc-specific.
+
+mod common;
+
+use common::TestEnv;
+use kebab_app::ingest_with_config;
+use kebab_core::{DocumentStore, FetchKind, FetchOpts, FetchQuery, IngestItemKind};
+
+#[test]
+fn twin_files_fetch_span_uses_correct_asset() {
+    let env = TestEnv::lexical_only();
+
+    // Write two markdown files with identical content at different paths.
+    let dir_a = env.workspace_root.join("src_a");
+    let dir_b = env.workspace_root.join("src_b");
+    std::fs::create_dir_all(&dir_a).unwrap();
+    std::fs::create_dir_all(&dir_b).unwrap();
+
+    // The content must produce at least 1 line so span fetch is non-trivial.
+    let content = "# Twin\n\nLine one.\n\nLine two.\n\nLine three.\n";
+    std::fs::write(dir_a.join("note.md"), content).unwrap();
+    std::fs::write(dir_b.join("note.md"), content).unwrap();
+
+    // Ingest all files (fixture workspace + our two new twins).
+    let report = ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("ingest must succeed");
+    assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");
+
+    // Both twin paths must appear as New in the report.
+    let items = report.items.as_ref().expect("items must be present");
+    let twin_items: Vec<_> = items
+        .iter()
+        .filter(|i| {
+            i.doc_path.0.ends_with("src_a/note.md")
+                || i.doc_path.0.ends_with("src_b/note.md")
+        })
+        .collect();
+    assert_eq!(
+        twin_items.len(),
+        2,
+        "exactly 2 twin items expected; items={items:?}"
+    );
+    for item in &twin_items {
+        assert_eq!(
+            item.kind,
+            IngestItemKind::New,
+            "each twin must be New; item={item:?}"
+        );
+    }
+
+    // Resolve doc_ids for both workspace paths.
+    // The ingest layer normalises workspace_path to the path relative to
+    // workspace_root (e.g. "src_a/note.md"), so we look up by that form.
+    let store = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
+    store.run_migrations().unwrap();
+
+    // Find the twin items by matching on suffix so the test is robust to
+    // however the workspace root is represented.
+    let items = report.items.as_ref().expect("items must be present");
+    let path_a_str = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("src_a/note.md"))
+        .map(|i| i.doc_path.0.clone())
+        .expect("src_a/note.md must appear in ingest report");
+    let path_b_str = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("src_b/note.md"))
+        .map(|i| i.doc_path.0.clone())
+        .expect("src_b/note.md must appear in ingest report");
+
+    let path_a = kebab_core::WorkspacePath(path_a_str);
+    let path_b = kebab_core::WorkspacePath(path_b_str);
+
+    let doc_a = store
+        .get_document_by_workspace_path(&path_a)
+        .expect("get_document_by_workspace_path path_a")
+        .expect("doc_a must exist after ingest");
+    let doc_b = store
+        .get_document_by_workspace_path(&path_b)
+        .expect("get_document_by_workspace_path path_b")
+        .expect("doc_b must exist after ingest");
+
+    // Both twins share one asset_id (same content hash).
+    assert_eq!(
+        doc_a.source_asset_id, doc_b.source_asset_id,
+        "twin files must share one asset_id"
+    );
+
+    // Open App and issue span fetch on each twin's doc_id.
+    let app = env.app();
+
+    let result_a = app
+        .fetch(
+            FetchQuery::Span {
+                doc_id: doc_a.doc_id.clone(),
+                line_start: 1,
+                line_end: 2,
+            },
+            FetchOpts::default(),
+        )
+        .expect("fetch_span on twin A must succeed for a markdown file");
+    assert_eq!(result_a.kind, FetchKind::Span);
+    assert!(
+        result_a.text.as_deref().is_some_and(|t| !t.is_empty()),
+        "span text for twin A must not be empty"
+    );
+
+    let result_b = app
+        .fetch(
+            FetchQuery::Span {
+                doc_id: doc_b.doc_id.clone(),
+                line_start: 1,
+                line_end: 2,
+            },
+            FetchOpts::default(),
+        )
+        .expect("fetch_span on twin B must succeed for a markdown file");
+    assert_eq!(result_b.kind, FetchKind::Span);
+    assert!(
+        result_b.text.as_deref().is_some_and(|t| !t.is_empty()),
+        "span text for twin B must not be empty"
+    );
+
+    // Ingest again to force the asset.workspace_path flip-flop, then
+    // re-check. Pre-fix this was the scenario that triggered the bug:
+    // after the second ingest the asset row's workspace_path could point
+    // at either twin, making one twin's span fetch behave incorrectly.
+    let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("second ingest must succeed");
+    assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}");
+
+    // Re-open app after second ingest and verify span still works on both.
+    let app2 = env.app();
+
+    app2.fetch(
+        FetchQuery::Span {
+            doc_id: doc_a.doc_id.clone(),
+            line_start: 1,
+            line_end: 3,
+        },
+        FetchOpts::default(),
+    )
+    .expect("fetch_span on twin A after flip-flop must still succeed");
+
+    app2.fetch(
+        FetchQuery::Span {
+            doc_id: doc_b.doc_id.clone(),
+            line_start: 1,
+            line_end: 3,
+        },
+        FetchOpts::default(),
+    )
+    .expect("fetch_span on twin B after flip-flop must still succeed");
+}
--- a/crates/kebab-core/src/traits.rs
+++ b/crates/kebab-core/src/traits.rs
@@ -8,7 +8,7 @@ use serde_json::Value;
 use crate::asset::{RawAsset, WorkspacePath};
 use crate::chunk::Chunk;
 use crate::document::{Block, CanonicalDocument};
-use crate::ids::{ChunkId, DocumentId};
+use crate::ids::{AssetId, ChunkId, DocumentId};
 use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
 use crate::media::MediaType;
 use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery};
@@ -161,10 +161,23 @@ pub trait DocumentStore {
    fn get_document(&self, id: &DocumentId) -> anyhow::Result<Option<CanonicalDocument>>;
    fn get_chunk(&self, id: &ChunkId) -> anyhow::Result<Option<Chunk>>;
    fn list_documents(&self, filter: &DocFilter) -> anyhow::Result<Vec<DocSummary>>;
+    /// Look up an asset row by its `asset_id` (PRIMARY KEY = blake3
+    /// content hash). Twin-file safe: asset_id is PK so there is
+    /// exactly one row per unique content hash, regardless of how many
+    /// `documents` rows share it. Use this instead of
+    /// `get_asset_by_workspace_path` when you already have a
+    /// `CanonicalDocument` (which carries `source_asset_id`).
+    fn get_asset(&self, id: &AssetId) -> anyhow::Result<Option<RawAsset>>;
+
    /// p9-fb-23: look up an asset row by its workspace path. Used by
    /// the incremental-ingest skip path to compare the freshly
    /// computed blake3 checksum against what's already in SQLite. The
    /// schema enforces a unique workspace_path per asset.
+    ///
+    /// NOTE: for twin files (identical content at different paths),
+    /// `assets.workspace_path` is "last-registered path" — it
+    /// flip-flops on every ingest. Prefer `get_asset` (by asset_id)
+    /// when you have a `CanonicalDocument.source_asset_id`.
    fn get_asset_by_workspace_path(
        &self,
        path: &WorkspacePath,
--- a/crates/kebab-store-sqlite/src/documents.rs
+++ b/crates/kebab-store-sqlite/src/documents.rs
@@ -264,6 +264,28 @@ impl kebab_core::DocumentStore for SqliteStore {
        }))
    }

+    fn get_asset(
+        &self,
+        id: &kebab_core::AssetId,
+    ) -> Result<Option<kebab_core::RawAsset>> {
+        let conn = self.lock_conn();
+        let result = conn.query_row(
+            r#"SELECT
+                asset_id, source_uri, workspace_path, media_type,
+                byte_len, checksum, storage_kind, storage_path,
+                discovered_at
+            FROM assets
+            WHERE asset_id = ?"#,
+            rusqlite::params![id.0.as_str()],
+            asset_from_row,
+        );
+        match result {
+            Ok(asset) => Ok(Some(asset)),
+            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
+            Err(e) => Err(e.into()),
+        }
+    }
+
    fn get_asset_by_workspace_path(
        &self,
        path: &kebab_core::WorkspacePath,
@@ -632,7 +654,8 @@ fn rows_optional<T>(err: rusqlite::Error) -> rusqlite::Result<Option<T>> {

 /// Reconstruct a [`kebab_core::RawAsset`] from one `assets` row.
 /// Row mapper for `RawAsset`. Column names are self-documenting; the
-/// SELECT in [`DocumentStore::get_asset_by_workspace_path`] must include
+/// SELECTs in [`DocumentStore::get_asset`] and
+/// [`DocumentStore::get_asset_by_workspace_path`] must both include
 /// all nine columns by their schema names.
 fn asset_from_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<kebab_core::RawAsset> {
    use std::path::PathBuf;
--- a/crates/kebab-store-sqlite/src/store.rs
+++ b/crates/kebab-store-sqlite/src/store.rs
@@ -652,6 +652,20 @@ pub fn purge_deleted_workspace_path(
 /// path (which has bytes + computed `storage_kind/path`) and the
 /// `DocumentStore::put_asset` path (which only has the `RawAsset` and
 /// reads `storage_kind/path` from `asset.stored`).
+///
+/// **`assets.workspace_path` is "last-registered path" semantics for
+/// twin files** (two source files with identical content share one
+/// `assets` row keyed on `asset_id = blake3(content)`). Each ingest
+/// of either twin overwrites `workspace_path` with whichever path was
+/// seen most recently — this is intentional and correct after PR #146
+/// made `try_skip_unchanged` document-centric (uses
+/// `get_document_by_workspace_path`, not `get_asset_by_workspace_path`)
+/// and PR #149 made `reset --orphans-only` document-centric too.
+/// Do NOT "fix" the flip-flop by adding a UNIQUE constraint on
+/// `workspace_path` in the `assets` table — twin de-dup is load-bearing.
+/// When you need media_type for a known document, use the 2-step lookup
+/// `get_document_by_workspace_path` → `doc.source_asset_id` →
+/// `get_asset(asset_id)` so the result is twin-safe.
 pub(crate) fn upsert_asset_row(
    conn: &Connection,
    asset: &kebab_core::RawAsset,