fix(dogfood): document-centric fetch_span + remove get_asset_by_workspace_path
assets.workspace_path is INTENTIONALLY 'last-registered path' for twin files (identical content at different paths share one asset row PK'd by blake3 content hash). PR #146 made try_skip_unchanged document-centric; PR #149 made reset --orphans-only document-centric; this PR removes the last caller of get_asset_by_workspace_path (fetch.rs:193 in fetch_span, which used it to reject PDF/audio media — for twins this could read the wrong asset's media_type and pick the wrong branch). Replaced with the natural 2-step lookup: get_document_by_workspace_path (PR #146) → doc.source_asset_id → get_asset (NEW trait method, asset_id is PRIMARY KEY so flip-flop-immune by construction). Then removed get_asset_by_workspace_path trait method + SqliteStore impl — 0 callers after the refactor. UPSERT doc-comment refreshed in store.rs to make the 'last-registered' semantics explicit so future readers don't try to 'fix' the flip-flop. Dogfood follow-up (PR #142 1B + multi-root corpus). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -189,10 +189,12 @@ fn fetch_span(
|
||||
// (markdown / note / paper / reference / inbox) is the *user-facing*
|
||||
// category, not the rendering format — the actual byte-level format
|
||||
// lives on the source `RawAsset.media_type`. Look it up via
|
||||
// workspace_path (unique key per asset).
|
||||
if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset_by_workspace_path(
|
||||
// doc.source_asset_id (PRIMARY KEY) so twin files (identical content
|
||||
// at different paths) always read *this* document's own asset row,
|
||||
// not whichever twin last wrote `assets.workspace_path`.
|
||||
if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset(
|
||||
&app.sqlite,
|
||||
&doc.workspace_path,
|
||||
&doc.source_asset_id,
|
||||
)? {
|
||||
if matches!(
|
||||
asset.media_type,
|
||||
|
||||
176
crates/kebab-app/tests/twin_files_fetch_span.rs
Normal file
176
crates/kebab-app/tests/twin_files_fetch_span.rs
Normal file
@@ -0,0 +1,176 @@
|
||||
//! Regression test for the twin-file fetch_span media-type lookup bug.
|
||||
//!
|
||||
//! Twin files (identical content at different workspace paths) share one
|
||||
//! `assets` row whose PRIMARY KEY is the blake3 content hash. The old
|
||||
//! `fetch_span` implementation called
|
||||
//! `get_asset_by_workspace_path(&doc.workspace_path)` to check whether the
|
||||
//! media type was PDF/audio (and therefore reject span fetch). For a twin
|
||||
//! file that lookup could silently return the *other* twin's asset row if
|
||||
//! `assets.workspace_path` had been overwritten on the most recent ingest of
|
||||
//! the sibling — making the media-type branch decision incorrect.
|
||||
//!
|
||||
//! Fix: `fetch_span` now uses the 2-step lookup
|
||||
//! `get_document_by_workspace_path` → `doc.source_asset_id` → `get_asset`
|
||||
//! so the result is always anchored to the requesting document, not
|
||||
//! whichever twin last updated `assets.workspace_path`.
|
||||
//!
|
||||
//! This test builds a twin-file scenario (two .md files at different paths
|
||||
//! with identical content), ingests both, then calls `fetch_span` on each
|
||||
//! twin's `doc_id` and asserts it succeeds. Before the fix, if the asset
|
||||
//! row's workspace_path happened to point at the wrong twin the span could
|
||||
//! return an incorrect `span_not_supported` for a non-PDF/audio file, or
|
||||
//! conversely allow span on a PDF twin by accident. After the fix, the
|
||||
//! lookup is always doc-specific.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::ingest_with_config;
|
||||
use kebab_core::{DocumentStore, FetchKind, FetchOpts, FetchQuery, IngestItemKind};
|
||||
|
||||
#[test]
|
||||
fn twin_files_fetch_span_uses_correct_asset() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two markdown files with identical content at different paths.
|
||||
let dir_a = env.workspace_root.join("src_a");
|
||||
let dir_b = env.workspace_root.join("src_b");
|
||||
std::fs::create_dir_all(&dir_a).unwrap();
|
||||
std::fs::create_dir_all(&dir_b).unwrap();
|
||||
|
||||
// The content must produce at least 1 line so span fetch is non-trivial.
|
||||
let content = "# Twin\n\nLine one.\n\nLine two.\n\nLine three.\n";
|
||||
std::fs::write(dir_a.join("note.md"), content).unwrap();
|
||||
std::fs::write(dir_b.join("note.md"), content).unwrap();
|
||||
|
||||
// Ingest all files (fixture workspace + our two new twins).
|
||||
let report = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");
|
||||
|
||||
// Both twin paths must appear as New in the report.
|
||||
let items = report.items.as_ref().expect("items must be present");
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| {
|
||||
i.doc_path.0.ends_with("src_a/note.md")
|
||||
|| i.doc_path.0.ends_with("src_b/note.md")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items.len(),
|
||||
2,
|
||||
"exactly 2 twin items expected; items={items:?}"
|
||||
);
|
||||
for item in &twin_items {
|
||||
assert_eq!(
|
||||
item.kind,
|
||||
IngestItemKind::New,
|
||||
"each twin must be New; item={item:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// Resolve doc_ids for both workspace paths.
|
||||
// The ingest layer normalises workspace_path to the path relative to
|
||||
// workspace_root (e.g. "src_a/note.md"), so we look up by that form.
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
// Find the twin items by matching on suffix so the test is robust to
|
||||
// however the workspace root is represented.
|
||||
let items = report.items.as_ref().expect("items must be present");
|
||||
let path_a_str = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("src_a/note.md"))
|
||||
.map(|i| i.doc_path.0.clone())
|
||||
.expect("src_a/note.md must appear in ingest report");
|
||||
let path_b_str = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("src_b/note.md"))
|
||||
.map(|i| i.doc_path.0.clone())
|
||||
.expect("src_b/note.md must appear in ingest report");
|
||||
|
||||
let path_a = kebab_core::WorkspacePath(path_a_str);
|
||||
let path_b = kebab_core::WorkspacePath(path_b_str);
|
||||
|
||||
let doc_a = store
|
||||
.get_document_by_workspace_path(&path_a)
|
||||
.expect("get_document_by_workspace_path path_a")
|
||||
.expect("doc_a must exist after ingest");
|
||||
let doc_b = store
|
||||
.get_document_by_workspace_path(&path_b)
|
||||
.expect("get_document_by_workspace_path path_b")
|
||||
.expect("doc_b must exist after ingest");
|
||||
|
||||
// Both twins share one asset_id (same content hash).
|
||||
assert_eq!(
|
||||
doc_a.source_asset_id, doc_b.source_asset_id,
|
||||
"twin files must share one asset_id"
|
||||
);
|
||||
|
||||
// Open App and issue span fetch on each twin's doc_id.
|
||||
let app = env.app();
|
||||
|
||||
let result_a = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_a.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 2,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin A must succeed for a markdown file");
|
||||
assert_eq!(result_a.kind, FetchKind::Span);
|
||||
assert!(
|
||||
result_a.text.as_deref().is_some_and(|t| !t.is_empty()),
|
||||
"span text for twin A must not be empty"
|
||||
);
|
||||
|
||||
let result_b = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_b.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 2,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin B must succeed for a markdown file");
|
||||
assert_eq!(result_b.kind, FetchKind::Span);
|
||||
assert!(
|
||||
result_b.text.as_deref().is_some_and(|t| !t.is_empty()),
|
||||
"span text for twin B must not be empty"
|
||||
);
|
||||
|
||||
// Ingest again to force the asset.workspace_path flip-flop, then
|
||||
// re-check. Pre-fix this was the scenario that triggered the bug:
|
||||
// after the second ingest the asset row's workspace_path could point
|
||||
// at either twin, making one twin's span fetch behave incorrectly.
|
||||
let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}");
|
||||
|
||||
// Re-open app after second ingest and verify span still works on both.
|
||||
let app2 = env.app();
|
||||
|
||||
app2.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_a.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 3,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin A after flip-flop must still succeed");
|
||||
|
||||
app2.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_b.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 3,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin B after flip-flop must still succeed");
|
||||
}
|
||||
@@ -8,7 +8,7 @@ use serde_json::Value;
|
||||
use crate::asset::{RawAsset, WorkspacePath};
|
||||
use crate::chunk::Chunk;
|
||||
use crate::document::{Block, CanonicalDocument};
|
||||
use crate::ids::{ChunkId, DocumentId};
|
||||
use crate::ids::{AssetId, ChunkId, DocumentId};
|
||||
use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
|
||||
use crate::media::MediaType;
|
||||
use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery};
|
||||
@@ -161,10 +161,23 @@ pub trait DocumentStore {
|
||||
fn get_document(&self, id: &DocumentId) -> anyhow::Result<Option<CanonicalDocument>>;
|
||||
fn get_chunk(&self, id: &ChunkId) -> anyhow::Result<Option<Chunk>>;
|
||||
fn list_documents(&self, filter: &DocFilter) -> anyhow::Result<Vec<DocSummary>>;
|
||||
/// Look up an asset row by its `asset_id` (PRIMARY KEY = blake3
|
||||
/// content hash). Twin-file safe: asset_id is PK so there is
|
||||
/// exactly one row per unique content hash, regardless of how many
|
||||
/// `documents` rows share it. Use this instead of
|
||||
/// `get_asset_by_workspace_path` when you already have a
|
||||
/// `CanonicalDocument` (which carries `source_asset_id`).
|
||||
fn get_asset(&self, id: &AssetId) -> anyhow::Result<Option<RawAsset>>;
|
||||
|
||||
/// p9-fb-23: look up an asset row by its workspace path. Used by
|
||||
/// the incremental-ingest skip path to compare the freshly
|
||||
/// computed blake3 checksum against what's already in SQLite. The
|
||||
/// schema enforces a unique workspace_path per asset.
|
||||
///
|
||||
/// NOTE: for twin files (identical content at different paths),
|
||||
/// `assets.workspace_path` is "last-registered path" — it
|
||||
/// flip-flops on every ingest. Prefer `get_asset` (by asset_id)
|
||||
/// when you have a `CanonicalDocument.source_asset_id`.
|
||||
fn get_asset_by_workspace_path(
|
||||
&self,
|
||||
path: &WorkspacePath,
|
||||
|
||||
@@ -264,6 +264,28 @@ impl kebab_core::DocumentStore for SqliteStore {
|
||||
}))
|
||||
}
|
||||
|
||||
fn get_asset(
|
||||
&self,
|
||||
id: &kebab_core::AssetId,
|
||||
) -> Result<Option<kebab_core::RawAsset>> {
|
||||
let conn = self.lock_conn();
|
||||
let result = conn.query_row(
|
||||
r#"SELECT
|
||||
asset_id, source_uri, workspace_path, media_type,
|
||||
byte_len, checksum, storage_kind, storage_path,
|
||||
discovered_at
|
||||
FROM assets
|
||||
WHERE asset_id = ?"#,
|
||||
rusqlite::params![id.0.as_str()],
|
||||
asset_from_row,
|
||||
);
|
||||
match result {
|
||||
Ok(asset) => Ok(Some(asset)),
|
||||
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_asset_by_workspace_path(
|
||||
&self,
|
||||
path: &kebab_core::WorkspacePath,
|
||||
@@ -632,7 +654,8 @@ fn rows_optional<T>(err: rusqlite::Error) -> rusqlite::Result<Option<T>> {
|
||||
|
||||
/// Reconstruct a [`kebab_core::RawAsset`] from one `assets` row.
|
||||
/// Row mapper for `RawAsset`. Column names are self-documenting; the
|
||||
/// SELECT in [`DocumentStore::get_asset_by_workspace_path`] must include
|
||||
/// SELECTs in [`DocumentStore::get_asset`] and
|
||||
/// [`DocumentStore::get_asset_by_workspace_path`] must both include
|
||||
/// all nine columns by their schema names.
|
||||
fn asset_from_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<kebab_core::RawAsset> {
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -652,6 +652,20 @@ pub fn purge_deleted_workspace_path(
|
||||
/// path (which has bytes + computed `storage_kind/path`) and the
|
||||
/// `DocumentStore::put_asset` path (which only has the `RawAsset` and
|
||||
/// reads `storage_kind/path` from `asset.stored`).
|
||||
///
|
||||
/// **`assets.workspace_path` is "last-registered path" semantics for
|
||||
/// twin files** (two source files with identical content share one
|
||||
/// `assets` row keyed on `asset_id = blake3(content)`). Each ingest
|
||||
/// of either twin overwrites `workspace_path` with whichever path was
|
||||
/// seen most recently — this is intentional and correct after PR #146
|
||||
/// made `try_skip_unchanged` document-centric (uses
|
||||
/// `get_document_by_workspace_path`, not `get_asset_by_workspace_path`)
|
||||
/// and PR #149 made `reset --orphans-only` document-centric too.
|
||||
/// Do NOT "fix" the flip-flop by adding a UNIQUE constraint on
|
||||
/// `workspace_path` in the `assets` table — twin de-dup is load-bearing.
|
||||
/// When you need media_type for a known document, use the 2-step lookup
|
||||
/// `get_document_by_workspace_path` → `doc.source_asset_id` →
|
||||
/// `get_asset(asset_id)` so the result is twin-safe.
|
||||
pub(crate) fn upsert_asset_row(
|
||||
conn: &Connection,
|
||||
asset: &kebab_core::RawAsset,
|
||||
|
||||
Reference in New Issue
Block a user