Files
kebab/crates/kebab-app/tests/twin_files_fetch_span.rs
altair823 453ec15df4 fix(dogfood): document-centric fetch_span + remove get_asset_by_workspace_path
assets.workspace_path is INTENTIONALLY 'last-registered path' for twin
files (identical content at different paths share one asset row PK'd by
blake3 content hash). PR #146 made try_skip_unchanged document-centric;
PR #149 made reset --orphans-only document-centric; this PR removes the
last caller of get_asset_by_workspace_path (fetch.rs:193 in fetch_span,
which used it to reject PDF/audio media — for twins this could read the
wrong asset's media_type and pick the wrong branch).

Replaced with the natural 2-step lookup: get_document_by_workspace_path
(PR #146) → doc.source_asset_id → get_asset (NEW trait method, asset_id
is PRIMARY KEY so flip-flop-immune by construction).

Then removed get_asset_by_workspace_path trait method + SqliteStore impl
— 0 callers after the refactor.

UPSERT doc-comment refreshed in store.rs to make the 'last-registered'
semantics explicit so future readers don't try to 'fix' the flip-flop.

Dogfood follow-up (PR #142 1B + multi-root corpus).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 08:03:38 +00:00

177 lines
6.7 KiB
Rust

//! Regression test for the twin-file fetch_span media-type lookup bug.
//!
//! Twin files (identical content at different workspace paths) share one
//! `assets` row whose PRIMARY KEY is the blake3 content hash. The old
//! `fetch_span` implementation called
//! `get_asset_by_workspace_path(&doc.workspace_path)` to check whether the
//! media type was PDF/audio (and therefore reject span fetch). For a twin
//! file that lookup could silently return the *other* twin's asset row if
//! `assets.workspace_path` had been overwritten on the most recent ingest of
//! the sibling — making the media-type branch decision incorrect.
//!
//! Fix: `fetch_span` now uses the 2-step lookup
//! `get_document_by_workspace_path` → `doc.source_asset_id` → `get_asset`
//! so the result is always anchored to the requesting document, not
//! whichever twin last updated `assets.workspace_path`.
//!
//! This test builds a twin-file scenario (two .md files at different paths
//! with identical content), ingests both, then calls `fetch_span` on each
//! twin's `doc_id` and asserts it succeeds. Before the fix, if the asset
//! row's workspace_path happened to point at the wrong twin the span could
//! return an incorrect `span_not_supported` for a non-PDF/audio file, or
//! conversely allow span on a PDF twin by accident. After the fix, the
//! lookup is always doc-specific.
mod common;
use common::TestEnv;
use kebab_app::ingest_with_config;
use kebab_core::{DocumentStore, FetchKind, FetchOpts, FetchQuery, IngestItemKind};
#[test]
fn twin_files_fetch_span_uses_correct_asset() {
let env = TestEnv::lexical_only();
// Write two markdown files with identical content at different paths.
let dir_a = env.workspace_root.join("src_a");
let dir_b = env.workspace_root.join("src_b");
std::fs::create_dir_all(&dir_a).unwrap();
std::fs::create_dir_all(&dir_b).unwrap();
// The content must produce at least 1 line so span fetch is non-trivial.
let content = "# Twin\n\nLine one.\n\nLine two.\n\nLine three.\n";
std::fs::write(dir_a.join("note.md"), content).unwrap();
std::fs::write(dir_b.join("note.md"), content).unwrap();
// Ingest all files (fixture workspace + our two new twins).
let report = ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");
// Both twin paths must appear as New in the report.
let items = report.items.as_ref().expect("items must be present");
let twin_items: Vec<_> = items
.iter()
.filter(|i| {
i.doc_path.0.ends_with("src_a/note.md")
|| i.doc_path.0.ends_with("src_b/note.md")
})
.collect();
assert_eq!(
twin_items.len(),
2,
"exactly 2 twin items expected; items={items:?}"
);
for item in &twin_items {
assert_eq!(
item.kind,
IngestItemKind::New,
"each twin must be New; item={item:?}"
);
}
// Resolve doc_ids for both workspace paths.
// The ingest layer normalises workspace_path to the path relative to
// workspace_root (e.g. "src_a/note.md"), so we look up by that form.
let store = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
store.run_migrations().unwrap();
// Find the twin items by matching on suffix so the test is robust to
// however the workspace root is represented.
let items = report.items.as_ref().expect("items must be present");
let path_a_str = items
.iter()
.find(|i| i.doc_path.0.ends_with("src_a/note.md"))
.map(|i| i.doc_path.0.clone())
.expect("src_a/note.md must appear in ingest report");
let path_b_str = items
.iter()
.find(|i| i.doc_path.0.ends_with("src_b/note.md"))
.map(|i| i.doc_path.0.clone())
.expect("src_b/note.md must appear in ingest report");
let path_a = kebab_core::WorkspacePath(path_a_str);
let path_b = kebab_core::WorkspacePath(path_b_str);
let doc_a = store
.get_document_by_workspace_path(&path_a)
.expect("get_document_by_workspace_path path_a")
.expect("doc_a must exist after ingest");
let doc_b = store
.get_document_by_workspace_path(&path_b)
.expect("get_document_by_workspace_path path_b")
.expect("doc_b must exist after ingest");
// Both twins share one asset_id (same content hash).
assert_eq!(
doc_a.source_asset_id, doc_b.source_asset_id,
"twin files must share one asset_id"
);
// Open App and issue span fetch on each twin's doc_id.
let app = env.app();
let result_a = app
.fetch(
FetchQuery::Span {
doc_id: doc_a.doc_id.clone(),
line_start: 1,
line_end: 2,
},
FetchOpts::default(),
)
.expect("fetch_span on twin A must succeed for a markdown file");
assert_eq!(result_a.kind, FetchKind::Span);
assert!(
result_a.text.as_deref().is_some_and(|t| !t.is_empty()),
"span text for twin A must not be empty"
);
let result_b = app
.fetch(
FetchQuery::Span {
doc_id: doc_b.doc_id.clone(),
line_start: 1,
line_end: 2,
},
FetchOpts::default(),
)
.expect("fetch_span on twin B must succeed for a markdown file");
assert_eq!(result_b.kind, FetchKind::Span);
assert!(
result_b.text.as_deref().is_some_and(|t| !t.is_empty()),
"span text for twin B must not be empty"
);
// Ingest again to force the asset.workspace_path flip-flop, then
// re-check. Pre-fix this was the scenario that triggered the bug:
// after the second ingest the asset row's workspace_path could point
// at either twin, making one twin's span fetch behave incorrectly.
let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
.expect("second ingest must succeed");
assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}");
// Re-open app after second ingest and verify span still works on both.
let app2 = env.app();
app2.fetch(
FetchQuery::Span {
doc_id: doc_a.doc_id.clone(),
line_start: 1,
line_end: 3,
},
FetchOpts::default(),
)
.expect("fetch_span on twin A after flip-flop must still succeed");
app2.fetch(
FetchQuery::Span {
doc_id: doc_b.doc_id.clone(),
line_start: 1,
line_end: 3,
},
FetchOpts::default(),
)
.expect("fetch_span on twin B after flip-flop must still succeed");
}