Files
kebab/crates/kb-store-sqlite/tests/asset_writer.rs
altair823 e41279de96 p1-6: harden store boundary (atomic asset write, poison-tolerant mutex, AssetId validation)
Three Important review fixes on the kb-store-sqlite write path:

I1. Atomic asset write. put_asset_with_bytes now stages bytes to
    `<final>.tmp.<pid>.<n>`, fsyncs, UPSERTs the row, then `rename`s into
    place (atomic on POSIX same-fs). On any failure between staging and
    rename we best-effort `remove_file` the temp so the previous orphan
    risk on UPSERT failure is gone. Reference mode is unchanged (no I/O,
    no orphan risk).

I2. Poison-tolerant mutex. New `lock_conn` helper does
    `.lock().unwrap_or_else(|p| p.into_inner())`, so a single panic mid-
    transaction no longer poisons every subsequent store call. The
    rusqlite Transaction Drop already rolls back on panic, leaving the
    Connection state safe to reuse. All 13 prior `.expect("sqlite mutex
    poisoned")` sites in store.rs / documents.rs / jobs.rs now route
    through `lock_conn`.

I3. AssetId shape validation. `kb_core::AssetId(pub String)` lets a
    hand-construction bypass the `FromStr` 32-hex invariant. Added
    `validate_asset_id` (32 ASCII hex chars) at every store entry that
    turns an AssetId into a path: `put_asset_with_bytes` and
    `DocumentStore::put_asset`. This shuts a potential path-traversal via
    `assets_path_for`'s `&id[..2]` shard slice.

Tests:
- `put_asset_with_bytes_orphan_cleanup_on_upsert_failure` — pre-seeds a
  row that takes the same `workspace_path` (UNIQUE), so the UPSERT trips
  a constraint not covered by `ON CONFLICT(asset_id)`. Asserts no final
  file and no leaked `*.tmp.*`.
- `put_asset_with_bytes_rejects_invalid_asset_id` — passes
  `AssetId("../etc/passwd_padded_to_xx_xxxxx")` (32 chars, contains `/`).
  Asserts error and zero filesystem artifacts under `data_dir/assets/`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 17:33:19 +00:00

236 lines
8.7 KiB
Rust

//! Asset writer tests: copy mode (file written 0o644), reference mode
//! (no copy, row records source), and checksum mismatch (Conflict).
use std::path::PathBuf;
use kb_core::{AssetId, AssetStorage, Checksum, MediaType, RawAsset, SourceUri, WorkspacePath};
use kb_store_sqlite::SqliteStore;
use time::OffsetDateTime;
mod common;
fn fixed_asset(_bytes: &[u8], byte_len: u64, declared_checksum: &str) -> RawAsset {
RawAsset {
// 32-hex AssetId per kb-core newtype invariant.
asset_id: AssetId("a".repeat(32)),
source_uri: SourceUri::File(PathBuf::from("/some/source.md")),
workspace_path: WorkspacePath::new("notes/foo.md".into()).unwrap(),
media_type: MediaType::Markdown,
byte_len,
checksum: Checksum(declared_checksum.into()),
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
stored: AssetStorage::Reference {
path: PathBuf::from("/some/source.md"),
sha: Checksum("0".repeat(64)),
},
}
}
fn b3_full_hex(bytes: &[u8]) -> String {
blake3::hash(bytes).to_hex().to_string()
}
#[test]
fn copy_mode_writes_file_with_0o644_and_correct_bytes() {
let env = common::TestEnv::with_threshold(100);
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let bytes = b"hello, sqlite";
let cs = b3_full_hex(bytes);
let asset = fixed_asset(bytes, bytes.len() as u64, &cs);
store.put_asset_with_bytes(&asset, bytes).expect("write");
// Path: data_dir/assets/aa/aaaaaa…aa
let aa = &asset.asset_id.0[..2];
let dest = env.data_dir().join("assets").join(aa).join(&asset.asset_id.0);
assert!(dest.exists(), "asset file not written at {}", dest.display());
let on_disk = std::fs::read(&dest).unwrap();
assert_eq!(on_disk, bytes);
// Mode 0o644 on Unix.
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let mode = std::fs::metadata(&dest).unwrap().permissions().mode() & 0o777;
assert_eq!(mode, 0o644, "expected 0o644, got 0o{mode:o}");
}
// Row recorded copied.
let storage_kind: String = env.with_conn(|c| {
c.query_row(
"SELECT storage_kind FROM assets WHERE asset_id = ?",
[&asset.asset_id.0],
|r| r.get(0),
)
});
assert_eq!(storage_kind, "copied");
}
#[test]
fn reference_mode_does_not_write_file_but_records_path() {
// copy_threshold_mb=0 → every byte lands on the reference branch.
let env = common::TestEnv::with_threshold(0);
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let bytes = b"big-pretend-bytes";
let cs = b3_full_hex(bytes);
// byte_len declared > 0 so the threshold check picks reference. With
// copy_threshold_bytes=0 even byte_len=1 trips the else branch.
let mut asset = fixed_asset(bytes, 1, &cs);
asset.source_uri = SourceUri::File(PathBuf::from("/path/to/original.md"));
store.put_asset_with_bytes(&asset, bytes).expect("ref write");
let aa = &asset.asset_id.0[..2];
let dest = env.data_dir().join("assets").join(aa).join(&asset.asset_id.0);
assert!(!dest.exists(), "reference mode must not copy bytes");
let (storage_kind, storage_path): (String, String) = env.with_conn(|c| {
c.query_row(
"SELECT storage_kind, storage_path FROM assets WHERE asset_id = ?",
[&asset.asset_id.0],
|r| Ok((r.get(0)?, r.get(1)?)),
)
});
assert_eq!(storage_kind, "reference");
assert_eq!(storage_path, "/path/to/original.md");
}
#[test]
fn put_asset_with_bytes_orphan_cleanup_on_upsert_failure() {
// Goal: prove that if the row UPSERT fails AFTER the bytes have been
// staged on disk, no `<aa>/<asset_id>` file is left behind.
//
// Lever: the `assets` table has a UNIQUE INDEX on `workspace_path`
// (V001), but the UPSERT is `ON CONFLICT(asset_id)`. So if some other
// row already owns this `workspace_path`, the INSERT half of the
// UPSERT trips a UNIQUE constraint that the ON CONFLICT clause does
// NOT handle — UPSERT errors. The new asset's bytes were already
// staged; we assert they are NOT visible at the final destination.
let env = common::TestEnv::with_threshold(100);
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
// Pre-populate a row that owns `notes/foo.md` (the workspace_path our
// fixture asset will also claim) under a *different* asset_id.
env.with_conn(|c| {
c.execute(
"INSERT INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
rusqlite::params![
"b".repeat(32),
"file:///elsewhere/foo.md",
"notes/foo.md",
"\"markdown\"",
7_i64,
"0".repeat(64),
"reference",
"/elsewhere/foo.md",
"2024-01-01T00:00:00Z",
],
)
});
let bytes = b"hello, sqlite";
let cs = b3_full_hex(bytes);
let asset = fixed_asset(bytes, bytes.len() as u64, &cs);
let err = store
.put_asset_with_bytes(&asset, bytes)
.expect_err("UPSERT must fail on workspace_path UNIQUE violation");
let msg = format!("{err:#}");
assert!(
msg.to_lowercase().contains("unique") || msg.to_lowercase().contains("constraint"),
"expected UNIQUE constraint failure, got: {msg}"
);
// Final destination must NOT exist (no orphan).
let aa = &asset.asset_id.0[..2];
let dest = env.data_dir().join("assets").join(aa).join(&asset.asset_id.0);
assert!(
!dest.exists(),
"asset bytes were left orphan at {} after UPSERT failure",
dest.display()
);
// No `*.tmp.*` either — temp file must be cleaned up too.
let shard_dir = env.data_dir().join("assets").join(aa);
if let Ok(entries) = std::fs::read_dir(&shard_dir) {
for entry in entries.flatten() {
let name = entry.file_name();
let s = name.to_string_lossy();
assert!(
!s.contains(".tmp."),
"temp file leaked at {}",
entry.path().display()
);
}
}
}
#[test]
fn put_asset_with_bytes_rejects_invalid_asset_id() {
// `kb_core::AssetId(pub String)` lets a hand-construction bypass the
// 32-hex `FromStr` invariant. The store boundary must reject any ID
// whose shape would let path construction escape `data_dir/assets/`.
let env = common::TestEnv::with_threshold(100);
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
// 32 chars but contains a `/` — would let `assets_path_for` stitch
// together a path outside the shard tree.
let evil_id = "../etc/passwd_padded_to_xx_xxxxx".to_string();
assert_eq!(evil_id.len(), 32, "test fixture must be 32 chars to exercise length-only checks");
let mut asset = fixed_asset(b"x", 1, &b3_full_hex(b"x"));
asset.asset_id = AssetId(evil_id.clone());
let err = store
.put_asset_with_bytes(&asset, b"x")
.expect_err("must reject non-hex AssetId");
let msg = format!("{err:#}");
assert!(
msg.contains("invalid AssetId shape"),
"expected AssetId-shape rejection, got: {msg}"
);
// And the bytes must NOT have been staged anywhere under the assets
// tree (no I/O should have happened before validation).
let assets_dir = env.data_dir().join("assets");
if assets_dir.exists() {
for entry in std::fs::read_dir(&assets_dir).unwrap().flatten() {
// Recurse one level into shard dirs and assert empty.
if let Some(sub) = std::fs::read_dir(entry.path()).unwrap().flatten().next() {
panic!(
"invalid AssetId still produced filesystem artifact at {}",
sub.path().display()
);
}
}
}
}
#[test]
fn checksum_mismatch_returns_conflict() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let bytes = b"the real bytes";
// Tampered checksum: hash a different payload.
let wrong_cs = b3_full_hex(b"different bytes");
let asset = fixed_asset(bytes, bytes.len() as u64, &wrong_cs);
let err = store
.put_asset_with_bytes(&asset, bytes)
.expect_err("must reject checksum mismatch");
let msg = format!("{err:#}");
assert!(
msg.contains("checksum mismatch") || msg.contains("conflict"),
"expected Conflict-flavoured error, got: {msg}"
);
}