feat(store-sqlite): add truncate_embedding_records helper

Wipes every row from embedding_records and returns the deleted row
count. Used by the upcoming `kebab reset --vector-only` to keep SQLite
consistent after the on-disk Lance store is removed.

Plan deviation from the original spec (task 1):
- Original test plan opened SqliteStore with a raw path; the actual
  signature is `SqliteStore::open(&Config)`, so the integration test
  builds a Config with `storage.data_dir` pointed at a tempdir.
- Original return type was Result<()>; bumped to Result<u64> so the
  caller (kebab-app::reset) can surface the truncated count in the
  reset_report.v1 wire payload without a separate COUNT query.

p9-fb-06 task 1.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-02 18:08:22 +00:00
parent 8784606028
commit cf65afaef0
2 changed files with 135 additions and 0 deletions

View File

@@ -107,6 +107,22 @@ impl SqliteStore {
/// WHERE embedding_id IN (?, ?, …)`) inside one transaction —
/// avoids the per-row `execute()` round-trip the previous
/// implementation paid.
/// Wipe every row from `embedding_records`, returning the count of
/// rows that were removed. Called by `kebab reset --vector-only` so
/// SQLite cannot point at a Lance row that the reset just removed
/// off-disk.
///
/// The function does NOT cascade to `chunks` or `documents` — those
/// are kept so the next `kebab ingest` re-embeds the existing chunk
/// set without re-parsing.
pub fn truncate_embedding_records(&self) -> Result<u64> {
let conn = self.lock_conn();
let n = conn
.execute("DELETE FROM embedding_records", [])
.context("DELETE FROM embedding_records")?;
Ok(n as u64)
}
pub fn mark_embedding_records_committed(
&self,
embedding_ids: &[String],

View File

@@ -0,0 +1,119 @@
//! `truncate_embedding_records` wipes every row regardless of status.
//!
//! Used by `kebab reset --vector-only` to keep SQLite in sync after the
//! Lance vector store is deleted off-disk. The helper is exposed at the
//! integration-test boundary so consumers (kebab-app's reset module) can
//! verify its semantics without reaching into private store internals.
use kebab_config::Config;
use kebab_store_sqlite::{EmbeddingRecordRow, SqliteStore};
use rusqlite::params;
use tempfile::TempDir;
use time::OffsetDateTime;
fn config_for(tmp: &TempDir) -> Config {
let mut c = Config::defaults();
c.storage.data_dir = tmp.path().to_string_lossy().into_owned();
c
}
fn open_store(tmp: &TempDir) -> SqliteStore {
let cfg = config_for(tmp);
let store = SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
store
}
/// Seed an asset + document + chunk so an `embedding_records` row inserted
/// against `chunk_id` does not violate the chunks FK. Mirrors the helper
/// used by the in-crate `embeddings::tests` module — copied here because
/// integration tests cannot reach the private `seed_chunk` from outside
/// the crate.
fn seed_chunk(store: &SqliteStore, chunk_id: &str) {
let conn = store.read_conn();
conn.execute(
"INSERT INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, ?, ?, ?, ?, ?, 'reference', '/tmp/x', ?)",
params![
"0123456789abcdef0123456789abcdef",
"file:///tmp/x",
"x.md",
"{}",
0_i64,
"deadbeef",
"1970-01-01T00:00:00Z",
],
)
.unwrap();
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang, source_type,
trust_level, parser_version, doc_version, schema_version,
metadata_json, provenance_json, created_at, updated_at
) VALUES (?, ?, ?, NULL, NULL, 'fs', 'unverified', 'v1', 1, 1, '{}', '{}', ?, ?)",
params![
"fedcba9876543210fedcba9876543210",
"0123456789abcdef0123456789abcdef",
"x.md",
"1970-01-01T00:00:00Z",
"1970-01-01T00:00:00Z",
],
)
.unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'hash', '[]', ?)",
params![
chunk_id,
"fedcba9876543210fedcba9876543210",
"1970-01-01T00:00:00Z"
],
)
.unwrap();
}
fn count_rows(store: &SqliteStore) -> i64 {
let conn = store.read_conn();
conn.query_row("SELECT COUNT(*) FROM embedding_records", [], |r| r.get(0))
.unwrap()
}
#[test]
fn truncate_removes_all_rows_and_returns_count() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let chunk = "11112222333344445555666677778888";
seed_chunk(&store, chunk);
let row = EmbeddingRecordRow {
embedding_id: "aaaa1111bbbb2222cccc3333dddd4444".to_string(),
chunk_id: chunk.to_string(),
model_id: "test-model".to_string(),
model_version: "v1".to_string(),
dimensions: 4,
lance_table: "chunk_embeddings_test_model_4".to_string(),
created_at: OffsetDateTime::now_utc(),
};
store
.put_embedding_records_pending(std::slice::from_ref(&row))
.unwrap();
assert_eq!(count_rows(&store), 1);
let removed = store.truncate_embedding_records().unwrap();
assert_eq!(removed, 1);
assert_eq!(count_rows(&store), 0);
}
#[test]
fn truncate_on_empty_table_is_noop() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let removed = store.truncate_embedding_records().unwrap();
assert_eq!(removed, 0);
assert_eq!(count_rows(&store), 0);
}