diff --git a/crates/kebab-store-sqlite/src/embeddings.rs b/crates/kebab-store-sqlite/src/embeddings.rs index 0ef76a2..fcf3d34 100644 --- a/crates/kebab-store-sqlite/src/embeddings.rs +++ b/crates/kebab-store-sqlite/src/embeddings.rs @@ -107,6 +107,22 @@ impl SqliteStore { /// WHERE embedding_id IN (?, ?, …)`) inside one transaction — /// avoids the per-row `execute()` round-trip the previous /// implementation paid. + /// Wipe every row from `embedding_records`, returning the count of + /// rows that were removed. Called by `kebab reset --vector-only` so + /// SQLite cannot point at a Lance row that the reset just removed + /// off-disk. + /// + /// The function does NOT cascade to `chunks` or `documents` — those + /// are kept so the next `kebab ingest` re-embeds the existing chunk + /// set without re-parsing. + pub fn truncate_embedding_records(&self) -> Result { + let conn = self.lock_conn(); + let n = conn + .execute("DELETE FROM embedding_records", []) + .context("DELETE FROM embedding_records")?; + Ok(n as u64) + } + pub fn mark_embedding_records_committed( &self, embedding_ids: &[String], diff --git a/crates/kebab-store-sqlite/tests/truncate_embeddings.rs b/crates/kebab-store-sqlite/tests/truncate_embeddings.rs new file mode 100644 index 0000000..a24fc3e --- /dev/null +++ b/crates/kebab-store-sqlite/tests/truncate_embeddings.rs @@ -0,0 +1,119 @@ +//! `truncate_embedding_records` wipes every row regardless of status. +//! +//! Used by `kebab reset --vector-only` to keep SQLite in sync after the +//! Lance vector store is deleted off-disk. The helper is exposed at the +//! integration-test boundary so consumers (kebab-app's reset module) can +//! verify its semantics without reaching into private store internals. + +use kebab_config::Config; +use kebab_store_sqlite::{EmbeddingRecordRow, SqliteStore}; +use rusqlite::params; +use tempfile::TempDir; +use time::OffsetDateTime; + +fn config_for(tmp: &TempDir) -> Config { + let mut c = Config::defaults(); + c.storage.data_dir = tmp.path().to_string_lossy().into_owned(); + c +} + +fn open_store(tmp: &TempDir) -> SqliteStore { + let cfg = config_for(tmp); + let store = SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + store +} + +/// Seed an asset + document + chunk so an `embedding_records` row inserted +/// against `chunk_id` does not violate the chunks FK. Mirrors the helper +/// used by the in-crate `embeddings::tests` module — copied here because +/// integration tests cannot reach the private `seed_chunk` from outside +/// the crate. +fn seed_chunk(store: &SqliteStore, chunk_id: &str) { + let conn = store.read_conn(); + conn.execute( + "INSERT INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, ?, ?, ?, 'reference', '/tmp/x', ?)", + params![ + "0123456789abcdef0123456789abcdef", + "file:///tmp/x", + "x.md", + "{}", + 0_i64, + "deadbeef", + "1970-01-01T00:00:00Z", + ], + ) + .unwrap(); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, NULL, 'fs', 'unverified', 'v1', 1, 1, '{}', '{}', ?, ?)", + params![ + "fedcba9876543210fedcba9876543210", + "0123456789abcdef0123456789abcdef", + "x.md", + "1970-01-01T00:00:00Z", + "1970-01-01T00:00:00Z", + ], + ) + .unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'hash', '[]', ?)", + params![ + chunk_id, + "fedcba9876543210fedcba9876543210", + "1970-01-01T00:00:00Z" + ], + ) + .unwrap(); +} + +fn count_rows(store: &SqliteStore) -> i64 { + let conn = store.read_conn(); + conn.query_row("SELECT COUNT(*) FROM embedding_records", [], |r| r.get(0)) + .unwrap() +} + +#[test] +fn truncate_removes_all_rows_and_returns_count() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let chunk = "11112222333344445555666677778888"; + seed_chunk(&store, chunk); + + let row = EmbeddingRecordRow { + embedding_id: "aaaa1111bbbb2222cccc3333dddd4444".to_string(), + chunk_id: chunk.to_string(), + model_id: "test-model".to_string(), + model_version: "v1".to_string(), + dimensions: 4, + lance_table: "chunk_embeddings_test_model_4".to_string(), + created_at: OffsetDateTime::now_utc(), + }; + store + .put_embedding_records_pending(std::slice::from_ref(&row)) + .unwrap(); + assert_eq!(count_rows(&store), 1); + + let removed = store.truncate_embedding_records().unwrap(); + assert_eq!(removed, 1); + assert_eq!(count_rows(&store), 0); +} + +#[test] +fn truncate_on_empty_table_is_noop() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let removed = store.truncate_embedding_records().unwrap(); + assert_eq!(removed, 0); + assert_eq!(count_rows(&store), 0); +}