feat(cli): kebab reset (p9-fb-06) #49

Merged
altair823 merged 7 commits from feat/p9-fb-06-reset into main 2026-05-02 18:45:25 +00:00
2 changed files with 135 additions and 0 deletions
Showing only changes of commit cf65afaef0 - Show all commits

View File

@@ -107,6 +107,22 @@ impl SqliteStore {
/// WHERE embedding_id IN (?, ?, …)`) inside one transaction —
/// avoids the per-row `execute()` round-trip the previous
/// implementation paid.
/// Wipe every row from `embedding_records`, returning the count of

(critical / doc 흡수 버그) truncate_embedding_records 의 doc comment 가 mark_embedding_records_committed 함수의 doc comment 직후에 인접 — Rust 의 /// 는 다음 항목에 attach 되므로 위쪽 14 줄짜리 mark_committed doc 이 truncate 의 doc 으로 흡수되고, mark_committed 자체는 doc 없이 pub fn 만 남음.

Why: 이 함수의 doc 은 phase-3 commit 의 "왜 WHERE status='pending' 만 대상인가 / 왜 단일 transaction 인가" 같은 design rationale 을 담고 있어서 잃어버리면 미래 reader 가 WHERE status='pending' 의 의도 (tombstone 보존, 중복 batch 방지) 를 추적 불가.

How to apply: truncate_embedding_recordsimpl SqliteStore 블록 (mark_committed 의 닫는 } 다음) 으로 옮긴다. plan 의 원래 의도 ("Append at the end of impl SqliteStore") 와도 일치. 옮긴 후 mark_committed 의 doc comment 14 줄이 다시 그 함수에 attach 되는지 확인.

(critical / doc 흡수 버그) `truncate_embedding_records` 의 doc comment 가 `mark_embedding_records_committed` 함수의 doc comment 직후에 인접 — Rust 의 `///` 는 다음 항목에 attach 되므로 위쪽 14 줄짜리 mark_committed doc 이 truncate 의 doc 으로 흡수되고, mark_committed 자체는 doc 없이 `pub fn` 만 남음. Why: 이 함수의 doc 은 phase-3 commit 의 "왜 WHERE status='pending' 만 대상인가 / 왜 단일 transaction 인가" 같은 design rationale 을 담고 있어서 잃어버리면 미래 reader 가 `WHERE status='pending'` 의 의도 (tombstone 보존, 중복 batch 방지) 를 추적 불가. How to apply: `truncate_embedding_records` 를 `impl SqliteStore` 블록 **끝** (mark_committed 의 닫는 `}` 다음) 으로 옮긴다. plan 의 원래 의도 ("Append at the end of `impl SqliteStore`") 와도 일치. 옮긴 후 mark_committed 의 doc comment 14 줄이 다시 그 함수에 attach 되는지 확인.
/// rows that were removed. Called by `kebab reset --vector-only` so
/// SQLite cannot point at a Lance row that the reset just removed
/// off-disk.
///
/// The function does NOT cascade to `chunks` or `documents` — those
/// are kept so the next `kebab ingest` re-embeds the existing chunk
/// set without re-parsing.
pub fn truncate_embedding_records(&self) -> Result<u64> {
let conn = self.lock_conn();
let n = conn
.execute("DELETE FROM embedding_records", [])
.context("DELETE FROM embedding_records")?;
Ok(n as u64)
}
pub fn mark_embedding_records_committed(
&self,
embedding_ids: &[String],

View File

@@ -0,0 +1,119 @@
//! `truncate_embedding_records` wipes every row regardless of status.
//!
//! Used by `kebab reset --vector-only` to keep SQLite in sync after the
//! Lance vector store is deleted off-disk. The helper is exposed at the
//! integration-test boundary so consumers (kebab-app's reset module) can
//! verify its semantics without reaching into private store internals.
use kebab_config::Config;
use kebab_store_sqlite::{EmbeddingRecordRow, SqliteStore};
use rusqlite::params;
use tempfile::TempDir;
use time::OffsetDateTime;
fn config_for(tmp: &TempDir) -> Config {
let mut c = Config::defaults();
c.storage.data_dir = tmp.path().to_string_lossy().into_owned();
c
}
fn open_store(tmp: &TempDir) -> SqliteStore {
let cfg = config_for(tmp);
let store = SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
store
}
/// Seed an asset + document + chunk so an `embedding_records` row inserted
/// against `chunk_id` does not violate the chunks FK. Mirrors the helper
/// used by the in-crate `embeddings::tests` module — copied here because
/// integration tests cannot reach the private `seed_chunk` from outside
/// the crate.
fn seed_chunk(store: &SqliteStore, chunk_id: &str) {
let conn = store.read_conn();
conn.execute(
"INSERT INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, ?, ?, ?, ?, ?, 'reference', '/tmp/x', ?)",
params![
"0123456789abcdef0123456789abcdef",
"file:///tmp/x",
"x.md",
"{}",
0_i64,
"deadbeef",
"1970-01-01T00:00:00Z",
],
)
.unwrap();
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang, source_type,
trust_level, parser_version, doc_version, schema_version,
metadata_json, provenance_json, created_at, updated_at
) VALUES (?, ?, ?, NULL, NULL, 'fs', 'unverified', 'v1', 1, 1, '{}', '{}', ?, ?)",
params![
"fedcba9876543210fedcba9876543210",
"0123456789abcdef0123456789abcdef",
"x.md",
"1970-01-01T00:00:00Z",
"1970-01-01T00:00:00Z",
],
)
.unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'hash', '[]', ?)",
params![
chunk_id,
"fedcba9876543210fedcba9876543210",
"1970-01-01T00:00:00Z"
],
)
.unwrap();
}
fn count_rows(store: &SqliteStore) -> i64 {
let conn = store.read_conn();
conn.query_row("SELECT COUNT(*) FROM embedding_records", [], |r| r.get(0))
.unwrap()
}
#[test]
fn truncate_removes_all_rows_and_returns_count() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let chunk = "11112222333344445555666677778888";
seed_chunk(&store, chunk);
let row = EmbeddingRecordRow {
embedding_id: "aaaa1111bbbb2222cccc3333dddd4444".to_string(),
chunk_id: chunk.to_string(),
model_id: "test-model".to_string(),
model_version: "v1".to_string(),
dimensions: 4,
lance_table: "chunk_embeddings_test_model_4".to_string(),
created_at: OffsetDateTime::now_utc(),
};
store
.put_embedding_records_pending(std::slice::from_ref(&row))
.unwrap();
assert_eq!(count_rows(&store), 1);
let removed = store.truncate_embedding_records().unwrap();
assert_eq!(removed, 1);
assert_eq!(count_rows(&store), 0);
}
#[test]
fn truncate_on_empty_table_is_noop() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let removed = store.truncate_embedding_records().unwrap();
assert_eq!(removed, 0);
assert_eq!(count_rows(&store), 0);
}