feat(store): V011 embedding_records FK 제거 + CASCADE 대체 명시 DELETE (sentinel 별칭 벡터)

별칭 dense 벡터를 sentinel chunk_id({orig}#alias)로 색인하려면 chunks 에 없는
chunk_id 가 embedding_records 에 들어가야 한다. V001 의 chunk_id REFERENCES chunks
ON DELETE CASCADE FK 가 이를 SQLite 787 로 막으므로 테이블을 FK 없이 재생성한다.
status/vector_committed(V003) + 3개 인덱스 보존, chunks_bd_tombstone_embeddings
trigger 무수정. DROP→RENAME 시 dangling trigger 재파싱을 피하려 legacy_alter_table=ON.

사라진 CASCADE 는 put_chunks + purge 두 경로(purge_orphan_at_workspace_path,
purge_deleted_workspace_path)의 명시 DELETE 로 대체 — chunks 삭제 직전 원본 +
{id}#alias sentinel embedding_records 를 함께 정리. corpus_revision baseline 2→3.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-30 13:41:20 +00:00
parent d279f343e7
commit 483b1ec06b
5 changed files with 254 additions and 12 deletions

View File

@@ -98,6 +98,18 @@ impl kebab_core::DocumentStore for SqliteStore {
.context("format chunk created_at")?;
let mut conn = self.lock_conn();
let tx = conn.transaction().map_err(StoreError::from)?;
// CASCADE 제거(V011) 대체: 이 doc 의 chunk 임베딩 레코드를 명시 정리.
// 원본 + sentinel({id}#alias) 둘 다. 별칭 dense 벡터(sentinel chunk_id)는
// chunks FK 가 없어 CASCADE 로 자동 정리되지 않으므로 여기서 직접 지운다.
// chunks 행이 살아있는 동안(아래 DELETE FROM chunks 직전) 실행해야 서브쿼리가
// chunk_id 를 본다. 설계 spec 2026-05-30-dense-alias-vectors-design.md §3.5-2.
tx.execute(
"DELETE FROM embedding_records WHERE chunk_id IN \
(SELECT chunk_id FROM chunks WHERE doc_id = ?1 \
UNION SELECT chunk_id || '#alias' FROM chunks WHERE doc_id = ?1)",
params![doc.0],
)
.map_err(StoreError::from)?;
tx.execute("DELETE FROM chunks WHERE doc_id = ?", params![doc.0])
.map_err(StoreError::from)?;
let mut stmt = tx

View File

@@ -627,7 +627,20 @@ pub(crate) fn purge_orphan_at_workspace_path(
return Ok(());
};
// documents → blocks / chunks / embedding_records via CASCADE.
// CASCADE 제거(V011) 대체: 이 asset 의 문서 chunk 임베딩 레코드를 명시 정리.
// 원본 + sentinel({id}#alias) 둘 다. 별칭 dense 벡터는 chunks FK 가 없어
// documents→chunks CASCADE 로 자동 정리되지 않으므로 chunks 가 살아있는 동안
// 직접 지운다. 설계 spec 2026-05-30-dense-alias-vectors-design.md §3.5-2.
conn.execute(
"DELETE FROM embedding_records WHERE chunk_id IN \
(SELECT chunk_id FROM chunks WHERE doc_id IN \
(SELECT doc_id FROM documents WHERE asset_id = ?1) \
UNION SELECT chunk_id || '#alias' FROM chunks WHERE doc_id IN \
(SELECT doc_id FROM documents WHERE asset_id = ?1))",
params![stale_asset_id],
)
.map_err(StoreError::from)?;
// documents → blocks / chunks via CASCADE.
conn.execute(
"DELETE FROM documents WHERE asset_id = ?",
params![stale_asset_id],
@@ -706,8 +719,20 @@ pub fn purge_deleted_workspace_path(
.map_err(StoreError::from)?;
drop(stmt);
// 2. DELETE the document row (CASCADE clears blocks / chunks /
// embedding_records via the FK constraints in V001).
// 1b. CASCADE 제거(V011) 대체: chunk 임베딩 레코드를 명시 정리(원본 +
// sentinel {id}#alias). 별칭 dense 벡터는 chunks FK 가 없어
// documents→chunks CASCADE 로 자동 정리되지 않는다. chunks 가
// 살아있는 동안(2번 DELETE 직전) 실행. spec §3.5-2.
conn.execute(
"DELETE FROM embedding_records WHERE chunk_id IN \
(SELECT chunk_id FROM chunks WHERE doc_id = ?1 \
UNION SELECT chunk_id || '#alias' FROM chunks WHERE doc_id = ?1)",
rusqlite::params![doc_id],
)
.map_err(StoreError::from)?;
// 2. DELETE the document row (CASCADE clears blocks / chunks via the
// FK constraints in V001; embedding_records handled above).
conn.execute(
"DELETE FROM documents WHERE doc_id = ?",
rusqlite::params![doc_id],

View File

@@ -20,26 +20,26 @@ fn open_store(tmp: &TempDir) -> SqliteStore {
store
}
/// Fresh store baseline: V004 seeds `corpus_revision = 0`, then both V009
/// and V010 migrations bump it by one each to invalidate any stale LRU
/// cache — so a fresh store after `run_migrations()` reads back as `2`.
/// Fresh store baseline: V004 seeds `corpus_revision = 0`, then V009,
/// V010, and V011 migrations bump it by one each to invalidate any stale
/// LRU cache — so a fresh store after `run_migrations()` reads back as `3`.
#[test]
fn fresh_store_starts_at_post_migration_baseline() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
assert_eq!(store.corpus_revision(), 2);
assert_eq!(store.corpus_revision(), 3);
}
/// Each `bump_corpus_revision` returns the new value monotonically
/// from the post-migration baseline (V009 + V010 → 2).
/// from the post-migration baseline (V009 + V010 + V011 3).
#[test]
fn bump_increments_monotonically() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
assert_eq!(store.bump_corpus_revision().unwrap(), 3);
assert_eq!(store.bump_corpus_revision().unwrap(), 4);
assert_eq!(store.bump_corpus_revision().unwrap(), 5);
assert_eq!(store.corpus_revision(), 5);
assert_eq!(store.bump_corpus_revision().unwrap(), 6);
assert_eq!(store.corpus_revision(), 6);
}
/// `corpus_revision` survives a store re-open (persisted in SQLite).
@@ -52,6 +52,6 @@ fn revision_persists_across_reopen() {
store.bump_corpus_revision().unwrap();
} // store dropped — file closed
let store = open_store(&tmp);
assert_eq!(store.corpus_revision(), 4);
assert_eq!(store.bump_corpus_revision().unwrap(), 5);
assert_eq!(store.corpus_revision(), 5);
assert_eq!(store.bump_corpus_revision().unwrap(), 6);
}

View File

@@ -0,0 +1,164 @@
//! V011: `embedding_records.chunk_id` FK 제거 + CASCADE 대체 명시 DELETE.
//!
//! 별칭 dense 벡터는 sentinel chunk_id(`{orig}#alias`)로 색인되는데, 이 id 는
//! `chunks` 에 행이 없다. V001 의 `chunk_id REFERENCES chunks ON DELETE CASCADE`
//! FK 가 살아 있으면 sentinel `embedding_records` INSERT 가 SQLite 787 로 실패한다.
//! V011 이 FK 를 제거하고, 사라진 CASCADE 는 `put_chunks` / purge 경로의 명시
//! DELETE 로 대체한다(설계 spec 2026-05-30-dense-alias-vectors-design.md §3.5).
use kebab_config::Config;
use kebab_core::{
Chunk, ChunkId, ChunkerVersion, DocumentId, DocumentStore,
};
use kebab_store_sqlite::{EmbeddingRecordRow, SqliteStore};
use rusqlite::params;
use tempfile::TempDir;
use time::OffsetDateTime;
fn open_store(tmp: &TempDir) -> SqliteStore {
let mut c = Config::defaults();
c.storage.data_dir = tmp.path().to_string_lossy().into_owned();
let store = SqliteStore::open(&c).unwrap();
store.run_migrations().unwrap();
store
}
const DOC_ID: &str = "fedcba9876543210fedcba9876543210";
/// Seed asset + document + one chunk so the *original* chunk_id has a
/// `chunks` row. The sentinel `{chunk_id}#alias` deliberately gets NO
/// chunks row — that is the case V011 must allow.
fn seed_chunk(store: &SqliteStore, chunk_id: &str) {
let conn = store.read_conn();
conn.execute(
"INSERT INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, ?, ?, '{}', 0, 'deadbeefdeadbeefdeadbeefdeadbeef',
'reference', '/tmp/x', '1970-01-01T00:00:00Z')",
params!["0123456789abcdef0123456789abcdef", "file:///tmp/x", "x.md"],
)
.unwrap();
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang, source_type,
trust_level, parser_version, doc_version, schema_version,
metadata_json, provenance_json, created_at, updated_at
) VALUES (?, ?, 'x.md', NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
'{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
params![DOC_ID, "0123456789abcdef0123456789abcdef"],
)
.unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
'1970-01-01T00:00:00Z')",
params![chunk_id, DOC_ID],
)
.unwrap();
}
fn embed_row(embedding_id: &str, chunk_id: &str) -> EmbeddingRecordRow {
EmbeddingRecordRow {
embedding_id: embedding_id.to_string(),
chunk_id: chunk_id.to_string(),
model_id: "m".to_string(),
model_version: "v1".to_string(),
dimensions: 4,
lance_table: "t".to_string(),
created_at: OffsetDateTime::UNIX_EPOCH,
}
}
fn embed_count(store: &SqliteStore, chunk_id: &str) -> i64 {
let conn = store.read_conn();
conn.query_row(
"SELECT COUNT(*) FROM embedding_records WHERE chunk_id = ?",
params![chunk_id],
|r| r.get::<_, i64>(0),
)
.unwrap()
}
/// V011 후 sentinel chunk_id(`chunks` 에 없는 id)로 `embedding_records` 를
/// INSERT 해도 FK 위반 없이 성공해야 한다.
#[test]
fn sentinel_embedding_record_insert_succeeds_without_fk() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let c1 = "11111111111111111111111111111111";
seed_chunk(&store, c1);
// sentinel: chunks 에 행이 없는 `{c1}#alias`.
let sentinel = format!("{c1}{}", kebab_core::ALIAS_SUFFIX);
let result =
store.put_embedding_records_pending(&[embed_row("e_sentinel_0000000000000000000000", &sentinel)]);
assert!(
result.is_ok(),
"sentinel embedding_records insert must not violate a chunks FK after V011: {result:?}"
);
assert_eq!(
embed_count(&store, &sentinel),
1,
"sentinel embedding row must be persisted"
);
}
/// `put_chunks` 재호출(재인제스트) 시, 명시 DELETE 가 그 doc 의 원본 + sentinel
/// `embedding_records` 를 모두 정리해 orphan 0 이 되어야 한다(CASCADE 대체).
#[test]
fn put_chunks_cleans_original_and_sentinel_embeddings() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let c1 = "11111111111111111111111111111111";
seed_chunk(&store, c1);
let sentinel = format!("{c1}{}", kebab_core::ALIAS_SUFFIX);
// 원본 + sentinel embedding_records 색인 (committed).
store
.put_embedding_records_pending(&[
embed_row("e_orig_000000000000000000000000000", c1),
embed_row("e_sentinel_0000000000000000000000", &sentinel),
])
.unwrap();
store
.mark_embedding_records_committed(&[
"e_orig_000000000000000000000000000".to_string(),
"e_sentinel_0000000000000000000000".to_string(),
])
.unwrap();
assert_eq!(embed_count(&store, c1), 1);
assert_eq!(embed_count(&store, &sentinel), 1);
// 재인제스트: 같은 chunk 를 put_chunks 로 다시 쓴다. 명시 DELETE 가
// 원본 + sentinel embedding_records 를 정리한 뒤 chunk 재삽입.
let doc_id = DocumentId(DOC_ID.to_string());
let chunk = Chunk {
chunk_id: ChunkId(c1.to_string()),
doc_id: doc_id.clone(),
block_ids: Vec::new(),
text: "hi".to_string(),
heading_path: Vec::new(),
source_spans: Vec::new(),
token_estimate: 1,
chunker_version: ChunkerVersion("v1".to_string()),
policy_hash: "h".to_string(),
tokenized_korean_text: None,
aliases: None,
};
store.put_chunks(&doc_id, std::slice::from_ref(&chunk)).unwrap();
assert_eq!(
embed_count(&store, c1),
0,
"original embedding_records must be cleaned on re-ingest (CASCADE replacement)"
);
assert_eq!(
embed_count(&store, &sentinel),
0,
"sentinel embedding_records must be cleaned on re-ingest (no chunks FK → explicit DELETE)"
);
}