From 483b1ec06b2b34e1af485e4ecf120d6793342ba7 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 30 May 2026 13:41:20 +0000 Subject: [PATCH] =?UTF-8?q?feat(store):=20V011=20embedding=5Frecords=20FK?= =?UTF-8?q?=20=EC=A0=9C=EA=B1=B0=20+=20CASCADE=20=EB=8C=80=EC=B2=B4=20?= =?UTF-8?q?=EB=AA=85=EC=8B=9C=20DELETE=20(sentinel=20=EB=B3=84=EC=B9=AD=20?= =?UTF-8?q?=EB=B2=A1=ED=84=B0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 별칭 dense 벡터를 sentinel chunk_id({orig}#alias)로 색인하려면 chunks 에 없는 chunk_id 가 embedding_records 에 들어가야 한다. V001 의 chunk_id REFERENCES chunks ON DELETE CASCADE FK 가 이를 SQLite 787 로 막으므로 테이블을 FK 없이 재생성한다. status/vector_committed(V003) + 3개 인덱스 보존, chunks_bd_tombstone_embeddings trigger 무수정. DROP→RENAME 시 dangling trigger 재파싱을 피하려 legacy_alter_table=ON. 사라진 CASCADE 는 put_chunks + purge 두 경로(purge_orphan_at_workspace_path, purge_deleted_workspace_path)의 명시 DELETE 로 대체 — chunks 삭제 직전 원본 + {id}#alias sentinel embedding_records 를 함께 정리. corpus_revision baseline 2→3. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/kebab-store-sqlite/src/documents.rs | 12 ++ crates/kebab-store-sqlite/src/store.rs | 31 +++- .../tests/corpus_revision.rs | 18 +- .../tests/embedding_records_fk.rs | 164 ++++++++++++++++++ .../V011__drop_embedding_records_fk.sql | 41 +++++ 5 files changed, 254 insertions(+), 12 deletions(-) create mode 100644 crates/kebab-store-sqlite/tests/embedding_records_fk.rs create mode 100644 migrations/V011__drop_embedding_records_fk.sql diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index b1cc3de..b8a964c 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -98,6 +98,18 @@ impl kebab_core::DocumentStore for SqliteStore { .context("format chunk created_at")?; let mut conn = self.lock_conn(); let tx = conn.transaction().map_err(StoreError::from)?; + // CASCADE 제거(V011) 대체: 이 doc 의 chunk 임베딩 레코드를 명시 정리. + // 원본 + sentinel({id}#alias) 둘 다. 별칭 dense 벡터(sentinel chunk_id)는 + // chunks FK 가 없어 CASCADE 로 자동 정리되지 않으므로 여기서 직접 지운다. + // chunks 행이 살아있는 동안(아래 DELETE FROM chunks 직전) 실행해야 서브쿼리가 + // chunk_id 를 본다. 설계 spec 2026-05-30-dense-alias-vectors-design.md §3.5-2. + tx.execute( + "DELETE FROM embedding_records WHERE chunk_id IN \ + (SELECT chunk_id FROM chunks WHERE doc_id = ?1 \ + UNION SELECT chunk_id || '#alias' FROM chunks WHERE doc_id = ?1)", + params![doc.0], + ) + .map_err(StoreError::from)?; tx.execute("DELETE FROM chunks WHERE doc_id = ?", params![doc.0]) .map_err(StoreError::from)?; let mut stmt = tx diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index 0948470..1837557 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -627,7 +627,20 @@ pub(crate) fn purge_orphan_at_workspace_path( return Ok(()); }; - // documents → blocks / chunks / embedding_records via CASCADE. + // CASCADE 제거(V011) 대체: 이 asset 의 문서 chunk 임베딩 레코드를 명시 정리. + // 원본 + sentinel({id}#alias) 둘 다. 별칭 dense 벡터는 chunks FK 가 없어 + // documents→chunks CASCADE 로 자동 정리되지 않으므로 chunks 가 살아있는 동안 + // 직접 지운다. 설계 spec 2026-05-30-dense-alias-vectors-design.md §3.5-2. + conn.execute( + "DELETE FROM embedding_records WHERE chunk_id IN \ + (SELECT chunk_id FROM chunks WHERE doc_id IN \ + (SELECT doc_id FROM documents WHERE asset_id = ?1) \ + UNION SELECT chunk_id || '#alias' FROM chunks WHERE doc_id IN \ + (SELECT doc_id FROM documents WHERE asset_id = ?1))", + params![stale_asset_id], + ) + .map_err(StoreError::from)?; + // documents → blocks / chunks via CASCADE. conn.execute( "DELETE FROM documents WHERE asset_id = ?", params![stale_asset_id], @@ -706,8 +719,20 @@ pub fn purge_deleted_workspace_path( .map_err(StoreError::from)?; drop(stmt); - // 2. DELETE the document row (CASCADE clears blocks / chunks / - // embedding_records via the FK constraints in V001). + // 1b. CASCADE 제거(V011) 대체: chunk 임베딩 레코드를 명시 정리(원본 + + // sentinel {id}#alias). 별칭 dense 벡터는 chunks FK 가 없어 + // documents→chunks CASCADE 로 자동 정리되지 않는다. chunks 가 + // 살아있는 동안(2번 DELETE 직전) 실행. spec §3.5-2. + conn.execute( + "DELETE FROM embedding_records WHERE chunk_id IN \ + (SELECT chunk_id FROM chunks WHERE doc_id = ?1 \ + UNION SELECT chunk_id || '#alias' FROM chunks WHERE doc_id = ?1)", + rusqlite::params![doc_id], + ) + .map_err(StoreError::from)?; + + // 2. DELETE the document row (CASCADE clears blocks / chunks via the + // FK constraints in V001; embedding_records handled above). conn.execute( "DELETE FROM documents WHERE doc_id = ?", rusqlite::params![doc_id], diff --git a/crates/kebab-store-sqlite/tests/corpus_revision.rs b/crates/kebab-store-sqlite/tests/corpus_revision.rs index 488d81a..1ac5db9 100644 --- a/crates/kebab-store-sqlite/tests/corpus_revision.rs +++ b/crates/kebab-store-sqlite/tests/corpus_revision.rs @@ -20,26 +20,26 @@ fn open_store(tmp: &TempDir) -> SqliteStore { store } -/// Fresh store baseline: V004 seeds `corpus_revision = 0`, then both V009 -/// and V010 migrations bump it by one each to invalidate any stale LRU -/// cache — so a fresh store after `run_migrations()` reads back as `2`. +/// Fresh store baseline: V004 seeds `corpus_revision = 0`, then V009, +/// V010, and V011 migrations bump it by one each to invalidate any stale +/// LRU cache — so a fresh store after `run_migrations()` reads back as `3`. #[test] fn fresh_store_starts_at_post_migration_baseline() { let tmp = TempDir::new().unwrap(); let store = open_store(&tmp); - assert_eq!(store.corpus_revision(), 2); + assert_eq!(store.corpus_revision(), 3); } /// Each `bump_corpus_revision` returns the new value monotonically -/// from the post-migration baseline (V009 + V010 → 2). +/// from the post-migration baseline (V009 + V010 + V011 → 3). #[test] fn bump_increments_monotonically() { let tmp = TempDir::new().unwrap(); let store = open_store(&tmp); - assert_eq!(store.bump_corpus_revision().unwrap(), 3); assert_eq!(store.bump_corpus_revision().unwrap(), 4); assert_eq!(store.bump_corpus_revision().unwrap(), 5); - assert_eq!(store.corpus_revision(), 5); + assert_eq!(store.bump_corpus_revision().unwrap(), 6); + assert_eq!(store.corpus_revision(), 6); } /// `corpus_revision` survives a store re-open (persisted in SQLite). @@ -52,6 +52,6 @@ fn revision_persists_across_reopen() { store.bump_corpus_revision().unwrap(); } // store dropped — file closed let store = open_store(&tmp); - assert_eq!(store.corpus_revision(), 4); - assert_eq!(store.bump_corpus_revision().unwrap(), 5); + assert_eq!(store.corpus_revision(), 5); + assert_eq!(store.bump_corpus_revision().unwrap(), 6); } diff --git a/crates/kebab-store-sqlite/tests/embedding_records_fk.rs b/crates/kebab-store-sqlite/tests/embedding_records_fk.rs new file mode 100644 index 0000000..d247a60 --- /dev/null +++ b/crates/kebab-store-sqlite/tests/embedding_records_fk.rs @@ -0,0 +1,164 @@ +//! V011: `embedding_records.chunk_id` FK 제거 + CASCADE 대체 명시 DELETE. +//! +//! 별칭 dense 벡터는 sentinel chunk_id(`{orig}#alias`)로 색인되는데, 이 id 는 +//! `chunks` 에 행이 없다. V001 의 `chunk_id REFERENCES chunks ON DELETE CASCADE` +//! FK 가 살아 있으면 sentinel `embedding_records` INSERT 가 SQLite 787 로 실패한다. +//! V011 이 FK 를 제거하고, 사라진 CASCADE 는 `put_chunks` / purge 경로의 명시 +//! DELETE 로 대체한다(설계 spec 2026-05-30-dense-alias-vectors-design.md §3.5). + +use kebab_config::Config; +use kebab_core::{ + Chunk, ChunkId, ChunkerVersion, DocumentId, DocumentStore, +}; +use kebab_store_sqlite::{EmbeddingRecordRow, SqliteStore}; +use rusqlite::params; +use tempfile::TempDir; +use time::OffsetDateTime; + +fn open_store(tmp: &TempDir) -> SqliteStore { + let mut c = Config::defaults(); + c.storage.data_dir = tmp.path().to_string_lossy().into_owned(); + let store = SqliteStore::open(&c).unwrap(); + store.run_migrations().unwrap(); + store +} + +const DOC_ID: &str = "fedcba9876543210fedcba9876543210"; + +/// Seed asset + document + one chunk so the *original* chunk_id has a +/// `chunks` row. The sentinel `{chunk_id}#alias` deliberately gets NO +/// chunks row — that is the case V011 must allow. +fn seed_chunk(store: &SqliteStore, chunk_id: &str) { + let conn = store.read_conn(); + conn.execute( + "INSERT INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, '{}', 0, 'deadbeefdeadbeefdeadbeefdeadbeef', + 'reference', '/tmp/x', '1970-01-01T00:00:00Z')", + params!["0123456789abcdef0123456789abcdef", "file:///tmp/x", "x.md"], + ) + .unwrap(); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, 'x.md', NULL, 'en', 'markdown', 'primary', 'v1', 1, 1, + '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')", + params![DOC_ID, "0123456789abcdef0123456789abcdef"], + ) + .unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]', + '1970-01-01T00:00:00Z')", + params![chunk_id, DOC_ID], + ) + .unwrap(); +} + +fn embed_row(embedding_id: &str, chunk_id: &str) -> EmbeddingRecordRow { + EmbeddingRecordRow { + embedding_id: embedding_id.to_string(), + chunk_id: chunk_id.to_string(), + model_id: "m".to_string(), + model_version: "v1".to_string(), + dimensions: 4, + lance_table: "t".to_string(), + created_at: OffsetDateTime::UNIX_EPOCH, + } +} + +fn embed_count(store: &SqliteStore, chunk_id: &str) -> i64 { + let conn = store.read_conn(); + conn.query_row( + "SELECT COUNT(*) FROM embedding_records WHERE chunk_id = ?", + params![chunk_id], + |r| r.get::<_, i64>(0), + ) + .unwrap() +} + +/// V011 후 sentinel chunk_id(`chunks` 에 없는 id)로 `embedding_records` 를 +/// INSERT 해도 FK 위반 없이 성공해야 한다. +#[test] +fn sentinel_embedding_record_insert_succeeds_without_fk() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let c1 = "11111111111111111111111111111111"; + seed_chunk(&store, c1); + + // sentinel: chunks 에 행이 없는 `{c1}#alias`. + let sentinel = format!("{c1}{}", kebab_core::ALIAS_SUFFIX); + let result = + store.put_embedding_records_pending(&[embed_row("e_sentinel_0000000000000000000000", &sentinel)]); + assert!( + result.is_ok(), + "sentinel embedding_records insert must not violate a chunks FK after V011: {result:?}" + ); + assert_eq!( + embed_count(&store, &sentinel), + 1, + "sentinel embedding row must be persisted" + ); +} + +/// `put_chunks` 재호출(재인제스트) 시, 명시 DELETE 가 그 doc 의 원본 + sentinel +/// `embedding_records` 를 모두 정리해 orphan 0 이 되어야 한다(CASCADE 대체). +#[test] +fn put_chunks_cleans_original_and_sentinel_embeddings() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let c1 = "11111111111111111111111111111111"; + seed_chunk(&store, c1); + let sentinel = format!("{c1}{}", kebab_core::ALIAS_SUFFIX); + + // 원본 + sentinel embedding_records 색인 (committed). + store + .put_embedding_records_pending(&[ + embed_row("e_orig_000000000000000000000000000", c1), + embed_row("e_sentinel_0000000000000000000000", &sentinel), + ]) + .unwrap(); + store + .mark_embedding_records_committed(&[ + "e_orig_000000000000000000000000000".to_string(), + "e_sentinel_0000000000000000000000".to_string(), + ]) + .unwrap(); + assert_eq!(embed_count(&store, c1), 1); + assert_eq!(embed_count(&store, &sentinel), 1); + + // 재인제스트: 같은 chunk 를 put_chunks 로 다시 쓴다. 명시 DELETE 가 + // 원본 + sentinel embedding_records 를 정리한 뒤 chunk 재삽입. + let doc_id = DocumentId(DOC_ID.to_string()); + let chunk = Chunk { + chunk_id: ChunkId(c1.to_string()), + doc_id: doc_id.clone(), + block_ids: Vec::new(), + text: "hi".to_string(), + heading_path: Vec::new(), + source_spans: Vec::new(), + token_estimate: 1, + chunker_version: ChunkerVersion("v1".to_string()), + policy_hash: "h".to_string(), + tokenized_korean_text: None, + aliases: None, + }; + store.put_chunks(&doc_id, std::slice::from_ref(&chunk)).unwrap(); + + assert_eq!( + embed_count(&store, c1), + 0, + "original embedding_records must be cleaned on re-ingest (CASCADE replacement)" + ); + assert_eq!( + embed_count(&store, &sentinel), + 0, + "sentinel embedding_records must be cleaned on re-ingest (no chunks FK → explicit DELETE)" + ); +} diff --git a/migrations/V011__drop_embedding_records_fk.sql b/migrations/V011__drop_embedding_records_fk.sql new file mode 100644 index 0000000..9f156f2 --- /dev/null +++ b/migrations/V011__drop_embedding_records_fk.sql @@ -0,0 +1,41 @@ +-- V011__drop_embedding_records_fk.sql — embedding_records.chunk_id FK 제거. +-- sentinel chunk_id({orig}#alias, chunks 에 없는 id) 벡터를 허용하기 위함 +-- (설계 spec 2026-05-30-dense-alias-vectors-design.md §3.5-1). SQLite 는 ALTER +-- 로 FK 제거 불가 → 테이블 재생성. status/vector_committed(V003) + 인덱스 보존. +-- CASCADE 제거분은 put_chunks/purge 의 명시 DELETE 로 대체(§3.5-2). +PRAGMA foreign_keys=OFF; +-- legacy_alter_table=ON: DROP embedding_records 직후 V003 의 +-- chunks_bd_tombstone_embeddings trigger 가 (아직 존재하는 chunks 위에서) +-- 사라진 embedding_records 를 참조하는 dangling 상태가 된다. 이후 RENAME 이 +-- 기본(legacy off) 모드면 스키마 전체를 재파싱하며 그 trigger 에서 +-- "no such table: embedding_records" 로 실패한다. legacy 모드는 RENAME 시 +-- trigger/view 본문 재파싱을 생략하므로 trigger 를 건드리지 않고 통과한다 +-- (SQLite ALTER TABLE 문서의 권장 table-redefinition 절차). +PRAGMA legacy_alter_table=ON; + +CREATE TABLE embedding_records_new ( + embedding_id TEXT PRIMARY KEY, + chunk_id TEXT NOT NULL, -- FK 제거 (was REFERENCES chunks ON DELETE CASCADE) + model_id TEXT NOT NULL, + model_version TEXT NOT NULL, + dimensions INTEGER NOT NULL, + lance_table TEXT NOT NULL, + created_at TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + vector_committed INTEGER NOT NULL DEFAULT 0, + UNIQUE(chunk_id, model_id, model_version, dimensions) +); +INSERT INTO embedding_records_new + SELECT embedding_id, chunk_id, model_id, model_version, dimensions, + lance_table, created_at, status, vector_committed + FROM embedding_records; +DROP TABLE embedding_records; +ALTER TABLE embedding_records_new RENAME TO embedding_records; +CREATE INDEX idx_embed_chunk ON embedding_records(chunk_id); +CREATE INDEX idx_embed_model ON embedding_records(model_id, model_version, dimensions); +CREATE INDEX idx_embed_status ON embedding_records(status); + +PRAGMA legacy_alter_table=OFF; +PRAGMA foreign_keys=ON; + +UPDATE kv SET value = CAST(CAST(value AS INTEGER) + 1 AS TEXT) WHERE key = 'corpus_revision';