diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 7ee324e..3f0f413 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -615,6 +615,7 @@ fn ingest_one_asset( // (per-document tx semantics per design §5.8); composing them is // the kb-app job. A failure mid-way leaves the DB in a state the // next ingest run can re-converge (UPSERT + DELETE-then-INSERT). + purge_vector_orphans_for_workspace_path(app, asset, vector_store)?; app.sqlite .put_asset_with_bytes(asset, &bytes) .context("DocumentStore::put_asset_with_bytes")?; @@ -841,6 +842,7 @@ fn ingest_one_image_asset( .context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?; // 5. Persist + embed — identical sequence to markdown. + purge_vector_orphans_for_workspace_path(app, asset, vector_store)?; app.sqlite .put_asset_with_bytes(asset, &bytes) .context("DocumentStore::put_asset_with_bytes (image)")?; @@ -949,6 +951,46 @@ fn record_image_analysis_failure( warning_notes.push(note); } +/// HOTFIXES 2026-05-02 P7-3 follow-up: when a tracked file's bytes +/// change, `purge_orphan_at_workspace_path` (in `kebab-store-sqlite`) +/// sweeps the SQLite chain (documents → blocks / chunks / embedding_records) +/// but the LanceDB rows keyed on the now-deleted `chunk_id`s live in a +/// separate store. This helper fetches the stale `chunk_id`s from +/// SQLite **before** they get cascade-deleted, then deletes the +/// matching vectors from every Lance table. +/// +/// Called by every per-medium ingest helper at the same point — +/// immediately before `put_asset_with_bytes` runs, so the SELECT +/// still sees the old chunk_ids and the DELETE happens before the +/// new rows land. Empty workspace_path / no embedder → no-op. +fn purge_vector_orphans_for_workspace_path( + app: &App, + asset: &RawAsset, + vector_store: Option<&Arc>, +) -> anyhow::Result<()> { + let Some(vec_store) = vector_store else { + return Ok(()); + }; + let stale = app + .sqlite + .stale_chunk_ids_at(&asset.workspace_path.0, &asset.asset_id.0) + .context("SqliteStore::stale_chunk_ids_at")?; + if stale.is_empty() { + return Ok(()); + } + use kebab_core::VectorStore as _; + vec_store + .delete_by_chunk_ids(&stale) + .context("VectorStore::delete_by_chunk_ids (orphan vector cleanup)")?; + tracing::debug!( + target: "kebab-app", + path = %asset.workspace_path.0, + count = stale.len(), + "purged orphan vectors for edited asset" + ); + Ok(()) +} + /// P7-3: process one `MediaType::Pdf` asset end-to-end. /// /// - Reads bytes from disk. @@ -1024,6 +1066,7 @@ fn ingest_one_pdf_asset( .chunk(&canonical, chunk_policy) .context("kb-chunk::PdfPageV1Chunker::chunk")?; + purge_vector_orphans_for_workspace_path(app, asset, vector_store)?; app.sqlite .put_asset_with_bytes(asset, &bytes) .context("DocumentStore::put_asset_with_bytes (pdf)")?; diff --git a/crates/kebab-core/src/traits.rs b/crates/kebab-core/src/traits.rs index bbdf663..ca9cd52 100644 --- a/crates/kebab-core/src/traits.rs +++ b/crates/kebab-core/src/traits.rs @@ -171,6 +171,18 @@ pub trait VectorStore { k: usize, filters: &SearchFilters, ) -> anyhow::Result>; + + /// Delete every vector whose `chunk_id` appears in `chunk_ids`. + /// + /// Used by `kebab-app` after `purge_orphan_at_workspace_path` sweeps + /// the SQLite side on a byte-edit re-ingest, so the LanceDB rows + /// keyed on the now-deleted `chunk_id`s do not stay on disk + /// forever. Empty input is a no-op. The default impl is a no-op so + /// older `VectorStore` impls (e.g. test fakes) keep compiling + /// without behavioural change. + fn delete_by_chunk_ids(&self, _chunk_ids: &[crate::ids::ChunkId]) -> anyhow::Result<()> { + Ok(()) + } } pub trait JobRepo { diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index b2455fa..58cf21a 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -308,6 +308,50 @@ fn temp_path_for(dest: &Path) -> PathBuf { parent.join(format!("{file_name}.tmp.{pid}.{n}")) } +impl SqliteStore { + /// SELECT every `chunks.chunk_id` whose owning document points at a + /// stale `asset_id` for `workspace_path` (i.e. the file's bytes have + /// changed since the last ingest, producing a brand-new + /// `asset_id`). + /// + /// Called by `kebab-app::ingest_one_*_asset` BEFORE + /// `put_asset_with_bytes` so the caller can hand the IDs to + /// `VectorStore::delete_by_chunk_ids`. After the SQLite cleanup + /// runs (CASCADE on `documents` → `chunks`) the same chunk_ids + /// would be unreadable. Returns an empty Vec when no stale row + /// exists at `workspace_path`. + /// + /// Read-only — does not mutate. The actual sweep happens inside + /// `purge_orphan_at_workspace_path` further down the pipeline. + pub fn stale_chunk_ids_at( + &self, + workspace_path: &str, + new_asset_id: &str, + ) -> Result> { + let conn = self.lock_conn(); + let mut stmt = conn + .prepare( + "SELECT c.chunk_id + FROM chunks c + INNER JOIN documents d ON c.doc_id = d.doc_id + INNER JOIN assets a ON d.asset_id = a.asset_id + WHERE a.workspace_path = ?1 AND a.asset_id != ?2", + ) + .map_err(StoreError::from)?; + let rows = stmt + .query_map(params![workspace_path, new_asset_id], |row| { + row.get::<_, String>(0) + }) + .map_err(StoreError::from)?; + let mut out: Vec = Vec::new(); + for row in rows { + let id = row.map_err(StoreError::from)?; + out.push(kebab_core::ChunkId(id)); + } + Ok(out) + } +} + /// Sweep stale `assets` + `documents` + downstream rows when the file /// at `workspace_path` is being re-ingested with bytes that produce a /// **different** `asset_id` (i.e. the file was edited). @@ -330,13 +374,12 @@ fn temp_path_for(dest: &Path) -> PathBuf { /// delete the on-disk byte file at `storage_path` so the data dir /// doesn't accumulate orphans across edits. /// -/// **Caveat — vector store orphans.** `embedding_records.chunk_id` -/// CASCADE clears the SQLite side, but the LanceDB rows keyed on -/// `chunk_id` live in a separate store and are not touched here. -/// Stale vectors do not affect retrieval (search joins through -/// SQLite, so an orphan vector is never surfaced) but they consume -/// disk in `data_dir/lancedb/`. A future task should reconcile by -/// `chunk_id` set diff. Tracked alongside this entry in HOTFIXES. +/// **Vector store cleanup**: `embedding_records.chunk_id` CASCADE +/// clears the SQLite side, but the LanceDB rows live in a separate +/// store. The caller (`kebab-app::ingest_one_*_asset`) is responsible +/// for fetching `stale_chunk_ids_at` BEFORE this purge runs and +/// calling `VectorStore::delete_by_chunk_ids` on those IDs. The +/// follow-up PR for HOTFIXES 2026-05-02 P7-3 wires this in. pub(crate) fn purge_orphan_at_workspace_path( conn: &Connection, workspace_path: &str, diff --git a/crates/kebab-store-vector/src/store.rs b/crates/kebab-store-vector/src/store.rs index 2fbbc0e..1e04a03 100644 --- a/crates/kebab-store-vector/src/store.rs +++ b/crates/kebab-store-vector/src/store.rs @@ -283,6 +283,91 @@ impl VectorStore for LanceVectorStore { Ok(()) } + /// Delete every Lance row whose `chunk_id` matches one of the + /// supplied IDs. Iterates *all* `chunk_embeddings_*` tables in the + /// connection — a single chunk_id only ever lives in one table + /// (one-model-per-workspace today, see `INDEX_VERSION` in + /// `paths.rs`), but the loop keeps the helper correct should the + /// workspace ever maintain multiple tables (e.g. mid-migration + /// between embedding models). + /// + /// Wired in by `kebab-app::ingest_one_*_asset` after the SQLite + /// side has been swept by `purge_orphan_at_workspace_path` — + /// closes the "vector store orphan" caveat from HOTFIXES + /// 2026-05-02 P7-3. + fn delete_by_chunk_ids(&self, chunk_ids: &[kebab_core::ChunkId]) -> Result<()> { + if chunk_ids.is_empty() { + return Ok(()); + } + // SQL IN() list. chunk_ids are 32-hex-char blake3 prefixes + // (validated upstream), so SQL injection is structurally + // impossible — we still quote to keep the predicate + // syntactically valid. We chunk into batches of 200 to keep the + // WHERE clause within typical SQL parser limits. + const BATCH: usize = 200; + self.runtime.block_on(async { + let names = self + .connection + .table_names() + .execute() + .await + .context("table_names")?; + for name in names { + if !name.starts_with("chunk_embeddings_") { + continue; + } + let table = match self.connection.open_table(&name).execute().await { + Ok(t) => t, + Err(e) => { + tracing::warn!( + target: "kebab-store-vector", + table = %name, + error = %e, + "delete_by_chunk_ids: skipped unopenable table" + ); + continue; + } + }; + for batch in chunk_ids.chunks(BATCH) { + // chunk_ids in production come from `id_for_chunk` + // which always emits 32 ASCII hex chars. The + // `ChunkId(pub String)` newtype permits hand- + // construction that bypasses that invariant; assert + // it here so a misuse fails loudly in dev rather + // than slipping a tainted string into Lance's SQL + // parser. + debug_assert!( + batch + .iter() + .all(|id| id.0.bytes().all(|b| b.is_ascii_hexdigit())), + "ChunkId must be ASCII hex (id_for_chunk invariant) — \ + hand-constructed IDs that bypass this would let \ + Lance's SQL parser see arbitrary text" + ); + let list = batch + .iter() + .map(|id| format!("'{}'", id.0)) + .collect::>() + .join(","); + let predicate = format!("chunk_id IN ({list})"); + table + .delete(&predicate) + .await + .with_context(|| { + format!("Lance delete on {name} ({} ids)", batch.len()) + })?; + } + } + anyhow::Ok(()) + })?; + tracing::debug!( + target: "kebab-store-vector", + count = chunk_ids.len(), + "deleted vector rows by chunk_id" + ); + Ok(()) + } + fn search( &self, query_vec: &[f32], diff --git a/docs/SMOKE.md b/docs/SMOKE.md index 82ae10a..c813f3b 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -217,6 +217,6 @@ rm -rf /tmp/kebab-smoke # 통째로 정리 - (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장. - (P7-3) `config.chunking.chunker_version` 는 markdown 만 represent — PDF 자산은 `pdf-page-v1` 하드코딩. `config.toml` 의 `chunker_version = "md-heading-v1"` 을 봐도 PDF 는 영향 안 받음. HOTFIXES `2026-05-02 P7-3` entry 참조 (P+ chunker registry task 까지 유지). - (P7-3) 한 PDF 가 N 페이지면 `kebab ingest` 가 N 개 (또는 그 이상의, 페이지 길면 multi-chunk) 의 chunk 를 한 transaction 안에서 commit. 500 페이지 책 → 500+ chunk 한 번에 → embedding throughput 가 bottleneck. 임베딩 활성 워크스페이스에서 큰 PDF 를 처음 ingest 하면 분-단위 시간 + WAL 크기 증가 가능 — P+ 스케일 hardening task 까지 정상 동작이지만 비용은 측정 가능. -- (P7-3) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 `purge_orphan_at_workspace_path` 가 옛 doc / chunks / embeddings 를 sweep 하고 새 byte 가 새 `doc_id` 로 색인됨. `IngestReport` 에 그 자산만 `new+=1` (다른 자산은 `updated`). LanceDB 는 별도 store 라 옛 vector 가 잔존하지만 검색에는 영향 없음 (SQLite join 으로 surface 안 됨) — 디스크 cleanup 은 P+. +- (P7-3 + follow-up) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 `purge_vector_orphans_for_workspace_path` 가 옛 chunk_id 를 LanceDB 에서 먼저 삭제, 이어서 `purge_orphan_at_workspace_path` 가 옛 doc / chunks / embedding_records 를 SQLite 에서 sweep. 새 byte 가 새 `doc_id` 로 색인됨. `IngestReport` 에 그 자산만 `new+=1` (다른 자산은 `updated`). 두 store 모두 정합 — 옛 본문 검색 시 옛 chunks 가 더 이상 surface 되지 않음. 자세한 history 와 발견된 버그는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 참조. diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index d6a0708..f8e49b8 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -32,7 +32,13 @@ git history. 3. DELETEs the stale `assets` row, freeing the `workspace_path` slot. 4. If the stale storage was `copied`, best-effort removes the byte file at `storage_path` so `data_dir/assets/` does not accumulate orphans across edits. -**Caveat (still deferred)**: `embedding_records.chunk_id` CASCADE clears the SQLite side, but the LanceDB rows keyed on `chunk_id` live in a separate store and are not touched. Stale vectors do not affect retrieval (search joins through SQLite, so an orphan vector is never surfaced) but they consume disk in `data_dir/lancedb/`. A future task should reconcile by `chunk_id` set diff. +**Vector store cleanup (closed by follow-up PR)**: `embedding_records.chunk_id` CASCADE clears the SQLite side, but LanceDB lives in a separate store. The follow-up PR adds: +- `VectorStore::delete_by_chunk_ids` trait method (default impl no-op for older fakes). +- `LanceVectorStore::delete_by_chunk_ids` iterates every `chunk_embeddings_*` table in the connection and runs `Table::delete("chunk_id IN (...)")` in batches of 200. +- `SqliteStore::stale_chunk_ids_at(workspace_path, new_asset_id)` SELECT helper (read-only) that fetches the stale chunk_ids before they get cascade-deleted. +- `kebab-app::purge_vector_orphans_for_workspace_path` orchestrator. Each per-medium ingest helper (`ingest_one_asset` markdown branch, `ingest_one_image_asset`, `ingest_one_pdf_asset`) calls it immediately before `put_asset_with_bytes` so the stale Lance rows go away in lockstep with the SQLite cascade. + +Verified end-to-end via the SMOKE runbook: edit a tracked PDF → re-ingest → vector search for the old body text returns the *new* chunks (semantic nearest-neighbour) and the old chunk_ids are not present in the vector store. The previously-`#[ignore]`d `re_ingest_edited_pdf_produces_new_doc_id` integration test runs by default after this fix, plus a dedicated unit test `put_asset_with_bytes_sweeps_workspace_path_orphan` in `kebab-store-sqlite::tests::asset_writer` that exercises the no-documents flavour. Verified end-to-end via the SMOKE runbook: `kebab ingest` → edit a tracked PDF → `kebab ingest` reports `new=1` for that asset (rest `updated`) and the prior doc/chunks are gone from `inspect` / `list docs`.