Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| eec90996aa | |||
| ce1c778b4a | |||
| 453ec15df4 | |||
| 1e6de9fe9f | |||
| 9fa2a1ebac | |||
| 749c6ae240 | |||
| 5f2bd9e97e | |||
| 1ce06c1e2d | |||
| d26efe167f | |||
| d6d165df01 | |||
| 2baa846c6b | |||
| 27baec82ea | |||
| acf8cf3be2 | |||
| ea5f7b22c8 | |||
| 5497c6e7b5 |
46
Cargo.lock
generated
46
Cargo.lock
generated
@@ -4127,7 +4127,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-app"
|
name = "kebab-app"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"base64 0.22.1",
|
"base64 0.22.1",
|
||||||
@@ -4172,7 +4172,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-chunk"
|
name = "kebab-chunk"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"blake3",
|
"blake3",
|
||||||
@@ -4187,7 +4187,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-cli"
|
name = "kebab-cli"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"clap",
|
"clap",
|
||||||
@@ -4208,7 +4208,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-config"
|
name = "kebab-config"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"dirs 5.0.1",
|
"dirs 5.0.1",
|
||||||
@@ -4223,7 +4223,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-core"
|
name = "kebab-core"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"blake3",
|
"blake3",
|
||||||
@@ -4237,7 +4237,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-embed"
|
name = "kebab-embed"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"blake3",
|
"blake3",
|
||||||
@@ -4251,7 +4251,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-embed-local"
|
name = "kebab-embed-local"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"fastembed",
|
"fastembed",
|
||||||
@@ -4264,7 +4264,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-eval"
|
name = "kebab-eval"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"kebab-app",
|
"kebab-app",
|
||||||
@@ -4283,7 +4283,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-llm"
|
name = "kebab-llm"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"kebab-core",
|
"kebab-core",
|
||||||
@@ -4292,7 +4292,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-llm-local"
|
name = "kebab-llm-local"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"kebab-config",
|
"kebab-config",
|
||||||
@@ -4309,7 +4309,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-mcp"
|
name = "kebab-mcp"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"kebab-app",
|
"kebab-app",
|
||||||
@@ -4327,7 +4327,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-normalize"
|
name = "kebab-normalize"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"kebab-core",
|
"kebab-core",
|
||||||
@@ -4342,7 +4342,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-parse-code"
|
name = "kebab-parse-code"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"gix",
|
"gix",
|
||||||
@@ -4360,7 +4360,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-parse-image"
|
name = "kebab-parse-image"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ab_glyph",
|
"ab_glyph",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
@@ -4384,7 +4384,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-parse-md"
|
name = "kebab-parse-md"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"kebab-core",
|
"kebab-core",
|
||||||
@@ -4401,7 +4401,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-parse-pdf"
|
name = "kebab-parse-pdf"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"blake3",
|
"blake3",
|
||||||
@@ -4414,7 +4414,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-parse-types"
|
name = "kebab-parse-types"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"kebab-core",
|
"kebab-core",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -4422,7 +4422,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-rag"
|
name = "kebab-rag"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"blake3",
|
"blake3",
|
||||||
@@ -4443,7 +4443,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-search"
|
name = "kebab-search"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"globset",
|
"globset",
|
||||||
@@ -4462,7 +4462,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-source-fs"
|
name = "kebab-source-fs"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"blake3",
|
"blake3",
|
||||||
@@ -4481,7 +4481,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-store-sqlite"
|
name = "kebab-store-sqlite"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"blake3",
|
"blake3",
|
||||||
@@ -4502,7 +4502,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-store-vector"
|
name = "kebab-store-vector"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"arrow",
|
"arrow",
|
||||||
@@ -4526,7 +4526,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "kebab-tui"
|
name = "kebab-tui"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"crossterm",
|
"crossterm",
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ edition = "2024"
|
|||||||
rust-version = "1.85"
|
rust-version = "1.85"
|
||||||
license = "MIT OR Apache-2.0"
|
license = "MIT OR Apache-2.0"
|
||||||
repository = "https://github.com/altair823/kebab"
|
repository = "https://github.com/altair823/kebab"
|
||||||
version = "0.8.3"
|
version = "0.11.1"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
anyhow = "1"
|
anyhow = "1"
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ cargo install --git https://gitea.altair823.xyz/altair823-org/kebab.git --bin ke
|
|||||||
|
|
||||||
업데이트는 `git pull && cargo install --path crates/kebab-cli --locked --force` 또는 git URL 형식의 경우 `cargo install --git ... --force`.
|
업데이트는 `git pull && cargo install --path crates/kebab-cli --locked --force` 또는 git URL 형식의 경우 `cargo install --git ... --force`.
|
||||||
|
|
||||||
제거는 `cargo uninstall kebab-cli`. 이 명령은 binary 만 지우고 워크스페이스 데이터는 그대로 남는다. 데이터까지 정리하려면 `kebab reset --all --yes` (config + data + cache + state 4 개 XDG 경로 모두 wipe — **irreversible**, 재시작 시 `kebab init` 다시 실행). 부분 wipe 는 `kebab reset --data-only` (config 보존), `kebab reset --vector-only` (Lance + `embedding_records` 만, 다음 ingest 가 re-embed) 등.
|
제거는 `cargo uninstall kebab-cli`. 이 명령은 binary 만 지우고 워크스페이스 데이터는 그대로 남는다. 데이터까지 정리하려면 `kebab reset --all --yes` (config + data + cache + state 4 개 XDG 경로 모두 wipe — **irreversible**, 재시작 시 `kebab init` 다시 실행). 부분 wipe 는 `kebab reset --data-only` (config 보존), `kebab reset --vector-only` (Lance + `embedding_records` 만, 다음 ingest 가 re-embed), **`kebab reset --orphans-only`** (현재 walker scope 밖에 있는 stored doc 만 정리 — `config.workspace.include` 좁히거나 sub-dir 옮긴 후 explicit reconcile; fs 의 file 은 건드리지 않음) 등.
|
||||||
|
|
||||||
## Quick start
|
## Quick start
|
||||||
|
|
||||||
|
|||||||
@@ -189,10 +189,12 @@ fn fetch_span(
|
|||||||
// (markdown / note / paper / reference / inbox) is the *user-facing*
|
// (markdown / note / paper / reference / inbox) is the *user-facing*
|
||||||
// category, not the rendering format — the actual byte-level format
|
// category, not the rendering format — the actual byte-level format
|
||||||
// lives on the source `RawAsset.media_type`. Look it up via
|
// lives on the source `RawAsset.media_type`. Look it up via
|
||||||
// workspace_path (unique key per asset).
|
// doc.source_asset_id (PRIMARY KEY) so twin files (identical content
|
||||||
if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset_by_workspace_path(
|
// at different paths) always read *this* document's own asset row,
|
||||||
|
// not whichever twin last wrote `assets.workspace_path`.
|
||||||
|
if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset(
|
||||||
&app.sqlite,
|
&app.sqlite,
|
||||||
&doc.workspace_path,
|
&doc.source_asset_id,
|
||||||
)? {
|
)? {
|
||||||
if matches!(
|
if matches!(
|
||||||
asset.media_type,
|
asset.media_type,
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ mod staleness;
|
|||||||
|
|
||||||
pub use app::{App, SearchResponse};
|
pub use app::{App, SearchResponse};
|
||||||
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
|
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
|
||||||
pub use reset::{ResetReport, ResetScope};
|
pub use reset::{ResetReport, ResetScope, enumerate_orphans};
|
||||||
pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify};
|
pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify};
|
||||||
pub use fetch::fetch_with_config;
|
pub use fetch::fetch_with_config;
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
@@ -375,6 +375,28 @@ pub fn ingest_with_config_opts(
|
|||||||
.map(|d| d.doc_id.0)
|
.map(|d| d.doc_id.0)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
// Dogfood: post-walker sweep to remove stored docs whose source
|
||||||
|
// file has been deleted from the filesystem. Must run BEFORE the
|
||||||
|
// per-asset loop so the loop's New/Updated labelling is based on
|
||||||
|
// the post-purge store state (the purged doc_ids won't be in
|
||||||
|
// `existing_doc_ids` above — they were already removed, OR the
|
||||||
|
// sweep here removes them before we start counting).
|
||||||
|
//
|
||||||
|
// Critical design invariant: only purge when the file is TRULY
|
||||||
|
// absent from disk. A file that is still on disk but outside the
|
||||||
|
// current walker scope (config narrowing / include-glob change) is
|
||||||
|
// NOT purged — we leave it in place to protect against accidental
|
||||||
|
// data loss via config edits.
|
||||||
|
let scanned_paths: std::collections::HashSet<kebab_core::WorkspacePath> = assets
|
||||||
|
.iter()
|
||||||
|
.map(|a| a.workspace_path.clone())
|
||||||
|
.collect();
|
||||||
|
let purged_deleted_files = sweep_deleted_files(
|
||||||
|
&app,
|
||||||
|
&scanned_paths,
|
||||||
|
vector_store.as_ref().map(|v| v.as_ref()),
|
||||||
|
)?;
|
||||||
|
|
||||||
let started_at = time::OffsetDateTime::now_utc();
|
let started_at = time::OffsetDateTime::now_utc();
|
||||||
|
|
||||||
let mut items: Vec<kebab_core::IngestItem> = Vec::new();
|
let mut items: Vec<kebab_core::IngestItem> = Vec::new();
|
||||||
@@ -647,11 +669,11 @@ pub fn ingest_with_config_opts(
|
|||||||
crate::ingest_progress::emit(progress, terminal_event);
|
crate::ingest_progress::emit(progress, terminal_event);
|
||||||
|
|
||||||
// p9-fb-19: bump the persistent corpus_revision counter when a
|
// p9-fb-19: bump the persistent corpus_revision counter when a
|
||||||
// commit landed (any new / updated). This invalidates every
|
// commit landed (any new / updated / purged). This invalidates every
|
||||||
// entry in any in-process LRU search cache (in this process or
|
// entry in any in-process LRU search cache (in this process or
|
||||||
// a sibling) on the next lookup. No-op when nothing changed
|
// a sibling) on the next lookup. No-op when nothing changed
|
||||||
// (skipped-only run) — the cache stays valid.
|
// (skipped-only run) — the cache stays valid.
|
||||||
if new_count > 0 || updated_count > 0 {
|
if new_count > 0 || updated_count > 0 || purged_deleted_files > 0 {
|
||||||
match app.sqlite.bump_corpus_revision() {
|
match app.sqlite.bump_corpus_revision() {
|
||||||
Ok(rev) => tracing::debug!(
|
Ok(rev) => tracing::debug!(
|
||||||
target: "kebab-app",
|
target: "kebab-app",
|
||||||
@@ -682,6 +704,7 @@ pub fn ingest_with_config_opts(
|
|||||||
skipped_generated: fs_skips.skipped_generated,
|
skipped_generated: fs_skips.skipped_generated,
|
||||||
skipped_size_exceeded: fs_skips.skipped_size_exceeded,
|
skipped_size_exceeded: fs_skips.skipped_size_exceeded,
|
||||||
skip_examples: fs_skips.skip_examples,
|
skip_examples: fs_skips.skip_examples,
|
||||||
|
purged_deleted_files,
|
||||||
items: if summary_only { None } else { Some(items) },
|
items: if summary_only { None } else { Some(items) },
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -1453,6 +1476,120 @@ fn purge_vector_orphans_for_workspace_path(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Dogfood: post-walker sweep that purges stored documents whose source
|
||||||
|
/// file has been physically deleted from the filesystem.
|
||||||
|
///
|
||||||
|
/// Algorithm:
|
||||||
|
/// 1. Query `documents` for every `workspace_path` currently stored.
|
||||||
|
/// 2. Compute `orphan_candidates = stored_paths - scanned_paths`.
|
||||||
|
/// 3. For each candidate: resolve to an absolute path and call
|
||||||
|
/// `Path::try_exists().unwrap_or(true)` — transient FS errors
|
||||||
|
/// (EACCES, NFS hiccup, ownership change) conservatively count as
|
||||||
|
/// "still present" so we never purge on uncertain signal. If the
|
||||||
|
/// file still exists on disk it was merely out-of-scope this run
|
||||||
|
/// (config narrowing / include-glob change) — leave it alone. Only
|
||||||
|
/// files that are truly absent trigger a purge.
|
||||||
|
/// 4. For absent files: call `purge_deleted_workspace_path` (SQLite
|
||||||
|
/// cascade delete + optional copied-asset file removal) and, if a
|
||||||
|
/// vector store is present, delete the associated vectors.
|
||||||
|
///
|
||||||
|
/// Returns the number of documents purged.
|
||||||
|
///
|
||||||
|
/// Non-fatal design: individual purge failures are logged and counted
|
||||||
|
/// as errors on the per-file level but do NOT abort the sweep — a
|
||||||
|
/// partial failure is preferable to blocking the rest of ingest. The
|
||||||
|
/// return value only counts successful purges.
|
||||||
|
fn sweep_deleted_files(
|
||||||
|
app: &App,
|
||||||
|
scanned_paths: &std::collections::HashSet<kebab_core::WorkspacePath>,
|
||||||
|
vector_store: Option<&kebab_store_vector::LanceVectorStore>,
|
||||||
|
) -> anyhow::Result<u32> {
|
||||||
|
use kebab_core::DocumentStore as _;
|
||||||
|
|
||||||
|
let stored_paths = app
|
||||||
|
.sqlite
|
||||||
|
.all_workspace_paths()
|
||||||
|
.context("sweep_deleted_files: all_workspace_paths")?;
|
||||||
|
|
||||||
|
if stored_paths.is_empty() {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let workspace_root = app.config.resolve_workspace_root();
|
||||||
|
let mut purged: u32 = 0;
|
||||||
|
|
||||||
|
for stored_path in stored_paths {
|
||||||
|
if scanned_paths.contains(&stored_path) {
|
||||||
|
continue; // still in scope — skip
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve to an absolute path and check existence on disk.
|
||||||
|
// Use `try_exists` + `unwrap_or(true)` so transient FS errors
|
||||||
|
// (EACCES on a path we lack read on, NFS hiccups, ownership
|
||||||
|
// change) are CONSERVATIVELY treated as "file still present" —
|
||||||
|
// never purge on uncertain signal (data-safety: PR #148 review).
|
||||||
|
// `exists()` would return false on Err and trigger a wrongful
|
||||||
|
// purge. Files whose path cannot be joined (theoretically
|
||||||
|
// impossible for non-empty workspace_path strings, but
|
||||||
|
// defense-in-depth) are likewise treated as still present.
|
||||||
|
let abs = workspace_root.join(&stored_path.0);
|
||||||
|
if abs.try_exists().unwrap_or(true) {
|
||||||
|
// File is on disk but not in this scan's scope (config
|
||||||
|
// narrowing). DO NOT purge — critical design constraint.
|
||||||
|
tracing::debug!(
|
||||||
|
target: "kebab-app",
|
||||||
|
path = %stored_path.0,
|
||||||
|
"sweep_deleted_files: file on disk but out of scope — leaving in store"
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// File is truly absent → purge.
|
||||||
|
let chunk_ids = match kebab_store_sqlite::purge_deleted_workspace_path(
|
||||||
|
&app.sqlite,
|
||||||
|
&stored_path,
|
||||||
|
) {
|
||||||
|
Ok(ids) => ids,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
target: "kebab-app",
|
||||||
|
path = %stored_path.0,
|
||||||
|
error = %e,
|
||||||
|
"sweep_deleted_files: purge failed; skipping this path"
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Purge associated vectors (best-effort; partial failure
|
||||||
|
// acceptable — orphan vectors get cleaned by `kebab reset
|
||||||
|
// --vector-only` if they accumulate).
|
||||||
|
if let Some(vec) = vector_store {
|
||||||
|
if !chunk_ids.is_empty() {
|
||||||
|
use kebab_core::VectorStore as _;
|
||||||
|
if let Err(e) = vec.delete_by_chunk_ids(&chunk_ids) {
|
||||||
|
tracing::warn!(
|
||||||
|
target: "kebab-app",
|
||||||
|
path = %stored_path.0,
|
||||||
|
count = chunk_ids.len(),
|
||||||
|
error = %e,
|
||||||
|
"sweep_deleted_files: vector delete failed; SQLite side already cleaned"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
target: "kebab-app",
|
||||||
|
path = %stored_path.0,
|
||||||
|
"sweep_deleted_files: purged document for deleted file"
|
||||||
|
);
|
||||||
|
purged = purged.saturating_add(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(purged)
|
||||||
|
}
|
||||||
|
|
||||||
/// P7-3: process one `MediaType::Pdf` asset end-to-end.
|
/// P7-3: process one `MediaType::Pdf` asset end-to-end.
|
||||||
///
|
///
|
||||||
/// - Reads bytes from disk.
|
/// - Reads bytes from disk.
|
||||||
|
|||||||
@@ -9,13 +9,19 @@
|
|||||||
//!
|
//!
|
||||||
//! `--vector-only` additionally truncates `embedding_records` in SQLite
|
//! `--vector-only` additionally truncates `embedding_records` in SQLite
|
||||||
//! so the next `kebab ingest` re-embeds cleanly without orphan rows.
|
//! so the next `kebab ingest` re-embeds cleanly without orphan rows.
|
||||||
|
//!
|
||||||
|
//! `--orphans-only` purges stored docs that are outside the current walker
|
||||||
|
//! scope (config narrowing / removed sub-directory). No filesystem paths are
|
||||||
|
//! removed — this is purely a store-level reconciliation.
|
||||||
|
|
||||||
|
use std::collections::HashSet;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use kebab_config::{Config, expand_path};
|
use kebab_config::{Config, expand_path};
|
||||||
|
use kebab_core::WorkspacePath;
|
||||||
|
|
||||||
/// What the user asked to remove. Mutually exclusive — picked by the CLI
|
/// What the user asked to remove. Mutually exclusive — picked by the CLI
|
||||||
/// from a clap `ArgGroup`.
|
/// from a clap `ArgGroup`.
|
||||||
@@ -32,6 +38,13 @@ pub enum ResetScope {
|
|||||||
VectorOnly,
|
VectorOnly,
|
||||||
/// Wipe only the config dir.
|
/// Wipe only the config dir.
|
||||||
ConfigOnly,
|
ConfigOnly,
|
||||||
|
/// Purge stored docs that are outside the current walker scope (no
|
||||||
|
/// filesystem paths are removed). Filesystem existence is NOT checked —
|
||||||
|
/// anything the current walker would not visit is considered an orphan.
|
||||||
|
/// The explicit complement to the conservative `sweep_deleted_files`
|
||||||
|
/// that runs during ingest (which leaves on-disk-but-out-of-scope docs
|
||||||
|
/// alone for data safety).
|
||||||
|
OrphansOnly,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Result of a successful wipe — emitted as `reset_report.v1` by the
|
/// Result of a successful wipe — emitted as `reset_report.v1` by the
|
||||||
@@ -41,6 +54,16 @@ pub struct ResetReport {
|
|||||||
pub scope: ResetScope,
|
pub scope: ResetScope,
|
||||||
pub removed_paths: Vec<PathBuf>,
|
pub removed_paths: Vec<PathBuf>,
|
||||||
pub embedding_rows_truncated: u64,
|
pub embedding_rows_truncated: u64,
|
||||||
|
/// Number of stored docs purged because they are outside the current
|
||||||
|
/// walker scope. Non-zero only when `scope == OrphansOnly`.
|
||||||
|
/// `#[serde(default)]` preserves back-compat with older callers that
|
||||||
|
/// do not include this field.
|
||||||
|
#[serde(default)]
|
||||||
|
pub orphans_purged: u32,
|
||||||
|
/// Paths of the orphaned docs that were purged. Sorted for deterministic
|
||||||
|
/// output. Non-empty only when `scope == OrphansOnly`.
|
||||||
|
#[serde(default)]
|
||||||
|
pub purged_paths: Vec<WorkspacePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compute the absolute on-disk paths a given scope will wipe, given a
|
/// Compute the absolute on-disk paths a given scope will wipe, given a
|
||||||
@@ -67,6 +90,10 @@ pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec<PathBuf> {
|
|||||||
vec![vector_dir]
|
vec![vector_dir]
|
||||||
}
|
}
|
||||||
ResetScope::ConfigOnly => vec![cfg_dir],
|
ResetScope::ConfigOnly => vec![cfg_dir],
|
||||||
|
// OrphansOnly operates purely at the store level — no filesystem paths
|
||||||
|
// are removed. Return empty so `estimate_size_bytes` stays zero and
|
||||||
|
// the existing confirm UI path for directory wipes is skipped.
|
||||||
|
ResetScope::OrphansOnly => vec![],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -96,16 +123,82 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
|
|||||||
paths.iter().map(|p| walk(p)).sum()
|
paths.iter().map(|p| walk(p)).sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Compute the workspace paths stored in SQLite that are NOT visited by
|
||||||
|
/// the current walker scope (i.e. they are "orphans" — on disk but
|
||||||
|
/// outside the configured include/exclude rules, or from a sub-directory
|
||||||
|
/// that has since been removed from the workspace).
|
||||||
|
///
|
||||||
|
/// Does NOT check filesystem existence — `OrphansOnly` is the explicit
|
||||||
|
/// "I know what I'm doing" variant; callers that want the conservative
|
||||||
|
/// fs-aware sweep should use `sweep_deleted_files` inside ingest.
|
||||||
|
///
|
||||||
|
/// Returns the list sorted for deterministic output. Called twice by the
|
||||||
|
/// CLI path (once for the confirm UI preview, once inside `execute`);
|
||||||
|
/// the double scan is acceptable for a rare destructive operation.
|
||||||
|
pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
|
||||||
|
use kebab_core::DocumentStore as _;
|
||||||
|
use kebab_source_fs::FsSourceConnector;
|
||||||
|
use kebab_core::SourceScope;
|
||||||
|
|
||||||
|
let store = kebab_store_sqlite::SqliteStore::open(cfg)
|
||||||
|
.context("enumerate_orphans: open SqliteStore")?;
|
||||||
|
|
||||||
|
let stored = store
|
||||||
|
.all_workspace_paths()
|
||||||
|
.context("enumerate_orphans: all_workspace_paths")?;
|
||||||
|
|
||||||
|
if stored.is_empty() {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the same SourceScope the CLI's ingest path uses: root from
|
||||||
|
// config, exclude list from config, no include override (full scope).
|
||||||
|
let root = cfg.resolve_workspace_root();
|
||||||
|
let scope = SourceScope {
|
||||||
|
root: root.clone(),
|
||||||
|
exclude: cfg.workspace.exclude.clone(),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let connector = FsSourceConnector::new(cfg)
|
||||||
|
.context("enumerate_orphans: build FsSourceConnector")?;
|
||||||
|
let (assets, _skips) = connector
|
||||||
|
.scan_with_skips(&scope)
|
||||||
|
.context("enumerate_orphans: scan workspace")?;
|
||||||
|
|
||||||
|
let scanned: HashSet<WorkspacePath> = assets
|
||||||
|
.into_iter()
|
||||||
|
.map(|a| a.workspace_path)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut orphans: Vec<WorkspacePath> = stored
|
||||||
|
.into_iter()
|
||||||
|
.filter(|p| !scanned.contains(p))
|
||||||
|
.collect();
|
||||||
|
orphans.sort_by(|a, b| a.0.cmp(&b.0));
|
||||||
|
Ok(orphans)
|
||||||
|
}
|
||||||
|
|
||||||
/// Wipe every path from `enumerate_paths(scope, cfg)`. For
|
/// Wipe every path from `enumerate_paths(scope, cfg)`. For
|
||||||
/// `ResetScope::VectorOnly`, also truncates the SQLite
|
/// `ResetScope::VectorOnly`, also truncates the SQLite
|
||||||
/// `embedding_records` table so the store doesn't point at the Lance
|
/// `embedding_records` table so the store doesn't point at the Lance
|
||||||
/// rows we just removed off-disk.
|
/// rows we just removed off-disk.
|
||||||
///
|
///
|
||||||
|
/// For `ResetScope::OrphansOnly`, no filesystem directories are removed.
|
||||||
|
/// Instead the store is reconciled: stored docs outside the current walker
|
||||||
|
/// scope are purged from SQLite (+ vector store when configured). The
|
||||||
|
/// caller is expected to have already shown the confirm UI using
|
||||||
|
/// `enumerate_orphans`.
|
||||||
|
///
|
||||||
/// Idempotent: a missing path is treated as already-removed (success).
|
/// Idempotent: a missing path is treated as already-removed (success).
|
||||||
/// Returns a `ResetReport` listing exactly what was removed (paths that
|
/// Returns a `ResetReport` listing exactly what was removed (paths that
|
||||||
/// existed before the call) so `--json` callers see the truth, not the
|
/// existed before the call) so `--json` callers see the truth, not the
|
||||||
/// request.
|
/// request.
|
||||||
pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||||
|
if matches!(scope, ResetScope::OrphansOnly) {
|
||||||
|
return execute_orphans_only(cfg);
|
||||||
|
}
|
||||||
|
|
||||||
let paths = enumerate_paths(scope, cfg);
|
let paths = enumerate_paths(scope, cfg);
|
||||||
let mut removed = Vec::new();
|
let mut removed = Vec::new();
|
||||||
|
|
||||||
@@ -128,9 +221,100 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
|||||||
scope,
|
scope,
|
||||||
removed_paths: removed,
|
removed_paths: removed,
|
||||||
embedding_rows_truncated,
|
embedding_rows_truncated,
|
||||||
|
orphans_purged: 0,
|
||||||
|
purged_paths: Vec::new(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Execute the `OrphansOnly` variant: reconcile stored docs against the
|
||||||
|
/// current walker scope without touching any filesystem directory.
|
||||||
|
fn execute_orphans_only(cfg: &Config) -> Result<ResetReport> {
|
||||||
|
let orphans = enumerate_orphans(cfg)
|
||||||
|
.context("execute_orphans_only: enumerate orphans")?;
|
||||||
|
|
||||||
|
if orphans.is_empty() {
|
||||||
|
return Ok(ResetReport {
|
||||||
|
scope: ResetScope::OrphansOnly,
|
||||||
|
removed_paths: Vec::new(),
|
||||||
|
embedding_rows_truncated: 0,
|
||||||
|
orphans_purged: 0,
|
||||||
|
purged_paths: Vec::new(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let store = std::sync::Arc::new(
|
||||||
|
kebab_store_sqlite::SqliteStore::open(cfg)
|
||||||
|
.context("execute_orphans_only: open SqliteStore")?,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Open vector store if configured. Mirror the same guard the ingest
|
||||||
|
// path uses: only construct when the provider is not "none" / dims > 0.
|
||||||
|
let vector_store: Option<kebab_store_vector::LanceVectorStore> =
|
||||||
|
open_vector_store_if_configured(cfg, store.clone())?;
|
||||||
|
|
||||||
|
let mut purged_paths: Vec<WorkspacePath> = Vec::new();
|
||||||
|
|
||||||
|
for path in &orphans {
|
||||||
|
let chunk_ids = kebab_store_sqlite::purge_deleted_workspace_path(&store, path)
|
||||||
|
.with_context(|| format!("execute_orphans_only: purge {}", path.0))?;
|
||||||
|
|
||||||
|
if let Some(ref vs) = vector_store {
|
||||||
|
if !chunk_ids.is_empty() {
|
||||||
|
use kebab_core::VectorStore as _;
|
||||||
|
if let Err(e) = vs.delete_by_chunk_ids(&chunk_ids) {
|
||||||
|
tracing::warn!(
|
||||||
|
target: "kebab-app",
|
||||||
|
path = %path.0,
|
||||||
|
count = chunk_ids.len(),
|
||||||
|
error = %e,
|
||||||
|
"reset --orphans-only: vector delete failed; SQLite side already cleaned"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
target: "kebab-app",
|
||||||
|
path = %path.0,
|
||||||
|
"reset --orphans-only: purged orphan document"
|
||||||
|
);
|
||||||
|
purged_paths.push(path.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
let orphans_purged = u32::try_from(purged_paths.len()).unwrap_or(u32::MAX);
|
||||||
|
|
||||||
|
Ok(ResetReport {
|
||||||
|
scope: ResetScope::OrphansOnly,
|
||||||
|
removed_paths: Vec::new(),
|
||||||
|
embedding_rows_truncated: 0,
|
||||||
|
orphans_purged,
|
||||||
|
purged_paths,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Open the Lance vector store if the configured embedding provider is
|
||||||
|
/// active (non-"none", dimensions > 0). Returns `None` for lexical-only
|
||||||
|
/// configs. Mirrors the guard in `App::vector`.
|
||||||
|
fn open_vector_store_if_configured(
|
||||||
|
cfg: &Config,
|
||||||
|
store: std::sync::Arc<kebab_store_sqlite::SqliteStore>,
|
||||||
|
) -> Result<Option<kebab_store_vector::LanceVectorStore>> {
|
||||||
|
if cfg.models.embedding.provider == "none" || cfg.models.embedding.dimensions == 0 {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
match kebab_store_vector::LanceVectorStore::new(cfg, store) {
|
||||||
|
Ok(vs) => Ok(Some(vs)),
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
target: "kebab-app",
|
||||||
|
error = %e,
|
||||||
|
"reset --orphans-only: could not open vector store; skipping vector delete"
|
||||||
|
);
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Open the SQLite store at the configured path and run
|
/// Open the SQLite store at the configured path and run
|
||||||
/// `truncate_embedding_records`. Returns the count of truncated rows
|
/// `truncate_embedding_records`. Returns the count of truncated rows
|
||||||
/// (the helper itself reports `DELETE` rowcount). If the SQLite file
|
/// (the helper itself reports `DELETE` rowcount). If the SQLite file
|
||||||
@@ -200,4 +384,14 @@ mod tests {
|
|||||||
let bytes = estimate_size_bytes(&[dir.path().to_path_buf()]);
|
let bytes = estimate_size_bytes(&[dir.path().to_path_buf()]);
|
||||||
assert_eq!(bytes, 5 + 6);
|
assert_eq!(bytes, 5 + 6);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn enumerate_orphans_only_returns_empty_paths() {
|
||||||
|
let cfg = Config::defaults();
|
||||||
|
let paths = enumerate_paths(ResetScope::OrphansOnly, &cfg);
|
||||||
|
assert!(
|
||||||
|
paths.is_empty(),
|
||||||
|
"OrphansOnly must return empty vec from enumerate_paths"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
178
crates/kebab-app/tests/file_deletion_auto_purge.rs
Normal file
178
crates/kebab-app/tests/file_deletion_auto_purge.rs
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
//! Dogfood: auto-purge stored docs for filesystem-deleted files.
|
||||||
|
//!
|
||||||
|
//! Two tests:
|
||||||
|
//!
|
||||||
|
//! 1. `file_deletion_auto_purge` — ingest 2 files, delete one, re-ingest.
|
||||||
|
//! The re-ingest must report `purged_deleted_files = 1`, the deleted
|
||||||
|
//! file must no longer appear in `list_docs`, and lexical search for
|
||||||
|
//! its unique content must return no hits.
|
||||||
|
//!
|
||||||
|
//! 2. `include_scope_narrowing_does_not_purge` — ingest 2 files under a
|
||||||
|
//! wide glob, narrow the walker scope to only one file, re-ingest.
|
||||||
|
//! The narrowed ingest must NOT purge the out-of-scope file because
|
||||||
|
//! the file is still on disk (just excluded from this run). Protects
|
||||||
|
//! users against accidental data loss via config edits.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use common::TestEnv;
|
||||||
|
use kebab_app::ingest_with_config_opts;
|
||||||
|
use kebab_app::IngestOpts;
|
||||||
|
use kebab_core::{DocFilter, DocumentStore, SearchMode, SearchQuery, SourceScope};
|
||||||
|
|
||||||
|
/// Helper: open the store via `TestEnv` and run `list_documents`.
|
||||||
|
fn list_doc_paths(env: &TestEnv) -> Vec<String> {
|
||||||
|
use kebab_store_sqlite::SqliteStore;
|
||||||
|
let store = SqliteStore::open(&env.config).unwrap();
|
||||||
|
store.run_migrations().unwrap();
|
||||||
|
store
|
||||||
|
.list_documents(&DocFilter::default())
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|d| d.doc_path.0)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn file_deletion_auto_purge() {
|
||||||
|
let env = TestEnv::lexical_only();
|
||||||
|
|
||||||
|
// Write two .rs files into the workspace.
|
||||||
|
let a_path = env.workspace_root.join("a.rs");
|
||||||
|
let b_path = env.workspace_root.join("b.rs");
|
||||||
|
std::fs::write(&a_path, "// file a\nfn alpha() {}\n").unwrap();
|
||||||
|
std::fs::write(&b_path, "// file b\nfn bravo() {}\n").unwrap();
|
||||||
|
|
||||||
|
// First ingest — both must be New.
|
||||||
|
let first = ingest_with_config_opts(
|
||||||
|
env.config.clone(),
|
||||||
|
env.scope(),
|
||||||
|
false,
|
||||||
|
IngestOpts::default(),
|
||||||
|
)
|
||||||
|
.expect("first ingest must succeed");
|
||||||
|
// Only count the .rs files we added (there may be fixture files too).
|
||||||
|
let first_new = first.new;
|
||||||
|
assert!(first_new >= 2, "expected at least 2 new docs: {first:?}");
|
||||||
|
assert_eq!(
|
||||||
|
first.purged_deleted_files, 0,
|
||||||
|
"no purges on first ingest: {first:?}"
|
||||||
|
);
|
||||||
|
assert_eq!(first.errors, 0, "no errors on first ingest: {first:?}");
|
||||||
|
|
||||||
|
// Delete one file from the filesystem.
|
||||||
|
std::fs::remove_file(&b_path).expect("remove b.rs");
|
||||||
|
|
||||||
|
// Second ingest — scanned count drops by 1; b.rs should be purged.
|
||||||
|
let second = ingest_with_config_opts(
|
||||||
|
env.config.clone(),
|
||||||
|
env.scope(),
|
||||||
|
false,
|
||||||
|
IngestOpts::default(),
|
||||||
|
)
|
||||||
|
.expect("second ingest must succeed");
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
second.purged_deleted_files, 1,
|
||||||
|
"exactly 1 file should be purged: {second:?}"
|
||||||
|
);
|
||||||
|
assert_eq!(second.new, 0, "no new docs after deletion: {second:?}");
|
||||||
|
assert_eq!(second.updated, 0, "no updated docs: {second:?}");
|
||||||
|
assert_eq!(second.errors, 0, "no errors: {second:?}");
|
||||||
|
|
||||||
|
// b.rs must no longer appear in list_docs.
|
||||||
|
let doc_paths = list_doc_paths(&env);
|
||||||
|
let b_ws_path = "b.rs";
|
||||||
|
assert!(
|
||||||
|
!doc_paths.iter().any(|p| p == b_ws_path),
|
||||||
|
"b.rs must be gone from list_docs; got: {doc_paths:?}"
|
||||||
|
);
|
||||||
|
// a.rs must still be present.
|
||||||
|
let a_ws_path = "a.rs";
|
||||||
|
assert!(
|
||||||
|
doc_paths.iter().any(|p| p == a_ws_path),
|
||||||
|
"a.rs must still be in list_docs; got: {doc_paths:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Lexical search for b.rs's unique content returns no hits.
|
||||||
|
let app = env.app();
|
||||||
|
let query = SearchQuery {
|
||||||
|
text: "bravo".to_string(),
|
||||||
|
mode: SearchMode::Lexical,
|
||||||
|
k: 10,
|
||||||
|
filters: kebab_core::SearchFilters::default(),
|
||||||
|
};
|
||||||
|
let hits = app.search(query).expect("search must not error");
|
||||||
|
assert!(
|
||||||
|
hits.is_empty(),
|
||||||
|
"search for deleted file's content must return no hits; got: {hits:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn include_scope_narrowing_does_not_purge() {
|
||||||
|
let env = TestEnv::lexical_only();
|
||||||
|
|
||||||
|
// Write two .rs files.
|
||||||
|
let a_path = env.workspace_root.join("a_narrow.rs");
|
||||||
|
let b_path = env.workspace_root.join("b_narrow.rs");
|
||||||
|
std::fs::write(&a_path, "// narrow a\nfn alpha_narrow() {}\n").unwrap();
|
||||||
|
std::fs::write(&b_path, "// narrow b\nfn bravo_narrow() {}\n").unwrap();
|
||||||
|
|
||||||
|
// Wide scope: first ingest — both must be New.
|
||||||
|
let wide_scope = SourceScope {
|
||||||
|
root: env.workspace_root.clone(),
|
||||||
|
include: vec!["**/*.rs".to_string()],
|
||||||
|
exclude: env.config.workspace.exclude.clone(),
|
||||||
|
};
|
||||||
|
let first = ingest_with_config_opts(
|
||||||
|
env.config.clone(),
|
||||||
|
wide_scope,
|
||||||
|
false,
|
||||||
|
IngestOpts::default(),
|
||||||
|
)
|
||||||
|
.expect("first ingest (wide) must succeed");
|
||||||
|
assert!(
|
||||||
|
first.new >= 2,
|
||||||
|
"expected at least 2 new docs: {first:?}"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
first.purged_deleted_files, 0,
|
||||||
|
"no purges on first ingest: {first:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Narrow scope: only a_narrow.rs in include — b_narrow.rs is still
|
||||||
|
// on disk but excluded from the walker scope.
|
||||||
|
let narrow_scope = SourceScope {
|
||||||
|
root: env.workspace_root.clone(),
|
||||||
|
include: vec!["a_narrow.rs".to_string()],
|
||||||
|
exclude: env.config.workspace.exclude.clone(),
|
||||||
|
};
|
||||||
|
let second = ingest_with_config_opts(
|
||||||
|
env.config.clone(),
|
||||||
|
narrow_scope,
|
||||||
|
false,
|
||||||
|
IngestOpts::default(),
|
||||||
|
)
|
||||||
|
.expect("second ingest (narrow) must succeed");
|
||||||
|
|
||||||
|
// CRITICAL: b_narrow.rs is still on disk — must NOT be purged.
|
||||||
|
assert_eq!(
|
||||||
|
second.purged_deleted_files, 0,
|
||||||
|
"scope-narrowing must NOT purge on-disk files; got: {second:?}"
|
||||||
|
);
|
||||||
|
assert_eq!(second.errors, 0, "no errors: {second:?}");
|
||||||
|
|
||||||
|
// b_narrow.rs must still exist in the store.
|
||||||
|
let doc_paths = list_doc_paths(&env);
|
||||||
|
let b_ws_path = "b_narrow.rs";
|
||||||
|
assert!(
|
||||||
|
doc_paths.iter().any(|p| p == b_ws_path),
|
||||||
|
"b_narrow.rs must still be in list_docs after scope narrowing; got: {doc_paths:?}"
|
||||||
|
);
|
||||||
|
// And the file must still be on disk.
|
||||||
|
assert!(
|
||||||
|
b_path.exists(),
|
||||||
|
"b_narrow.rs must still be on disk (we didn't delete it)"
|
||||||
|
);
|
||||||
|
}
|
||||||
141
crates/kebab-app/tests/reset_orphans.rs
Normal file
141
crates/kebab-app/tests/reset_orphans.rs
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
//! Integration test for `kebab reset --orphans-only`.
|
||||||
|
//!
|
||||||
|
//! Verifies that stored docs outside the current walker scope are purged
|
||||||
|
//! from the store without removing any files from the filesystem.
|
||||||
|
//!
|
||||||
|
//! Test outline:
|
||||||
|
//! 1. Ingest 3 .rs files (a.rs, b.rs, c.rs) — all New.
|
||||||
|
//! 2. Narrow the config `include` to `["a.rs"]` only; b.rs and c.rs are
|
||||||
|
//! still on disk but outside the walker scope.
|
||||||
|
//! 3. Run `execute(ResetScope::OrphansOnly, &cfg)` — report must show
|
||||||
|
//! `orphans_purged == 2` and `purged_paths` contains b.rs + c.rs.
|
||||||
|
//! 4. `list docs` must show only a.rs.
|
||||||
|
//! 5. b.rs and c.rs must still exist on disk (no filesystem removal).
|
||||||
|
//! 6. Second reset → `orphans_purged == 0` (idempotent).
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use common::TestEnv;
|
||||||
|
use kebab_app::IngestOpts;
|
||||||
|
use kebab_app::reset::{ResetScope, execute};
|
||||||
|
use kebab_core::{DocFilter, DocumentStore, SourceScope};
|
||||||
|
|
||||||
|
/// Open the SqliteStore and list all `workspace_path` values.
|
||||||
|
fn list_doc_paths(env: &TestEnv) -> Vec<String> {
|
||||||
|
use kebab_store_sqlite::SqliteStore;
|
||||||
|
let store = SqliteStore::open(&env.config).unwrap();
|
||||||
|
store.run_migrations().unwrap();
|
||||||
|
store
|
||||||
|
.list_documents(&DocFilter::default())
|
||||||
|
.unwrap()
|
||||||
|
.into_iter()
|
||||||
|
.map(|d| d.doc_path.0)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn reset_orphans_only_purges_out_of_scope_docs() {
|
||||||
|
let env = TestEnv::lexical_only();
|
||||||
|
|
||||||
|
// Write three .rs files into the workspace.
|
||||||
|
let a_path = env.workspace_root.join("a.rs");
|
||||||
|
let b_path = env.workspace_root.join("b.rs");
|
||||||
|
let c_path = env.workspace_root.join("c.rs");
|
||||||
|
std::fs::write(&a_path, "// file a\nfn alpha() {}\n").unwrap();
|
||||||
|
std::fs::write(&b_path, "// file b\nfn bravo() {}\n").unwrap();
|
||||||
|
std::fs::write(&c_path, "// file c\nfn charlie() {}\n").unwrap();
|
||||||
|
|
||||||
|
// Ingest all three with a wide scope.
|
||||||
|
let wide_scope = SourceScope {
|
||||||
|
root: env.workspace_root.clone(),
|
||||||
|
include: vec!["**/*.rs".to_string()],
|
||||||
|
exclude: env.config.workspace.exclude.clone(),
|
||||||
|
};
|
||||||
|
let first = kebab_app::ingest_with_config_opts(
|
||||||
|
env.config.clone(),
|
||||||
|
wide_scope,
|
||||||
|
false,
|
||||||
|
IngestOpts::default(),
|
||||||
|
)
|
||||||
|
.expect("first ingest must succeed");
|
||||||
|
// The fixture workspace may contain other .rs files — just assert we
|
||||||
|
// got at least 3 new docs (our a.rs, b.rs, c.rs).
|
||||||
|
assert!(first.new >= 3, "expected at least 3 new docs: {first:?}");
|
||||||
|
assert_eq!(first.errors, 0, "no errors on first ingest");
|
||||||
|
|
||||||
|
// Narrow config to include only a.rs; b.rs + c.rs are still on disk.
|
||||||
|
let mut narrow_cfg = env.config.clone();
|
||||||
|
narrow_cfg.workspace.exclude.clear();
|
||||||
|
// Re-point workspace root (already correct) and restrict include via
|
||||||
|
// the SourceScope in the connector. The config's `workspace.root` is
|
||||||
|
// used by `enumerate_orphans` to build its scope — we keep that
|
||||||
|
// pointing at the workspace root. We simulate narrowing by setting a
|
||||||
|
// glob that only matches a.rs.
|
||||||
|
//
|
||||||
|
// NOTE: `kebab_config::WorkspaceCfg` does not have an `include` field
|
||||||
|
// (it was removed in p9-fb-25). We narrow the scope via the walker
|
||||||
|
// exclude list: exclude b.rs and c.rs explicitly.
|
||||||
|
narrow_cfg.workspace.exclude = vec!["b.rs".to_string(), "c.rs".to_string()];
|
||||||
|
|
||||||
|
// Run orphans-only reset.
|
||||||
|
let report = execute(ResetScope::OrphansOnly, &narrow_cfg)
|
||||||
|
.expect("orphans-only reset must succeed");
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
report.orphans_purged, 2,
|
||||||
|
"expected 2 orphans purged (b.rs + c.rs): {report:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut purged: Vec<String> = report
|
||||||
|
.purged_paths
|
||||||
|
.iter()
|
||||||
|
.map(|p| p.0.clone())
|
||||||
|
.collect();
|
||||||
|
purged.sort();
|
||||||
|
assert_eq!(
|
||||||
|
purged,
|
||||||
|
vec!["b.rs".to_string(), "c.rs".to_string()],
|
||||||
|
"purged_paths must list b.rs and c.rs in sorted order: {purged:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
// list docs must show only a.rs (and any pre-existing fixture files
|
||||||
|
// that are not excluded by the narrow config).
|
||||||
|
let doc_paths = list_doc_paths(&env);
|
||||||
|
// The narrow_cfg excludes b.rs + c.rs — they must no longer be in store.
|
||||||
|
assert!(
|
||||||
|
!doc_paths.iter().any(|p| p == "b.rs"),
|
||||||
|
"b.rs must be gone from store after orphans-only reset; got: {doc_paths:?}"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
!doc_paths.iter().any(|p| p == "c.rs"),
|
||||||
|
"c.rs must be gone from store after orphans-only reset; got: {doc_paths:?}"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
doc_paths.iter().any(|p| p == "a.rs"),
|
||||||
|
"a.rs must still be in store; got: {doc_paths:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Both b.rs and c.rs must still exist on the filesystem — no file
|
||||||
|
// removal is performed by orphans-only.
|
||||||
|
assert!(
|
||||||
|
b_path.exists(),
|
||||||
|
"b.rs must still be on disk after orphans-only reset"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
c_path.exists(),
|
||||||
|
"c.rs must still be on disk after orphans-only reset"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Second reset must be idempotent: nothing left to purge.
|
||||||
|
let second = execute(ResetScope::OrphansOnly, &narrow_cfg)
|
||||||
|
.expect("second orphans-only reset must succeed");
|
||||||
|
assert_eq!(
|
||||||
|
second.orphans_purged, 0,
|
||||||
|
"second reset must be idempotent (orphans_purged == 0): {second:?}"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
second.purged_paths.is_empty(),
|
||||||
|
"second reset purged_paths must be empty: {:?}",
|
||||||
|
second.purged_paths
|
||||||
|
);
|
||||||
|
}
|
||||||
176
crates/kebab-app/tests/twin_files_fetch_span.rs
Normal file
176
crates/kebab-app/tests/twin_files_fetch_span.rs
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
//! Regression test for the twin-file fetch_span media-type lookup bug.
|
||||||
|
//!
|
||||||
|
//! Twin files (identical content at different workspace paths) share one
|
||||||
|
//! `assets` row whose PRIMARY KEY is the blake3 content hash. The old
|
||||||
|
//! `fetch_span` implementation called
|
||||||
|
//! `get_asset_by_workspace_path(&doc.workspace_path)` to check whether the
|
||||||
|
//! media type was PDF/audio (and therefore reject span fetch). For a twin
|
||||||
|
//! file that lookup could silently return the *other* twin's asset row if
|
||||||
|
//! `assets.workspace_path` had been overwritten on the most recent ingest of
|
||||||
|
//! the sibling — making the media-type branch decision incorrect.
|
||||||
|
//!
|
||||||
|
//! Fix: `fetch_span` now uses the 2-step lookup
|
||||||
|
//! `get_document_by_workspace_path` → `doc.source_asset_id` → `get_asset`
|
||||||
|
//! so the result is always anchored to the requesting document, not
|
||||||
|
//! whichever twin last updated `assets.workspace_path`.
|
||||||
|
//!
|
||||||
|
//! This test builds a twin-file scenario (two .md files at different paths
|
||||||
|
//! with identical content), ingests both, then calls `fetch_span` on each
|
||||||
|
//! twin's `doc_id` and asserts it succeeds. Before the fix, if the asset
|
||||||
|
//! row's workspace_path happened to point at the wrong twin the span could
|
||||||
|
//! return an incorrect `span_not_supported` for a non-PDF/audio file, or
|
||||||
|
//! conversely allow span on a PDF twin by accident. After the fix, the
|
||||||
|
//! lookup is always doc-specific.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use common::TestEnv;
|
||||||
|
use kebab_app::ingest_with_config;
|
||||||
|
use kebab_core::{DocumentStore, FetchKind, FetchOpts, FetchQuery, IngestItemKind};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn twin_files_fetch_span_uses_correct_asset() {
|
||||||
|
let env = TestEnv::lexical_only();
|
||||||
|
|
||||||
|
// Write two markdown files with identical content at different paths.
|
||||||
|
let dir_a = env.workspace_root.join("src_a");
|
||||||
|
let dir_b = env.workspace_root.join("src_b");
|
||||||
|
std::fs::create_dir_all(&dir_a).unwrap();
|
||||||
|
std::fs::create_dir_all(&dir_b).unwrap();
|
||||||
|
|
||||||
|
// The content must produce at least 1 line so span fetch is non-trivial.
|
||||||
|
let content = "# Twin\n\nLine one.\n\nLine two.\n\nLine three.\n";
|
||||||
|
std::fs::write(dir_a.join("note.md"), content).unwrap();
|
||||||
|
std::fs::write(dir_b.join("note.md"), content).unwrap();
|
||||||
|
|
||||||
|
// Ingest all files (fixture workspace + our two new twins).
|
||||||
|
let report = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||||
|
.expect("ingest must succeed");
|
||||||
|
assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");
|
||||||
|
|
||||||
|
// Both twin paths must appear as New in the report.
|
||||||
|
let items = report.items.as_ref().expect("items must be present");
|
||||||
|
let twin_items: Vec<_> = items
|
||||||
|
.iter()
|
||||||
|
.filter(|i| {
|
||||||
|
i.doc_path.0.ends_with("src_a/note.md")
|
||||||
|
|| i.doc_path.0.ends_with("src_b/note.md")
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
assert_eq!(
|
||||||
|
twin_items.len(),
|
||||||
|
2,
|
||||||
|
"exactly 2 twin items expected; items={items:?}"
|
||||||
|
);
|
||||||
|
for item in &twin_items {
|
||||||
|
assert_eq!(
|
||||||
|
item.kind,
|
||||||
|
IngestItemKind::New,
|
||||||
|
"each twin must be New; item={item:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve doc_ids for both workspace paths.
|
||||||
|
// The ingest layer normalises workspace_path to the path relative to
|
||||||
|
// workspace_root (e.g. "src_a/note.md"), so we look up by that form.
|
||||||
|
let store = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||||
|
store.run_migrations().unwrap();
|
||||||
|
|
||||||
|
// Find the twin items by matching on suffix so the test is robust to
|
||||||
|
// however the workspace root is represented.
|
||||||
|
let items = report.items.as_ref().expect("items must be present");
|
||||||
|
let path_a_str = items
|
||||||
|
.iter()
|
||||||
|
.find(|i| i.doc_path.0.ends_with("src_a/note.md"))
|
||||||
|
.map(|i| i.doc_path.0.clone())
|
||||||
|
.expect("src_a/note.md must appear in ingest report");
|
||||||
|
let path_b_str = items
|
||||||
|
.iter()
|
||||||
|
.find(|i| i.doc_path.0.ends_with("src_b/note.md"))
|
||||||
|
.map(|i| i.doc_path.0.clone())
|
||||||
|
.expect("src_b/note.md must appear in ingest report");
|
||||||
|
|
||||||
|
let path_a = kebab_core::WorkspacePath(path_a_str);
|
||||||
|
let path_b = kebab_core::WorkspacePath(path_b_str);
|
||||||
|
|
||||||
|
let doc_a = store
|
||||||
|
.get_document_by_workspace_path(&path_a)
|
||||||
|
.expect("get_document_by_workspace_path path_a")
|
||||||
|
.expect("doc_a must exist after ingest");
|
||||||
|
let doc_b = store
|
||||||
|
.get_document_by_workspace_path(&path_b)
|
||||||
|
.expect("get_document_by_workspace_path path_b")
|
||||||
|
.expect("doc_b must exist after ingest");
|
||||||
|
|
||||||
|
// Both twins share one asset_id (same content hash).
|
||||||
|
assert_eq!(
|
||||||
|
doc_a.source_asset_id, doc_b.source_asset_id,
|
||||||
|
"twin files must share one asset_id"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Open App and issue span fetch on each twin's doc_id.
|
||||||
|
let app = env.app();
|
||||||
|
|
||||||
|
let result_a = app
|
||||||
|
.fetch(
|
||||||
|
FetchQuery::Span {
|
||||||
|
doc_id: doc_a.doc_id.clone(),
|
||||||
|
line_start: 1,
|
||||||
|
line_end: 2,
|
||||||
|
},
|
||||||
|
FetchOpts::default(),
|
||||||
|
)
|
||||||
|
.expect("fetch_span on twin A must succeed for a markdown file");
|
||||||
|
assert_eq!(result_a.kind, FetchKind::Span);
|
||||||
|
assert!(
|
||||||
|
result_a.text.as_deref().is_some_and(|t| !t.is_empty()),
|
||||||
|
"span text for twin A must not be empty"
|
||||||
|
);
|
||||||
|
|
||||||
|
let result_b = app
|
||||||
|
.fetch(
|
||||||
|
FetchQuery::Span {
|
||||||
|
doc_id: doc_b.doc_id.clone(),
|
||||||
|
line_start: 1,
|
||||||
|
line_end: 2,
|
||||||
|
},
|
||||||
|
FetchOpts::default(),
|
||||||
|
)
|
||||||
|
.expect("fetch_span on twin B must succeed for a markdown file");
|
||||||
|
assert_eq!(result_b.kind, FetchKind::Span);
|
||||||
|
assert!(
|
||||||
|
result_b.text.as_deref().is_some_and(|t| !t.is_empty()),
|
||||||
|
"span text for twin B must not be empty"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Ingest again to force the asset.workspace_path flip-flop, then
|
||||||
|
// re-check. Pre-fix this was the scenario that triggered the bug:
|
||||||
|
// after the second ingest the asset row's workspace_path could point
|
||||||
|
// at either twin, making one twin's span fetch behave incorrectly.
|
||||||
|
let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||||
|
.expect("second ingest must succeed");
|
||||||
|
assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}");
|
||||||
|
|
||||||
|
// Re-open app after second ingest and verify span still works on both.
|
||||||
|
let app2 = env.app();
|
||||||
|
|
||||||
|
app2.fetch(
|
||||||
|
FetchQuery::Span {
|
||||||
|
doc_id: doc_a.doc_id.clone(),
|
||||||
|
line_start: 1,
|
||||||
|
line_end: 3,
|
||||||
|
},
|
||||||
|
FetchOpts::default(),
|
||||||
|
)
|
||||||
|
.expect("fetch_span on twin A after flip-flop must still succeed");
|
||||||
|
|
||||||
|
app2.fetch(
|
||||||
|
FetchQuery::Span {
|
||||||
|
doc_id: doc_b.doc_id.clone(),
|
||||||
|
line_start: 1,
|
||||||
|
line_end: 3,
|
||||||
|
},
|
||||||
|
FetchOpts::default(),
|
||||||
|
)
|
||||||
|
.expect("fetch_span on twin B after flip-flop must still succeed");
|
||||||
|
}
|
||||||
@@ -275,6 +275,14 @@ enum Cmd {
|
|||||||
#[arg(long, group = "reset_scope")]
|
#[arg(long, group = "reset_scope")]
|
||||||
config_only: bool,
|
config_only: bool,
|
||||||
|
|
||||||
|
/// Purge stored docs that are outside the current walker scope
|
||||||
|
/// (config narrowing / removed sub-directory). No filesystem paths
|
||||||
|
/// are removed — this is purely a store-level reconciliation.
|
||||||
|
/// Filesystem existence is NOT checked; anything the current walker
|
||||||
|
/// would not visit is considered an orphan and removed from the store.
|
||||||
|
#[arg(long, group = "reset_scope")]
|
||||||
|
orphans_only: bool,
|
||||||
|
|
||||||
/// Skip the interactive confirm. Required in non-interactive
|
/// Skip the interactive confirm. Required in non-interactive
|
||||||
/// contexts (CI, pipes).
|
/// contexts (CI, pipes).
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
@@ -595,14 +603,20 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
|||||||
println!("{}", serde_json::to_string(&wire::wire_ingest(&report))?);
|
println!("{}", serde_json::to_string(&wire::wire_ingest(&report))?);
|
||||||
} else {
|
} else {
|
||||||
let skipped_breakdown = kebab_app::render_skipped_breakdown(&report.skipped_by_extension);
|
let skipped_breakdown = kebab_app::render_skipped_breakdown(&report.skipped_by_extension);
|
||||||
|
let purged_suffix = if report.purged_deleted_files > 0 {
|
||||||
|
format!(" purged {}", report.purged_deleted_files)
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
};
|
||||||
println!(
|
println!(
|
||||||
"scanned {} new {} updated {} skipped {}{} errors {} ({} ms)",
|
"scanned {} new {} updated {} skipped {}{} errors {}{} ({} ms)",
|
||||||
report.scanned,
|
report.scanned,
|
||||||
report.new,
|
report.new,
|
||||||
report.updated,
|
report.updated,
|
||||||
report.skipped,
|
report.skipped,
|
||||||
skipped_breakdown,
|
skipped_breakdown,
|
||||||
report.errors,
|
report.errors,
|
||||||
|
purged_suffix,
|
||||||
report.duration_ms
|
report.duration_ms
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -1088,6 +1102,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
|||||||
data_only: _,
|
data_only: _,
|
||||||
vector_only,
|
vector_only,
|
||||||
config_only,
|
config_only,
|
||||||
|
orphans_only,
|
||||||
yes,
|
yes,
|
||||||
} => {
|
} => {
|
||||||
use kebab_app::ResetScope;
|
use kebab_app::ResetScope;
|
||||||
@@ -1101,11 +1116,50 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
|||||||
ResetScope::VectorOnly
|
ResetScope::VectorOnly
|
||||||
} else if *config_only {
|
} else if *config_only {
|
||||||
ResetScope::ConfigOnly
|
ResetScope::ConfigOnly
|
||||||
|
} else if *orphans_only {
|
||||||
|
ResetScope::OrphansOnly
|
||||||
} else {
|
} else {
|
||||||
ResetScope::DataOnly
|
ResetScope::DataOnly
|
||||||
};
|
};
|
||||||
|
|
||||||
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
|
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
|
||||||
|
|
||||||
|
if matches!(scope, ResetScope::OrphansOnly) {
|
||||||
|
// OrphansOnly: confirm UI shows orphan count + sample paths
|
||||||
|
// rather than on-disk directory sizes.
|
||||||
|
let orphan_paths = kebab_app::enumerate_orphans(&cfg)?;
|
||||||
|
|
||||||
|
if !*yes {
|
||||||
|
use std::io::IsTerminal;
|
||||||
|
if !std::io::stdin().is_terminal() {
|
||||||
|
anyhow::bail!(
|
||||||
|
"reset --orphans-only is destructive and stdin is non-interactive — pass --yes to proceed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if !confirm_orphans_only(&orphan_paths)? {
|
||||||
|
if !cli.quiet {
|
||||||
|
eprintln!("aborted.");
|
||||||
|
}
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let report = kebab_app::reset::execute(scope, &cfg)?;
|
||||||
|
if cli.json {
|
||||||
|
println!("{}", serde_json::to_string(&wire::wire_reset(&report))?);
|
||||||
|
} else {
|
||||||
|
if report.orphans_purged > 0 {
|
||||||
|
println!("orphans purged: {}", report.orphans_purged);
|
||||||
|
for p in &report.purged_paths {
|
||||||
|
println!(" - {}", p.0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("no orphaned docs found — store is already in sync with walker scope");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
let paths = kebab_app::reset::enumerate_paths(scope, &cfg);
|
let paths = kebab_app::reset::enumerate_paths(scope, &cfg);
|
||||||
let bytes = kebab_app::reset::estimate_size_bytes(&paths);
|
let bytes = kebab_app::reset::estimate_size_bytes(&paths);
|
||||||
|
|
||||||
@@ -1444,6 +1498,46 @@ fn confirm_destructive(
|
|||||||
Ok(matches!(s.as_str(), "y" | "yes"))
|
Ok(matches!(s.as_str(), "y" | "yes"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Confirm prompt for `--orphans-only`: shows the orphan count + a
|
||||||
|
/// sample of up to 5 paths so the user knows what will be purged before
|
||||||
|
/// committing. No filesystem paths are removed — only store records.
|
||||||
|
fn confirm_orphans_only(
|
||||||
|
orphan_paths: &[kebab_core::WorkspacePath],
|
||||||
|
) -> anyhow::Result<bool> {
|
||||||
|
use std::io::Write;
|
||||||
|
let n = orphan_paths.len();
|
||||||
|
let mut out = std::io::stderr().lock();
|
||||||
|
|
||||||
|
if n == 0 {
|
||||||
|
writeln!(out, "no orphaned docs found — nothing to purge.")?;
|
||||||
|
out.flush()?;
|
||||||
|
// Nothing to do; treat as confirmed so the caller can emit the
|
||||||
|
// "no orphans" report without prompting.
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
let sample: Vec<&str> = orphan_paths
|
||||||
|
.iter()
|
||||||
|
.take(5)
|
||||||
|
.map(|p| p.0.as_str())
|
||||||
|
.collect();
|
||||||
|
let sample_str = sample.join(", ");
|
||||||
|
let ellipsis = if n > 5 { ", …" } else { "" };
|
||||||
|
|
||||||
|
writeln!(
|
||||||
|
out,
|
||||||
|
"Purge {n} stored doc(s) outside the current walker scope? (no filesystem paths removed)"
|
||||||
|
)?;
|
||||||
|
writeln!(out, " sample: {sample_str}{ellipsis}")?;
|
||||||
|
write!(out, "[y/N] ")?;
|
||||||
|
out.flush()?;
|
||||||
|
|
||||||
|
let mut line = String::new();
|
||||||
|
std::io::stdin().read_line(&mut line)?;
|
||||||
|
let s = line.trim().to_ascii_lowercase();
|
||||||
|
Ok(matches!(s.as_str(), "y" | "yes"))
|
||||||
|
}
|
||||||
|
|
||||||
/// p9-fb-35: human-friendly plain output for `kebab fetch`.
|
/// p9-fb-35: human-friendly plain output for `kebab fetch`.
|
||||||
fn render_fetch_plain(r: &kebab_core::FetchResult) {
|
fn render_fetch_plain(r: &kebab_core::FetchResult) {
|
||||||
println!("# {} ({})", r.doc_path.0, format_kind(r.kind));
|
println!("# {} ({})", r.doc_path.0, format_kind(r.kind));
|
||||||
|
|||||||
@@ -260,6 +260,7 @@ mod tests {
|
|||||||
skipped_generated: 0,
|
skipped_generated: 0,
|
||||||
skipped_size_exceeded: 0,
|
skipped_size_exceeded: 0,
|
||||||
skip_examples: SkipExamples::default(),
|
skip_examples: SkipExamples::default(),
|
||||||
|
purged_deleted_files: 0,
|
||||||
items: None,
|
items: None,
|
||||||
};
|
};
|
||||||
let v = wire_ingest(&r);
|
let v = wire_ingest(&r);
|
||||||
@@ -364,6 +365,8 @@ mod tests {
|
|||||||
scope: kebab_app::ResetScope::DataOnly,
|
scope: kebab_app::ResetScope::DataOnly,
|
||||||
removed_paths: vec![std::path::PathBuf::from("/tmp/x")],
|
removed_paths: vec![std::path::PathBuf::from("/tmp/x")],
|
||||||
embedding_rows_truncated: 0,
|
embedding_rows_truncated: 0,
|
||||||
|
orphans_purged: 0,
|
||||||
|
purged_paths: vec![],
|
||||||
};
|
};
|
||||||
let v = wire_reset(&r);
|
let v = wire_reset(&r);
|
||||||
assert_eq!(schema_of(&v), Some("reset_report.v1"));
|
assert_eq!(schema_of(&v), Some("reset_report.v1"));
|
||||||
|
|||||||
@@ -47,6 +47,12 @@ pub struct IngestReport {
|
|||||||
/// p10-1A-1: sample file paths per skip category (≤ 5 each).
|
/// p10-1A-1: sample file paths per skip category (≤ 5 each).
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub skip_examples: SkipExamples,
|
pub skip_examples: SkipExamples,
|
||||||
|
/// Dogfood: docs whose on-disk file was deleted since the last ingest
|
||||||
|
/// and were therefore removed from the store. Additive field — older
|
||||||
|
/// wire consumers that pre-date this field read it as 0 via
|
||||||
|
/// `#[serde(default)]`.
|
||||||
|
#[serde(default)]
|
||||||
|
pub purged_deleted_files: u32,
|
||||||
/// `None` ↔ wire `items: null` (`--summary-only`).
|
/// `None` ↔ wire `items: null` (`--summary-only`).
|
||||||
pub items: Option<Vec<IngestItem>>,
|
pub items: Option<Vec<IngestItem>>,
|
||||||
}
|
}
|
||||||
@@ -136,6 +142,7 @@ mod tests {
|
|||||||
builtin_blacklist: vec!["node_modules/x.js".into()],
|
builtin_blacklist: vec!["node_modules/x.js".into()],
|
||||||
gitignore: vec![],
|
gitignore: vec![],
|
||||||
},
|
},
|
||||||
|
purged_deleted_files: 0,
|
||||||
items: None,
|
items: None,
|
||||||
};
|
};
|
||||||
let v = serde_json::to_value(&r).unwrap();
|
let v = serde_json::to_value(&r).unwrap();
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use serde_json::Value;
|
|||||||
use crate::asset::{RawAsset, WorkspacePath};
|
use crate::asset::{RawAsset, WorkspacePath};
|
||||||
use crate::chunk::Chunk;
|
use crate::chunk::Chunk;
|
||||||
use crate::document::{Block, CanonicalDocument};
|
use crate::document::{Block, CanonicalDocument};
|
||||||
use crate::ids::{ChunkId, DocumentId};
|
use crate::ids::{AssetId, ChunkId, DocumentId};
|
||||||
use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
|
use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
|
||||||
use crate::media::MediaType;
|
use crate::media::MediaType;
|
||||||
use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery};
|
use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery};
|
||||||
@@ -161,10 +161,23 @@ pub trait DocumentStore {
|
|||||||
fn get_document(&self, id: &DocumentId) -> anyhow::Result<Option<CanonicalDocument>>;
|
fn get_document(&self, id: &DocumentId) -> anyhow::Result<Option<CanonicalDocument>>;
|
||||||
fn get_chunk(&self, id: &ChunkId) -> anyhow::Result<Option<Chunk>>;
|
fn get_chunk(&self, id: &ChunkId) -> anyhow::Result<Option<Chunk>>;
|
||||||
fn list_documents(&self, filter: &DocFilter) -> anyhow::Result<Vec<DocSummary>>;
|
fn list_documents(&self, filter: &DocFilter) -> anyhow::Result<Vec<DocSummary>>;
|
||||||
|
/// Look up an asset row by its `asset_id` (PRIMARY KEY = blake3
|
||||||
|
/// content hash). Twin-file safe: asset_id is PK so there is
|
||||||
|
/// exactly one row per unique content hash, regardless of how many
|
||||||
|
/// `documents` rows share it. Use this instead of
|
||||||
|
/// `get_asset_by_workspace_path` when you already have a
|
||||||
|
/// `CanonicalDocument` (which carries `source_asset_id`).
|
||||||
|
fn get_asset(&self, id: &AssetId) -> anyhow::Result<Option<RawAsset>>;
|
||||||
|
|
||||||
/// p9-fb-23: look up an asset row by its workspace path. Used by
|
/// p9-fb-23: look up an asset row by its workspace path. Used by
|
||||||
/// the incremental-ingest skip path to compare the freshly
|
/// the incremental-ingest skip path to compare the freshly
|
||||||
/// computed blake3 checksum against what's already in SQLite. The
|
/// computed blake3 checksum against what's already in SQLite. The
|
||||||
/// schema enforces a unique workspace_path per asset.
|
/// schema enforces a unique workspace_path per asset.
|
||||||
|
///
|
||||||
|
/// NOTE: for twin files (identical content at different paths),
|
||||||
|
/// `assets.workspace_path` is "last-registered path" — it
|
||||||
|
/// flip-flops on every ingest. Prefer `get_asset` (by asset_id)
|
||||||
|
/// when you have a `CanonicalDocument.source_asset_id`.
|
||||||
fn get_asset_by_workspace_path(
|
fn get_asset_by_workspace_path(
|
||||||
&self,
|
&self,
|
||||||
path: &WorkspacePath,
|
path: &WorkspacePath,
|
||||||
@@ -183,6 +196,16 @@ pub trait DocumentStore {
|
|||||||
&self,
|
&self,
|
||||||
path: &WorkspacePath,
|
path: &WorkspacePath,
|
||||||
) -> anyhow::Result<Option<CanonicalDocument>>;
|
) -> anyhow::Result<Option<CanonicalDocument>>;
|
||||||
|
|
||||||
|
/// Return every `workspace_path` stored in the `documents` table.
|
||||||
|
///
|
||||||
|
/// Used by the post-walker sweep in `kebab-app::ingest` to detect
|
||||||
|
/// documents whose source file has been deleted from the filesystem.
|
||||||
|
/// The set difference `(stored - scanned)` yields orphan candidates;
|
||||||
|
/// each candidate is then existence-checked on disk so that
|
||||||
|
/// out-of-scope files (config narrowing) are NOT purged — only truly
|
||||||
|
/// absent files trigger the purge.
|
||||||
|
fn all_workspace_paths(&self) -> anyhow::Result<Vec<WorkspacePath>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait VectorStore {
|
pub trait VectorStore {
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ pub fn code_lang_for_path(path: &Path) -> Option<&'static str> {
|
|||||||
match ext.as_str() {
|
match ext.as_str() {
|
||||||
"rs" => Some("rust"),
|
"rs" => Some("rust"),
|
||||||
"py" | "pyi" => Some("python"),
|
"py" | "pyi" => Some("python"),
|
||||||
"ts" | "tsx" => Some("typescript"),
|
"ts" | "tsx" | "mts" | "cts" => Some("typescript"),
|
||||||
"js" | "mjs" | "cjs" | "jsx" => Some("javascript"),
|
"js" | "mjs" | "cjs" | "jsx" => Some("javascript"),
|
||||||
"go" => Some("go"),
|
"go" => Some("go"),
|
||||||
"java" => Some("java"),
|
"java" => Some("java"),
|
||||||
@@ -82,7 +82,7 @@ pub fn module_path_for_python(workspace_path: &str) -> String {
|
|||||||
/// (no slash replacement, no source-root strip). See plan §Task C.
|
/// (no slash replacement, no source-root strip). See plan §Task C.
|
||||||
pub fn module_path_for_tsjs(workspace_path: &str) -> String {
|
pub fn module_path_for_tsjs(workspace_path: &str) -> String {
|
||||||
let p = workspace_path;
|
let p = workspace_path;
|
||||||
for ext in [".tsx", ".ts", ".jsx", ".mjs", ".cjs", ".js"] {
|
for ext in [".tsx", ".mts", ".cts", ".ts", ".jsx", ".mjs", ".cjs", ".js"] {
|
||||||
if let Some(stripped) = p.strip_suffix(ext) {
|
if let Some(stripped) = p.strip_suffix(ext) {
|
||||||
return stripped.to_string();
|
return stripped.to_string();
|
||||||
}
|
}
|
||||||
@@ -110,7 +110,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn module_path_for_tsjs_keeps_slashes_and_strips_ext() {
|
fn module_path_for_tsjs_keeps_slashes_and_strips_ext() {
|
||||||
for ext in ["ts", "tsx", "js", "jsx", "mjs", "cjs"] {
|
for ext in ["ts", "tsx", "mts", "cts", "js", "jsx", "mjs", "cjs"] {
|
||||||
let p = format!("src/search/retriever/Retriever.{ext}");
|
let p = format!("src/search/retriever/Retriever.{ext}");
|
||||||
assert_eq!(module_path_for_tsjs(&p), "src/search/retriever/Retriever");
|
assert_eq!(module_path_for_tsjs(&p), "src/search/retriever/Retriever");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -173,8 +173,9 @@ impl Extractor for TypescriptAstExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Select the tree-sitter grammar based on the workspace path's
|
/// Select the tree-sitter grammar based on the workspace path's
|
||||||
/// extension. `.tsx` → TSX grammar; everything else (`.ts`, `.d.ts`,
|
/// extension. `.tsx` → TSX grammar; everything else (`.ts`, `.mts`,
|
||||||
/// missing extension) → TypeScript grammar.
|
/// `.cts`, `.d.ts`, missing extension) → TypeScript grammar (the JSX-
|
||||||
|
/// agnostic variants all share one grammar in tree-sitter-typescript 0.23).
|
||||||
fn select_grammar(workspace_path: &str) -> tree_sitter::Language {
|
fn select_grammar(workspace_path: &str) -> tree_sitter::Language {
|
||||||
if workspace_path.ends_with(".tsx") {
|
if workspace_path.ends_with(".tsx") {
|
||||||
tree_sitter_typescript::LANGUAGE_TSX.into()
|
tree_sitter_typescript::LANGUAGE_TSX.into()
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ fn known_extensions_map_to_canonical_identifiers() {
|
|||||||
("foo.pyi", Some("python")),
|
("foo.pyi", Some("python")),
|
||||||
("foo.ts", Some("typescript")),
|
("foo.ts", Some("typescript")),
|
||||||
("foo.tsx", Some("typescript")),
|
("foo.tsx", Some("typescript")),
|
||||||
|
("foo.mts", Some("typescript")), // ESM TS — same grammar
|
||||||
|
("foo.cts", Some("typescript")), // CommonJS TS — same grammar
|
||||||
("foo.js", Some("javascript")),
|
("foo.js", Some("javascript")),
|
||||||
("foo.mjs", Some("javascript")),
|
("foo.mjs", Some("javascript")),
|
||||||
("foo.cjs", Some("javascript")),
|
("foo.cjs", Some("javascript")),
|
||||||
|
|||||||
@@ -19,7 +19,9 @@ pub(crate) fn media_type_for(path: &Path) -> MediaType {
|
|||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
match ext.as_str() {
|
match ext.as_str() {
|
||||||
"md" => MediaType::Markdown,
|
// Markdown + MDX (markdown + JSX, treated as plain markdown — the
|
||||||
|
// JSX islands are folded into raw passthrough by the md parser).
|
||||||
|
"md" | "mdx" => MediaType::Markdown,
|
||||||
"pdf" => MediaType::Pdf,
|
"pdf" => MediaType::Pdf,
|
||||||
|
|
||||||
"png" => MediaType::Image(ImageType::Png),
|
"png" => MediaType::Image(ImageType::Png),
|
||||||
@@ -40,7 +42,8 @@ pub(crate) fn media_type_for(path: &Path) -> MediaType {
|
|||||||
|
|
||||||
// p10-1B: Python / TS / JS AST chunkers active.
|
// p10-1B: Python / TS / JS AST chunkers active.
|
||||||
"py" | "pyi" => MediaType::Code("python".into()),
|
"py" | "pyi" => MediaType::Code("python".into()),
|
||||||
"ts" | "tsx" => MediaType::Code("typescript".into()),
|
// .mts / .cts are TypeScript ESM / CommonJS variants — same grammar.
|
||||||
|
"ts" | "tsx" | "mts" | "cts" => MediaType::Code("typescript".into()),
|
||||||
"js" | "mjs" | "cjs" | "jsx" => MediaType::Code("javascript".into()),
|
"js" | "mjs" | "cjs" | "jsx" => MediaType::Code("javascript".into()),
|
||||||
|
|
||||||
// Empty string (no extension) and any other extension: bucket as
|
// Empty string (no extension) and any other extension: bucket as
|
||||||
@@ -102,6 +105,20 @@ mod tests {
|
|||||||
assert_eq!(media_type_for(Path::new("a/b.rs")), MediaType::Code("rust".into()));
|
assert_eq!(media_type_for(Path::new("a/b.rs")), MediaType::Code("rust".into()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ts_variants_mts_cts() {
|
||||||
|
// .mts / .cts are TypeScript ESM / CommonJS — same grammar as .ts.
|
||||||
|
assert_eq!(media_type_for(Path::new("a/b.mts")), MediaType::Code("typescript".into()));
|
||||||
|
assert_eq!(media_type_for(Path::new("a/b.cts")), MediaType::Code("typescript".into()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mdx_routes_to_markdown() {
|
||||||
|
// MDX is markdown with JSX islands; the md parser folds the JSX
|
||||||
|
// through as raw passthrough.
|
||||||
|
assert_eq!(media_type_for(Path::new("docs/page.mdx")), MediaType::Markdown);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn unknown_and_missing_extension() {
|
fn unknown_and_missing_extension() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
@@ -56,5 +56,6 @@
|
|||||||
"skipped_kebabignore": 0,
|
"skipped_kebabignore": 0,
|
||||||
"skipped_size_exceeded": 0,
|
"skipped_size_exceeded": 0,
|
||||||
"unchanged": 0,
|
"unchanged": 0,
|
||||||
|
"purged_deleted_files": 0,
|
||||||
"updated": 1
|
"updated": 1
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -264,6 +264,28 @@ impl kebab_core::DocumentStore for SqliteStore {
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_asset(
|
||||||
|
&self,
|
||||||
|
id: &kebab_core::AssetId,
|
||||||
|
) -> Result<Option<kebab_core::RawAsset>> {
|
||||||
|
let conn = self.lock_conn();
|
||||||
|
let result = conn.query_row(
|
||||||
|
r#"SELECT
|
||||||
|
asset_id, source_uri, workspace_path, media_type,
|
||||||
|
byte_len, checksum, storage_kind, storage_path,
|
||||||
|
discovered_at
|
||||||
|
FROM assets
|
||||||
|
WHERE asset_id = ?"#,
|
||||||
|
rusqlite::params![id.0.as_str()],
|
||||||
|
asset_from_row,
|
||||||
|
);
|
||||||
|
match result {
|
||||||
|
Ok(asset) => Ok(Some(asset)),
|
||||||
|
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
|
||||||
|
Err(e) => Err(e.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn get_asset_by_workspace_path(
|
fn get_asset_by_workspace_path(
|
||||||
&self,
|
&self,
|
||||||
path: &kebab_core::WorkspacePath,
|
path: &kebab_core::WorkspacePath,
|
||||||
@@ -352,6 +374,22 @@ impl kebab_core::DocumentStore for SqliteStore {
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn all_workspace_paths(&self) -> Result<Vec<kebab_core::WorkspacePath>> {
|
||||||
|
let conn = self.lock_conn();
|
||||||
|
let mut stmt = conn
|
||||||
|
.prepare("SELECT workspace_path FROM documents")
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
let rows = stmt
|
||||||
|
.query_map([], |r| r.get::<_, String>(0))
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
let mut out = Vec::new();
|
||||||
|
for row in rows {
|
||||||
|
let path = row.map_err(StoreError::from)?;
|
||||||
|
out.push(kebab_core::WorkspacePath(path));
|
||||||
|
}
|
||||||
|
Ok(out)
|
||||||
|
}
|
||||||
|
|
||||||
fn list_documents(
|
fn list_documents(
|
||||||
&self,
|
&self,
|
||||||
filter: &kebab_core::DocFilter,
|
filter: &kebab_core::DocFilter,
|
||||||
@@ -616,7 +654,8 @@ fn rows_optional<T>(err: rusqlite::Error) -> rusqlite::Result<Option<T>> {
|
|||||||
|
|
||||||
/// Reconstruct a [`kebab_core::RawAsset`] from one `assets` row.
|
/// Reconstruct a [`kebab_core::RawAsset`] from one `assets` row.
|
||||||
/// Row mapper for `RawAsset`. Column names are self-documenting; the
|
/// Row mapper for `RawAsset`. Column names are self-documenting; the
|
||||||
/// SELECT in [`DocumentStore::get_asset_by_workspace_path`] must include
|
/// SELECTs in [`DocumentStore::get_asset`] and
|
||||||
|
/// [`DocumentStore::get_asset_by_workspace_path`] must both include
|
||||||
/// all nine columns by their schema names.
|
/// all nine columns by their schema names.
|
||||||
fn asset_from_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<kebab_core::RawAsset> {
|
fn asset_from_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<kebab_core::RawAsset> {
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|||||||
@@ -35,4 +35,4 @@ pub use error::StoreError;
|
|||||||
pub use eval::{EvalQueryResultRecord, EvalRunRecord, EvalRunRow};
|
pub use eval::{EvalQueryResultRecord, EvalRunRecord, EvalRunRow};
|
||||||
pub use fts::rebuild_chunks_fts;
|
pub use fts::rebuild_chunks_fts;
|
||||||
pub use jobs::IngestRunRow;
|
pub use jobs::IngestRunRow;
|
||||||
pub use store::{CountSummary, NotIndexed, SqliteStore};
|
pub use store::{CountSummary, NotIndexed, SqliteStore, purge_deleted_workspace_path};
|
||||||
|
|||||||
@@ -540,10 +540,132 @@ pub(crate) fn purge_orphan_at_workspace_path(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Purge all stored data for a document whose on-disk file has been
|
||||||
|
/// deleted (as opposed to content-changed, which is handled by
|
||||||
|
/// `purge_orphan_at_workspace_path`).
|
||||||
|
///
|
||||||
|
/// Returns the `chunk_id`s that were associated with the document so
|
||||||
|
/// the caller can issue a matching `VectorStore::delete_by_chunk_ids`
|
||||||
|
/// on the LanceDB side.
|
||||||
|
///
|
||||||
|
/// Deletion order:
|
||||||
|
/// 1. Collect chunk_ids (before cascade removes them).
|
||||||
|
/// 2. DELETE the `documents` row → CASCADE clears `blocks`, `chunks`,
|
||||||
|
/// `embedding_records`.
|
||||||
|
/// 3. DELETE the `assets` row **only if no other document still
|
||||||
|
/// references it** (twin-file protection — `assets` can be shared
|
||||||
|
/// across identical-content files via the blake3 PK).
|
||||||
|
/// 4. If the asset was `storage_kind = 'copied'`, best-effort delete
|
||||||
|
/// the on-disk byte file at `storage_path`.
|
||||||
|
///
|
||||||
|
/// Returns `Ok(vec![])` when no document exists at `workspace_path`
|
||||||
|
/// (idempotent — caller doesn't need to pre-check).
|
||||||
|
pub fn purge_deleted_workspace_path(
|
||||||
|
store: &SqliteStore,
|
||||||
|
workspace_path: &kebab_core::WorkspacePath,
|
||||||
|
) -> anyhow::Result<Vec<kebab_core::ChunkId>> {
|
||||||
|
let conn = store.lock_conn();
|
||||||
|
|
||||||
|
// Look up the document + its asset_id.
|
||||||
|
let doc_row: Option<(String, String)> = conn
|
||||||
|
.query_row(
|
||||||
|
"SELECT doc_id, asset_id FROM documents WHERE workspace_path = ?",
|
||||||
|
rusqlite::params![workspace_path.0],
|
||||||
|
|r| Ok((r.get(0)?, r.get(1)?)),
|
||||||
|
)
|
||||||
|
.optional()
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
|
||||||
|
let Some((doc_id, asset_id)) = doc_row else {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
};
|
||||||
|
|
||||||
|
// 1. Collect chunk_ids before CASCADE removes them.
|
||||||
|
let mut stmt = conn
|
||||||
|
.prepare("SELECT chunk_id FROM chunks WHERE doc_id = ?")
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
let rows = stmt
|
||||||
|
.query_map(rusqlite::params![doc_id], |r| r.get::<_, String>(0))
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
let chunk_ids: Vec<kebab_core::ChunkId> = rows
|
||||||
|
.map(|r| r.map(kebab_core::ChunkId))
|
||||||
|
.collect::<rusqlite::Result<Vec<_>>>()
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
drop(stmt);
|
||||||
|
|
||||||
|
// 2. DELETE the document row (CASCADE clears blocks / chunks /
|
||||||
|
// embedding_records via the FK constraints in V001).
|
||||||
|
conn.execute(
|
||||||
|
"DELETE FROM documents WHERE doc_id = ?",
|
||||||
|
rusqlite::params![doc_id],
|
||||||
|
)
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
|
||||||
|
// 3. Delete the asset row only when no other document still
|
||||||
|
// references it (twin-file safety: two files with identical
|
||||||
|
// bytes share a single asset row via the blake3 PK).
|
||||||
|
let remaining_refs: i64 = conn
|
||||||
|
.query_row(
|
||||||
|
"SELECT COUNT(*) FROM documents WHERE asset_id = ?",
|
||||||
|
rusqlite::params![asset_id],
|
||||||
|
|r| r.get(0),
|
||||||
|
)
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
|
||||||
|
if remaining_refs == 0 {
|
||||||
|
// 4. Capture storage details before deleting the row.
|
||||||
|
let asset_storage: Option<(String, String)> = conn
|
||||||
|
.query_row(
|
||||||
|
"SELECT storage_kind, storage_path FROM assets WHERE asset_id = ?",
|
||||||
|
rusqlite::params![asset_id],
|
||||||
|
|r| Ok((r.get(0)?, r.get(1)?)),
|
||||||
|
)
|
||||||
|
.optional()
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"DELETE FROM assets WHERE asset_id = ?",
|
||||||
|
rusqlite::params![asset_id],
|
||||||
|
)
|
||||||
|
.map_err(StoreError::from)?;
|
||||||
|
|
||||||
|
// 5. Best-effort: remove the on-disk copied asset file.
|
||||||
|
if let Some((storage_kind, storage_path)) = asset_storage {
|
||||||
|
if storage_kind == "copied" {
|
||||||
|
let _ = std::fs::remove_file(&storage_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::debug!(
|
||||||
|
target: "kebab-store-sqlite",
|
||||||
|
workspace_path = %workspace_path.0,
|
||||||
|
doc_id = %doc_id,
|
||||||
|
chunk_count = chunk_ids.len(),
|
||||||
|
"purged deleted-file document from store"
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(chunk_ids)
|
||||||
|
}
|
||||||
|
|
||||||
/// UPSERT a row into `assets`. Used by both the `put_asset_with_bytes`
|
/// UPSERT a row into `assets`. Used by both the `put_asset_with_bytes`
|
||||||
/// path (which has bytes + computed `storage_kind/path`) and the
|
/// path (which has bytes + computed `storage_kind/path`) and the
|
||||||
/// `DocumentStore::put_asset` path (which only has the `RawAsset` and
|
/// `DocumentStore::put_asset` path (which only has the `RawAsset` and
|
||||||
/// reads `storage_kind/path` from `asset.stored`).
|
/// reads `storage_kind/path` from `asset.stored`).
|
||||||
|
///
|
||||||
|
/// **`assets.workspace_path` is "last-registered path" semantics for
|
||||||
|
/// twin files** (two source files with identical content share one
|
||||||
|
/// `assets` row keyed on `asset_id = blake3(content)`). Each ingest
|
||||||
|
/// of either twin overwrites `workspace_path` with whichever path was
|
||||||
|
/// seen most recently — this is intentional and correct after PR #146
|
||||||
|
/// made `try_skip_unchanged` document-centric (uses
|
||||||
|
/// `get_document_by_workspace_path`, not `get_asset_by_workspace_path`)
|
||||||
|
/// and PR #149 made `reset --orphans-only` document-centric too.
|
||||||
|
/// Do NOT "fix" the flip-flop by adding a UNIQUE constraint on
|
||||||
|
/// `workspace_path` in the `assets` table — twin de-dup is load-bearing.
|
||||||
|
/// When you need media_type for a known document, use the 2-step lookup
|
||||||
|
/// `get_document_by_workspace_path` → `doc.source_asset_id` →
|
||||||
|
/// `get_asset(asset_id)` so the result is twin-safe.
|
||||||
pub(crate) fn upsert_asset_row(
|
pub(crate) fn upsert_asset_row(
|
||||||
conn: &Connection,
|
conn: &Connection,
|
||||||
asset: &kebab_core::RawAsset,
|
asset: &kebab_core::RawAsset,
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ fn fixture_report() -> IngestReport {
|
|||||||
skipped_generated: 0,
|
skipped_generated: 0,
|
||||||
skipped_size_exceeded: 0,
|
skipped_size_exceeded: 0,
|
||||||
skip_examples: kebab_core::SkipExamples::default(),
|
skip_examples: kebab_core::SkipExamples::default(),
|
||||||
|
purged_deleted_files: 0,
|
||||||
items: Some(vec![
|
items: Some(vec![
|
||||||
IngestItem {
|
IngestItem {
|
||||||
kind: IngestItemKind::New,
|
kind: IngestItemKind::New,
|
||||||
|
|||||||
@@ -14,12 +14,18 @@
|
|||||||
"schema_version": { "const": "reset_report.v1" },
|
"schema_version": { "const": "reset_report.v1" },
|
||||||
"scope": {
|
"scope": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": ["all", "data_only", "vector_only", "config_only"]
|
"enum": ["all", "data_only", "vector_only", "config_only", "orphans_only"]
|
||||||
},
|
},
|
||||||
"removed_paths": {
|
"removed_paths": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": { "type": "string" }
|
"items": { "type": "string" }
|
||||||
},
|
},
|
||||||
"embedding_rows_truncated": { "type": "integer", "minimum": 0 }
|
"embedding_rows_truncated": { "type": "integer", "minimum": 0 },
|
||||||
|
"orphans_purged": { "type": "integer", "minimum": 0, "default": 0 },
|
||||||
|
"purged_paths": {
|
||||||
|
"type": "array",
|
||||||
|
"items": { "type": "string" },
|
||||||
|
"default": []
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user