From c3a48a88a54671265ec3d464f234fd3d50714651 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 18:23:42 +0000 Subject: [PATCH] =?UTF-8?q?feat(app):=20add=20reset=20module=20=E2=80=94?= =?UTF-8?q?=20scope,=20path=20enumeration,=20execute?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provides the wipe core for `kebab reset`. Mutually-exclusive ResetScope variants (All / DataOnly / VectorOnly / ConfigOnly), pure path enumeration for the confirm UI preview, byte-size estimator, and an execute helper that removes paths off-disk + truncates embedding_records when scope is VectorOnly. Plan deviation from the original spec (task 2): - Original `truncate_embeddings` helper opened SqliteStore via path and ran a separate COUNT query through `lock_conn` (private). Both APIs are unavailable from outside the crate, so the helper now opens the store via `SqliteStore::open(&Config)` and lets `truncate_embedding_records` (task 1) report the deleted count directly. - Skipped the XDG-env-overriding unit test from the original plan to avoid race conditions with sibling tests; the equivalent integration coverage moves up to the CLI tests in task 4 where each invocation runs in a fresh process. - Added an FS-touching unit test (`estimate_size_sums_file_lengths`) to cover the read-side of `estimate_size_bytes` against a tempdir. p9-fb-06 task 2. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 2 + crates/kebab-app/src/reset.rs | 203 ++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 crates/kebab-app/src/reset.rs diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 78c8a81..b7877a8 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -57,8 +57,10 @@ use kebab_source_fs::FsSourceConnector; mod app; pub mod doctor_signal; pub mod logging; +pub mod reset; pub use app::App; +pub use reset::{ResetReport, ResetScope}; /// Parser-version label persisted in `documents.parser_version` for /// every Markdown file ingested through the `kb-parse-md` pipeline. diff --git a/crates/kebab-app/src/reset.rs b/crates/kebab-app/src/reset.rs new file mode 100644 index 0000000..5247fed --- /dev/null +++ b/crates/kebab-app/src/reset.rs @@ -0,0 +1,203 @@ +//! `kebab reset` core — scope-driven path enumeration + wipe. +//! +//! The CLI (and any future TUI surface) calls `enumerate_paths(scope, &cfg)` +//! to compute exactly which on-disk paths the user has asked to remove, +//! presents that list for confirmation, then calls `execute(scope, &cfg)` +//! to actually remove them. Splitting the read step (enumerate) from the +//! write step (execute) is what lets the confirm UI show a faithful +//! preview without having to re-derive the path set. +//! +//! `--vector-only` additionally truncates `embedding_records` in SQLite +//! so the next `kebab ingest` re-embeds cleanly without orphan rows. + +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; + +use kebab_config::{Config, expand_path}; + +/// What the user asked to remove. Mutually exclusive — picked by the CLI +/// from a clap `ArgGroup`. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ResetScope { + /// Wipe config + data + cache + state (all four XDG dirs). + All, + /// Wipe data + cache + state. Config is preserved so the next run + /// behaves the same. Default when the user passes `--data-only`. + DataOnly, + /// Wipe only the Lance vector_dir off-disk AND truncate the matching + /// `embedding_records` rows in SQLite. Documents / chunks survive. + VectorOnly, + /// Wipe only the config dir. + ConfigOnly, +} + +/// Result of a successful wipe — emitted as `reset_report.v1` by the +/// CLI's `--json` mode and used by the human-mode summary line. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ResetReport { + pub scope: ResetScope, + pub removed_paths: Vec, + pub embedding_rows_truncated: u64, +} + +/// Compute the absolute on-disk paths a given scope will wipe, given a +/// loaded `Config`. Pure — does NOT touch the filesystem. +/// +/// `--all` returns all four XDG paths in a stable order (config, data, +/// cache, state). `--vector-only` returns the resolved `storage.vector_dir`. +/// Order is preserved across calls so the confirm UI is deterministic. +pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec { + let cfg_dir = Config::xdg_config_path() + .parent() + .map(PathBuf::from) + .unwrap_or_default(); + let data_dir = Config::xdg_data_dir(); + let cache_dir = Config::xdg_cache_dir(); + let state_dir = Config::xdg_state_dir(); + + match scope { + ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir], + ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir], + ResetScope::VectorOnly => { + let vector_dir = + expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy()); + vec![vector_dir] + } + ResetScope::ConfigOnly => vec![cfg_dir], + } +} + +/// Best-effort byte size of a directory tree (returns 0 on any I/O error +/// — this is for the confirm UI, not accounting). Skips broken symlinks +/// instead of bubbling errors so a half-broken cache still gets summed. +pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 { + fn walk(p: &std::path::Path) -> u64 { + let mut total = 0u64; + let entries = match std::fs::read_dir(p) { + Ok(it) => it, + Err(_) => return 0, + }; + for e in entries.flatten() { + let ft = match e.file_type() { + Ok(t) => t, + Err(_) => continue, + }; + if ft.is_dir() { + total += walk(&e.path()); + } else if ft.is_file() { + total += e.metadata().map(|m| m.len()).unwrap_or(0); + } + } + total + } + paths.iter().map(|p| walk(p)).sum() +} + +/// Wipe every path from `enumerate_paths(scope, cfg)`. For +/// `ResetScope::VectorOnly`, also truncates the SQLite +/// `embedding_records` table so the store doesn't point at the Lance +/// rows we just removed off-disk. +/// +/// Idempotent: a missing path is treated as already-removed (success). +/// Returns a `ResetReport` listing exactly what was removed (paths that +/// existed before the call) so `--json` callers see the truth, not the +/// request. +pub fn execute(scope: ResetScope, cfg: &Config) -> Result { + let paths = enumerate_paths(scope, cfg); + let mut removed = Vec::new(); + + for p in &paths { + if !p.exists() { + continue; + } + std::fs::remove_dir_all(p) + .with_context(|| format!("remove {}", p.display()))?; + removed.push(p.clone()); + } + + let embedding_rows_truncated = if matches!(scope, ResetScope::VectorOnly) { + truncate_embeddings(cfg)? + } else { + 0 + }; + + Ok(ResetReport { + scope, + removed_paths: removed, + embedding_rows_truncated, + }) +} + +/// Open the SQLite store at the configured path and run +/// `truncate_embedding_records`. Returns the count of truncated rows +/// (the helper itself reports `DELETE` rowcount). If the SQLite file +/// does not exist (e.g. user has never ingested), returns 0 — not an +/// error. +fn truncate_embeddings(cfg: &Config) -> Result { + let data_dir = expand_path(&cfg.storage.data_dir, ""); + let sqlite_path = data_dir.join("kebab.sqlite"); + if !sqlite_path.exists() { + return Ok(0); + } + let store = kebab_store_sqlite::SqliteStore::open(cfg) + .context("open SqliteStore for truncate_embedding_records")?; + store.truncate_embedding_records() +} + +#[cfg(test)] +mod tests { + use super::*; + + fn cfg_with_vector_dir(s: &str) -> Config { + let mut c = Config::defaults(); + c.storage.vector_dir = s.to_string(); + c + } + + #[test] + fn enumerate_data_only_excludes_config_dir() { + let cfg = Config::defaults(); + let paths = enumerate_paths(ResetScope::DataOnly, &cfg); + let cfg_dir = Config::xdg_config_path() + .parent() + .map(PathBuf::from) + .unwrap_or_default(); + assert!(!paths.contains(&cfg_dir)); + } + + #[test] + fn enumerate_vector_only_returns_resolved_vector_dir() { + let cfg = cfg_with_vector_dir("{data_dir}/lancedb"); + let paths = enumerate_paths(ResetScope::VectorOnly, &cfg); + assert_eq!(paths.len(), 1); + let s = paths[0].to_string_lossy().into_owned(); + assert!(s.ends_with("/lancedb"), "got: {s}"); + } + + #[test] + fn enumerate_all_has_four_distinct_paths() { + let cfg = Config::defaults(); + let paths = enumerate_paths(ResetScope::All, &cfg); + assert_eq!(paths.len(), 4); + let unique: std::collections::HashSet<_> = paths.iter().collect(); + assert_eq!(unique.len(), 4); + } + + #[test] + fn estimate_size_returns_zero_on_missing_dir() { + assert_eq!(estimate_size_bytes(&[PathBuf::from("/nonexistent/xyz")]), 0); + } + + #[test] + fn estimate_size_sums_file_lengths() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write(dir.path().join("a"), b"hello").unwrap(); + std::fs::create_dir(dir.path().join("nested")).unwrap(); + std::fs::write(dir.path().join("nested/b"), b"world!").unwrap(); + let bytes = estimate_size_bytes(&[dir.path().to_path_buf()]); + assert_eq!(bytes, 5 + 6); + } +}