Files
kebab/crates/kebab-app/src/reset.rs
altair823 685007789a style: cargo fmt --all (round 4 ingest log feature follow-up)
Phase C4 executor 의 마지막 `fix(test): clippy + fmt fixes` commit 이
test file 부분만 fmt 적용. workspace 전체 fmt 누락 발견 → cargo fmt --all
적용. 모든 import alphabetical reorder + line wrapping 정합.

추가 untracked artifact 동시 commit:
- docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md (491 line, ACCEPT)
- docs/superpowers/plans/2026-05-28-v0.20-ingest-log-plan.md (616 line, ACCEPT)

workspace test: 1370 passed / 0 failed / 50 ignored, ingest_log_smoke green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 04:18:40 +00:00

392 lines
14 KiB
Rust

//! `kebab reset` core — scope-driven path enumeration + wipe.
//!
//! The CLI (and any future TUI surface) calls `enumerate_paths(scope, &cfg)`
//! to compute exactly which on-disk paths the user has asked to remove,
//! presents that list for confirmation, then calls `execute(scope, &cfg)`
//! to actually remove them. Splitting the read step (enumerate) from the
//! write step (execute) is what lets the confirm UI show a faithful
//! preview without having to re-derive the path set.
//!
//! `--vector-only` additionally truncates `embedding_records` in SQLite
//! so the next `kebab ingest` re-embeds cleanly without orphan rows.
//!
//! `--orphans-only` purges stored docs that are outside the current walker
//! scope (config narrowing / removed sub-directory). No filesystem paths are
//! removed — this is purely a store-level reconciliation.
use std::collections::HashSet;
use std::path::PathBuf;
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use kebab_config::{Config, expand_path};
use kebab_core::WorkspacePath;
/// What the user asked to remove. Mutually exclusive — picked by the CLI
/// from a clap `ArgGroup`.
#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ResetScope {
/// Wipe config + data + cache + state (all four XDG dirs).
All,
/// Wipe data + cache + state. Config is preserved so the next run
/// behaves the same. Default when the user passes `--data-only`.
DataOnly,
/// Wipe only the Lance vector_dir off-disk AND truncate the matching
/// `embedding_records` rows in SQLite. Documents / chunks survive.
VectorOnly,
/// Wipe only the config dir.
ConfigOnly,
/// Purge stored docs that are outside the current walker scope (no
/// filesystem paths are removed). Filesystem existence is NOT checked —
/// anything the current walker would not visit is considered an orphan.
/// The explicit complement to the conservative `sweep_deleted_files`
/// that runs during ingest (which leaves on-disk-but-out-of-scope docs
/// alone for data safety).
OrphansOnly,
}
/// Result of a successful wipe — emitted as `reset_report.v1` by the
/// CLI's `--json` mode and used by the human-mode summary line.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ResetReport {
pub scope: ResetScope,
pub removed_paths: Vec<PathBuf>,
pub embedding_rows_truncated: u64,
/// Number of stored docs purged because they are outside the current
/// walker scope. Non-zero only when `scope == OrphansOnly`.
/// `#[serde(default)]` preserves back-compat with older callers that
/// do not include this field.
#[serde(default)]
pub orphans_purged: u32,
/// Paths of the orphaned docs that were purged. Sorted for deterministic
/// output. Non-empty only when `scope == OrphansOnly`.
#[serde(default)]
pub purged_paths: Vec<WorkspacePath>,
}
/// Compute the absolute on-disk paths a given scope will wipe, given a
/// loaded `Config`. Pure — does NOT touch the filesystem.
///
/// `--all` returns all four XDG paths in a stable order (config, data,
/// cache, state). `--vector-only` returns the resolved `storage.vector_dir`.
/// Order is preserved across calls so the confirm UI is deterministic.
pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec<PathBuf> {
let cfg_dir = Config::xdg_config_path()
.parent()
.map(PathBuf::from)
.unwrap_or_default();
let data_dir = Config::xdg_data_dir();
let cache_dir = Config::xdg_cache_dir();
let state_dir = Config::xdg_state_dir();
match scope {
ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir],
ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir],
ResetScope::VectorOnly => {
let vector_dir = expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
vec![vector_dir]
}
ResetScope::ConfigOnly => vec![cfg_dir],
// OrphansOnly operates purely at the store level — no filesystem paths
// are removed. Return empty so `estimate_size_bytes` stays zero and
// the existing confirm UI path for directory wipes is skipped.
ResetScope::OrphansOnly => vec![],
}
}
/// Best-effort byte size of a directory tree (returns 0 on any I/O error
/// — this is for the confirm UI, not accounting). Skips broken symlinks
/// instead of bubbling errors so a half-broken cache still gets summed.
pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
fn walk(p: &std::path::Path) -> u64 {
let mut total = 0u64;
let entries = match std::fs::read_dir(p) {
Ok(it) => it,
Err(_) => return 0,
};
for e in entries.flatten() {
let ft = match e.file_type() {
Ok(t) => t,
Err(_) => continue,
};
if ft.is_dir() {
total += walk(&e.path());
} else if ft.is_file() {
total += e.metadata().map(|m| m.len()).unwrap_or(0);
}
}
total
}
paths.iter().map(|p| walk(p)).sum()
}
/// Compute the workspace paths stored in SQLite that are NOT visited by
/// the current walker scope (i.e. they are "orphans" — on disk but
/// outside the configured include/exclude rules, or from a sub-directory
/// that has since been removed from the workspace).
///
/// Does NOT check filesystem existence — `OrphansOnly` is the explicit
/// "I know what I'm doing" variant; callers that want the conservative
/// fs-aware sweep should use `sweep_deleted_files` inside ingest.
///
/// Returns the list sorted for deterministic output. Called twice by the
/// CLI path (once for the confirm UI preview, once inside `execute`);
/// the double scan is acceptable for a rare destructive operation.
pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
use kebab_core::DocumentStore as _;
use kebab_core::SourceScope;
use kebab_source_fs::FsSourceConnector;
let store = kebab_store_sqlite::SqliteStore::open(cfg)
.context("enumerate_orphans: open SqliteStore")?;
let stored = store
.all_workspace_paths()
.context("enumerate_orphans: all_workspace_paths")?;
if stored.is_empty() {
return Ok(Vec::new());
}
// Build the same SourceScope the CLI's ingest path uses: root from
// config, exclude list from config, no include override (full scope).
let root = cfg.resolve_workspace_root();
let scope = SourceScope {
root: root.clone(),
exclude: cfg.workspace.exclude.clone(),
..Default::default()
};
let connector =
FsSourceConnector::new(cfg).context("enumerate_orphans: build FsSourceConnector")?;
let (assets, _skips) = connector
.scan_with_skips(&scope)
.context("enumerate_orphans: scan workspace")?;
let scanned: HashSet<WorkspacePath> = assets.into_iter().map(|a| a.workspace_path).collect();
let mut orphans: Vec<WorkspacePath> = stored
.into_iter()
.filter(|p| !scanned.contains(p))
.collect();
orphans.sort_by(|a, b| a.0.cmp(&b.0));
Ok(orphans)
}
/// Wipe every path from `enumerate_paths(scope, cfg)`. For
/// `ResetScope::VectorOnly`, also truncates the SQLite
/// `embedding_records` table so the store doesn't point at the Lance
/// rows we just removed off-disk.
///
/// For `ResetScope::OrphansOnly`, no filesystem directories are removed.
/// Instead the store is reconciled: stored docs outside the current walker
/// scope are purged from SQLite (+ vector store when configured). The
/// caller is expected to have already shown the confirm UI using
/// `enumerate_orphans`.
///
/// Idempotent: a missing path is treated as already-removed (success).
/// Returns a `ResetReport` listing exactly what was removed (paths that
/// existed before the call) so `--json` callers see the truth, not the
/// request.
pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
if matches!(scope, ResetScope::OrphansOnly) {
return execute_orphans_only(cfg);
}
let paths = enumerate_paths(scope, cfg);
let mut removed = Vec::new();
for p in &paths {
if !p.exists() {
continue;
}
std::fs::remove_dir_all(p).with_context(|| format!("remove {}", p.display()))?;
removed.push(p.clone());
}
let embedding_rows_truncated = if matches!(scope, ResetScope::VectorOnly) {
truncate_embeddings(cfg)?
} else {
0
};
Ok(ResetReport {
scope,
removed_paths: removed,
embedding_rows_truncated,
orphans_purged: 0,
purged_paths: Vec::new(),
})
}
/// Execute the `OrphansOnly` variant: reconcile stored docs against the
/// current walker scope without touching any filesystem directory.
fn execute_orphans_only(cfg: &Config) -> Result<ResetReport> {
let orphans = enumerate_orphans(cfg).context("execute_orphans_only: enumerate orphans")?;
if orphans.is_empty() {
return Ok(ResetReport {
scope: ResetScope::OrphansOnly,
removed_paths: Vec::new(),
embedding_rows_truncated: 0,
orphans_purged: 0,
purged_paths: Vec::new(),
});
}
let store = std::sync::Arc::new(
kebab_store_sqlite::SqliteStore::open(cfg)
.context("execute_orphans_only: open SqliteStore")?,
);
// Open vector store if configured. Mirror the same guard the ingest
// path uses: only construct when the provider is not "none" / dims > 0.
let vector_store: Option<kebab_store_vector::LanceVectorStore> =
open_vector_store_if_configured(cfg, store.clone())?;
let mut purged_paths: Vec<WorkspacePath> = Vec::new();
for path in &orphans {
let chunk_ids = kebab_store_sqlite::purge_deleted_workspace_path(&store, path)
.with_context(|| format!("execute_orphans_only: purge {}", path.0))?;
if let Some(ref vs) = vector_store {
if !chunk_ids.is_empty() {
use kebab_core::VectorStore as _;
if let Err(e) = vs.delete_by_chunk_ids(&chunk_ids) {
tracing::warn!(
target: "kebab-app",
path = %path.0,
count = chunk_ids.len(),
error = %e,
"reset --orphans-only: vector delete failed; SQLite side already cleaned"
);
}
}
}
tracing::info!(
target: "kebab-app",
path = %path.0,
"reset --orphans-only: purged orphan document"
);
purged_paths.push(path.clone());
}
let orphans_purged = u32::try_from(purged_paths.len()).unwrap_or(u32::MAX);
Ok(ResetReport {
scope: ResetScope::OrphansOnly,
removed_paths: Vec::new(),
embedding_rows_truncated: 0,
orphans_purged,
purged_paths,
})
}
/// Open the Lance vector store if the configured embedding provider is
/// active (non-"none", dimensions > 0). Returns `None` for lexical-only
/// configs. Mirrors the guard in `App::vector`.
fn open_vector_store_if_configured(
cfg: &Config,
store: std::sync::Arc<kebab_store_sqlite::SqliteStore>,
) -> Result<Option<kebab_store_vector::LanceVectorStore>> {
if cfg.models.embedding.provider == "none" || cfg.models.embedding.dimensions == 0 {
return Ok(None);
}
match kebab_store_vector::LanceVectorStore::new(cfg, store) {
Ok(vs) => Ok(Some(vs)),
Err(e) => {
tracing::warn!(
target: "kebab-app",
error = %e,
"reset --orphans-only: could not open vector store; skipping vector delete"
);
Ok(None)
}
}
}
/// Open the SQLite store at the configured path and run
/// `truncate_embedding_records`. Returns the count of truncated rows
/// (the helper itself reports `DELETE` rowcount). If the SQLite file
/// does not exist (e.g. user has never ingested), returns 0 — not an
/// error.
fn truncate_embeddings(cfg: &Config) -> Result<u64> {
let data_dir = expand_path(&cfg.storage.data_dir, "");
let sqlite_path = data_dir.join("kebab.sqlite");
if !sqlite_path.exists() {
return Ok(0);
}
let store = kebab_store_sqlite::SqliteStore::open(cfg)
.context("open SqliteStore for truncate_embedding_records")?;
store.truncate_embedding_records()
}
#[cfg(test)]
mod tests {
use super::*;
fn cfg_with_vector_dir(s: &str) -> Config {
let mut c = Config::defaults();
c.storage.vector_dir = s.to_string();
c
}
#[test]
fn enumerate_data_only_excludes_config_dir() {
let cfg = Config::defaults();
let paths = enumerate_paths(ResetScope::DataOnly, &cfg);
let cfg_dir = Config::xdg_config_path()
.parent()
.map(PathBuf::from)
.unwrap_or_default();
assert!(!paths.contains(&cfg_dir));
}
#[test]
fn enumerate_vector_only_returns_resolved_vector_dir() {
let cfg = cfg_with_vector_dir("{data_dir}/lancedb");
let paths = enumerate_paths(ResetScope::VectorOnly, &cfg);
assert_eq!(paths.len(), 1);
let s = paths[0].to_string_lossy().into_owned();
assert!(s.ends_with("/lancedb"), "got: {s}");
}
#[test]
fn enumerate_all_has_four_distinct_paths() {
let cfg = Config::defaults();
let paths = enumerate_paths(ResetScope::All, &cfg);
assert_eq!(paths.len(), 4);
let unique: std::collections::HashSet<_> = paths.iter().collect();
assert_eq!(unique.len(), 4);
}
#[test]
fn estimate_size_returns_zero_on_missing_dir() {
assert_eq!(estimate_size_bytes(&[PathBuf::from("/nonexistent/xyz")]), 0);
}
#[test]
fn estimate_size_sums_file_lengths() {
let dir = tempfile::tempdir().unwrap();
std::fs::write(dir.path().join("a"), b"hello").unwrap();
std::fs::create_dir(dir.path().join("nested")).unwrap();
std::fs::write(dir.path().join("nested/b"), b"world!").unwrap();
let bytes = estimate_size_bytes(&[dir.path().to_path_buf()]);
assert_eq!(bytes, 5 + 6);
}
#[test]
fn enumerate_orphans_only_returns_empty_paths() {
let cfg = Config::defaults();
let paths = enumerate_paths(ResetScope::OrphansOnly, &cfg);
assert!(
paths.is_empty(),
"OrphansOnly must return empty vec from enumerate_paths"
);
}
}