diff --git a/HANDOFF.md b/HANDOFF.md index a21f566..82443ca 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -40,6 +40,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능. - **P9-2 jump_to_citation workspace_root** — spec literal 의 `jump_to_citation(citation, editor_env)` 가 workspace_root 인자 누락. citation.path 가 workspace 상대라 editor 호출 시 절대 경로 필요 → `workspace_root: &Path` 인자 추가. 동일하게 `render_search` generic 도 P9-1 과 같은 사유로 제거. - **P9-3 e/j/k 키 의 \"input empty\" 분기** — spec 의 `e=toggle explain` / `j=k=scroll` 이 typing 과 충돌 (\"explain\" / \"javascript\" 같은 단어 입력 깨짐). input 이 비어 있을 때만 command 키로 동작 — vim \"command vs insert\" 컨벤션 변형. 사용자가 텍스트 입력 시 모든 알파벳 정상 통과. - **P9-4 enter_inspect helper + Search `i` 키** — spec 의 진입 경로 (Library Enter → Doc inspect, Search `i` → Chunk inspect) 를 한 helper 로 묶음. `InspectTarget` enum (`Doc(DocumentId) | Chunk(ChunkId)`), `return_to: Pane` 가 Esc 시 원래 pane 으로 복귀. `c` 키가 모든 section (metadata / provenance / blocks / spans / text / embeddings) 일괄 collapse/expand — spec 의 \"focus 기반 selective collapse\" 는 v1 단순화. +- **2026-05-02 P9 도그푸딩 후속 (p9-fb-06)** — `kebab reset --all|--data-only|--vector-only|--config-only [--yes]` 추가. TTY 가 아니면 `--yes` 필수 (silent destruction 금지). `--vector-only` 가 SQLite `embedding_records` 도 함께 truncate (off-disk Lance dir 만 wipe 시 orphan 방지). 도그푸딩 막힘 강도 1위 (수동 4 경로 `rm -rf` 부담) 해소. spec: `tasks/p9/p9-fb-06-data-reset-command.md`, plan: `docs/superpowers/plans/2026-05-02-p9-fb-06-reset-command.md`. ## 다음 task 후보 diff --git a/README.md b/README.md index 3712a12..9dfc14f 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ cargo install --git https://gitea.altair823.xyz/altair823-org/kebab.git --bin ke 업데이트는 `git pull && cargo install --path crates/kebab-cli --locked --force` 또는 git URL 형식의 경우 `cargo install --git ... --force`. -제거는 `cargo uninstall kebab-cli`. 이 명령은 binary 만 지우고 워크스페이스 데이터 (`~/.local/share/kebab/`, `~/.config/kebab/`) 는 그대로 남는다 — 데이터까지 정리하려면 `rm -rf ~/.local/share/kebab ~/.config/kebab ~/.cache/kebab ~/.local/state/kebab`. +제거는 `cargo uninstall kebab-cli`. 이 명령은 binary 만 지우고 워크스페이스 데이터는 그대로 남는다. 데이터까지 정리하려면 `kebab reset --all --yes` (config + data + cache + state 4 개 XDG 경로 모두 wipe — **irreversible**, 재시작 시 `kebab init` 다시 실행). 부분 wipe 는 `kebab reset --data-only` (config 보존), `kebab reset --vector-only` (Lance + `embedding_records` 만, 다음 ingest 가 re-embed) 등. ## Quick start @@ -77,6 +77,7 @@ kebab doctor | `kebab ask ""` | RAG 답변 + 근거 인용. 근거 부족 시 거절. Ollama 필요 | | `kebab doctor` | 설정/모델/DB 헬스 체크 | | `kebab tui` | Ratatui 셸 (Library + Search + Ask + Inspect 패널, desktop 진행 중) | +| `kebab reset [--all / --data-only / --vector-only / --config-only] [--yes]` | XDG 데이터 wipe. **Irreversible.** TTY 면 confirm prompt, 아니면 `--yes` 필수. `--vector-only` 는 SQLite `embedding_records` 도 함께 truncate (orphan 방지) | | `kebab eval run / compare` | golden query 회귀 측정 | 모든 명령에 `--json` 플래그. 출력은 frozen wire schema v1 (`schema_version` 항상 포함, 예: `ingest_report.v1`, `search_hit.v1`, `answer.v1`, `doctor.v1`). diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 78c8a81..b7877a8 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -57,8 +57,10 @@ use kebab_source_fs::FsSourceConnector; mod app; pub mod doctor_signal; pub mod logging; +pub mod reset; pub use app::App; +pub use reset::{ResetReport, ResetScope}; /// Parser-version label persisted in `documents.parser_version` for /// every Markdown file ingested through the `kb-parse-md` pipeline. diff --git a/crates/kebab-app/src/reset.rs b/crates/kebab-app/src/reset.rs new file mode 100644 index 0000000..5247fed --- /dev/null +++ b/crates/kebab-app/src/reset.rs @@ -0,0 +1,203 @@ +//! `kebab reset` core — scope-driven path enumeration + wipe. +//! +//! The CLI (and any future TUI surface) calls `enumerate_paths(scope, &cfg)` +//! to compute exactly which on-disk paths the user has asked to remove, +//! presents that list for confirmation, then calls `execute(scope, &cfg)` +//! to actually remove them. Splitting the read step (enumerate) from the +//! write step (execute) is what lets the confirm UI show a faithful +//! preview without having to re-derive the path set. +//! +//! `--vector-only` additionally truncates `embedding_records` in SQLite +//! so the next `kebab ingest` re-embeds cleanly without orphan rows. + +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; + +use kebab_config::{Config, expand_path}; + +/// What the user asked to remove. Mutually exclusive — picked by the CLI +/// from a clap `ArgGroup`. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ResetScope { + /// Wipe config + data + cache + state (all four XDG dirs). + All, + /// Wipe data + cache + state. Config is preserved so the next run + /// behaves the same. Default when the user passes `--data-only`. + DataOnly, + /// Wipe only the Lance vector_dir off-disk AND truncate the matching + /// `embedding_records` rows in SQLite. Documents / chunks survive. + VectorOnly, + /// Wipe only the config dir. + ConfigOnly, +} + +/// Result of a successful wipe — emitted as `reset_report.v1` by the +/// CLI's `--json` mode and used by the human-mode summary line. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ResetReport { + pub scope: ResetScope, + pub removed_paths: Vec, + pub embedding_rows_truncated: u64, +} + +/// Compute the absolute on-disk paths a given scope will wipe, given a +/// loaded `Config`. Pure — does NOT touch the filesystem. +/// +/// `--all` returns all four XDG paths in a stable order (config, data, +/// cache, state). `--vector-only` returns the resolved `storage.vector_dir`. +/// Order is preserved across calls so the confirm UI is deterministic. +pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec { + let cfg_dir = Config::xdg_config_path() + .parent() + .map(PathBuf::from) + .unwrap_or_default(); + let data_dir = Config::xdg_data_dir(); + let cache_dir = Config::xdg_cache_dir(); + let state_dir = Config::xdg_state_dir(); + + match scope { + ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir], + ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir], + ResetScope::VectorOnly => { + let vector_dir = + expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy()); + vec![vector_dir] + } + ResetScope::ConfigOnly => vec![cfg_dir], + } +} + +/// Best-effort byte size of a directory tree (returns 0 on any I/O error +/// — this is for the confirm UI, not accounting). Skips broken symlinks +/// instead of bubbling errors so a half-broken cache still gets summed. +pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 { + fn walk(p: &std::path::Path) -> u64 { + let mut total = 0u64; + let entries = match std::fs::read_dir(p) { + Ok(it) => it, + Err(_) => return 0, + }; + for e in entries.flatten() { + let ft = match e.file_type() { + Ok(t) => t, + Err(_) => continue, + }; + if ft.is_dir() { + total += walk(&e.path()); + } else if ft.is_file() { + total += e.metadata().map(|m| m.len()).unwrap_or(0); + } + } + total + } + paths.iter().map(|p| walk(p)).sum() +} + +/// Wipe every path from `enumerate_paths(scope, cfg)`. For +/// `ResetScope::VectorOnly`, also truncates the SQLite +/// `embedding_records` table so the store doesn't point at the Lance +/// rows we just removed off-disk. +/// +/// Idempotent: a missing path is treated as already-removed (success). +/// Returns a `ResetReport` listing exactly what was removed (paths that +/// existed before the call) so `--json` callers see the truth, not the +/// request. +pub fn execute(scope: ResetScope, cfg: &Config) -> Result { + let paths = enumerate_paths(scope, cfg); + let mut removed = Vec::new(); + + for p in &paths { + if !p.exists() { + continue; + } + std::fs::remove_dir_all(p) + .with_context(|| format!("remove {}", p.display()))?; + removed.push(p.clone()); + } + + let embedding_rows_truncated = if matches!(scope, ResetScope::VectorOnly) { + truncate_embeddings(cfg)? + } else { + 0 + }; + + Ok(ResetReport { + scope, + removed_paths: removed, + embedding_rows_truncated, + }) +} + +/// Open the SQLite store at the configured path and run +/// `truncate_embedding_records`. Returns the count of truncated rows +/// (the helper itself reports `DELETE` rowcount). If the SQLite file +/// does not exist (e.g. user has never ingested), returns 0 — not an +/// error. +fn truncate_embeddings(cfg: &Config) -> Result { + let data_dir = expand_path(&cfg.storage.data_dir, ""); + let sqlite_path = data_dir.join("kebab.sqlite"); + if !sqlite_path.exists() { + return Ok(0); + } + let store = kebab_store_sqlite::SqliteStore::open(cfg) + .context("open SqliteStore for truncate_embedding_records")?; + store.truncate_embedding_records() +} + +#[cfg(test)] +mod tests { + use super::*; + + fn cfg_with_vector_dir(s: &str) -> Config { + let mut c = Config::defaults(); + c.storage.vector_dir = s.to_string(); + c + } + + #[test] + fn enumerate_data_only_excludes_config_dir() { + let cfg = Config::defaults(); + let paths = enumerate_paths(ResetScope::DataOnly, &cfg); + let cfg_dir = Config::xdg_config_path() + .parent() + .map(PathBuf::from) + .unwrap_or_default(); + assert!(!paths.contains(&cfg_dir)); + } + + #[test] + fn enumerate_vector_only_returns_resolved_vector_dir() { + let cfg = cfg_with_vector_dir("{data_dir}/lancedb"); + let paths = enumerate_paths(ResetScope::VectorOnly, &cfg); + assert_eq!(paths.len(), 1); + let s = paths[0].to_string_lossy().into_owned(); + assert!(s.ends_with("/lancedb"), "got: {s}"); + } + + #[test] + fn enumerate_all_has_four_distinct_paths() { + let cfg = Config::defaults(); + let paths = enumerate_paths(ResetScope::All, &cfg); + assert_eq!(paths.len(), 4); + let unique: std::collections::HashSet<_> = paths.iter().collect(); + assert_eq!(unique.len(), 4); + } + + #[test] + fn estimate_size_returns_zero_on_missing_dir() { + assert_eq!(estimate_size_bytes(&[PathBuf::from("/nonexistent/xyz")]), 0); + } + + #[test] + fn estimate_size_sums_file_lengths() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write(dir.path().join("a"), b"hello").unwrap(); + std::fs::create_dir(dir.path().join("nested")).unwrap(); + std::fs::write(dir.path().join("nested/b"), b"world!").unwrap(); + let bytes = estimate_size_bytes(&[dir.path().to_path_buf()]); + assert_eq!(bytes, 5 + 6); + } +} diff --git a/crates/kebab-cli/Cargo.toml b/crates/kebab-cli/Cargo.toml index e7eed98..9818589 100644 --- a/crates/kebab-cli/Cargo.toml +++ b/crates/kebab-cli/Cargo.toml @@ -30,3 +30,6 @@ kebab-tui = { path = "../kebab-tui" } anyhow = { workspace = true } serde_json = { workspace = true } clap = { version = "4", features = ["derive"] } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/crates/kebab-cli/src/main.rs b/crates/kebab-cli/src/main.rs index 76e0e02..4c01037 100644 --- a/crates/kebab-cli/src/main.rs +++ b/crates/kebab-cli/src/main.rs @@ -99,6 +99,35 @@ enum Cmd { seed: Option, }, + /// Wipe XDG data dirs (and optionally the Lance vector store) so the + /// workspace can be re-initialised. **Irreversible.** Without + /// `--yes`, prompts on TTY; aborts in non-interactive contexts. + Reset { + /// Wipe config + data + cache + state. Implies losing + /// `config.toml` — re-run `kebab init` afterwards. + #[arg(long, group = "reset_scope")] + all: bool, + + /// Default. Wipe data + cache + state. Config is preserved. + #[arg(long, group = "reset_scope")] + data_only: bool, + + /// Wipe only the Lance vector store + truncate + /// `embedding_records`. SQLite documents / chunks survive so the + /// next `kebab ingest` re-embeds without re-parsing. + #[arg(long, group = "reset_scope")] + vector_only: bool, + + /// Wipe only the config dir. + #[arg(long, group = "reset_scope")] + config_only: bool, + + /// Skip the interactive confirm. Required in non-interactive + /// contexts (CI, pipes). + #[arg(long)] + yes: bool, + }, + /// Health check. Doctor, @@ -380,6 +409,64 @@ fn run(cli: &Cli) -> anyhow::Result<()> { Ok(()) } + Cmd::Reset { + all, + data_only: _, + vector_only, + config_only, + yes, + } => { + use kebab_app::ResetScope; + // `--data-only` explicit OR no scope flag at all → DataOnly. + // The `data_only: _` binding above is intentional — clap's + // `group = "reset_scope"` already enforces mutual exclusion, + // so the flag's presence does not change the resolved scope. + let scope = if *all { + ResetScope::All + } else if *vector_only { + ResetScope::VectorOnly + } else if *config_only { + ResetScope::ConfigOnly + } else { + ResetScope::DataOnly + }; + + let cfg = kebab_config::Config::load(cli.config.as_deref())?; + let paths = kebab_app::reset::enumerate_paths(scope, &cfg); + let bytes = kebab_app::reset::estimate_size_bytes(&paths); + + if !*yes { + use std::io::IsTerminal; + if !std::io::stdin().is_terminal() { + anyhow::bail!( + "reset is destructive and stdin is non-interactive — pass --yes to proceed" + ); + } + if !confirm_destructive(scope, &paths, bytes)? { + eprintln!("aborted."); + return Ok(()); + } + } + + let report = kebab_app::reset::execute(scope, &cfg)?; + if cli.json { + println!("{}", serde_json::to_string(&wire::wire_reset(&report))?); + } else { + println!( + "removed {} path(s); embedding_rows_truncated={}", + report.removed_paths.len(), + report.embedding_rows_truncated + ); + for p in &report.removed_paths { + println!(" - {}", p.display()); + } + if matches!(scope, ResetScope::All | ResetScope::ConfigOnly) { + println!("hint: run `kebab init` to recreate config.toml"); + } + } + Ok(()) + } + Cmd::Doctor => { let report = kebab_app::doctor_with_config_path(cli.config.as_deref())?; if cli.json { @@ -496,3 +583,29 @@ fn run(cli: &Cli) -> anyhow::Result<()> { } } +/// Minimal stdin/stdout confirm prompt for destructive ops. No new dep — +/// uses stdlib `IsTerminal` (the caller is expected to have already +/// short-circuited the non-TTY case). Returns `Ok(true)` only when the +/// user types `y` / `Y` / `yes`. Empty input or anything else → `false` +/// (safe default). +fn confirm_destructive( + scope: kebab_app::ResetScope, + paths: &[std::path::PathBuf], + bytes: u64, +) -> anyhow::Result { + use std::io::Write; + let mut out = std::io::stderr().lock(); + writeln!(out, "kebab reset ({:?}): about to remove", scope)?; + for p in paths { + writeln!(out, " - {}", p.display())?; + } + writeln!(out, "estimated total: {} bytes", bytes)?; + write!(out, "Proceed? [y/N] ")?; + out.flush()?; + + let mut line = String::new(); + std::io::stdin().read_line(&mut line)?; + let s = line.trim().to_ascii_lowercase(); + Ok(matches!(s.as_str(), "y" | "yes")) +} + diff --git a/crates/kebab-cli/src/wire.rs b/crates/kebab-cli/src/wire.rs index fd1493a..b6ca688 100644 --- a/crates/kebab-cli/src/wire.rs +++ b/crates/kebab-cli/src/wire.rs @@ -108,6 +108,12 @@ pub fn wire_doctor(d: &DoctorReport) -> Value { tag_object(v, "doctor.v1") } +/// Wrap a [`kebab_app::ResetReport`] as `reset_report.v1`. +pub fn wire_reset(r: &kebab_app::ResetReport) -> Value { + let v = serde_json::to_value(r).expect("ResetReport serializes"); + tag_object(v, "reset_report.v1") +} + #[cfg(test)] mod tests { use super::*; @@ -172,4 +178,23 @@ mod tests { let tagged = tag_object(v, "x.v1"); assert_eq!(schema_of(&tagged), Some("x.v1")); } + + #[test] + fn reset_wrapper_tags_schema_version_and_serializes_scope() { + let r = kebab_app::ResetReport { + scope: kebab_app::ResetScope::DataOnly, + removed_paths: vec![std::path::PathBuf::from("/tmp/x")], + embedding_rows_truncated: 0, + }; + let v = wire_reset(&r); + assert_eq!(schema_of(&v), Some("reset_report.v1")); + assert_eq!(v.get("scope").and_then(Value::as_str), Some("data_only")); + assert_eq!( + v.get("embedding_rows_truncated").and_then(Value::as_u64), + Some(0) + ); + let paths = v.get("removed_paths").and_then(Value::as_array).unwrap(); + assert_eq!(paths.len(), 1); + assert_eq!(paths[0].as_str(), Some("/tmp/x")); + } } diff --git a/crates/kebab-cli/tests/reset_cli.rs b/crates/kebab-cli/tests/reset_cli.rs new file mode 100644 index 0000000..d45ff8f --- /dev/null +++ b/crates/kebab-cli/tests/reset_cli.rs @@ -0,0 +1,156 @@ +//! Integration coverage for `kebab reset` — exercises the binary +//! end-to-end against a tempdir-rooted XDG layout. Each test runs the +//! built `kebab` bin in a fresh subprocess so the per-process XDG env +//! overrides don't bleed into sibling tests. + +use std::process::Command; + +fn kebab_bin() -> std::path::PathBuf { + // The compiled bin is at `target/debug/kebab` relative to the + // workspace root. CARGO_MANIFEST_DIR points at the kebab-cli crate + // dir; the workspace root is two levels above (../../). + let manifest = env!("CARGO_MANIFEST_DIR"); + std::path::PathBuf::from(manifest) + .parent() + .unwrap() + .parent() + .unwrap() + .join("target/debug/kebab") +} + +#[test] +fn reset_data_only_yes_removes_data_dir_and_keeps_config() { + let tmp = tempfile::tempdir().unwrap(); + let xdg_cfg = tmp.path().join("cfg"); + let xdg_data = tmp.path().join("data"); + let xdg_cache = tmp.path().join("cache"); + let xdg_state = tmp.path().join("state"); + std::fs::create_dir_all(xdg_cfg.join("kebab")).unwrap(); + std::fs::create_dir_all(xdg_data.join("kebab")).unwrap(); + std::fs::create_dir_all(xdg_cache.join("kebab")).unwrap(); + std::fs::create_dir_all(xdg_state.join("kebab")).unwrap(); + // No `config.toml` written — Config::load(None) falls back to + // defaults when the file is absent (see kebab-config). The marker + // file under cfg/kebab/ is what we assert survives. + std::fs::write(xdg_cfg.join("kebab/marker"), b"cfg").unwrap(); + std::fs::write(xdg_data.join("kebab/marker"), b"data").unwrap(); + + let out = Command::new(kebab_bin()) + .args(["reset", "--data-only", "--yes"]) + .env("XDG_CONFIG_HOME", &xdg_cfg) + .env("XDG_DATA_HOME", &xdg_data) + .env("XDG_CACHE_HOME", &xdg_cache) + .env("XDG_STATE_HOME", &xdg_state) + .output() + .unwrap(); + assert!( + out.status.success(), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); + + assert!(!xdg_data.join("kebab").exists(), "data dir should be gone"); + assert!(!xdg_cache.join("kebab").exists(), "cache dir should be gone"); + assert!(!xdg_state.join("kebab").exists(), "state dir should be gone"); + assert!(xdg_cfg.join("kebab/marker").exists(), "config dir preserved"); +} + +#[test] +fn reset_no_yes_in_non_tty_aborts_with_exit_2() { + let tmp = tempfile::tempdir().unwrap(); + let xdg_data = tmp.path().join("data"); + std::fs::create_dir_all(xdg_data.join("kebab")).unwrap(); + std::fs::write(xdg_data.join("kebab/marker"), b"d").unwrap(); + + let out = Command::new(kebab_bin()) + .args(["reset", "--data-only"]) + .env("XDG_CONFIG_HOME", tmp.path().join("cfg")) + .env("XDG_DATA_HOME", &xdg_data) + .env("XDG_CACHE_HOME", tmp.path().join("cache")) + .env("XDG_STATE_HOME", tmp.path().join("state")) + .output() + .unwrap(); + + // Non-TTY (Command::output gives no tty) without --yes must abort. + assert!(!out.status.success(), "expected abort, got success"); + let code = out.status.code().unwrap_or(-1); + assert_eq!(code, 2, "expected exit 2 (generic error), got {code}"); + assert!( + xdg_data.join("kebab").exists(), + "data dir must survive an aborted reset" + ); + let stderr = String::from_utf8_lossy(&out.stderr); + assert!( + stderr.contains("non-interactive") || stderr.contains("--yes"), + "expected refusal hint in stderr, got: {stderr}" + ); +} + +#[test] +fn reset_data_only_yes_json_emits_reset_report_v1() { + let tmp = tempfile::tempdir().unwrap(); + let xdg_data = tmp.path().join("data"); + std::fs::create_dir_all(xdg_data.join("kebab")).unwrap(); + std::fs::write(xdg_data.join("kebab/marker"), b"d").unwrap(); + + let out = Command::new(kebab_bin()) + .args(["--json", "reset", "--data-only", "--yes"]) + .env("XDG_CONFIG_HOME", tmp.path().join("cfg")) + .env("XDG_DATA_HOME", &xdg_data) + .env("XDG_CACHE_HOME", tmp.path().join("cache")) + .env("XDG_STATE_HOME", tmp.path().join("state")) + .output() + .unwrap(); + assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr)); + + let v: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); + assert_eq!( + v.get("schema_version").and_then(|s| s.as_str()), + Some("reset_report.v1") + ); + assert_eq!(v.get("scope").and_then(|s| s.as_str()), Some("data_only")); + // The data dir was created beforehand and must show up in the + // report. The cache dir was NOT created, so it must be omitted — + // proving idempotency ("missing path is treated as already + // removed"). The state dir may or may not appear: kebab-app's + // logging init creates the state dir as a side-effect, which is + // tolerated. We assert the strict invariant (data in, cache out) + // and let the state dir be either way. + let paths: Vec = v + .get("removed_paths") + .and_then(|a| a.as_array()) + .expect("removed_paths must be a JSON array") + .iter() + .filter_map(|s| s.as_str().map(str::to_owned)) + .collect(); + assert!( + paths.iter().any(|p| p.contains("/data/kebab")), + "data dir must be reported as removed, got: {paths:?}" + ); + assert!( + !paths.iter().any(|p| p.contains("/cache/kebab")), + "cache dir was never created and must be omitted, got: {paths:?}" + ); +} + +#[test] +fn reset_mutually_exclusive_scope_flags_rejected() { + // clap's `group = "reset_scope"` should reject --all and + // --data-only together. The bin must exit nonzero with a clap + // usage error before touching any path. + let tmp = tempfile::tempdir().unwrap(); + let out = Command::new(kebab_bin()) + .args(["reset", "--all", "--data-only", "--yes"]) + .env("XDG_CONFIG_HOME", tmp.path().join("cfg")) + .env("XDG_DATA_HOME", tmp.path().join("data")) + .env("XDG_CACHE_HOME", tmp.path().join("cache")) + .env("XDG_STATE_HOME", tmp.path().join("state")) + .output() + .unwrap(); + assert!(!out.status.success()); + let stderr = String::from_utf8_lossy(&out.stderr); + assert!( + stderr.contains("cannot be used") || stderr.contains("conflicts"), + "expected clap conflict error, got: {stderr}" + ); +} diff --git a/crates/kebab-store-sqlite/src/embeddings.rs b/crates/kebab-store-sqlite/src/embeddings.rs index 0ef76a2..40b19d0 100644 --- a/crates/kebab-store-sqlite/src/embeddings.rs +++ b/crates/kebab-store-sqlite/src/embeddings.rs @@ -132,6 +132,22 @@ impl SqliteStore { tx.commit().map_err(StoreError::from)?; Ok(()) } + + /// Wipe every row from `embedding_records`, returning the count of + /// rows that were removed. Called by `kebab reset --vector-only` so + /// SQLite cannot point at a Lance row that the reset just removed + /// off-disk. + /// + /// The function does NOT cascade to `chunks` or `documents` — those + /// are kept so the next `kebab ingest` re-embeds the existing chunk + /// set without re-parsing. + pub fn truncate_embedding_records(&self) -> Result { + let conn = self.lock_conn(); + let n = conn + .execute("DELETE FROM embedding_records", []) + .context("DELETE FROM embedding_records")?; + Ok(n as u64) + } } #[cfg(test)] diff --git a/crates/kebab-store-sqlite/tests/truncate_embeddings.rs b/crates/kebab-store-sqlite/tests/truncate_embeddings.rs new file mode 100644 index 0000000..a24fc3e --- /dev/null +++ b/crates/kebab-store-sqlite/tests/truncate_embeddings.rs @@ -0,0 +1,119 @@ +//! `truncate_embedding_records` wipes every row regardless of status. +//! +//! Used by `kebab reset --vector-only` to keep SQLite in sync after the +//! Lance vector store is deleted off-disk. The helper is exposed at the +//! integration-test boundary so consumers (kebab-app's reset module) can +//! verify its semantics without reaching into private store internals. + +use kebab_config::Config; +use kebab_store_sqlite::{EmbeddingRecordRow, SqliteStore}; +use rusqlite::params; +use tempfile::TempDir; +use time::OffsetDateTime; + +fn config_for(tmp: &TempDir) -> Config { + let mut c = Config::defaults(); + c.storage.data_dir = tmp.path().to_string_lossy().into_owned(); + c +} + +fn open_store(tmp: &TempDir) -> SqliteStore { + let cfg = config_for(tmp); + let store = SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + store +} + +/// Seed an asset + document + chunk so an `embedding_records` row inserted +/// against `chunk_id` does not violate the chunks FK. Mirrors the helper +/// used by the in-crate `embeddings::tests` module — copied here because +/// integration tests cannot reach the private `seed_chunk` from outside +/// the crate. +fn seed_chunk(store: &SqliteStore, chunk_id: &str) { + let conn = store.read_conn(); + conn.execute( + "INSERT INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, ?, ?, ?, 'reference', '/tmp/x', ?)", + params![ + "0123456789abcdef0123456789abcdef", + "file:///tmp/x", + "x.md", + "{}", + 0_i64, + "deadbeef", + "1970-01-01T00:00:00Z", + ], + ) + .unwrap(); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, NULL, 'fs', 'unverified', 'v1', 1, 1, '{}', '{}', ?, ?)", + params![ + "fedcba9876543210fedcba9876543210", + "0123456789abcdef0123456789abcdef", + "x.md", + "1970-01-01T00:00:00Z", + "1970-01-01T00:00:00Z", + ], + ) + .unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'hash', '[]', ?)", + params![ + chunk_id, + "fedcba9876543210fedcba9876543210", + "1970-01-01T00:00:00Z" + ], + ) + .unwrap(); +} + +fn count_rows(store: &SqliteStore) -> i64 { + let conn = store.read_conn(); + conn.query_row("SELECT COUNT(*) FROM embedding_records", [], |r| r.get(0)) + .unwrap() +} + +#[test] +fn truncate_removes_all_rows_and_returns_count() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let chunk = "11112222333344445555666677778888"; + seed_chunk(&store, chunk); + + let row = EmbeddingRecordRow { + embedding_id: "aaaa1111bbbb2222cccc3333dddd4444".to_string(), + chunk_id: chunk.to_string(), + model_id: "test-model".to_string(), + model_version: "v1".to_string(), + dimensions: 4, + lance_table: "chunk_embeddings_test_model_4".to_string(), + created_at: OffsetDateTime::now_utc(), + }; + store + .put_embedding_records_pending(std::slice::from_ref(&row)) + .unwrap(); + assert_eq!(count_rows(&store), 1); + + let removed = store.truncate_embedding_records().unwrap(); + assert_eq!(removed, 1); + assert_eq!(count_rows(&store), 0); +} + +#[test] +fn truncate_on_empty_table_is_noop() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let removed = store.truncate_embedding_records().unwrap(); + assert_eq!(removed, 0); + assert_eq!(count_rows(&store), 0); +} diff --git a/docs/wire-schema/v1/reset_report.schema.json b/docs/wire-schema/v1/reset_report.schema.json new file mode 100644 index 0000000..ab34d8c --- /dev/null +++ b/docs/wire-schema/v1/reset_report.schema.json @@ -0,0 +1,25 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kb.local/wire/v1/reset_report.schema.json", + "title": "ResetReport v1", + "description": "Result of `kebab reset` — what scope was requested and what was actually removed off-disk. A path that did not exist before the call is omitted (the wipe is idempotent).", + "type": "object", + "required": [ + "schema_version", + "scope", + "removed_paths", + "embedding_rows_truncated" + ], + "properties": { + "schema_version": { "const": "reset_report.v1" }, + "scope": { + "type": "string", + "enum": ["all", "data_only", "vector_only", "config_only"] + }, + "removed_paths": { + "type": "array", + "items": { "type": "string" } + }, + "embedding_rows_truncated": { "type": "integer", "minimum": 0 } + } +} diff --git a/tasks/p9/p9-fb-06-data-reset-command.md b/tasks/p9/p9-fb-06-data-reset-command.md index e1a575f..5828a70 100644 --- a/tasks/p9/p9-fb-06-data-reset-command.md +++ b/tasks/p9/p9-fb-06-data-reset-command.md @@ -3,7 +3,7 @@ phase: P9 component: kebab-cli + kebab-app task_id: p9-fb-06 title: "kebab reset / nuke command" -status: planned +status: in_progress depends_on: [] unblocks: [] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md