From 35c987df1c96f5db9cbd2559bdbe257db01e2134 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 28 May 2026 06:17:47 +0000 Subject: [PATCH] =?UTF-8?q?feat(app):=20log=20retention=20=E2=80=94=20keep?= =?UTF-8?q?=5Frecent=5Fruns=20+=20retention=5Fdays=20(Enhancement=204)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LoggingCfg gains two fields with serde defaults: keep_recent_runs (default 100, top-N file retention) and retention_days (default 30, time-based retention for both ndjson files and the SQLite mirror). IngestLogWriter::open now runs cleanup_old_logs before creating a new ingest-*.ndjson — delete iff (idx >= keep_recent) OR (modified <= cutoff). ingest_with_config_opts also calls SqliteStore::prune_pdf_ocr_events(retention_days) at ingest start so the SQLite mirror tracks the same retention window. Backward compat (AC-9): both new fields use #[serde(default = ...)], so a pre-v0.20.x config with only [logging] ingest_log_enabled + ingest_log_dir parses unchanged. kebab init writes the new defaults automatically via Config::default() -> toml::to_string_pretty (AC-12). docs/SMOKE.md config example synced. Closure r1 F5: explicit OR-on-stale comment inside cleanup_old_logs. Co-Authored-By: Claude Opus 4.7 --- crates/kebab-app/Cargo.toml | 1 + crates/kebab-app/src/ingest_log.rs | 123 ++++++++++++++++++ crates/kebab-app/src/lib.rs | 9 ++ crates/kebab-app/tests/ingest_log_smoke.rs | 2 + .../tests/pdf_ocr_events_insert_smoke.rs | 1 + crates/kebab-config/src/lib.rs | 20 ++- .../kebab-config/tests/logging_roundtrip.rs | 25 ++++ docs/SMOKE.md | 6 + 8 files changed, 186 insertions(+), 1 deletion(-) diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml index c14c6ad..3964c12 100644 --- a/crates/kebab-app/Cargo.toml +++ b/crates/kebab-app/Cargo.toml @@ -72,6 +72,7 @@ rusqlite = { workspace = true } [dev-dependencies] rusqlite = { workspace = true } +filetime = "0.2" tempfile = { workspace = true } # Image-pipeline integration tests use wiremock to stub Ollama for OCR # / caption HTTP calls. Async runtime to host the mock server only; diff --git a/crates/kebab-app/src/ingest_log.rs b/crates/kebab-app/src/ingest_log.rs index 9288f30..311a97d 100644 --- a/crates/kebab-app/src/ingest_log.rs +++ b/crates/kebab-app/src/ingest_log.rs @@ -29,6 +29,10 @@ impl IngestLogWriter { let run_id = generate_run_id(); let log_dir = expand_log_dir(&cfg.ingest_log_dir); std::fs::create_dir_all(&log_dir)?; + // Cleanup before creating the new file (non-critical: warn on error). + if let Err(e) = cleanup_old_logs(&log_dir, cfg.keep_recent_runs, cfg.retention_days) { + tracing::warn!(target: "kebab-app", "ingest log cleanup failed: {e}"); + } let path = log_dir.join(format!("ingest-{run_id}.ndjson")); let file = BufWriter::new(File::create(&path)?); Ok(Some(Self { @@ -218,10 +222,69 @@ pub(crate) fn percentiles( (Some(p50), Some(p90), Some(p99), Some(max)) } +/// Delete old ingest log files from `log_dir`. +/// +/// **Retention rule (§3.4 OR-on-stale semantics):** +/// Keep a file iff BOTH conditions hold: (idx < keep_recent) AND (modified > cutoff). +/// Delete iff (idx >= keep_recent) OR (modified <= cutoff) — either stale condition +/// triggers deletion. Files are indexed newest-first so `idx=0` is the most recent. +pub(crate) fn cleanup_old_logs( + log_dir: &Path, + keep_recent: u32, + retention_days: u32, +) -> anyhow::Result<()> { + let mut entries: Vec<_> = std::fs::read_dir(log_dir)? + .filter_map(|e| e.ok()) + .filter(|e| { + e.path() + .file_name() + .and_then(|n| n.to_str()) + .map(|s| s.starts_with("ingest-") && s.ends_with(".ndjson")) + .unwrap_or(false) + }) + .collect(); + + // Sort newest-first by mtime (files without mtime go to the end). + entries.sort_by_key(|e| { + std::cmp::Reverse( + e.metadata() + .ok() + .and_then(|m| m.modified().ok()), + ) + }); + + let cutoff = SystemTime::now() + .checked_sub(std::time::Duration::from_secs( + retention_days as u64 * 86400, + )) + .unwrap_or(SystemTime::UNIX_EPOCH); + + for (idx, entry) in entries.into_iter().enumerate() { + let modified = entry + .metadata() + .ok() + .and_then(|m| m.modified().ok()) + .unwrap_or(SystemTime::UNIX_EPOCH); + // Keep iff (idx < keep_recent) AND (modified > cutoff). + if (idx as u32) < keep_recent && modified > cutoff { + continue; + } + if let Err(e) = std::fs::remove_file(entry.path()) { + tracing::warn!( + target: "kebab-app", + "failed to remove old log {}: {e}", + entry.path().display() + ); + } + } + Ok(()) +} + #[cfg(test)] mod tests { use super::*; use kebab_config::LoggingCfg; + use std::time::SystemTime; use tempfile::TempDir; #[test] @@ -254,6 +317,7 @@ mod tests { let cfg = LoggingCfg { ingest_log_enabled: false, ingest_log_dir: PathBuf::from("/tmp/should-not-exist"), + ..Default::default() }; let result = IngestLogWriter::open(&cfg).expect("open should not error"); assert!(result.is_none(), "disabled writer should return None"); @@ -265,6 +329,7 @@ mod tests { let cfg = LoggingCfg { ingest_log_enabled: true, ingest_log_dir: tmp.path().to_path_buf(), + ..Default::default() }; let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap(); let path = writer.path().to_path_buf(); @@ -315,6 +380,7 @@ mod tests { let cfg = LoggingCfg { ingest_log_enabled: true, ingest_log_dir: tmp.path().to_path_buf(), + ..Default::default() }; let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap(); let path = writer.path().to_path_buf(); @@ -333,4 +399,61 @@ mod tests { "file should have at least 1 line after drop" ); } + + /// AC-7: keep_recent=3 with 5 files, oldest 2 should be deleted. + #[test] + fn cleanup_keeps_recent_n_drops_old() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path(); + // Create 5 files with mtime spread across 60 days + for i in 0..5u64 { + let path = dir.join(format!("ingest-file{i}.ndjson")); + std::fs::write(&path, b"x").unwrap(); + // Set mtime: file 0 = newest, file 4 = 60 days old + let age_days = i * 15; // 0, 15, 30, 45, 60 days old + let mtime = SystemTime::now() + .checked_sub(std::time::Duration::from_secs(age_days * 86400)) + .unwrap(); + filetime::set_file_mtime( + &path, + filetime::FileTime::from_system_time(mtime), + ) + .unwrap(); + } + // keep_recent=3, retention_days=90 (no time-based deletion) + cleanup_old_logs(dir, 3, 90).unwrap(); + let remaining: Vec<_> = std::fs::read_dir(dir) + .unwrap() + .filter_map(|e| e.ok()) + .collect(); + assert_eq!(remaining.len(), 3, "expected 3 files after cleanup"); + } + + /// F5 OR-on-stale: files within keep_recent count but older than retention_days + /// must still be deleted. + #[test] + fn cleanup_drops_stale_even_within_count() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path(); + // 2 files, both 90 days old — well past retention_days=30 + for i in 0..2u64 { + let path = dir.join(format!("ingest-old{i}.ndjson")); + std::fs::write(&path, b"x").unwrap(); + let mtime = SystemTime::now() + .checked_sub(std::time::Duration::from_secs(90 * 86400)) + .unwrap(); + filetime::set_file_mtime( + &path, + filetime::FileTime::from_system_time(mtime), + ) + .unwrap(); + } + // keep_recent=10 (both within count) but retention_days=30 → both stale + cleanup_old_logs(dir, 10, 30).unwrap(); + let remaining: Vec<_> = std::fs::read_dir(dir) + .unwrap() + .filter_map(|e| e.ok()) + .collect(); + assert_eq!(remaining.len(), 0, "stale files must be deleted even within keep_recent"); + } } diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 97fbeab..1f289fc 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -321,6 +321,15 @@ pub fn ingest_with_config_opts( let ocr_pages_cnt: Arc> = Arc::new(Mutex::new(0u32)); let ocr_failures_cnt: Arc> = Arc::new(Mutex::new(0u32)); + // v0.20.x r2: prune stale pdf_ocr_events rows once per ingest run. + let _pruned = app + .sqlite + .prune_pdf_ocr_events(app.config.logging.retention_days) + .unwrap_or_else(|e| { + tracing::warn!(target: "kebab-app", "pdf_ocr_events prune failed: {e}"); + 0 + }); + // Walk the workspace. crate::ingest_progress::emit( progress, diff --git a/crates/kebab-app/tests/ingest_log_smoke.rs b/crates/kebab-app/tests/ingest_log_smoke.rs index cf152fa..6cc69aa 100644 --- a/crates/kebab-app/tests/ingest_log_smoke.rs +++ b/crates/kebab-app/tests/ingest_log_smoke.rs @@ -28,6 +28,7 @@ fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Con cfg.logging = LoggingCfg { ingest_log_enabled: true, ingest_log_dir: log_dir.to_path_buf(), + ..Default::default() }; cfg } @@ -138,6 +139,7 @@ fn ingest_log_disabled_emits_no_file() { cfg.logging = LoggingCfg { ingest_log_enabled: false, ingest_log_dir: log_dir.clone(), + ..Default::default() }; let scope = SourceScope { diff --git a/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs b/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs index 8808473..9967e0a 100644 --- a/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs +++ b/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs @@ -58,6 +58,7 @@ async fn ingest_dual_write_doc_id_matches_ndjson() { env.config.logging = LoggingCfg { ingest_log_enabled: true, ingest_log_dir: log_dir.clone(), + ..Default::default() }; // Copy scanned PDF into workspace diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index 7fa693d..335920b 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -443,9 +443,19 @@ pub struct LoggingCfg { /// Directory for per-run log files. Default `{state_dir}/logs`. /// `{state_dir}` expands to the XDG state dir (e.g. `~/.local/state/kebab`). - /// Log file accumulation is user-managed — no rotation policy (spec §6 R-1). #[serde(default = "default_ingest_log_dir")] pub ingest_log_dir: PathBuf, + + /// v0.20.x r2 Enhancement 4: keep the most recent N ingest log files. + /// Older files (beyond this count) are deleted at ingest start. + /// Default 100. AC-9: #[serde(default)] ensures backward compat. + #[serde(default = "default_keep_recent_runs")] + pub keep_recent_runs: u32, + + /// v0.20.x r2 Enhancement 4: delete log files older than N days. + /// Also applied to `pdf_ocr_events` SQLite rows. Default 30. + #[serde(default = "default_retention_days")] + pub retention_days: u32, } fn default_ingest_log_enabled() -> bool { @@ -454,12 +464,20 @@ fn default_ingest_log_enabled() -> bool { fn default_ingest_log_dir() -> PathBuf { PathBuf::from("{state_dir}/logs") } +fn default_keep_recent_runs() -> u32 { + 100 +} +fn default_retention_days() -> u32 { + 30 +} impl Default for LoggingCfg { fn default() -> Self { Self { ingest_log_enabled: default_ingest_log_enabled(), ingest_log_dir: default_ingest_log_dir(), + keep_recent_runs: default_keep_recent_runs(), + retention_days: default_retention_days(), } } } diff --git a/crates/kebab-config/tests/logging_roundtrip.rs b/crates/kebab-config/tests/logging_roundtrip.rs index 27521ce..8525cc2 100644 --- a/crates/kebab-config/tests/logging_roundtrip.rs +++ b/crates/kebab-config/tests/logging_roundtrip.rs @@ -42,3 +42,28 @@ fn pre_v020_config_without_logging_section_gets_defaults() { assert!(w.logging.ingest_log_enabled); assert_eq!(w.logging.ingest_log_dir, PathBuf::from("{state_dir}/logs")); } + +// Test 4 (AC-9 v0.20.x r2): old config with only ingest_log_enabled + ingest_log_dir +// parses without error and produces correct defaults for keep_recent_runs + retention_days. +#[test] +fn old_logging_config_parses_with_defaults() { + let toml = r#" +[logging] +ingest_log_enabled = true +ingest_log_dir = "{state_dir}/logs" +"#; + let w: LoggingWrapper = toml::from_str(toml).expect("old logging config must parse"); + assert!(w.logging.ingest_log_enabled); + assert_eq!( + w.logging.ingest_log_dir, + PathBuf::from("{state_dir}/logs") + ); + assert_eq!( + w.logging.keep_recent_runs, 100, + "keep_recent_runs must default to 100" + ); + assert_eq!( + w.logging.retention_days, 30, + "retention_days must default to 30" + ); +} diff --git a/docs/SMOKE.md b/docs/SMOKE.md index b727246..24d261f 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -149,6 +149,12 @@ skip_generated_header = true max_file_bytes = 262144 max_file_lines = 5000 extra_skip_globs = [] # 사용자 추가 skip 패턴 (gitignore syntax) + +[logging] +ingest_log_enabled = true +ingest_log_dir = "{state_dir}/logs" +keep_recent_runs = 100 # v0.20.x r2: 최근 N 개 run log 파일 보존 +retention_days = 30 # v0.20.x r2: N일 이상 된 log / OCR 이벤트 자동 삭제 ``` `KEBAB_*` 환경변수로 override 가능 (`KEBAB_MODELS_LLM_MODEL=gemma4:26b kebab …` 등). 자세한 키 목록은 `crates/kebab-config/src/lib.rs` 의 `apply_env` 매치 암. `KEBAB_READONLY=1` — write-path 비활성화 (CI 안전망). `KEBAB_PROGRESS=plain` — non-TTY 환경에서 진행 상황을 plain 한 줄씩 stderr 출력 (spinner 대신).