feat(app): log retention — keep_recent_runs + retention_days (Enhancement 4)

LoggingCfg gains two fields with serde defaults: keep_recent_runs
(default 100, top-N file retention) and retention_days (default 30,
time-based retention for both ndjson files and the SQLite mirror).

IngestLogWriter::open now runs cleanup_old_logs before creating a new
ingest-*.ndjson — delete iff (idx >= keep_recent) OR (modified <=
cutoff). ingest_with_config_opts also calls
SqliteStore::prune_pdf_ocr_events(retention_days) at ingest start so
the SQLite mirror tracks the same retention window.

Backward compat (AC-9): both new fields use #[serde(default = ...)],
so a pre-v0.20.x config with only [logging] ingest_log_enabled +
ingest_log_dir parses unchanged. kebab init writes the new defaults
automatically via Config::default() -> toml::to_string_pretty (AC-12).

docs/SMOKE.md config example synced.

Closure r1 F5: explicit OR-on-stale comment inside cleanup_old_logs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 06:17:47 +00:00
parent d9ec7b8dc3
commit 35c987df1c
8 changed files with 186 additions and 1 deletions

View File

@@ -72,6 +72,7 @@ rusqlite = { workspace = true }
[dev-dependencies]
rusqlite = { workspace = true }
filetime = "0.2"
tempfile = { workspace = true }
# Image-pipeline integration tests use wiremock to stub Ollama for OCR
# / caption HTTP calls. Async runtime to host the mock server only;

View File

@@ -29,6 +29,10 @@ impl IngestLogWriter {
let run_id = generate_run_id();
let log_dir = expand_log_dir(&cfg.ingest_log_dir);
std::fs::create_dir_all(&log_dir)?;
// Cleanup before creating the new file (non-critical: warn on error).
if let Err(e) = cleanup_old_logs(&log_dir, cfg.keep_recent_runs, cfg.retention_days) {
tracing::warn!(target: "kebab-app", "ingest log cleanup failed: {e}");
}
let path = log_dir.join(format!("ingest-{run_id}.ndjson"));
let file = BufWriter::new(File::create(&path)?);
Ok(Some(Self {
@@ -218,10 +222,69 @@ pub(crate) fn percentiles(
(Some(p50), Some(p90), Some(p99), Some(max))
}
/// Delete old ingest log files from `log_dir`.
///
/// **Retention rule (§3.4 OR-on-stale semantics):**
/// Keep a file iff BOTH conditions hold: (idx < keep_recent) AND (modified > cutoff).
/// Delete iff (idx >= keep_recent) OR (modified <= cutoff) — either stale condition
/// triggers deletion. Files are indexed newest-first so `idx=0` is the most recent.
pub(crate) fn cleanup_old_logs(
log_dir: &Path,
keep_recent: u32,
retention_days: u32,
) -> anyhow::Result<()> {
let mut entries: Vec<_> = std::fs::read_dir(log_dir)?
.filter_map(|e| e.ok())
.filter(|e| {
e.path()
.file_name()
.and_then(|n| n.to_str())
.map(|s| s.starts_with("ingest-") && s.ends_with(".ndjson"))
.unwrap_or(false)
})
.collect();
// Sort newest-first by mtime (files without mtime go to the end).
entries.sort_by_key(|e| {
std::cmp::Reverse(
e.metadata()
.ok()
.and_then(|m| m.modified().ok()),
)
});
let cutoff = SystemTime::now()
.checked_sub(std::time::Duration::from_secs(
retention_days as u64 * 86400,
))
.unwrap_or(SystemTime::UNIX_EPOCH);
for (idx, entry) in entries.into_iter().enumerate() {
let modified = entry
.metadata()
.ok()
.and_then(|m| m.modified().ok())
.unwrap_or(SystemTime::UNIX_EPOCH);
// Keep iff (idx < keep_recent) AND (modified > cutoff).
if (idx as u32) < keep_recent && modified > cutoff {
continue;
}
if let Err(e) = std::fs::remove_file(entry.path()) {
tracing::warn!(
target: "kebab-app",
"failed to remove old log {}: {e}",
entry.path().display()
);
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use kebab_config::LoggingCfg;
use std::time::SystemTime;
use tempfile::TempDir;
#[test]
@@ -254,6 +317,7 @@ mod tests {
let cfg = LoggingCfg {
ingest_log_enabled: false,
ingest_log_dir: PathBuf::from("/tmp/should-not-exist"),
..Default::default()
};
let result = IngestLogWriter::open(&cfg).expect("open should not error");
assert!(result.is_none(), "disabled writer should return None");
@@ -265,6 +329,7 @@ mod tests {
let cfg = LoggingCfg {
ingest_log_enabled: true,
ingest_log_dir: tmp.path().to_path_buf(),
..Default::default()
};
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
let path = writer.path().to_path_buf();
@@ -315,6 +380,7 @@ mod tests {
let cfg = LoggingCfg {
ingest_log_enabled: true,
ingest_log_dir: tmp.path().to_path_buf(),
..Default::default()
};
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
let path = writer.path().to_path_buf();
@@ -333,4 +399,61 @@ mod tests {
"file should have at least 1 line after drop"
);
}
/// AC-7: keep_recent=3 with 5 files, oldest 2 should be deleted.
#[test]
fn cleanup_keeps_recent_n_drops_old() {
let tmp = TempDir::new().unwrap();
let dir = tmp.path();
// Create 5 files with mtime spread across 60 days
for i in 0..5u64 {
let path = dir.join(format!("ingest-file{i}.ndjson"));
std::fs::write(&path, b"x").unwrap();
// Set mtime: file 0 = newest, file 4 = 60 days old
let age_days = i * 15; // 0, 15, 30, 45, 60 days old
let mtime = SystemTime::now()
.checked_sub(std::time::Duration::from_secs(age_days * 86400))
.unwrap();
filetime::set_file_mtime(
&path,
filetime::FileTime::from_system_time(mtime),
)
.unwrap();
}
// keep_recent=3, retention_days=90 (no time-based deletion)
cleanup_old_logs(dir, 3, 90).unwrap();
let remaining: Vec<_> = std::fs::read_dir(dir)
.unwrap()
.filter_map(|e| e.ok())
.collect();
assert_eq!(remaining.len(), 3, "expected 3 files after cleanup");
}
/// F5 OR-on-stale: files within keep_recent count but older than retention_days
/// must still be deleted.
#[test]
fn cleanup_drops_stale_even_within_count() {
let tmp = TempDir::new().unwrap();
let dir = tmp.path();
// 2 files, both 90 days old — well past retention_days=30
for i in 0..2u64 {
let path = dir.join(format!("ingest-old{i}.ndjson"));
std::fs::write(&path, b"x").unwrap();
let mtime = SystemTime::now()
.checked_sub(std::time::Duration::from_secs(90 * 86400))
.unwrap();
filetime::set_file_mtime(
&path,
filetime::FileTime::from_system_time(mtime),
)
.unwrap();
}
// keep_recent=10 (both within count) but retention_days=30 → both stale
cleanup_old_logs(dir, 10, 30).unwrap();
let remaining: Vec<_> = std::fs::read_dir(dir)
.unwrap()
.filter_map(|e| e.ok())
.collect();
assert_eq!(remaining.len(), 0, "stale files must be deleted even within keep_recent");
}
}

View File

@@ -321,6 +321,15 @@ pub fn ingest_with_config_opts(
let ocr_pages_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
let ocr_failures_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
// v0.20.x r2: prune stale pdf_ocr_events rows once per ingest run.
let _pruned = app
.sqlite
.prune_pdf_ocr_events(app.config.logging.retention_days)
.unwrap_or_else(|e| {
tracing::warn!(target: "kebab-app", "pdf_ocr_events prune failed: {e}");
0
});
// Walk the workspace.
crate::ingest_progress::emit(
progress,

View File

@@ -28,6 +28,7 @@ fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Con
cfg.logging = LoggingCfg {
ingest_log_enabled: true,
ingest_log_dir: log_dir.to_path_buf(),
..Default::default()
};
cfg
}
@@ -138,6 +139,7 @@ fn ingest_log_disabled_emits_no_file() {
cfg.logging = LoggingCfg {
ingest_log_enabled: false,
ingest_log_dir: log_dir.clone(),
..Default::default()
};
let scope = SourceScope {

View File

@@ -58,6 +58,7 @@ async fn ingest_dual_write_doc_id_matches_ndjson() {
env.config.logging = LoggingCfg {
ingest_log_enabled: true,
ingest_log_dir: log_dir.clone(),
..Default::default()
};
// Copy scanned PDF into workspace

View File

@@ -443,9 +443,19 @@ pub struct LoggingCfg {
/// Directory for per-run log files. Default `{state_dir}/logs`.
/// `{state_dir}` expands to the XDG state dir (e.g. `~/.local/state/kebab`).
/// Log file accumulation is user-managed — no rotation policy (spec §6 R-1).
#[serde(default = "default_ingest_log_dir")]
pub ingest_log_dir: PathBuf,
/// v0.20.x r2 Enhancement 4: keep the most recent N ingest log files.
/// Older files (beyond this count) are deleted at ingest start.
/// Default 100. AC-9: #[serde(default)] ensures backward compat.
#[serde(default = "default_keep_recent_runs")]
pub keep_recent_runs: u32,
/// v0.20.x r2 Enhancement 4: delete log files older than N days.
/// Also applied to `pdf_ocr_events` SQLite rows. Default 30.
#[serde(default = "default_retention_days")]
pub retention_days: u32,
}
fn default_ingest_log_enabled() -> bool {
@@ -454,12 +464,20 @@ fn default_ingest_log_enabled() -> bool {
fn default_ingest_log_dir() -> PathBuf {
PathBuf::from("{state_dir}/logs")
}
fn default_keep_recent_runs() -> u32 {
100
}
fn default_retention_days() -> u32 {
30
}
impl Default for LoggingCfg {
fn default() -> Self {
Self {
ingest_log_enabled: default_ingest_log_enabled(),
ingest_log_dir: default_ingest_log_dir(),
keep_recent_runs: default_keep_recent_runs(),
retention_days: default_retention_days(),
}
}
}

View File

@@ -42,3 +42,28 @@ fn pre_v020_config_without_logging_section_gets_defaults() {
assert!(w.logging.ingest_log_enabled);
assert_eq!(w.logging.ingest_log_dir, PathBuf::from("{state_dir}/logs"));
}
// Test 4 (AC-9 v0.20.x r2): old config with only ingest_log_enabled + ingest_log_dir
// parses without error and produces correct defaults for keep_recent_runs + retention_days.
#[test]
fn old_logging_config_parses_with_defaults() {
let toml = r#"
[logging]
ingest_log_enabled = true
ingest_log_dir = "{state_dir}/logs"
"#;
let w: LoggingWrapper = toml::from_str(toml).expect("old logging config must parse");
assert!(w.logging.ingest_log_enabled);
assert_eq!(
w.logging.ingest_log_dir,
PathBuf::from("{state_dir}/logs")
);
assert_eq!(
w.logging.keep_recent_runs, 100,
"keep_recent_runs must default to 100"
);
assert_eq!(
w.logging.retention_days, 30,
"retention_days must default to 30"
);
}

View File

@@ -149,6 +149,12 @@ skip_generated_header = true
max_file_bytes = 262144
max_file_lines = 5000
extra_skip_globs = [] # 사용자 추가 skip 패턴 (gitignore syntax)
[logging]
ingest_log_enabled = true
ingest_log_dir = "{state_dir}/logs"
keep_recent_runs = 100 # v0.20.x r2: 최근 N 개 run log 파일 보존
retention_days = 30 # v0.20.x r2: N일 이상 된 log / OCR 이벤트 자동 삭제
```
`KEBAB_*` 환경변수로 override 가능 (`KEBAB_MODELS_LLM_MODEL=gemma4:26b kebab …` 등). 자세한 키 목록은 `crates/kebab-config/src/lib.rs``apply_env` 매치 암. `KEBAB_READONLY=1` — write-path 비활성화 (CI 안전망). `KEBAB_PROGRESS=plain` — non-TTY 환경에서 진행 상황을 plain 한 줄씩 stderr 출력 (spinner 대신).