feat(app): log retention — keep_recent_runs + retention_days (Enhancement 4)
LoggingCfg gains two fields with serde defaults: keep_recent_runs (default 100, top-N file retention) and retention_days (default 30, time-based retention for both ndjson files and the SQLite mirror). IngestLogWriter::open now runs cleanup_old_logs before creating a new ingest-*.ndjson — delete iff (idx >= keep_recent) OR (modified <= cutoff). ingest_with_config_opts also calls SqliteStore::prune_pdf_ocr_events(retention_days) at ingest start so the SQLite mirror tracks the same retention window. Backward compat (AC-9): both new fields use #[serde(default = ...)], so a pre-v0.20.x config with only [logging] ingest_log_enabled + ingest_log_dir parses unchanged. kebab init writes the new defaults automatically via Config::default() -> toml::to_string_pretty (AC-12). docs/SMOKE.md config example synced. Closure r1 F5: explicit OR-on-stale comment inside cleanup_old_logs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -72,6 +72,7 @@ rusqlite = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
rusqlite = { workspace = true }
|
||||
filetime = "0.2"
|
||||
tempfile = { workspace = true }
|
||||
# Image-pipeline integration tests use wiremock to stub Ollama for OCR
|
||||
# / caption HTTP calls. Async runtime to host the mock server only;
|
||||
|
||||
@@ -29,6 +29,10 @@ impl IngestLogWriter {
|
||||
let run_id = generate_run_id();
|
||||
let log_dir = expand_log_dir(&cfg.ingest_log_dir);
|
||||
std::fs::create_dir_all(&log_dir)?;
|
||||
// Cleanup before creating the new file (non-critical: warn on error).
|
||||
if let Err(e) = cleanup_old_logs(&log_dir, cfg.keep_recent_runs, cfg.retention_days) {
|
||||
tracing::warn!(target: "kebab-app", "ingest log cleanup failed: {e}");
|
||||
}
|
||||
let path = log_dir.join(format!("ingest-{run_id}.ndjson"));
|
||||
let file = BufWriter::new(File::create(&path)?);
|
||||
Ok(Some(Self {
|
||||
@@ -218,10 +222,69 @@ pub(crate) fn percentiles(
|
||||
(Some(p50), Some(p90), Some(p99), Some(max))
|
||||
}
|
||||
|
||||
/// Delete old ingest log files from `log_dir`.
|
||||
///
|
||||
/// **Retention rule (§3.4 OR-on-stale semantics):**
|
||||
/// Keep a file iff BOTH conditions hold: (idx < keep_recent) AND (modified > cutoff).
|
||||
/// Delete iff (idx >= keep_recent) OR (modified <= cutoff) — either stale condition
|
||||
/// triggers deletion. Files are indexed newest-first so `idx=0` is the most recent.
|
||||
pub(crate) fn cleanup_old_logs(
|
||||
log_dir: &Path,
|
||||
keep_recent: u32,
|
||||
retention_days: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut entries: Vec<_> = std::fs::read_dir(log_dir)?
|
||||
.filter_map(|e| e.ok())
|
||||
.filter(|e| {
|
||||
e.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.map(|s| s.starts_with("ingest-") && s.ends_with(".ndjson"))
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort newest-first by mtime (files without mtime go to the end).
|
||||
entries.sort_by_key(|e| {
|
||||
std::cmp::Reverse(
|
||||
e.metadata()
|
||||
.ok()
|
||||
.and_then(|m| m.modified().ok()),
|
||||
)
|
||||
});
|
||||
|
||||
let cutoff = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(
|
||||
retention_days as u64 * 86400,
|
||||
))
|
||||
.unwrap_or(SystemTime::UNIX_EPOCH);
|
||||
|
||||
for (idx, entry) in entries.into_iter().enumerate() {
|
||||
let modified = entry
|
||||
.metadata()
|
||||
.ok()
|
||||
.and_then(|m| m.modified().ok())
|
||||
.unwrap_or(SystemTime::UNIX_EPOCH);
|
||||
// Keep iff (idx < keep_recent) AND (modified > cutoff).
|
||||
if (idx as u32) < keep_recent && modified > cutoff {
|
||||
continue;
|
||||
}
|
||||
if let Err(e) = std::fs::remove_file(entry.path()) {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
"failed to remove old log {}: {e}",
|
||||
entry.path().display()
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_config::LoggingCfg;
|
||||
use std::time::SystemTime;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
@@ -254,6 +317,7 @@ mod tests {
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: PathBuf::from("/tmp/should-not-exist"),
|
||||
..Default::default()
|
||||
};
|
||||
let result = IngestLogWriter::open(&cfg).expect("open should not error");
|
||||
assert!(result.is_none(), "disabled writer should return None");
|
||||
@@ -265,6 +329,7 @@ mod tests {
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
@@ -315,6 +380,7 @@ mod tests {
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
@@ -333,4 +399,61 @@ mod tests {
|
||||
"file should have at least 1 line after drop"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC-7: keep_recent=3 with 5 files, oldest 2 should be deleted.
|
||||
#[test]
|
||||
fn cleanup_keeps_recent_n_drops_old() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let dir = tmp.path();
|
||||
// Create 5 files with mtime spread across 60 days
|
||||
for i in 0..5u64 {
|
||||
let path = dir.join(format!("ingest-file{i}.ndjson"));
|
||||
std::fs::write(&path, b"x").unwrap();
|
||||
// Set mtime: file 0 = newest, file 4 = 60 days old
|
||||
let age_days = i * 15; // 0, 15, 30, 45, 60 days old
|
||||
let mtime = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(age_days * 86400))
|
||||
.unwrap();
|
||||
filetime::set_file_mtime(
|
||||
&path,
|
||||
filetime::FileTime::from_system_time(mtime),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
// keep_recent=3, retention_days=90 (no time-based deletion)
|
||||
cleanup_old_logs(dir, 3, 90).unwrap();
|
||||
let remaining: Vec<_> = std::fs::read_dir(dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.collect();
|
||||
assert_eq!(remaining.len(), 3, "expected 3 files after cleanup");
|
||||
}
|
||||
|
||||
/// F5 OR-on-stale: files within keep_recent count but older than retention_days
|
||||
/// must still be deleted.
|
||||
#[test]
|
||||
fn cleanup_drops_stale_even_within_count() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let dir = tmp.path();
|
||||
// 2 files, both 90 days old — well past retention_days=30
|
||||
for i in 0..2u64 {
|
||||
let path = dir.join(format!("ingest-old{i}.ndjson"));
|
||||
std::fs::write(&path, b"x").unwrap();
|
||||
let mtime = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(90 * 86400))
|
||||
.unwrap();
|
||||
filetime::set_file_mtime(
|
||||
&path,
|
||||
filetime::FileTime::from_system_time(mtime),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
// keep_recent=10 (both within count) but retention_days=30 → both stale
|
||||
cleanup_old_logs(dir, 10, 30).unwrap();
|
||||
let remaining: Vec<_> = std::fs::read_dir(dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.collect();
|
||||
assert_eq!(remaining.len(), 0, "stale files must be deleted even within keep_recent");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -321,6 +321,15 @@ pub fn ingest_with_config_opts(
|
||||
let ocr_pages_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
|
||||
let ocr_failures_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
|
||||
|
||||
// v0.20.x r2: prune stale pdf_ocr_events rows once per ingest run.
|
||||
let _pruned = app
|
||||
.sqlite
|
||||
.prune_pdf_ocr_events(app.config.logging.retention_days)
|
||||
.unwrap_or_else(|e| {
|
||||
tracing::warn!(target: "kebab-app", "pdf_ocr_events prune failed: {e}");
|
||||
0
|
||||
});
|
||||
|
||||
// Walk the workspace.
|
||||
crate::ingest_progress::emit(
|
||||
progress,
|
||||
|
||||
@@ -28,6 +28,7 @@ fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Con
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
cfg
|
||||
}
|
||||
@@ -138,6 +139,7 @@ fn ingest_log_disabled_emits_no_file() {
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: log_dir.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scope = SourceScope {
|
||||
|
||||
@@ -58,6 +58,7 @@ async fn ingest_dual_write_doc_id_matches_ndjson() {
|
||||
env.config.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Copy scanned PDF into workspace
|
||||
|
||||
@@ -443,9 +443,19 @@ pub struct LoggingCfg {
|
||||
|
||||
/// Directory for per-run log files. Default `{state_dir}/logs`.
|
||||
/// `{state_dir}` expands to the XDG state dir (e.g. `~/.local/state/kebab`).
|
||||
/// Log file accumulation is user-managed — no rotation policy (spec §6 R-1).
|
||||
#[serde(default = "default_ingest_log_dir")]
|
||||
pub ingest_log_dir: PathBuf,
|
||||
|
||||
/// v0.20.x r2 Enhancement 4: keep the most recent N ingest log files.
|
||||
/// Older files (beyond this count) are deleted at ingest start.
|
||||
/// Default 100. AC-9: #[serde(default)] ensures backward compat.
|
||||
#[serde(default = "default_keep_recent_runs")]
|
||||
pub keep_recent_runs: u32,
|
||||
|
||||
/// v0.20.x r2 Enhancement 4: delete log files older than N days.
|
||||
/// Also applied to `pdf_ocr_events` SQLite rows. Default 30.
|
||||
#[serde(default = "default_retention_days")]
|
||||
pub retention_days: u32,
|
||||
}
|
||||
|
||||
fn default_ingest_log_enabled() -> bool {
|
||||
@@ -454,12 +464,20 @@ fn default_ingest_log_enabled() -> bool {
|
||||
fn default_ingest_log_dir() -> PathBuf {
|
||||
PathBuf::from("{state_dir}/logs")
|
||||
}
|
||||
fn default_keep_recent_runs() -> u32 {
|
||||
100
|
||||
}
|
||||
fn default_retention_days() -> u32 {
|
||||
30
|
||||
}
|
||||
|
||||
impl Default for LoggingCfg {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
ingest_log_enabled: default_ingest_log_enabled(),
|
||||
ingest_log_dir: default_ingest_log_dir(),
|
||||
keep_recent_runs: default_keep_recent_runs(),
|
||||
retention_days: default_retention_days(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,3 +42,28 @@ fn pre_v020_config_without_logging_section_gets_defaults() {
|
||||
assert!(w.logging.ingest_log_enabled);
|
||||
assert_eq!(w.logging.ingest_log_dir, PathBuf::from("{state_dir}/logs"));
|
||||
}
|
||||
|
||||
// Test 4 (AC-9 v0.20.x r2): old config with only ingest_log_enabled + ingest_log_dir
|
||||
// parses without error and produces correct defaults for keep_recent_runs + retention_days.
|
||||
#[test]
|
||||
fn old_logging_config_parses_with_defaults() {
|
||||
let toml = r#"
|
||||
[logging]
|
||||
ingest_log_enabled = true
|
||||
ingest_log_dir = "{state_dir}/logs"
|
||||
"#;
|
||||
let w: LoggingWrapper = toml::from_str(toml).expect("old logging config must parse");
|
||||
assert!(w.logging.ingest_log_enabled);
|
||||
assert_eq!(
|
||||
w.logging.ingest_log_dir,
|
||||
PathBuf::from("{state_dir}/logs")
|
||||
);
|
||||
assert_eq!(
|
||||
w.logging.keep_recent_runs, 100,
|
||||
"keep_recent_runs must default to 100"
|
||||
);
|
||||
assert_eq!(
|
||||
w.logging.retention_days, 30,
|
||||
"retention_days must default to 30"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -149,6 +149,12 @@ skip_generated_header = true
|
||||
max_file_bytes = 262144
|
||||
max_file_lines = 5000
|
||||
extra_skip_globs = [] # 사용자 추가 skip 패턴 (gitignore syntax)
|
||||
|
||||
[logging]
|
||||
ingest_log_enabled = true
|
||||
ingest_log_dir = "{state_dir}/logs"
|
||||
keep_recent_runs = 100 # v0.20.x r2: 최근 N 개 run log 파일 보존
|
||||
retention_days = 30 # v0.20.x r2: N일 이상 된 log / OCR 이벤트 자동 삭제
|
||||
```
|
||||
|
||||
`KEBAB_*` 환경변수로 override 가능 (`KEBAB_MODELS_LLM_MODEL=gemma4:26b kebab …` 등). 자세한 키 목록은 `crates/kebab-config/src/lib.rs` 의 `apply_env` 매치 암. `KEBAB_READONLY=1` — write-path 비활성화 (CI 안전망). `KEBAB_PROGRESS=plain` — non-TTY 환경에서 진행 상황을 plain 한 줄씩 stderr 출력 (spinner 대신).
|
||||
|
||||
Reference in New Issue
Block a user