diff --git a/Cargo.lock b/Cargo.lock index 4815907..7f463b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4133,6 +4133,7 @@ dependencies = [ "base64 0.22.1", "blake3", "dirs 5.0.1", + "filetime", "ignore", "image", "kebab-chunk", diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index e3f55ce..80cedcd 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -1147,10 +1147,7 @@ impl App { } #[doc(hidden)] - pub fn inspect_ocr_stats_with_config( - &self, - _cfg: &kebab_config::Config, - ) -> Result { + pub fn inspect_ocr_stats_with_config(&self, _cfg: &kebab_config::Config) -> Result { use crate::ingest_log::percentiles; let conn = self.sqlite.read_conn(); @@ -1189,9 +1186,7 @@ impl App { let mut by_engine = std::collections::BTreeMap::new(); { let mut stmt = conn - .prepare( - "SELECT ocr_engine, COUNT(*) FROM pdf_ocr_events GROUP BY ocr_engine", - ) + .prepare("SELECT ocr_engine, COUNT(*) FROM pdf_ocr_events GROUP BY ocr_engine") .context("prepare engine query")?; let rows = stmt .query_map([], |r| Ok((r.get::<_, String>(0)?, r.get::<_, u64>(1)?))) @@ -1215,17 +1210,14 @@ impl App { LIMIT 10", ) .context("prepare by_doc query")?; - stmt.query_map( - [], - |r| { - Ok(OcrStatsByDoc { - doc_id: r.get(0)?, - failure_count: r.get(1)?, - success_count: r.get(2)?, - p90_ms: None, // per-doc p90 deferred (open question #3) - }) - }, - ) + stmt.query_map([], |r| { + Ok(OcrStatsByDoc { + doc_id: r.get(0)?, + failure_count: r.get(1)?, + success_count: r.get(2)?, + p90_ms: None, // per-doc p90 deferred (open question #3) + }) + }) .context("query by_doc")? .filter_map(|r| r.ok()) .collect() diff --git a/crates/kebab-app/src/ingest_log.rs b/crates/kebab-app/src/ingest_log.rs index 311a97d..83f9cd3 100644 --- a/crates/kebab-app/src/ingest_log.rs +++ b/crates/kebab-app/src/ingest_log.rs @@ -206,9 +206,7 @@ impl IngestSummary { /// Simple percentile extraction on a sorted copy of `samples`. /// Returns `(p50, p90, p99, max)`. All `None` when samples is empty. /// p99 surfaces via `inspect ocr-stats`; `IngestSummary` uses p50/p90/max only. -pub(crate) fn percentiles( - samples: &[u64], -) -> (Option, Option, Option, Option) { +pub(crate) fn percentiles(samples: &[u64]) -> (Option, Option, Option, Option) { if samples.is_empty() { return (None, None, None, None); } @@ -245,13 +243,7 @@ pub(crate) fn cleanup_old_logs( .collect(); // Sort newest-first by mtime (files without mtime go to the end). - entries.sort_by_key(|e| { - std::cmp::Reverse( - e.metadata() - .ok() - .and_then(|m| m.modified().ok()), - ) - }); + entries.sort_by_key(|e| std::cmp::Reverse(e.metadata().ok().and_then(|m| m.modified().ok()))); let cutoff = SystemTime::now() .checked_sub(std::time::Duration::from_secs( @@ -414,11 +406,7 @@ mod tests { let mtime = SystemTime::now() .checked_sub(std::time::Duration::from_secs(age_days * 86400)) .unwrap(); - filetime::set_file_mtime( - &path, - filetime::FileTime::from_system_time(mtime), - ) - .unwrap(); + filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap(); } // keep_recent=3, retention_days=90 (no time-based deletion) cleanup_old_logs(dir, 3, 90).unwrap(); @@ -442,11 +430,7 @@ mod tests { let mtime = SystemTime::now() .checked_sub(std::time::Duration::from_secs(90 * 86400)) .unwrap(); - filetime::set_file_mtime( - &path, - filetime::FileTime::from_system_time(mtime), - ) - .unwrap(); + filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap(); } // keep_recent=10 (both within count) but retention_days=30 → both stale cleanup_old_logs(dir, 10, 30).unwrap(); @@ -454,6 +438,10 @@ mod tests { .unwrap() .filter_map(|e| e.ok()) .collect(); - assert_eq!(remaining.len(), 0, "stale files must be deleted even within keep_recent"); + assert_eq!( + remaining.len(), + 0, + "stale files must be deleted even within keep_recent" + ); } } diff --git a/crates/kebab-app/src/pdf_ocr_apply.rs b/crates/kebab-app/src/pdf_ocr_apply.rs index bb002f6..ba375d1 100644 --- a/crates/kebab-app/src/pdf_ocr_apply.rs +++ b/crates/kebab-app/src/pdf_ocr_apply.rs @@ -190,10 +190,9 @@ where kind: ProvenanceKind::Warning, note: Some(note), }); - let (image_width, image_height) = - extract_image_dimensions(&page_image_bytes) - .map(|(w, h)| (Some(w), Some(h))) - .unwrap_or((None, None)); + let (image_width, image_height) = extract_image_dimensions(&page_image_bytes) + .map(|(w, h)| (Some(w), Some(h))) + .unwrap_or((None, None)); emit_progress(PdfOcrProgress::Finished { page: page_num, ms: start.elapsed().as_millis() as u64, diff --git a/crates/kebab-app/tests/ocr_inspect_smoke.rs b/crates/kebab-app/tests/ocr_inspect_smoke.rs index eba7174..31e1bf4 100644 --- a/crates/kebab-app/tests/ocr_inspect_smoke.rs +++ b/crates/kebab-app/tests/ocr_inspect_smoke.rs @@ -111,7 +111,10 @@ fn ocr_failures_corpus_wide() { assert_eq!(result.schema_version, "ocr_failures.v1"); assert!(result.failure_count >= 1, "expected at least 1 failure"); - assert!(!result.failures.is_empty(), "failures list must be non-empty"); + assert!( + !result.failures.is_empty(), + "failures list must be non-empty" + ); } /// AC-5: `inspect_ocr_failures` with doc_id filter returns matching rows. diff --git a/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs b/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs index 9967e0a..4ef52f3 100644 --- a/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs +++ b/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs @@ -66,8 +66,7 @@ async fn ingest_dual_write_doc_id_matches_ndjson() { std::fs::copy(scanned_pdf_src(), &dest).expect("copy scanned PDF"); // Run ingest - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("ingest"); + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest"); // Read ndjson log let log_files: Vec<_> = std::fs::read_dir(&log_dir) diff --git a/crates/kebab-config/tests/logging_roundtrip.rs b/crates/kebab-config/tests/logging_roundtrip.rs index 8525cc2..e964930 100644 --- a/crates/kebab-config/tests/logging_roundtrip.rs +++ b/crates/kebab-config/tests/logging_roundtrip.rs @@ -54,10 +54,7 @@ ingest_log_dir = "{state_dir}/logs" "#; let w: LoggingWrapper = toml::from_str(toml).expect("old logging config must parse"); assert!(w.logging.ingest_log_enabled); - assert_eq!( - w.logging.ingest_log_dir, - PathBuf::from("{state_dir}/logs") - ); + assert_eq!(w.logging.ingest_log_dir, PathBuf::from("{state_dir}/logs")); assert_eq!( w.logging.keep_recent_runs, 100, "keep_recent_runs must default to 100" diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index ffb7f28..437cc02 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -1018,11 +1018,19 @@ impl SqliteStore { ms, chars, success, reason, ocr_engine) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", rusqlite::params![ - run_id, ts, doc_id, doc_path, page, - image_byte_size, image_width, image_height, - ms, chars, + run_id, + ts, + doc_id, + doc_path, + page, + image_byte_size, + image_width, + image_height, + ms, + chars, if success { 1i32 } else { 0i32 }, - reason, ocr_engine + reason, + ocr_engine ], )?; Ok(()) @@ -1034,8 +1042,7 @@ impl SqliteStore { /// means "delete everything older than now" (i.e. all past rows). pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result { use time::format_description::well_known::Rfc3339; - let cutoff = time::OffsetDateTime::now_utc() - - time::Duration::days(retention_days as i64); + let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(retention_days as i64); let cutoff_ts = cutoff .format(&Rfc3339) .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string()); diff --git a/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs b/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs index 294aaea..2db3cdb 100644 --- a/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs +++ b/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs @@ -25,7 +25,11 @@ fn v008_pdf_ocr_events_table_exists() { ) .optional() }); - assert_eq!(name.as_deref(), Some("pdf_ocr_events"), "pdf_ocr_events table must exist after V008"); + assert_eq!( + name.as_deref(), + Some("pdf_ocr_events"), + "pdf_ocr_events table must exist after V008" + ); } /// AC-8: insert 2 rows with different timestamps; prune with retention_days=0 @@ -74,9 +78,7 @@ fn record_and_prune_pdf_ocr_event() { // prune with retention_days=0 → cutoff=now → deletes any row with ts < now. // The 1970 row should be deleted; the 2099 row survives. - let pruned = store - .prune_pdf_ocr_events(0) - .expect("prune"); + let pruned = store.prune_pdf_ocr_events(0).expect("prune"); assert_eq!(pruned, 1, "should have deleted exactly 1 old row"); // Verify only the future row remains