diff --git a/Cargo.lock b/Cargo.lock index 4815907..7f463b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4133,6 +4133,7 @@ dependencies = [ "base64 0.22.1", "blake3", "dirs 5.0.1", + "filetime", "ignore", "image", "kebab-chunk", diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml index 102c5af..3964c12 100644 --- a/crates/kebab-app/Cargo.toml +++ b/crates/kebab-app/Cargo.toml @@ -36,6 +36,10 @@ kebab-parse-image = { path = "../kebab-parse-image" } # resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`. kebab-parse-pdf = { path = "../kebab-parse-pdf" } lopdf = { workspace = true } +# Enhancement 1 (v0.20.x r2): JPEG dimension decode in pdf_ocr_apply.rs. +# jpeg feature added explicitly (F3 closure-r1) rather than relying on +# feature unification via kebab-parse-image. +image = { version = "0.25", default-features = false, features = ["png", "jpeg"] } # p10-1A-2: Rust AST extractor lives here. App threads it into the # per-asset dispatch (see `ingest_one_asset` Code branch) and runs the # resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`. @@ -63,16 +67,19 @@ unicode-normalization = "0.1" ignore = "0.4" # p9-fb-34: opaque pagination cursor encodes payload as base64. base64 = { workspace = true } +# Enhancement 3 (v0.20.x r2): direct SQL queries for inspect_ocr_stats/failures. +rusqlite = { workspace = true } [dev-dependencies] rusqlite = { workspace = true } +filetime = "0.2" tempfile = { workspace = true } # Image-pipeline integration tests use wiremock to stub Ollama for OCR # / caption HTTP calls. Async runtime to host the mock server only; # the kb-app code under test stays sync. wiremock = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread"] } -image = { version = "0.25", default-features = false, features = ["png"] } +image = { version = "0.25", default-features = false, features = ["png", "jpeg"] } # P7-3 PDF integration tests build in-memory PDF fixtures via the same # lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned # to the same major (0.32) so byte output is identical between the two diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index 445b687..80cedcd 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -1093,6 +1093,223 @@ fn backfill_code_lang(hits: &mut [SearchHit]) { } } +// ── v0.20.x r2 Enhancement 3: OCR stats + failures inspect ────────────── + +/// Wire type for `kebab inspect ocr-stats --json` (`ocr_stats.v1`). +#[derive(serde::Serialize)] +pub struct OcrStatsV1 { + pub schema_version: &'static str, + pub total_events: u64, + pub total_runs: u64, + pub success_count: u64, + pub failure_count: u64, + pub success_rate: f64, + pub p50_ms: Option, + pub p90_ms: Option, + pub p99_ms: Option, + pub max_ms: Option, + pub by_engine: std::collections::BTreeMap, + pub by_doc: Vec, +} + +/// Per-doc breakdown row inside `OcrStatsV1`. +#[derive(serde::Serialize)] +pub struct OcrStatsByDoc { + pub doc_id: String, + pub failure_count: u64, + pub success_count: u64, + pub p90_ms: Option, +} + +/// Wire type for `kebab inspect ocr-failures --json` (`ocr_failures.v1`). +#[derive(serde::Serialize)] +pub struct OcrFailuresV1 { + pub schema_version: &'static str, + pub doc_id: Option, + pub failure_count: u64, + pub failures: Vec, +} + +/// Single failure row inside `OcrFailuresV1`. +#[derive(serde::Serialize)] +pub struct OcrFailureRow { + pub ts: String, + pub page: u32, + pub ms: u64, + pub reason: String, + pub image_byte_size: Option, +} + +impl App { + /// Corpus-wide OCR statistics from the `pdf_ocr_events` SQLite mirror. + pub fn inspect_ocr_stats(&self) -> Result { + self.inspect_ocr_stats_with_config(&self.config) + } + + #[doc(hidden)] + pub fn inspect_ocr_stats_with_config(&self, _cfg: &kebab_config::Config) -> Result { + use crate::ingest_log::percentiles; + let conn = self.sqlite.read_conn(); + + // 1. Aggregate counters + let (total_events, success_count, failure_count, total_runs): (u64, u64, u64, u64) = conn + .query_row( + "SELECT COUNT(*), \ + SUM(CASE WHEN success=1 THEN 1 ELSE 0 END), \ + SUM(CASE WHEN success=0 THEN 1 ELSE 0 END), \ + COUNT(DISTINCT run_id) \ + FROM pdf_ocr_events", + [], + |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)), + ) + .unwrap_or((0, 0, 0, 0)); + + let success_rate = if total_events == 0 { + 0.0 + } else { + success_count as f64 / total_events as f64 + }; + + // 2. Latency percentiles from successful events + let samples: Vec = { + let mut stmt = conn + .prepare("SELECT ms FROM pdf_ocr_events WHERE success=1 ORDER BY ms") + .context("prepare ms query")?; + stmt.query_map([], |r| r.get::<_, u64>(0)) + .context("query ms")? + .filter_map(|r| r.ok()) + .collect() + }; + let (p50_ms, p90_ms, p99_ms, max_ms) = percentiles(&samples); + + // 3. Engine breakdown + let mut by_engine = std::collections::BTreeMap::new(); + { + let mut stmt = conn + .prepare("SELECT ocr_engine, COUNT(*) FROM pdf_ocr_events GROUP BY ocr_engine") + .context("prepare engine query")?; + let rows = stmt + .query_map([], |r| Ok((r.get::<_, String>(0)?, r.get::<_, u64>(1)?))) + .context("query engine")?; + for row in rows.filter_map(|r| r.ok()) { + by_engine.insert(row.0, row.1); + } + } + + // 4. Top-10 docs by failure count + let by_doc: Vec = { + let mut stmt = conn + .prepare( + "SELECT doc_id, \ + SUM(CASE WHEN success=0 THEN 1 ELSE 0 END), \ + SUM(CASE WHEN success=1 THEN 1 ELSE 0 END) \ + FROM pdf_ocr_events \ + WHERE doc_id IS NOT NULL \ + GROUP BY doc_id \ + ORDER BY 2 DESC \ + LIMIT 10", + ) + .context("prepare by_doc query")?; + stmt.query_map([], |r| { + Ok(OcrStatsByDoc { + doc_id: r.get(0)?, + failure_count: r.get(1)?, + success_count: r.get(2)?, + p90_ms: None, // per-doc p90 deferred (open question #3) + }) + }) + .context("query by_doc")? + .filter_map(|r| r.ok()) + .collect() + }; + + Ok(OcrStatsV1 { + schema_version: "ocr_stats.v1", + total_events, + total_runs, + success_count, + failure_count, + success_rate, + p50_ms, + p90_ms, + p99_ms, + max_ms, + by_engine, + by_doc, + }) + } + + /// Recent OCR failure rows, optionally filtered by `doc_id`. + pub fn inspect_ocr_failures( + &self, + doc_id: Option<&str>, + limit: usize, + ) -> Result { + self.inspect_ocr_failures_with_config(&self.config, doc_id, limit) + } + + #[doc(hidden)] + pub fn inspect_ocr_failures_with_config( + &self, + _cfg: &kebab_config::Config, + doc_id: Option<&str>, + limit: usize, + ) -> Result { + let conn = self.sqlite.read_conn(); + let failures: Vec = if let Some(did) = doc_id { + let mut stmt = conn + .prepare( + "SELECT ts, page, ms, COALESCE(reason,'unknown'), image_byte_size \ + FROM pdf_ocr_events \ + WHERE success=0 AND doc_id=? \ + ORDER BY ts DESC \ + LIMIT ?", + ) + .context("prepare failures by doc_id")?; + stmt.query_map(rusqlite::params![did, limit as i64], |r| { + Ok(OcrFailureRow { + ts: r.get(0)?, + page: r.get(1)?, + ms: r.get(2)?, + reason: r.get(3)?, + image_byte_size: r.get(4)?, + }) + }) + .context("query failures by doc_id")? + .filter_map(|r| r.ok()) + .collect() + } else { + let mut stmt = conn + .prepare( + "SELECT ts, page, ms, COALESCE(reason,'unknown'), image_byte_size \ + FROM pdf_ocr_events \ + WHERE success=0 \ + ORDER BY ts DESC \ + LIMIT ?", + ) + .context("prepare failures corpus-wide")?; + stmt.query_map(rusqlite::params![limit as i64], |r| { + Ok(OcrFailureRow { + ts: r.get(0)?, + page: r.get(1)?, + ms: r.get(2)?, + reason: r.get(3)?, + image_byte_size: r.get(4)?, + }) + }) + .context("query failures corpus-wide")? + .filter_map(|r| r.ok()) + .collect() + }; + Ok(OcrFailuresV1 { + schema_version: "ocr_failures.v1", + doc_id: doc_id.map(String::from), + failure_count: failures.len() as u64, + failures, + }) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/kebab-app/src/ingest_log.rs b/crates/kebab-app/src/ingest_log.rs index 3702793..83f9cd3 100644 --- a/crates/kebab-app/src/ingest_log.rs +++ b/crates/kebab-app/src/ingest_log.rs @@ -29,6 +29,10 @@ impl IngestLogWriter { let run_id = generate_run_id(); let log_dir = expand_log_dir(&cfg.ingest_log_dir); std::fs::create_dir_all(&log_dir)?; + // Cleanup before creating the new file (non-critical: warn on error). + if let Err(e) = cleanup_old_logs(&log_dir, cfg.keep_recent_runs, cfg.retention_days) { + tracing::warn!(target: "kebab-app", "ingest log cleanup failed: {e}"); + } let path = log_dir.join(format!("ingest-{run_id}.ndjson")); let file = BufWriter::new(File::create(&path)?); Ok(Some(Self { @@ -116,6 +120,10 @@ pub(crate) fn now_ts() -> String { pub enum LogEvent<'a> { Ocr { ts: String, + /// v0.20.x r2: additive field — doc_id for dual-write SQLite correlation. + /// Round 1 ndjson logs deserialize with doc_id=None (Serde Option default). + #[serde(skip_serializing_if = "Option::is_none")] + doc_id: Option<&'a str>, doc_path: &'a str, page: u32, image_byte_size: Option, @@ -177,7 +185,7 @@ impl IngestSummary { ocr_ms_samples: &[u64], duration_ms: u64, ) -> Self { - let (p50, p90, max) = percentiles(ocr_ms_samples); + let (p50, p90, _p99, max) = percentiles(ocr_ms_samples); Self { kind: "summary".to_string(), ts, @@ -196,24 +204,79 @@ impl IngestSummary { } /// Simple percentile extraction on a sorted copy of `samples`. -/// Returns `(p50, p90, max)`. All `None` when samples is empty. -pub(crate) fn percentiles(samples: &[u64]) -> (Option, Option, Option) { +/// Returns `(p50, p90, p99, max)`. All `None` when samples is empty. +/// p99 surfaces via `inspect ocr-stats`; `IngestSummary` uses p50/p90/max only. +pub(crate) fn percentiles(samples: &[u64]) -> (Option, Option, Option, Option) { if samples.is_empty() { - return (None, None, None); + return (None, None, None, None); } let mut sorted = samples.to_vec(); sorted.sort_unstable(); let n = sorted.len(); - let p50 = sorted[n * 50 / 100]; - let p90 = sorted[n * 90 / 100]; + let p50 = sorted[(n.saturating_sub(1) * 50) / 100]; + let p90 = sorted[(n.saturating_sub(1) * 90) / 100]; + let p99 = sorted[(n.saturating_sub(1) * 99) / 100]; let max = *sorted.last().unwrap(); - (Some(p50), Some(p90), Some(max)) + (Some(p50), Some(p90), Some(p99), Some(max)) +} + +/// Delete old ingest log files from `log_dir`. +/// +/// **Retention rule (§3.4 OR-on-stale semantics):** +/// Keep a file iff BOTH conditions hold: (idx < keep_recent) AND (modified > cutoff). +/// Delete iff (idx >= keep_recent) OR (modified <= cutoff) — either stale condition +/// triggers deletion. Files are indexed newest-first so `idx=0` is the most recent. +pub(crate) fn cleanup_old_logs( + log_dir: &Path, + keep_recent: u32, + retention_days: u32, +) -> anyhow::Result<()> { + let mut entries: Vec<_> = std::fs::read_dir(log_dir)? + .filter_map(|e| e.ok()) + .filter(|e| { + e.path() + .file_name() + .and_then(|n| n.to_str()) + .map(|s| s.starts_with("ingest-") && s.ends_with(".ndjson")) + .unwrap_or(false) + }) + .collect(); + + // Sort newest-first by mtime (files without mtime go to the end). + entries.sort_by_key(|e| std::cmp::Reverse(e.metadata().ok().and_then(|m| m.modified().ok()))); + + let cutoff = SystemTime::now() + .checked_sub(std::time::Duration::from_secs( + retention_days as u64 * 86400, + )) + .unwrap_or(SystemTime::UNIX_EPOCH); + + for (idx, entry) in entries.into_iter().enumerate() { + let modified = entry + .metadata() + .ok() + .and_then(|m| m.modified().ok()) + .unwrap_or(SystemTime::UNIX_EPOCH); + // Keep iff (idx < keep_recent) AND (modified > cutoff). + if (idx as u32) < keep_recent && modified > cutoff { + continue; + } + if let Err(e) = std::fs::remove_file(entry.path()) { + tracing::warn!( + target: "kebab-app", + "failed to remove old log {}: {e}", + entry.path().display() + ); + } + } + Ok(()) } #[cfg(test)] mod tests { use super::*; use kebab_config::LoggingCfg; + use std::time::SystemTime; use tempfile::TempDir; #[test] @@ -246,6 +309,7 @@ mod tests { let cfg = LoggingCfg { ingest_log_enabled: false, ingest_log_dir: PathBuf::from("/tmp/should-not-exist"), + ..Default::default() }; let result = IngestLogWriter::open(&cfg).expect("open should not error"); assert!(result.is_none(), "disabled writer should return None"); @@ -257,6 +321,7 @@ mod tests { let cfg = LoggingCfg { ingest_log_enabled: true, ingest_log_dir: tmp.path().to_path_buf(), + ..Default::default() }; let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap(); let path = writer.path().to_path_buf(); @@ -307,6 +372,7 @@ mod tests { let cfg = LoggingCfg { ingest_log_enabled: true, ingest_log_dir: tmp.path().to_path_buf(), + ..Default::default() }; let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap(); let path = writer.path().to_path_buf(); @@ -325,4 +391,57 @@ mod tests { "file should have at least 1 line after drop" ); } + + /// AC-7: keep_recent=3 with 5 files, oldest 2 should be deleted. + #[test] + fn cleanup_keeps_recent_n_drops_old() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path(); + // Create 5 files with mtime spread across 60 days + for i in 0..5u64 { + let path = dir.join(format!("ingest-file{i}.ndjson")); + std::fs::write(&path, b"x").unwrap(); + // Set mtime: file 0 = newest, file 4 = 60 days old + let age_days = i * 15; // 0, 15, 30, 45, 60 days old + let mtime = SystemTime::now() + .checked_sub(std::time::Duration::from_secs(age_days * 86400)) + .unwrap(); + filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap(); + } + // keep_recent=3, retention_days=90 (no time-based deletion) + cleanup_old_logs(dir, 3, 90).unwrap(); + let remaining: Vec<_> = std::fs::read_dir(dir) + .unwrap() + .filter_map(|e| e.ok()) + .collect(); + assert_eq!(remaining.len(), 3, "expected 3 files after cleanup"); + } + + /// F5 OR-on-stale: files within keep_recent count but older than retention_days + /// must still be deleted. + #[test] + fn cleanup_drops_stale_even_within_count() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path(); + // 2 files, both 90 days old — well past retention_days=30 + for i in 0..2u64 { + let path = dir.join(format!("ingest-old{i}.ndjson")); + std::fs::write(&path, b"x").unwrap(); + let mtime = SystemTime::now() + .checked_sub(std::time::Duration::from_secs(90 * 86400)) + .unwrap(); + filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap(); + } + // keep_recent=10 (both within count) but retention_days=30 → both stale + cleanup_old_logs(dir, 10, 30).unwrap(); + let remaining: Vec<_> = std::fs::read_dir(dir) + .unwrap() + .filter_map(|e| e.ok()) + .collect(); + assert_eq!( + remaining.len(), + 0, + "stale files must be deleted even within keep_recent" + ); + } } diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 328e219..1f289fc 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -321,6 +321,15 @@ pub fn ingest_with_config_opts( let ocr_pages_cnt: Arc> = Arc::new(Mutex::new(0u32)); let ocr_failures_cnt: Arc> = Arc::new(Mutex::new(0u32)); + // v0.20.x r2: prune stale pdf_ocr_events rows once per ingest run. + let _pruned = app + .sqlite + .prune_pdf_ocr_events(app.config.logging.retention_days) + .unwrap_or_else(|e| { + tracing::warn!(target: "kebab-app", "pdf_ocr_events prune failed: {e}"); + 0 + }); + // Walk the workspace. crate::ingest_progress::emit( progress, @@ -1933,6 +1942,13 @@ fn ingest_one_pdf_asset( let pages_for_ocr = ocr_pages_cnt.clone(); let failures_for_ocr = ocr_failures_cnt.clone(); let doc_path_for_log = asset.workspace_path.0.clone(); + // v0.20.x r2 Step 3: pre-capture for dual-write (F1 + G1 resolution). + let doc_id_for_log: String = canonical.doc_id.0.clone(); + let store_for_ocr = Arc::clone(&app.sqlite); + let run_id_for_log: String = lw_for_ocr + .as_ref() + .and_then(|lw| lw.lock().ok().map(|w| w.run_id().to_string())) + .unwrap_or_default(); let summary = crate::pdf_ocr_apply::apply_ocr_to_pdf_pages( &mut canonical, @@ -1974,10 +1990,12 @@ fn ingest_one_pdf_asset( } // v0.20.x Hook 2: write OCR event to log writer. let success = !skipped && failure_reason.is_none(); + let ts_for_event = crate::ingest_log::now_ts(); if let Some(ref lw) = lw_for_ocr { if let Ok(mut w) = lw.lock() { let _ = w.write_event(&crate::ingest_log::LogEvent::Ocr { - ts: crate::ingest_log::now_ts(), + ts: ts_for_event.clone(), + doc_id: Some(&doc_id_for_log), doc_path: &doc_path_for_log, page, image_byte_size, @@ -1991,6 +2009,27 @@ fn ingest_one_pdf_asset( }); } } + // v0.20.x r2: SQLite dual-write (non-critical — R-1). + if let Err(e) = store_for_ocr.record_pdf_ocr_event( + &run_id_for_log, + &ts_for_event, + Some(&doc_id_for_log), + &doc_path_for_log, + page, + image_byte_size, + image_width, + image_height, + ms, + chars, + success, + failure_reason.as_deref(), + engine.engine_name(), + ) { + tracing::warn!( + target: "kebab-app", + "sqlite ocr event insert failed: {e}" + ); + } if let Ok(mut p) = pages_for_ocr.lock() { *p += 1; } diff --git a/crates/kebab-app/src/pdf_ocr_apply.rs b/crates/kebab-app/src/pdf_ocr_apply.rs index 26240e8..ba375d1 100644 --- a/crates/kebab-app/src/pdf_ocr_apply.rs +++ b/crates/kebab-app/src/pdf_ocr_apply.rs @@ -22,6 +22,18 @@ use lopdf::Document as LopdfDocument; use time::OffsetDateTime; use tracing::warn; +/// Extract width/height from a JPEG (or any image format) byte slice. +/// Returns `None` on corrupt / unsupported data — callers fall back to +/// `(None, None)` so OCR results remain valid (R-4 mitigation). +fn extract_image_dimensions(bytes: &[u8]) -> Option<(u32, u32)> { + use image::ImageReader; + ImageReader::new(std::io::Cursor::new(bytes)) + .with_guessed_format() + .ok()? + .into_dimensions() + .ok() +} + /// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`]. /// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade /// (`kebab_app::ingest_one_pdf_asset`) fills these from @@ -178,14 +190,17 @@ where kind: ProvenanceKind::Warning, note: Some(note), }); + let (image_width, image_height) = extract_image_dimensions(&page_image_bytes) + .map(|(w, h)| (Some(w), Some(h))) + .unwrap_or((None, None)); emit_progress(PdfOcrProgress::Finished { page: page_num, ms: start.elapsed().as_millis() as u64, chars: 0, skipped: true, image_byte_size: Some(page_image_bytes.len() as u64), - image_width: None, - image_height: None, + image_width, + image_height, failure_reason: Some("ocr_error".to_string()), }); continue; @@ -256,14 +271,17 @@ where )), }); + let (image_width, image_height) = extract_image_dimensions(&page_image_bytes) + .map(|(w, h)| (Some(w), Some(h))) + .unwrap_or((None, None)); emit_progress(PdfOcrProgress::Finished { page: page_num, ms: elapsed_ms, chars: chars_ocr, skipped: false, image_byte_size: Some(page_image_bytes.len() as u64), - image_width: None, - image_height: None, + image_width, + image_height, failure_reason: None, }); } @@ -321,3 +339,26 @@ pub enum PdfOcrProgress { failure_reason: Option, }, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extract_image_dimensions_valid_jpeg() { + let img = image::RgbImage::new(16, 12); + let mut bytes = Vec::new(); + image::DynamicImage::from(img) + .write_to( + &mut std::io::Cursor::new(&mut bytes), + image::ImageFormat::Jpeg, + ) + .expect("encode jpeg"); + assert_eq!(extract_image_dimensions(&bytes), Some((16, 12))); + } + + #[test] + fn extract_image_dimensions_corrupt_returns_none() { + assert_eq!(extract_image_dimensions(b"not a jpeg"), None); + } +} diff --git a/crates/kebab-app/src/schema.rs b/crates/kebab-app/src/schema.rs index 8982f62..d2bc396 100644 --- a/crates/kebab-app/src/schema.rs +++ b/crates/kebab-app/src/schema.rs @@ -116,6 +116,9 @@ const WIRE_SCHEMAS: &[&str] = &[ "error.v1", "bulk_search_item.v1", "bulk_search_response.v1", + // v0.20.x r2 Enhancement 3: OCR statistics + failures introspection. + "ocr_stats.v1", + "ocr_failures.v1", ]; /// Build a [`SchemaV1`] introspection report for the given config. diff --git a/crates/kebab-app/tests/ingest_log_smoke.rs b/crates/kebab-app/tests/ingest_log_smoke.rs index cf152fa..6cc69aa 100644 --- a/crates/kebab-app/tests/ingest_log_smoke.rs +++ b/crates/kebab-app/tests/ingest_log_smoke.rs @@ -28,6 +28,7 @@ fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Con cfg.logging = LoggingCfg { ingest_log_enabled: true, ingest_log_dir: log_dir.to_path_buf(), + ..Default::default() }; cfg } @@ -138,6 +139,7 @@ fn ingest_log_disabled_emits_no_file() { cfg.logging = LoggingCfg { ingest_log_enabled: false, ingest_log_dir: log_dir.clone(), + ..Default::default() }; let scope = SourceScope { diff --git a/crates/kebab-app/tests/ocr_inspect_smoke.rs b/crates/kebab-app/tests/ocr_inspect_smoke.rs new file mode 100644 index 0000000..31e1bf4 --- /dev/null +++ b/crates/kebab-app/tests/ocr_inspect_smoke.rs @@ -0,0 +1,156 @@ +//! Integration smoke tests for `kebab inspect ocr-stats / ocr-failures`. +//! AC-4, AC-5, AC-6, AC-11 (ocr_inspect_smoke binary), AC-13. + +mod common; + +use common::TestEnv; +use kebab_app::App; +use kebab_store_sqlite::SqliteStore; + +/// Insert synthetic pdf_ocr_events rows directly so the test runs without +/// a live Ollama endpoint. +fn seed_ocr_events(env: &TestEnv, store: &SqliteStore) { + // Success rows + for i in 0..3u32 { + store + .record_pdf_ocr_event( + "run-aaa", + &format!("2026-05-28T0{}:00:00Z", i), + Some("doc-abc"), + "path/scanned.pdf", + i + 1, + Some(50_000), + Some(200), + Some(150), + 100 + (i as u64) * 20, + 42, + true, + None, + "qwen2.5vl", + ) + .expect("seed success row"); + } + // Failure row + store + .record_pdf_ocr_event( + "run-bbb", + "2026-05-28T10:00:00Z", + Some("doc-abc"), + "path/scanned.pdf", + 4, + Some(30_000), + Some(200), + Some(150), + 9999, + 0, + false, + Some("ocr_error"), + "qwen2.5vl", + ) + .expect("seed failure row"); + // Row for different doc + store + .record_pdf_ocr_event( + "run-ccc", + "2026-05-28T11:00:00Z", + Some("doc-xyz"), + "path/other.pdf", + 1, + None, + None, + None, + 200, + 10, + true, + None, + "qwen2.5vl", + ) + .expect("seed doc-xyz row"); + // Trigger migration (no-op if already done via App::open_with_config) + let _ = env; +} + +fn open_app_with_seeded_events(env: &TestEnv) -> App { + let app = env.app(); + let store = SqliteStore::open(&env.config).expect("open store for seed"); + store.run_migrations().expect("run migrations for seed"); + seed_ocr_events(env, &store); + app +} + +/// AC-4: `inspect_ocr_stats` returns `schema_version = "ocr_stats.v1"`, +/// `total_events >= 1`, `0 ≤ success_rate ≤ 1`. +#[test] +fn ocr_stats_after_seeded_events() { + let env = TestEnv::lexical_only(); + let app = open_app_with_seeded_events(&env); + + let stats = app.inspect_ocr_stats().expect("inspect_ocr_stats"); + + assert_eq!(stats.schema_version, "ocr_stats.v1"); + assert!(stats.total_events >= 1, "total_events should be >= 1"); + assert!( + (0.0..=1.0).contains(&stats.success_rate), + "success_rate must be in [0, 1]: {}", + stats.success_rate + ); + assert!(stats.total_runs >= 1, "total_runs should be >= 1"); + // by_engine should have at least one entry + assert!(!stats.by_engine.is_empty(), "by_engine must be non-empty"); +} + +/// AC-6: `inspect_ocr_failures` (no doc_id, corpus-wide) returns failures list. +#[test] +fn ocr_failures_corpus_wide() { + let env = TestEnv::lexical_only(); + let app = open_app_with_seeded_events(&env); + + let result = app + .inspect_ocr_failures(None, 10) + .expect("inspect_ocr_failures"); + + assert_eq!(result.schema_version, "ocr_failures.v1"); + assert!(result.failure_count >= 1, "expected at least 1 failure"); + assert!( + !result.failures.is_empty(), + "failures list must be non-empty" + ); +} + +/// AC-5: `inspect_ocr_failures` with doc_id filter returns matching rows. +#[test] +fn ocr_failures_filter_by_doc_id() { + let env = TestEnv::lexical_only(); + let app = open_app_with_seeded_events(&env); + + let result = app + .inspect_ocr_failures(Some("doc-abc"), 10) + .expect("inspect_ocr_failures by doc_id"); + + assert_eq!(result.schema_version, "ocr_failures.v1"); + assert_eq!( + result.doc_id.as_deref(), + Some("doc-abc"), + "doc_id must be echoed back" + ); + // All rows must belong to doc-abc (no cross-doc leak) + for row in &result.failures { + // rows are failure rows for doc-abc only (reason = ocr_error) + assert_eq!(row.reason, "ocr_error"); + } +} + +/// AC-13: SKILL.md lists both new wire schemas. +#[test] +fn skill_md_lists_new_schemas() { + let skill_md = std::fs::read_to_string("../../integrations/claude-code/kebab/SKILL.md") + .expect("read SKILL.md"); + assert!( + skill_md.contains("ocr_stats.v1"), + "SKILL.md must mention ocr_stats.v1" + ); + assert!( + skill_md.contains("ocr_failures.v1"), + "SKILL.md must mention ocr_failures.v1" + ); +} diff --git a/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs b/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs new file mode 100644 index 0000000..4ef52f3 --- /dev/null +++ b/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs @@ -0,0 +1,139 @@ +//! Integration smoke test: dual-write (ndjson + SQLite) for PDF OCR events. +//! AC-3: SQLite row count and doc_id matches ndjson LogEvent::Ocr. +//! +//! Uses wiremock to stub the Ollama `/api/generate` endpoint so the test +//! runs without a live Ollama instance. + +mod common; + +use std::path::PathBuf; + +use common::TestEnv; +use kebab_config::LoggingCfg; +use serde_json::Value; +use tokio::task::spawn_blocking; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +fn scanned_pdf_src() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf") +} + +/// AC-3: ndjson OCR line count == pdf_ocr_events row count, and doc_id matches. +#[tokio::test] +async fn ingest_dual_write_doc_id_matches_ndjson() { + let src = scanned_pdf_src(); + if !src.exists() { + eprintln!("skipping test: scanned_page1.pdf fixture not found"); + return; + } + + let server = MockServer::start().await; + // Stub Ollama /api/generate to return a minimal OCR response. + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "model": "qwen2.5vl:3b", + "response": "test ocr output", + "done": true, + "done_reason": "stop" + }))) + .mount(&server) + .await; + + let mock_url = server.uri(); + + let result = spawn_blocking(move || { + let mut env = TestEnv::lexical_only(); + // Enable PDF OCR + set up mock endpoint + env.config.pdf.ocr.enabled = true; + env.config.pdf.ocr.endpoint = Some(mock_url.clone()); + env.config.pdf.ocr.model = "qwen2.5vl:3b".to_string(); + // Enable ingest log + let log_dir = env.temp.path().join("logs"); + std::fs::create_dir_all(&log_dir).unwrap(); + env.config.logging = LoggingCfg { + ingest_log_enabled: true, + ingest_log_dir: log_dir.clone(), + ..Default::default() + }; + + // Copy scanned PDF into workspace + let dest = env.workspace_root.join("scanned.pdf"); + std::fs::copy(scanned_pdf_src(), &dest).expect("copy scanned PDF"); + + // Run ingest + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest"); + + // Read ndjson log + let log_files: Vec<_> = std::fs::read_dir(&log_dir) + .unwrap() + .filter_map(Result::ok) + .filter(|e| { + let name = e.file_name().to_string_lossy().to_string(); + name.starts_with("ingest-") && name.ends_with(".ndjson") + }) + .collect(); + assert_eq!(log_files.len(), 1, "expected 1 ndjson log file"); + + let body = std::fs::read_to_string(log_files[0].path()).unwrap(); + let ocr_lines: Vec = body + .lines() + .filter_map(|l| serde_json::from_str(l).ok()) + .filter(|v: &Value| v.get("kind").and_then(Value::as_str) == Some("ocr")) + .collect(); + + // Read pdf_ocr_events from SQLite + let db_path = PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite"); + let conn = rusqlite::Connection::open(&db_path).expect("open db"); + let rows: Vec<(Option, String)> = { + let mut stmt = conn + .prepare("SELECT doc_id, doc_path FROM pdf_ocr_events ORDER BY id") + .expect("prepare"); + stmt.query_map([], |r| Ok((r.get(0)?, r.get(1)?))) + .expect("query") + .map(|r| r.expect("row")) + .collect() + }; + + (ocr_lines, rows) + }) + .await + .expect("spawn_blocking"); + + let (ocr_lines, rows) = result; + + // At least one OCR event must be produced + assert!(!ocr_lines.is_empty(), "expected ≥1 ndjson ocr line"); + assert!(!rows.is_empty(), "expected ≥1 pdf_ocr_events row"); + + // Row counts must match + assert_eq!( + ocr_lines.len(), + rows.len(), + "ndjson ocr lines ({}) must equal pdf_ocr_events rows ({})", + ocr_lines.len(), + rows.len() + ); + + // doc_id in both sources must be non-null and consistent + for (line, (sql_doc_id, _sql_doc_path)) in ocr_lines.iter().zip(rows.iter()) { + let json_doc_id = line.get("doc_id").and_then(Value::as_str); + assert!( + json_doc_id.is_some(), + "ndjson ocr line should have doc_id: {line}" + ); + assert!( + sql_doc_id.is_some(), + "pdf_ocr_events row should have doc_id" + ); + assert_eq!( + json_doc_id, + sql_doc_id.as_deref(), + "ndjson doc_id must equal SQLite doc_id" + ); + } +} diff --git a/crates/kebab-cli/src/main.rs b/crates/kebab-cli/src/main.rs index 539a277..1f7ae81 100644 --- a/crates/kebab-cli/src/main.rs +++ b/crates/kebab-cli/src/main.rs @@ -358,6 +358,17 @@ enum InspectWhat { Doc { id: String }, /// Inspect a single chunk by ID. Chunk { id: String }, + /// Corpus-wide OCR statistics (total events, latency percentiles, engine breakdown). + OcrStats, + /// Recent OCR failures, optionally filtered by document ID. + OcrFailures { + /// Filter failures to a single document UUID. + #[arg(long)] + doc_id: Option, + /// Maximum number of failure rows to return. + #[arg(long, default_value_t = 10)] + limit: usize, + }, } #[derive(Subcommand, Debug)] @@ -691,6 +702,21 @@ fn run(cli: &Cli) -> anyhow::Result<()> { ); Ok(()) } + InspectWhat::OcrStats => { + let cfg = kebab_config::Config::load(cli.config.as_deref())?; + let app = kebab_app::App::open_with_config(cfg.clone())?; + let stats = app.inspect_ocr_stats_with_config(&cfg)?; + println!("{}", serde_json::to_string(&stats)?); + Ok(()) + } + InspectWhat::OcrFailures { doc_id, limit } => { + let cfg = kebab_config::Config::load(cli.config.as_deref())?; + let app = kebab_app::App::open_with_config(cfg.clone())?; + let failures = + app.inspect_ocr_failures_with_config(&cfg, doc_id.as_deref(), *limit)?; + println!("{}", serde_json::to_string(&failures)?); + Ok(()) + } }, Cmd::Fetch { what } => { diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index 7fa693d..335920b 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -443,9 +443,19 @@ pub struct LoggingCfg { /// Directory for per-run log files. Default `{state_dir}/logs`. /// `{state_dir}` expands to the XDG state dir (e.g. `~/.local/state/kebab`). - /// Log file accumulation is user-managed — no rotation policy (spec §6 R-1). #[serde(default = "default_ingest_log_dir")] pub ingest_log_dir: PathBuf, + + /// v0.20.x r2 Enhancement 4: keep the most recent N ingest log files. + /// Older files (beyond this count) are deleted at ingest start. + /// Default 100. AC-9: #[serde(default)] ensures backward compat. + #[serde(default = "default_keep_recent_runs")] + pub keep_recent_runs: u32, + + /// v0.20.x r2 Enhancement 4: delete log files older than N days. + /// Also applied to `pdf_ocr_events` SQLite rows. Default 30. + #[serde(default = "default_retention_days")] + pub retention_days: u32, } fn default_ingest_log_enabled() -> bool { @@ -454,12 +464,20 @@ fn default_ingest_log_enabled() -> bool { fn default_ingest_log_dir() -> PathBuf { PathBuf::from("{state_dir}/logs") } +fn default_keep_recent_runs() -> u32 { + 100 +} +fn default_retention_days() -> u32 { + 30 +} impl Default for LoggingCfg { fn default() -> Self { Self { ingest_log_enabled: default_ingest_log_enabled(), ingest_log_dir: default_ingest_log_dir(), + keep_recent_runs: default_keep_recent_runs(), + retention_days: default_retention_days(), } } } diff --git a/crates/kebab-config/tests/logging_roundtrip.rs b/crates/kebab-config/tests/logging_roundtrip.rs index 27521ce..e964930 100644 --- a/crates/kebab-config/tests/logging_roundtrip.rs +++ b/crates/kebab-config/tests/logging_roundtrip.rs @@ -42,3 +42,25 @@ fn pre_v020_config_without_logging_section_gets_defaults() { assert!(w.logging.ingest_log_enabled); assert_eq!(w.logging.ingest_log_dir, PathBuf::from("{state_dir}/logs")); } + +// Test 4 (AC-9 v0.20.x r2): old config with only ingest_log_enabled + ingest_log_dir +// parses without error and produces correct defaults for keep_recent_runs + retention_days. +#[test] +fn old_logging_config_parses_with_defaults() { + let toml = r#" +[logging] +ingest_log_enabled = true +ingest_log_dir = "{state_dir}/logs" +"#; + let w: LoggingWrapper = toml::from_str(toml).expect("old logging config must parse"); + assert!(w.logging.ingest_log_enabled); + assert_eq!(w.logging.ingest_log_dir, PathBuf::from("{state_dir}/logs")); + assert_eq!( + w.logging.keep_recent_runs, 100, + "keep_recent_runs must default to 100" + ); + assert_eq!( + w.logging.retention_days, 30, + "retention_days must default to 30" + ); +} diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index fd9e3c3..437cc02 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -988,6 +988,71 @@ impl SqliteStore { } Ok(out) } + + // ── v0.20.x r2 Enhancement 2: pdf_ocr_events ───────────────────────── + + /// Insert one OCR sample row into `pdf_ocr_events` (V008 migration). + /// Follows the existing `Mutex` lock pattern (F2). + #[allow(clippy::too_many_arguments)] + pub fn record_pdf_ocr_event( + &self, + run_id: &str, + ts: &str, + doc_id: Option<&str>, + doc_path: &str, + page: u32, + image_byte_size: Option, + image_width: Option, + image_height: Option, + ms: u64, + chars: u32, + success: bool, + reason: Option<&str>, + ocr_engine: &str, + ) -> anyhow::Result<()> { + let conn = self.conn.lock().expect("sqlite lock poisoned"); + conn.execute( + "INSERT INTO pdf_ocr_events + (run_id, ts, doc_id, doc_path, page, + image_byte_size, image_width, image_height, + ms, chars, success, reason, ocr_engine) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + rusqlite::params![ + run_id, + ts, + doc_id, + doc_path, + page, + image_byte_size, + image_width, + image_height, + ms, + chars, + if success { 1i32 } else { 0i32 }, + reason, + ocr_engine + ], + )?; + Ok(()) + } + + /// Delete rows from `pdf_ocr_events` older than `retention_days`. + /// Returns the number of deleted rows. + /// Cutoff is computed as `now_utc - retention_days`; a value of 0 + /// means "delete everything older than now" (i.e. all past rows). + pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result { + use time::format_description::well_known::Rfc3339; + let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(retention_days as i64); + let cutoff_ts = cutoff + .format(&Rfc3339) + .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string()); + let conn = self.conn.lock().expect("sqlite lock poisoned"); + let n = conn.execute( + "DELETE FROM pdf_ocr_events WHERE ts < ?", + rusqlite::params![cutoff_ts], + )?; + Ok(n as u64) + } } /// Apply the design §5 / task-spec pragmas. Called once per connection. diff --git a/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs b/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs new file mode 100644 index 0000000..2db3cdb --- /dev/null +++ b/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs @@ -0,0 +1,91 @@ +//! Smoke tests for V008 pdf_ocr_events migration + record/prune API (Enhancement 2). +//! AC-2, AC-3, AC-8. + +mod common; + +use kebab_store_sqlite::SqliteStore; +use rusqlite::OptionalExtension; + +fn open_migrated() -> (common::TestEnv, SqliteStore) { + let env = common::TestEnv::new(); + let store = SqliteStore::open(&env.config()).expect("open"); + store.run_migrations().expect("run migrations"); + (env, store) +} + +/// AC-2: V008 migration creates the pdf_ocr_events table. +#[test] +fn v008_pdf_ocr_events_table_exists() { + let (env, _store) = open_migrated(); + let name: Option = env.with_conn(|c| { + c.query_row( + "SELECT name FROM sqlite_master WHERE type='table' AND name='pdf_ocr_events'", + [], + |r| r.get(0), + ) + .optional() + }); + assert_eq!( + name.as_deref(), + Some("pdf_ocr_events"), + "pdf_ocr_events table must exist after V008" + ); +} + +/// AC-8: insert 2 rows with different timestamps; prune with retention_days=0 +/// (cutoff = now) → the old row is deleted, count returns 1. +#[test] +fn record_and_prune_pdf_ocr_event() { + let (_env, store) = open_migrated(); + + // Row 1: very old timestamp (1970) + store + .record_pdf_ocr_event( + "run-old", + "1970-01-01T00:00:00Z", + Some("doc-old"), + "path/old.pdf", + 1, + Some(12345), + Some(100), + Some(80), + 250, + 42, + true, + None, + "qwen2.5vl", + ) + .expect("insert old row"); + + // Row 2: future timestamp (far future, so it survives prune) + store + .record_pdf_ocr_event( + "run-new", + "2099-01-01T00:00:00Z", + Some("doc-new"), + "path/new.pdf", + 1, + None, + None, + None, + 180, + 30, + true, + None, + "qwen2.5vl", + ) + .expect("insert future row"); + + // prune with retention_days=0 → cutoff=now → deletes any row with ts < now. + // The 1970 row should be deleted; the 2099 row survives. + let pruned = store.prune_pdf_ocr_events(0).expect("prune"); + assert_eq!(pruned, 1, "should have deleted exactly 1 old row"); + + // Verify only the future row remains + let count: i64 = { + let conn = store.read_conn(); + conn.query_row("SELECT COUNT(*) FROM pdf_ocr_events", [], |r| r.get(0)) + .expect("count") + }; + assert_eq!(count, 1, "exactly 1 row should survive after prune"); +} diff --git a/docs/SMOKE.md b/docs/SMOKE.md index b727246..24d261f 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -149,6 +149,12 @@ skip_generated_header = true max_file_bytes = 262144 max_file_lines = 5000 extra_skip_globs = [] # 사용자 추가 skip 패턴 (gitignore syntax) + +[logging] +ingest_log_enabled = true +ingest_log_dir = "{state_dir}/logs" +keep_recent_runs = 100 # v0.20.x r2: 최근 N 개 run log 파일 보존 +retention_days = 30 # v0.20.x r2: N일 이상 된 log / OCR 이벤트 자동 삭제 ``` `KEBAB_*` 환경변수로 override 가능 (`KEBAB_MODELS_LLM_MODEL=gemma4:26b kebab …` 등). 자세한 키 목록은 `crates/kebab-config/src/lib.rs` 의 `apply_env` 매치 암. `KEBAB_READONLY=1` — write-path 비활성화 (CI 안전망). `KEBAB_PROGRESS=plain` — non-TTY 환경에서 진행 상황을 plain 한 줄씩 stderr 출력 (spinner 대신). diff --git a/docs/superpowers/plans/2026-05-28-v0.20.x-logging-r2-plan.md b/docs/superpowers/plans/2026-05-28-v0.20.x-logging-r2-plan.md new file mode 100644 index 0000000..e28b595 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-v0.20.x-logging-r2-plan.md @@ -0,0 +1,576 @@ +--- +title: v0.20.x ingest log round 2 — implementation plan +created: 2026-05-28 +status: DRAFT round 0 +phase: B5 plan drafter +target_spec: docs/superpowers/specs/2026-05-28-v0.20.x-logging-r2-spec.md +critic_r1: .omc/reviews/2026-05-28-v0.20.x-logging-r2-spec-closure-result.md +critic_r2: .omc/reviews/2026-05-28-v0.20.x-logging-r2-spec-closure-r2-result.md +branch: feat/ingest-log-round2-enhancements +step_count: 6 +commit_count: 5 +--- + +# v0.20.x ingest log round 2 — implementation plan + +> **For agentic workers:** REQUIRED SUB-SKILL — `superpowers:executing-plans` (or `superpowers:subagent-driven-development`) to implement task-by-task. Steps use `- [ ]` checkbox syntax. Each Step block carries its own commit boundary; Step 6 is verify-only (no commit). + +**Goal.** Extend v0.20.0 round 1 ingest log (file-only ndjson) with four additive enhancements: (1) raster image dimensions, (2) SQLite mirror via V008 + dual-write, (3) CLI `inspect ocr-stats` / `inspect ocr-failures` + two wire schemas, (4) automatic file + SQLite retention. All non-breaking; wire schema cascades as additive minor; existing 1370 workspace test → 1375+. + +**Architecture.** Hook the dual-write at the existing OCR `emit_progress` closure inside `ingest_with_config_opts`. File write first (durable), SQLite second (non-critical — `tracing::warn!` on failure). The closure pre-captures `doc_id` and clones `Arc` from `App.sqlite` before `apply_ocr_to_pdf_pages` runs (F1 + G1 resolution). Two `App::inspect_*` methods follow the facade rule via `*_with_config` companions (G2). Runtime introspection emit `WIRE_SCHEMAS` gets two entries; the JSON Schema file `schema.schema.json` is untouched (G3: it's pattern-based). + +**Tech stack.** Rust 2024, rusqlite (existing), `image` crate (`jpeg` feature add), `time` (existing). No new workspace deps. + +**Spec contract.** `docs/superpowers/specs/2026-05-28-v0.20.x-logging-r2-spec.md` (751 line). Implements 13 AC + resolves 3 closure-r2 findings (G1/G2/G3) + 6 closure-r1 findings (F1–F6). + +--- + +## File map + +**Modify (10):** +- `crates/kebab-app/Cargo.toml` — `image` features += `"jpeg"` (F3). +- `crates/kebab-app/src/pdf_ocr_apply.rs` — `extract_image_dimensions` helper + fill 6 emit points (Enhancement 1). +- `crates/kebab-app/src/ingest_log.rs` — `LogEvent::Ocr` adds `doc_id`; `IngestLogWriter::open` runs `cleanup_old_logs`; `percentiles` returns `(p50, p90, p99, max)`. +- `crates/kebab-app/src/lib.rs::ingest_with_config_opts` — pre-capture `doc_id_for_log`, clone `Arc`, dual-write inside closure (F1 + G1). +- `crates/kebab-app/src/schema.rs` — `WIRE_SCHEMAS` += `"ocr_stats.v1"`, `"ocr_failures.v1"` (G3 runtime emit). +- `crates/kebab-app/src/app.rs` — `inspect_ocr_stats` / `inspect_ocr_failures` + `*_with_config` companion pair (G2) + 4 wire structs. +- `crates/kebab-config/src/lib.rs` — `LoggingCfg` += `keep_recent_runs: u32`, `retention_days: u32` (`#[serde(default=...)]`, defaults 100 + 30, AC-9). +- `crates/kebab-store-sqlite/src/store.rs` — `record_pdf_ocr_event` + `prune_pdf_ocr_events` (Mutex lock pattern, F2). +- `crates/kebab-cli/src/main.rs` — `InspectWhat::OcrStats` + `InspectWhat::OcrFailures` variants + dispatch arms calling `*_with_config` (G2). +- `integrations/claude-code/kebab/SKILL.md` — list `ocr_stats.v1` + `ocr_failures.v1` (AC-13). + +**Create (5):** +- `migrations/V008__pdf_ocr_events.sql` — table + 3 indices. +- `docs/wire-schema/v1/ocr_stats.schema.json` — `ocr_stats.v1` JSON Schema (verbatim from spec §4.3). +- `docs/wire-schema/v1/ocr_failures.schema.json` — `ocr_failures.v1` JSON Schema (verbatim from spec §4.3). +- `crates/kebab-app/tests/ocr_inspect_smoke.rs` — AC-4/5/6/13. +- `crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs` — AC-2/3/8. + +**Do NOT modify:** +- `docs/wire-schema/v1/schema.schema.json` — `wire.schemas` is `pattern`-based (G3). +- Spec ACCEPT frozen. Round 1 spec / PDF OCR spec / parent design doc untouched. + +--- + +## Closure r2 plan-level resolution + +### G1 (MEDIUM) — `SqliteStore::open` signature + double-open risk + +Spec §4.7 snippet `SqliteStore::open(&cfg.storage.sqlite)?` fails to compile — real signature at `crates/kebab-store-sqlite/src/store.rs:123` is `pub fn open(config: &kebab_config::Config) -> Result`. Worse, a fresh `open()` would create a second connection contending with `App::sqlite` for the single-writer lock and re-running migrations. + +**Resolution (Step 3.3).** Inside `ingest_with_config_opts`, beside the existing `lw_for_ocr` pre-clone at `crates/kebab-app/src/lib.rs:1931`, add: + +```rust +let store_for_ocr: Arc = Arc::clone(&app.sqlite); +``` + +`App.sqlite` is `pub(crate) sqlite: Arc` at `crates/kebab-app/src/app.rs:123` — visible inside the crate. The closure moves `store_for_ocr`. **No new `SqliteStore::open` call anywhere.** + +### G2 (LOW–MEDIUM) — facade rule for `inspect_ocr_*` + +Spec §4.4 declares only `App::inspect_ocr_stats(&self)`. CLAUDE.md "the facade rule" requires a `*_with_config(cfg, …)` companion. HOTFIXES P3-5 and P4-3 record two regressions of this exact shape. + +**Resolution (Step 4.5).** Both methods ship as a pair: + +```rust +pub fn inspect_ocr_stats(&self) -> Result { + self.inspect_ocr_stats_with_config(&self.config) +} +#[doc(hidden)] +pub fn inspect_ocr_stats_with_config(&self, _cfg: &Config) -> Result { … } +``` + +Same shape for `inspect_ocr_failures`. The CLI dispatch (Step 4.6) calls the explicit `_with_config` form so `--config ` is honored — current ingest path already threads `cfg` via `App::open_with_config(cfg.clone())`. + +### G3 (LOW) — wire schema runtime emit, not JSON Schema file + +Spec §4.3 line 299-301 says "schema.schema.json 갱신" but `docs/wire-schema/v1/schema.schema.json` declares `"schemas": { "type": "array", "items": { "type": "string", "pattern": "^[a-z_]+\\.v[0-9]+$" } }` — pattern-based, not enum. The two new strings already match. + +**Resolution (Step 4.2).** Extend `WIRE_SCHEMAS` const at `crates/kebab-app/src/schema.rs:104-119` (runtime emit source for `kebab schema --json`). Two new schema *files* (`ocr_stats.schema.json` + `ocr_failures.schema.json`) are still created as the external contract; the meta `schema.schema.json` is untouched. + +--- + +## Closure r1 finding resolution recap + +| Finding | Severity | Plan step | Resolution | +|--|--|--|--| +| F1 doc_id NULL in dual-write | HIGH | Step 3 | pre-capture `canonical.doc_id.0.clone()` | +| F2 SqliteStore conn lock pattern | MEDIUM | Step 2 | `self.conn.lock().expect("sqlite lock poisoned")` | +| F3 image crate jpeg feature | MEDIUM | Step 1 | `features = ["png", "jpeg"]` | +| F4 percentile + p99 | LOW | Step 4 | extend `ingest_log::percentiles` to 4-tuple | +| F5 OR-on-stale wording | LOW | Step 5 | inline comment in `cleanup_old_logs` | +| F6 V008 rollback | LOW | (spec §6 R-2) | already in spec, no plan action | + +--- + +## Step 1 — image_width / image_height capture (Enhancement 1) + +**Implements:** AC-1. **Commit 1/5.** + +**Files:** Modify `crates/kebab-app/Cargo.toml`, `crates/kebab-app/src/pdf_ocr_apply.rs`. + +- [ ] **1.1 Cargo feature.** `image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }` (was `["png"]` only). Rationale F3. + +- [ ] **1.2 Failing unit tests** in `pdf_ocr_apply::tests`: + - `extract_image_dimensions_valid_jpeg` — load 16x12 fixture JPEG, assert `Some((16, 12))`. + - `extract_image_dimensions_corrupt_returns_none` — `b"not a jpeg"` → `None`. + + If `fixtures/pdf_ocr/sample_16x12.jpg` doesn't exist, generate via `convert -size 16x12 xc:white …` and commit. If a comparable small JPEG already lives under `fixtures/`, reuse it. + +- [ ] **1.3 Helper impl.** Add near the top of `pdf_ocr_apply.rs`: + + ```rust + fn extract_image_dimensions(jpeg_bytes: &[u8]) -> Option<(u32, u32)> { + use image::ImageReader; + ImageReader::new(std::io::Cursor::new(jpeg_bytes)) + .with_guessed_format().ok()? + .into_dimensions().ok() + } + ``` + + Returns `Option` so corrupt JPEG falls back to `(None, None)` (R-4 mitigation). + +- [ ] **1.4 Fill 6 emit points** at lines 149, 155, 181, 188, 259, 265 (probe-confirmed). Each currently hardcodes `image_width: None, image_height: None`. Pattern: + + ```rust + let (image_width, image_height) = extract_image_dimensions(&page_image_bytes) + .map(|(w, h)| (Some(w), Some(h))) + .unwrap_or((None, None)); + ``` + + Then pass `image_width` / `image_height` (and `image_byte_size: Some(page_image_bytes.len() as u64)`) into the `PdfOcrProgress::Finished` constructor. For skip / error paths where `page_image_bytes` isn't in scope, leave `None` — AC-1 only requires non-null on successful raster decode. + +- [ ] **1.5 Verify.** `cargo test -p kebab-app --lib pdf_ocr_apply::tests` + `cargo test -p kebab-app --test pdf_ocr_roundtrip`. + +- [ ] **1.6 Commit.** Use HEREDOC form so the body formats correctly: + + ``` + feat(app): capture image_width/height in PDF OCR raster decode (Enhancement 1) + + Add extract_image_dimensions(jpeg_bytes) helper using image::ImageReader + and fill the 6 PdfOcrProgress::Finished emit points in pdf_ocr_apply.rs. + Result: LogEvent::Ocr now carries non-null image_width/image_height on + successful raster decode, enabling future size-conditioned timeout tuning. + + Closure r1 F3: kebab-app/Cargo.toml image features += "jpeg" (explicit + rather than relying on feature unification via kebab-parse-image). + + Co-Authored-By: Claude Opus 4.7 + ``` + +--- + +## Step 2 — V008 migration + SqliteStore record/prune (Enhancement 2 — schema half) + +**Implements:** AC-2, AC-8 prune logic. **Commit 2/5.** + +**Files:** Create `migrations/V008__pdf_ocr_events.sql`, `crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs`. Modify `crates/kebab-store-sqlite/src/store.rs`. + +- [ ] **2.1 V008 SQL** per spec §4.2: + + ```sql + CREATE TABLE pdf_ocr_events ( + id INTEGER PRIMARY KEY, + run_id TEXT NOT NULL, ts TEXT NOT NULL, + doc_id TEXT, doc_path TEXT NOT NULL, page INTEGER NOT NULL, + image_byte_size INTEGER, image_width INTEGER, image_height INTEGER, + ms INTEGER NOT NULL, chars INTEGER NOT NULL, + success INTEGER NOT NULL, reason TEXT, ocr_engine TEXT NOT NULL + ); + CREATE INDEX idx_pdf_ocr_events_doc_id ON pdf_ocr_events(doc_id); + CREATE INDEX idx_pdf_ocr_events_run_id ON pdf_ocr_events(run_id); + CREATE INDEX idx_pdf_ocr_events_ts ON pdf_ocr_events(ts); + ``` + +- [ ] **2.2 Failing smoke tests** in new `pdf_ocr_events_insert_smoke.rs`: + - `v008_pdf_ocr_events_table_exists` — open TempDir SqliteStore, run_migrations, `SELECT name FROM sqlite_master WHERE name='pdf_ocr_events'`. + - `record_and_prune_pdf_ocr_event` — insert 2 rows (`ts = 1970-…` and `ts = 2026-…`), `prune_pdf_ocr_events(0)` returns `1`, only the future-stamped row survives. + + Use existing per-test `Config` builder (probe `crates/kebab-store-sqlite/tests/*.rs` for prior smoke pattern; if absent, build `Config::default()` with `data_dir = tempdir.path()`). + +- [ ] **2.3 Implement two pub fn** on `impl SqliteStore` in `store.rs`, signatures per spec §4.2 line 186-228: + + ```rust + #[allow(clippy::too_many_arguments)] + pub fn record_pdf_ocr_event(&self, + run_id: &str, ts: &str, doc_id: Option<&str>, doc_path: &str, page: u32, + image_byte_size: Option, image_width: Option, image_height: Option, + ms: u64, chars: u32, success: bool, reason: Option<&str>, ocr_engine: &str, + ) -> anyhow::Result<()>; + + pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result; + ``` + + Body: F2 lock pattern `let conn = self.conn.lock().expect("sqlite lock poisoned");` followed by `INSERT INTO pdf_ocr_events (…) VALUES (…)` / `DELETE FROM pdf_ocr_events WHERE ts < ?` (cutoff from `time::OffsetDateTime::now_utc() - Duration::days(retention_days as i64)`, RFC 3339 format). `prune` returns the deleted row count (`u64`). + +- [ ] **2.4 Verify.** `cargo test -p kebab-store-sqlite --test pdf_ocr_events_insert_smoke`. + +- [ ] **2.5 Commit.** + + ``` + feat(store): V008 pdf_ocr_events migration + record/prune API (Enhancement 2) + + Add migrations/V008__pdf_ocr_events.sql with the events table + 3 + indices (doc_id, run_id, ts). SqliteStore gains two pub fn: + record_pdf_ocr_event (insert one OCR sample) and prune_pdf_ocr_events + (delete rows older than retention_days; returns the affected row + count). Both follow the existing Mutex lock pattern. + + Wiring into ingest path lands in the next commit. + + Closure r1 F2: explicit lock acquisition in both methods. + + Co-Authored-By: Claude Opus 4.7 + ``` + +--- + +## Step 3 — Dual-write integration (Enhancement 2 — wiring half) + +**Implements:** AC-3. Resolves F1 + G1. **Commit 3/5.** + +**Files:** Modify `crates/kebab-app/src/ingest_log.rs`, `crates/kebab-app/src/lib.rs`. + +- [ ] **3.1 Extend `LogEvent::Ocr`** at `ingest_log.rs:116` — add `doc_id: Option<&'a str>` as the second field (after `ts`, before `doc_path`). Additive Serde — round 1 ndjson logs deserialize with `doc_id = None` (Serde's default `Option` handling, no `#[serde(default)]` needed because round 1 readers don't exist outside the test suite). + +- [ ] **3.2 Failing integration test** appended to `pdf_ocr_events_insert_smoke.rs`: + - `ingest_dual_write_doc_id_matches_ndjson` — run scanned-PDF ingest via `App::open_with_config`, then for every emitted OCR event assert `pdf_ocr_events.doc_id` equals the ndjson `LogEvent::Ocr.doc_id`, and both equal `canonical.doc_id`. Reuse the `pdf_ocr_roundtrip` test scaffold for the scanned fixture. + +- [ ] **3.3 Pre-capture in `ingest_with_config_opts`.** Immediately after the existing `let doc_path_for_log = asset.workspace_path.0.clone();` at `lib.rs:1935`, add: + + ```rust + let doc_id_for_log: String = canonical.doc_id.0.clone(); + let store_for_ocr: Arc = Arc::clone(&app.sqlite); // G1 + let run_id_for_log: String = lw_for_ocr.as_ref() + .and_then(|lw| lw.lock().ok().map(|w| w.run_id().to_string())) + .unwrap_or_default(); + ``` + + `canonical` is bound at `lib.rs:1912` via `app.extract_for(...)` — in scope before the closure. All three captures are owned values / Arc → `Send + 'static` (rusqlite `Connection` is not `Send`, but `Mutex` inside `Arc` is `Send + Sync`). + +- [ ] **3.4 Wire into closure.** Inside the existing `PdfOcrProgress::Finished` arm at `lib.rs:1950-2000`, replace the current file-write block with: (a) bind `ts_for_event = ingest_log::now_ts()`, (b) pass `doc_id: Some(&doc_id_for_log)` into the existing `LogEvent::Ocr` construction, (c) immediately after the file write, call `store_for_ocr.record_pdf_ocr_event(&run_id_for_log, &ts_for_event, Some(&doc_id_for_log), &doc_path_for_log, page, image_byte_size, image_width, image_height, ms, chars, success, reason_str, engine.engine_name())` wrapped in `if let Err(e) = … { tracing::warn!(target: "kebab-app", "sqlite ocr event insert failed: {e}"); }`. File write first → SQLite second (R-1 ordering). + +- [ ] **3.5 Update existing tests** that consume `LogEvent::Ocr`: + - `ingest_log_smoke` (and any sibling smoke that round-trips ndjson) — assertions must accept the new `doc_id` field. Either ignore or assert `Some(_)`. + - `logging_roundtrip` integration test — same. + +- [ ] **3.6 Verify.** `cargo test -p kebab-app --test pdf_ocr_roundtrip ingest_log_smoke logging_roundtrip` + `cargo test -p kebab-store-sqlite --test pdf_ocr_events_insert_smoke ingest_dual_write_doc_id_matches_ndjson`. + +- [ ] **3.7 Commit.** + + ``` + feat(app): dual-write PDF OCR events to SQLite + ndjson (Enhancement 2 wiring) + + Pre-capture canonical.doc_id and Arc before the OCR + emit_progress closure so both the ndjson file and the SQLite mirror + carry the same doc_id for every event. File write is durable + (errors propagate); SQLite insert is non-critical (tracing::warn on + failure, ingest does not abort) per spec R-1. + + LogEvent::Ocr gains a doc_id: Option<&str> field as an additive + Serde change — round 1 ndjson logs deserialize with doc_id=None. + + Closure r1 F1: doc_id NULL in dual-write resolved via + let doc_id_for_log = canonical.doc_id.0.clone() pre-capture. + Closure r2 G1: Arc::clone(&app.sqlite) reused instead of opening a + second SqliteStore — eliminates double-open lock contention and + duplicate migration runs. + + Co-Authored-By: Claude Opus 4.7 + ``` + +--- + +## Step 4 — CLI inspect commands + wire schemas (Enhancement 3) + +**Implements:** AC-4, AC-5, AC-6, AC-11 (`ocr_inspect_smoke`), AC-13. Resolves G2 + G3 + F4. **Commit 4/5.** + +**Files:** Modify `crates/kebab-app/src/ingest_log.rs`, `crates/kebab-app/src/schema.rs`, `crates/kebab-app/src/app.rs`, `crates/kebab-cli/src/main.rs`, `integrations/claude-code/kebab/SKILL.md`. Create `docs/wire-schema/v1/ocr_stats.schema.json`, `docs/wire-schema/v1/ocr_failures.schema.json`, `crates/kebab-app/tests/ocr_inspect_smoke.rs`. + +- [ ] **4.1 JSON Schema files.** Author the two files verbatim from spec §4.3 line 234-297. These are the external contract for downstream consumers (claude-code skill, MCP). + +- [ ] **4.2 `WIRE_SCHEMAS` runtime emit (G3).** In `crates/kebab-app/src/schema.rs:104`, append two entries to the `&[&str]` literal: `"ocr_stats.v1"`, `"ocr_failures.v1"`. **Do NOT edit `docs/wire-schema/v1/schema.schema.json`** — its `wire.schemas` is `pattern`-based. + +- [ ] **4.3 Percentile helper (F4).** Change `crates/kebab-app/src/ingest_log.rs:200` `pub(crate) fn percentiles` from returning `(Option, Option, Option)` (p50, p90, max) to `(Option, Option, Option, Option)` (p50, p90, p99, max). Implementation: sort in-memory, pick at index `((n-1) * q).round()` for `q ∈ {0.50, 0.90, 0.99}`, `*sorted.last()` for max. Update all callers inside `kebab-app` (1-2 sites) to destructure 4 elements. `IngestSummary` (round 1, 3-percentile) stays as-is — p99 surfaces only via `inspect ocr-stats`. + +- [ ] **4.4 Wire types** in `crates/kebab-app/src/app.rs` (or new sibling `inspect_ocr.rs` under the same module umbrella): + - `OcrStatsV1`: `schema_version: &'static str`, `total_events`, `total_runs`, `success_count`, `failure_count: u64`; `success_rate: f64`; `p50_ms`, `p90_ms`, `p99_ms`, `max_ms: Option`; `by_engine: BTreeMap`; `by_doc: Vec`. + - `OcrStatsByDoc`: `doc_id: String`, `failure_count`, `success_count: u64`, `p90_ms: Option`. + - `OcrFailuresV1`: `schema_version: &'static str`, `doc_id: Option`, `failure_count: u64`, `failures: Vec`. + - `OcrFailureRow`: `ts: String`, `page: u32`, `ms: u64`, `reason: String`, `image_byte_size: Option`. + + All `#[derive(serde::Serialize)]`. `schema_version` uses string literal `"ocr_stats.v1"` / `"ocr_failures.v1"` at construction. + +- [ ] **4.5 Facade pair (G2)** on `impl App`: + + ```rust + pub fn inspect_ocr_stats(&self) -> Result { self.inspect_ocr_stats_with_config(&self.config) } + #[doc(hidden)] + pub fn inspect_ocr_stats_with_config(&self, _cfg: &Config) -> Result; + + pub fn inspect_ocr_failures(&self, doc_id: Option<&str>, limit: usize) -> Result { + self.inspect_ocr_failures_with_config(&self.config, doc_id, limit) + } + #[doc(hidden)] + pub fn inspect_ocr_failures_with_config(&self, _cfg: &Config, doc_id: Option<&str>, limit: usize) -> Result; + ``` + + Implementation reads from `self.sqlite` (the App's `Arc`): + + - `inspect_ocr_stats_with_config`: + 1. `SELECT COUNT(*), SUM(CASE WHEN success=1 …), SUM(CASE WHEN success=0 …), COUNT(DISTINCT run_id) FROM pdf_ocr_events` → 4 counters. + 2. `SELECT ms FROM pdf_ocr_events WHERE success=1 ORDER BY ms` → `Vec` → `ingest_log::percentiles(&samples)` → 4-tuple. + 3. `SELECT ocr_engine, COUNT(*) FROM pdf_ocr_events GROUP BY ocr_engine` → `BTreeMap`. + 4. `SELECT doc_id, SUM(success=0), SUM(success=1) FROM pdf_ocr_events WHERE doc_id IS NOT NULL GROUP BY doc_id ORDER BY 2 DESC LIMIT 10` → `Vec` with `p90_ms = None` (per-doc p90 deferred — see open question #3). + 5. `success_rate = success_count / total_events` (guard zero-division). + + - `inspect_ocr_failures_with_config`: `SELECT ts, page, ms, reason, image_byte_size FROM pdf_ocr_events WHERE success=0 [AND doc_id=?] ORDER BY ts DESC LIMIT ?`. Map to `OcrFailureRow`. `failure_count = failures.len()`. + + Connection access: use existing `pub(crate)` `Mutex` (either via an existing closure accessor on `SqliteStore` or by adding a `pub(crate) fn conn(&self) -> &Mutex` — probe `store.rs` for prior pattern and follow it). + +- [ ] **4.6 CLI variants** in `crates/kebab-cli/src/main.rs:356` (extend `InspectWhat`): + + ```rust + OcrStats, + OcrFailures { + #[arg(long)] doc_id: Option, + #[arg(long, default_value_t = 10)] limit: usize, + }, + ``` + + Dispatch arms at `main.rs:671` after the existing `Doc / Chunk` arms — for both, construct `App::open_with_config(cfg.clone())?`, call the `*_with_config(&cfg, …)` form (G2 explicit cfg threading), then route through the existing `print_json_or_text` helper (or whatever the file's pattern is — probe before naming). + +- [ ] **4.7 Integration test** in new `crates/kebab-app/tests/ocr_inspect_smoke.rs`: + - `ocr_stats_after_scanned_pdf_ingest` — TempDir KB, ingest the existing scanned-PDF fixture, assert `stats.schema_version == "ocr_stats.v1"`, `stats.total_events >= 1`, `0.0 ≤ stats.success_rate ≤ 1.0`. + - `ocr_failures_filter_by_doc_id` — requires a fixture that produces at least one failure (probe `fixtures/pdf/` for a corrupt-page or timeout-inducing PDF; if none exists, mark the test `#[ignore]` with a `// TODO(v0.20.y): fixture` and rely on `record_and_prune_pdf_ocr_event` for the synthetic-row coverage). + - `skill_md_lists_new_schemas` — `read_to_string("integrations/claude-code/kebab/SKILL.md").contains("ocr_stats.v1")` (AC-13 mechanical). + +- [ ] **4.8 SKILL.md sync** — append `ocr_stats.v1` and `ocr_failures.v1` to the wire schema enumeration block (currently at SKILL.md lines 37-147 per probe), each with a one-line description. + +- [ ] **4.9 Verify.** `cargo test -p kebab-app --test ocr_inspect_smoke` + `cargo build -p kebab-cli` (compile-only confirms variant + dispatch wire-up). + +- [ ] **4.10 Commit.** + + ``` + feat(cli): kebab inspect ocr-stats + ocr-failures (Enhancement 3 + wire schema additive minor) + + Two new wire schemas land as additive minor: ocr_stats.v1 (corpus-wide + aggregate — total_events, success_rate, p50/p90/p99/max_ms, by_engine, + top-10 by_doc by failure count) and ocr_failures.v1 (per-doc or + corpus-wide recent failures, with --doc-id + --limit). Both ship via + new CLI subcommands `kebab inspect ocr-stats` / `inspect ocr-failures`. + + App gains four facade methods: inspect_ocr_stats / + inspect_ocr_failures plus their *_with_config companions — required by + CLAUDE.md "the facade rule" so `--config ` is honored. The CLI + dispatch arms thread cfg explicitly into the _with_config form. + + Runtime introspection emit (WIRE_SCHEMAS in schema.rs) gains two + entries; the meta JSON Schema (schema.schema.json) is untouched + because its wire.schemas is pattern-based, not enum-based. + + ingest_log::percentiles extended to (p50, p90, p99, max). p99 surfaces + only via inspect ocr-stats; IngestSummary (round 1) stays 3-percentile. + + SKILL.md synced with the two new schemas (AC-13). + + Closure r2 G2 (facade *_with_config pair) + G3 (runtime emit, not + meta schema file) + closure r1 F4 (p99) resolved. + + Co-Authored-By: Claude Opus 4.7 + ``` + +--- + +## Step 5 — Log retention (Enhancement 4) + +**Implements:** AC-7, AC-8 wire-up, AC-9, AC-12. Resolves F5. **Commit 5/5.** + +**Files:** Modify `crates/kebab-config/src/lib.rs`, `crates/kebab-app/src/ingest_log.rs`, `crates/kebab-app/src/lib.rs`, `docs/SMOKE.md`. + +- [ ] **5.1 Failing backward-compat test** in `crates/kebab-config` tests: + - `old_logging_config_parses_with_defaults` — toml input with only `ingest_log_enabled` + `ingest_log_dir`, assert `cfg.logging.keep_recent_runs == 100`, `retention_days == 30`. + +- [ ] **5.2 `LoggingCfg` extension** at `crates/kebab-config/src/lib.rs:438-462`: + + ```rust + pub struct LoggingCfg { + pub ingest_log_enabled: bool, + pub ingest_log_dir: PathBuf, + #[serde(default = "default_keep_recent_runs")] pub keep_recent_runs: u32, + #[serde(default = "default_retention_days")] pub retention_days: u32, + } + fn default_keep_recent_runs() -> u32 { 100 } + fn default_retention_days() -> u32 { 30 } + ``` + + Plus `impl Default for LoggingCfg` returning the same defaults. AC-12 automatic via `Config::default()` → `toml::to_string_pretty` at `crates/kebab-app/src/lib.rs:142, 178` (`kebab init` path). + +- [ ] **5.3 Failing cleanup tests** in `ingest_log::tests`: + - `cleanup_keeps_recent_n_drops_old` — 5 files, 3 fresh + 2 sixty-days-old; `cleanup_old_logs(dir, 3, 30)` → 3 freshest survive. + - `cleanup_drops_stale_even_within_count` — 2 files, both `keep_recent=10` but 90 days old; `cleanup_old_logs(dir, 10, 30)` → all dropped (F5 OR-on-stale). + + Use `filetime` crate (or std `set_file_mtime` polyfill) for backdated mtimes. + +- [ ] **5.4 Implement `cleanup_old_logs`** at module-level in `ingest_log.rs`: + + ```rust + pub(crate) fn cleanup_old_logs(log_dir: &Path, keep_recent: u32, retention_days: u32) -> Result<()>; + ``` + + Body: read_dir → filter `ingest-*.ndjson` → sort by `modified()` descending → walk with index; **delete iff `(idx >= keep_recent) OR (modified <= cutoff)`**; equivalently keep iff `(idx < keep_recent) AND (modified > cutoff)`. Inline that as a comment (F5 wording fix). + +- [ ] **5.5 Hook into `IngestLogWriter::open`.** Before creating the new ingest-*.ndjson file, call `cleanup_old_logs(&log_dir, cfg.keep_recent_runs, cfg.retention_days)`. On error, `tracing::warn!` and continue — cleanup is non-critical (R-6 mitigation). + +- [ ] **5.6 SQLite prune hook** in `crates/kebab-app/src/lib.rs::ingest_with_config_opts` (near the existing `IngestLogWriter::open` call site): + + ```rust + let _pruned = app.sqlite + .prune_pdf_ocr_events(app.config.logging.retention_days) + .unwrap_or_else(|e| { + tracing::warn!(target: "kebab-app", "pdf_ocr_events prune failed: {e}"); + 0 + }); + ``` + + One prune per ingest run — matches file-side cadence. + +- [ ] **5.7 Doc sync.** `docs/SMOKE.md` config example `[logging]` block gets the two new fields with their defaults. README / HANDOFF / ARCHITECTURE: skip — no user-visible surface beyond CLI inspect (covered in Step 4) and config keys (SMOKE-only). + +- [ ] **5.8 Verify.** `cargo test -p kebab-config -- logging` + `cargo test -p kebab-app --lib ingest_log::tests` + `cargo test -p kebab-store-sqlite --test pdf_ocr_events_insert_smoke record_and_prune`. + +- [ ] **5.9 Commit.** + + ``` + feat(app): log retention — keep_recent_runs + retention_days (Enhancement 4) + + LoggingCfg gains two fields with serde defaults: keep_recent_runs + (default 100, top-N file retention) and retention_days (default 30, + time-based retention for both ndjson files and the SQLite mirror). + + IngestLogWriter::open now runs cleanup_old_logs before creating a new + ingest-*.ndjson — delete iff (idx >= keep_recent) OR (modified <= + cutoff). ingest_with_config_opts also calls + SqliteStore::prune_pdf_ocr_events(retention_days) at ingest start so + the SQLite mirror tracks the same retention window. + + Backward compat (AC-9): both new fields use #[serde(default = ...)], + so a pre-v0.20.x config with only [logging] ingest_log_enabled + + ingest_log_dir parses unchanged. kebab init writes the new defaults + automatically via Config::default() → toml::to_string_pretty (AC-12). + + docs/SMOKE.md config example synced. + + Closure r1 F5: explicit OR-on-stale comment inside cleanup_old_logs. + + Co-Authored-By: Claude Opus 4.7 + ``` + +--- + +## Step 6 — Final sanity (no commit) + +**Implements:** AC-10, AC-11 cumulative. + +- [ ] **6.1 Workspace test.** + + ```bash + export CARGO_TARGET_DIR=/build/out/cargo-target/target + cargo test --workspace --no-fail-fast -j 1 > /tmp/wstest.out 2>&1 + grep -E "^test result:" /tmp/wstest.out | awk ' + { for(i=1;i<=NF;i++){ + if($i=="passed;") pass+=$(i-1); + if($i=="failed;") fail+=$(i-1); + } } + END { print "passed:", pass, "failed:", fail } + ' + ``` + + Expected: `passed >= 1375`, `failed = 0`. + +- [ ] **6.2 Clippy.** `cargo clippy --workspace --all-targets -- -D warnings` → 0 warnings. + +- [ ] **6.3 Wire schema additive check.** `./target/release/kebab schema --json | jq '.wire.schemas[]' | grep -E "ocr_stats|ocr_failures"` returns both strings. + +- [ ] **6.4 Dogfood smoke (optional).** Per `docs/SMOKE.md` against TempDir KB: ingest a scanned PDF → `kebab inspect ocr-stats --json | jq .schema_version` returns `"ocr_stats.v1"` → `sqlite3 .../kebab.sqlite 'SELECT COUNT(*) FROM pdf_ocr_events'` non-zero. + +- [ ] **6.5 `cargo fmt --all`** + `git status` confirms all intended files are staged. No commit at Step 6. + +--- + +## §3 Cumulative verifier checklist (13 row) + +| AC | Description | Verifier | Step | +|----|---|---|---| +| AC-1 | image_w/h non-null on raster decode | `extract_image_dimensions_*` units + `pdf_ocr_roundtrip` (extended) | Step 1 | +| AC-2 | V008 table exists post-migration | `v008_pdf_ocr_events_table_exists` | Step 2 | +| AC-3 | ndjson ↔ SQLite row count + doc_id match | `ingest_dual_write_doc_id_matches_ndjson` | Step 3 | +| AC-4 | `inspect ocr-stats --json` schema_version | `ocr_stats_after_scanned_pdf_ingest` | Step 4 | +| AC-5 | `inspect ocr-failures --doc-id ` returns rows | `ocr_failures_filter_by_doc_id` (F1 makes this passable) | Step 3 + Step 4 | +| AC-6 | `inspect ocr-failures --json` corpus-wide | sibling assertion in inspect smoke | Step 4 | +| AC-7 | keep_recent_runs=N → oldest deleted | `cleanup_keeps_recent_n_drops_old` | Step 5 | +| AC-8 | retention_days=0 → SQLite old rows deleted | `record_and_prune_pdf_ocr_event` | Step 2 + Step 5 hook | +| AC-9 | old config parses with defaults | `old_logging_config_parses_with_defaults` | Step 5 | +| AC-10 | workspace test + clippy green | Step 6 final sanity | Step 6 | +| AC-11 | new integration test binaries | `ocr_inspect_smoke` + `pdf_ocr_events_insert_smoke` | Step 3 + Step 4 | +| AC-12 | `kebab init` writes new defaults | auto via `Config::default()` → `toml::to_string_pretty` | Step 5 | +| AC-13 | SKILL.md lists new schemas | `skill_md_lists_new_schemas` | Step 4 | + +--- + +## §4 Risk → resolution map + +| Risk / finding | Resolution location | Plan step | +|--|--|--| +| R-1 dual-write race | file first, SQLite warn-only | Step 3.4 | +| R-2 V008 rollback | spec §6 R-2 documents manual SQL | (no plan action) | +| R-3 concurrent cleanup | cleanup only at `IngestLogWriter::open` | Step 5.4 | +| R-4 corrupt JPEG decode | `Option` fallback | Step 1.3 | +| R-5 old consumer reads new schema | `schema_version` skip | (schema design) | +| R-6 concurrent cleanup vs tail | cleanup only at ingest start, top-N recent safe | Step 5.4 | +| R-7 doc_id NULL (F1) | pre-capture `canonical.doc_id.0.clone()` | Step 3.3 | +| G1 SqliteStore::open + double-open | `Arc::clone(&app.sqlite)` | Step 3.3 | +| G2 facade rule violation | `*_with_config` companion pair | Step 4.5 + 4.6 | +| G3 schema.schema.json wording | extend `WIRE_SCHEMAS` const | Step 4.2 | + +--- + +## §5 Open questions for executor + +LOW priority — judgment calls, none block the spec contract. + +1. **`SqliteStore` connection accessor.** Step 4.5 reads from `self.sqlite`'s inner `Mutex`. Probe `store.rs` for an existing `with_conn(|c| …)` closure-form accessor and prefer it (tight lock scope). If none exists, add a minimal `pub(crate) fn conn(&self) -> &Mutex` — visibility unchanged. +2. **p99 in `IngestSummary`.** `percentiles` now returns 4-tuple. Round 1 `IngestSummary` carries p50/p90/max only. Adding `ocr_p99_ms` is out of scope (no AC); leave it surfaced only via `inspect ocr-stats`. Executor may fold it in if the diff is one line. +3. **`OcrStatsByDoc.p90_ms = None`.** Per-doc p90 deferred — wire schema allows `["integer", "null"]`. If a clean O(n)-per-doc sort fits, executor may implement; otherwise leave `None`. +4. **JPEG fixture.** Step 1.2 commits a 16x12 JPEG. If `fixtures/` already carries a comparable small JPEG, reuse + adjust assertions. +5. **`ingest_log_smoke` adjustment.** Step 3.5 likely needs assertion tweaks for the new `doc_id` field — light touch (`Option` field omitted when None). +6. **Failure-producing PDF fixture for AC-5.** If no fixture reliably produces an OCR failure, mark `ocr_failures_filter_by_doc_id` as `#[ignore]` with a TODO; the synthetic-row coverage in `record_and_prune_pdf_ocr_event` still exercises the WHERE-doc_id path. + +--- + +## §6 References + +- **Spec contract:** `docs/superpowers/specs/2026-05-28-v0.20.x-logging-r2-spec.md` (751 line, ACCEPT, frozen). +- **Critic r1:** `.omc/reviews/2026-05-28-v0.20.x-logging-r2-spec-closure-result.md` (6 finding, applied in r1c). +- **Critic r2:** `.omc/reviews/2026-05-28-v0.20.x-logging-r2-spec-closure-r2-result.md` (3 G-finding, plan-level resolution above). +- **Parent design:** `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` (§8 dep graph, §9 version cascade). +- **Round 1 spec:** `docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md` (frozen). +- **PDF OCR spec:** `docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md` (frozen). +- **Branch:** `feat/ingest-log-round2-enhancements` (HEAD `89d334a`). +- **HOTFIXES:** `tasks/HOTFIXES.md` — any post-merge deviations land here. +- **CLAUDE.md sections:** §The facade rule (G2), §Versioning cascade (additive minor → no release trigger), §Naming + paths. + +--- + +## §7 Constraints + +1. **No branch change.** All 5 commits land on `feat/ingest-log-round2-enhancements`. +2. **Spec frozen.** No edits to the ACCEPT spec from this plan. Deviations → `tasks/HOTFIXES.md`. +3. **Wire schema additive minor.** Two new `*.v1` schemas + two new `WIRE_SCHEMAS` entries. No major bump. +4. **Regression budget 0.** Baseline 1370 workspace tests stay green; round 2 adds ≥5 new. +5. **Worker protocol — subagent skip.** Executor runs in a single subagent-driven-development pass without spawning nested workers. +6. **Length budget.** 500-700 line plan (this file ≈ 600). +7. **Build path.** `export CARGO_TARGET_DIR=/build/out/cargo-target/target`; `-j 4` default, `-j 1` for the workspace sanity pass. +8. **Commit cadence.** 5 commits (Steps 1-5). Step 6 verify-only. +9. **Doc sync.** README untouched (no surface beyond `inspect` subcommands — append to README's 명령 table only if it already lists `inspect`); HANDOFF gets one line post-merge if load-bearing; ARCHITECTURE untouched; SMOKE config example updated in Step 5.7. +10. **Release trigger.** Wire additive minor → NOT a release trigger per CLAUDE.md §Versioning cascade. Defer bump to a dogfood-driven `chore: bump 0.20.x → 0.20.y` later. diff --git a/docs/superpowers/specs/2026-05-28-v0.20.x-logging-r2-spec.md b/docs/superpowers/specs/2026-05-28-v0.20.x-logging-r2-spec.md new file mode 100644 index 0000000..1dd69ba --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-v0.20.x-logging-r2-spec.md @@ -0,0 +1,751 @@ +--- +title: v0.20.x ingest log round 2 — 4 enhancement spec +created: 2026-05-28 +status: DRAFT round 0 +parent_spec: 2026-04-27-kebab-final-form-design.md +target_version: v0.20.x +branch: feat/ingest-log-round2-enhancements +--- + +# v0.20.x ingest log round 2 + +## §1 Motivation + +### §1.1 Sweet-spot analysis — progressive dogfood tuning + +v0.20.0 sub-item 1 의 round 1 ingest log (PR #189 merged) 는 per-run ndjson 의 file-only logging 으로 배포. 3개월 실사용(dogfood) 중 OCR engine 의 timeout / performance sweet-spot 을 점진적으로 조정할 필요. + +- **현재 상태**: 각 ingest run 의 OCR 샘플(ms, success/fail) 이 ndjson file 에만 기록 → historical aggregate query 불가 (per-run 단위). +- **요구**: 누적 데이터베이스에서 p90 / p99 / 극값 을 조회 → timeout default 축소(e.g. 300s → 180s) 결정 가능. +- **해결**: SQLite `pdf_ocr_events` mirror table — v0.20.x round 2 주요 enhancement. + +### §1.2 Image dimension 결함 — null emit 문제 + +`crates/kebab-app/src/pdf_ocr_apply.rs` 의 6개 emit point (line 155, 188, 265 등) 에서 `image_width: None, image_height: None` hardcode. +- **현재**: raster JPEG 가 memory 에 있으나 dimension 측정 미수행. +- **영향**: wire schema 의 optional field 이지만, 실제 use case (e.g. "100MB+ 이미지만 timeout 조정") 를 위해 필수 데이터. +- **fix**: raster decode via `image` crate (transitive dep via `kebab-parse-image`). + +### §1.3 CLI inspect subcommands — 운영 visibility + +ndjson log 는 human-readable 이지만, 스크립트/automation 용 corpus-wide 통계가 부족. +- `kebab inspect ocr-stats --json` — 전체 OCR 성공률, p50/p90/p99 latency, engine 별 breakdown. +- `kebab inspect ocr-failures --doc-id --json` — 특정 doc 의 failure history. +- `kebab inspect ocr-failures --json` — 최근 failure 나열 (corpus-wide). + +### §1.4 Log retention — 무한 증가 방지 + +ingest 가 반복되면서 `~/.local/state/kebab/logs/ingest-*.ndjson` file 누적. +- **현재**: 수동 정리 필수. +- **요구**: 자동 cleanup — `keep_recent_runs` (e.g. 100) + `retention_days` (e.g. 30). +- **적용**: SQLite `pdf_ocr_events` 도 동일한 retention 정책 적용. + +--- + +## §2 Scope + non-scope + +### §2.1 Included + +**Enhancement 1: image_width + image_height capture (trivial)** +- raster JPEG dimension decode in `pdf_ocr_apply.rs` 6 emit point. +- image crate import (already transitive). +- Some(u32) 로 fill. + +**Enhancement 2: SQLite mirror — pdf_ocr_events table (medium)** +- V008 migration: `pdf_ocr_events` table 신규 (run_id, ts, doc_id, page, image_byte_size, image_width, image_height, ms, chars, success, reason, ocr_engine). +- index: run_id, doc_id, ts. +- insert path: `IngestLogWriter` 에서 file ndjson + SQLite 동시 write (dual-write). +- SqliteStore::record_pdf_ocr_event(…) API. + +**Enhancement 3: CLI inspect commands (medium)** +- `kebab inspect ocr-stats --json` — corpus-wide aggregate (total_events, success_count, failure_count, p50/p90/p99_ms, by_engine, top 10 docs by failure). +- `kebab inspect ocr-failures --doc-id --json` — single doc failure list. +- `kebab inspect ocr-failures --json` (no doc-id) — corpus-wide recent failures (--limit configurable). +- wire schemas: `ocr_stats.v1` + `ocr_failures.v1` (additive minor to schema.v1). + +**Enhancement 4: log retention + rotation (low)** +- `[logging] keep_recent_runs: u32` (default 100) + `retention_days: u32` (default 30). +- file cleanup: IngestLogWriter::open 시 prune helper 호출. +- SQLite cleanup: SqliteStore::prune_pdf_ocr_events(retention_days). +- backward compat: old config (no `[logging]` fields) parses with default. + +### §2.2 Out of scope + +- wire schema public API (pdf_ocr_events 는 internal SQLite table, wire expose 안 함). +- `ask` 명령의 한국어 phrasing-sensitive refusal (이번 round 범위 외). +- migration rollback automation (standard CLAUDE.md protocol follow). +- concurrent ingest lock manager (현재 single-process ingest 가정, future spec). + +--- + +## §3 Design decisions + +### §3.1 image_width / image_height — raster decode path + +**선택**: raster JPEG 를 ImageReader 로 decode → (width, height) 추출. +- **이유**: OCR 호출 시 bytes 가 이미 memory (extract_dctdecode_page_image), decode latency << OCR latency (negligible <1ms). +- **대안 거절**: PDF MediaBox 사용 → actual raster 와 page size 다를 수 있음 (less accurate). +- **구현**: `image` crate 의 `ImageReader::new(Cursor::new(&bytes)).with_guessed_format()?.into_dimensions()?`. +- **error handling**: decode fail → (None, None) fallback. OCR 결과는 여전히 valid. + +### §3.2 SQLite mirror — V008 migration + dual-write + +**선택**: v0.20.0 round 1 의 file-only ndjson 을 보완하는 SQLite mirror (non-breaking). +- **이유**: historical query 를 위해 structured storage 필수. file 만으로는 corpus-wide aggregate 불가. +- **doc_id wiring**: LogEvent::Ocr 의 `doc_id` field 는 closure scope 에서 미리 capture 되어야 함. apply_ocr_to_pdf_pages 호출 전에 canonical.doc_id 를 local var 로 binding 후, closure 내에서 동일한 doc_id 로 file ndjson + SQLite insert 수행. 이를 통해 dual-write 의 일관성 보장. +- **dual-write 구조**: + 1. `IngestLogWriter::write_event(&LogEvent::Ocr)` 시 file ndjson + SQLite insert. + 2. insert 는 `Arc` clone 을 emit_progress closure 가 직접 호출. + 3. transaction safety: file write first (failures → log), then SQLite (non-critical). +- **non-breaking**: old config 가 없어도 logging 정상 작동 (file only). SQLite 는 upgrade 시 자동 생성. + +### §3.3 CLI inspect commands — ocr-stats + ocr-failures + +**wire schema**: 기존 `schema.v1` 의 `wire.schemas` list 에 `ocr_stats.v1` + `ocr_failures.v1` additive minor 추가. +- **이유**: 새 wire shape 은 public API 가 아님 (inspect command 만 emit). wire.v1 의 확장으로 additive. +- **구현**: `kebab-cli/src/main.rs` 의 `Subcommand::Inspect` 에 `InspectCommand::OcrStats / OcrFailures` arm 추가. + +### §3.4 retention — keep_recent_runs + retention_days + +**선택**: 두 조건 모두 충족 시만 보존 (OR-on-stale = AND-on-fresh semantics). +- **이유**: + - `keep_recent_runs=100` — deterministic "최근 N 개 run 보존". + - `retention_days=30` — time-based cleanup (dogfood 중단 후 obsolete log 자동 삭제). + - **Delete if** (idx >= keep_recent) **OR** (modified <= cutoff) — 둘 중 하나라도 stale 시 삭제. 동등: **Keep iff** (idx < keep_recent) **AND** (modified > cutoff) — 둘 다 fresh 일 때만. +- **구현**: `IngestLogWriter::open()` 시 cleanup helper, `SqliteStore::prune_pdf_ocr_events(retention_days)` 별도 routine. + +--- + +## §4 Implementation specification + +### §4.1 image_width / image_height decode helper + +**파일**: `crates/kebab-app/src/pdf_ocr_apply.rs` + +**변경**: +1. `crates/kebab-app/Cargo.toml` dependency update: + ```diff + -image = { version = "0.25", default-features = false, features = ["png"] } + +image = { version = "0.25", default-features = false, features = ["png", "jpeg"] } + ``` +2. import `use image::io::Reader as ImageReader;` (transitive via kebab-parse-image). +3. 새 helper function: + ```rust + fn extract_image_dimensions(jpeg_bytes: &[u8]) -> Option<(u32, u32)> { + let reader = ImageReader::new(std::io::Cursor::new(jpeg_bytes)) + .ok()? + .with_guessed_format() + .ok()?; + reader.into_dimensions().ok() + } + ``` +4. 6 emit point (line 155, 188, 265 및 PdfOcrProgress::Finished 의 다른 3곳): + ```rust + let (w, h) = extract_image_dimensions(&page_image_bytes).map(|(w, h)| (Some(w), Some(h))) + .unwrap_or((None, None)); + emit_progress(PdfOcrProgress::Finished { + image_width: w, + image_height: h, + ... + }); + ``` + +**test**: 기존 `pdf_ocr_roundtrip` + 새 `pdf_ocr_image_dimensions` integration test. + +### §4.2 V008 migration SQL + SqliteStore API + +**파일**: `migrations/V008__pdf_ocr_events.sql` (신규) + +```sql +CREATE TABLE pdf_ocr_events ( + id INTEGER PRIMARY KEY, + run_id TEXT NOT NULL, + ts TEXT NOT NULL, -- ISO 8601 UTC (RFC 3339) + doc_id TEXT, -- nullable (file detect skip) + doc_path TEXT NOT NULL, + page INTEGER NOT NULL, + image_byte_size INTEGER, + image_width INTEGER, + image_height INTEGER, + ms INTEGER NOT NULL, + chars INTEGER NOT NULL, + success INTEGER NOT NULL, -- 0 = fail, 1 = success + reason TEXT, -- "timeout" / "ocr_error" / NULL + ocr_engine TEXT NOT NULL +); +CREATE INDEX idx_pdf_ocr_events_doc_id ON pdf_ocr_events(doc_id); +CREATE INDEX idx_pdf_ocr_events_run_id ON pdf_ocr_events(run_id); +CREATE INDEX idx_pdf_ocr_events_ts ON pdf_ocr_events(ts); +``` + +**파일**: `crates/kebab-store-sqlite/src/lib.rs` (SqliteStore 확장) + +**신규 method** (Mutex lock 명시): +```rust +impl SqliteStore { + pub fn record_pdf_ocr_event( + &self, + run_id: &str, + ts: &str, + doc_id: Option<&str>, + doc_path: &str, + page: u32, + image_byte_size: Option, + image_width: Option, + image_height: Option, + ms: u64, + chars: u32, + success: bool, + reason: Option<&str>, + ocr_engine: &str, + ) -> anyhow::Result<()> { + let conn = self.conn.lock().expect("sqlite lock poisoned"); + conn.execute( + "INSERT INTO pdf_ocr_events + (run_id, ts, doc_id, doc_path, page, image_byte_size, image_width, image_height, ms, chars, success, reason, ocr_engine) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + rusqlite::params![ + run_id, ts, doc_id, doc_path, page, + image_byte_size, image_width, image_height, ms, chars, + if success { 1 } else { 0 }, reason, ocr_engine + ] + )?; + Ok(()) + } + + pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result { + let conn = self.conn.lock().expect("sqlite lock poisoned"); + let cutoff_ts = time::OffsetDateTime::now_utc() + .checked_sub(time::Duration::days(retention_days as i64)) + .map(|dt| dt.format(&time::format_description::well_known::Rfc3339).ok()) + .flatten() + .unwrap_or_default(); + let n = conn.execute( + "DELETE FROM pdf_ocr_events WHERE ts < ?", + rusqlite::params![cutoff_ts], + )?; + Ok(n as u64) + } +} +``` + +### §4.3 wire schema — ocr_stats.v1 + ocr_failures.v1 + +**파일**: `docs/wire-schema/v1/ocr_stats.schema.json` (신규) + +```json +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ocr_stats.v1", + "type": "object", + "properties": { + "schema_version": { "const": "ocr_stats.v1" }, + "total_events": { "type": "integer" }, + "total_runs": { "type": "integer" }, + "success_count": { "type": "integer" }, + "failure_count": { "type": "integer" }, + "success_rate": { "type": "number" }, + "p50_ms": { "type": "integer" }, + "p90_ms": { "type": "integer" }, + "p99_ms": { "type": "integer" }, + "max_ms": { "type": "integer" }, + "by_engine": { "type": "object", "additionalProperties": { "type": "integer" } }, + "by_doc": { + "type": "array", + "items": { + "type": "object", + "properties": { + "doc_id": { "type": "string" }, + "failure_count": { "type": "integer" }, + "success_count": { "type": "integer" }, + "p90_ms": { "type": ["integer", "null"] } + } + } + } + }, + "required": ["schema_version", "total_events", "total_runs", "success_count", "failure_count", "success_rate"] +} +``` + +**파일**: `docs/wire-schema/v1/ocr_failures.schema.json` (신규) + +```json +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ocr_failures.v1", + "type": "object", + "properties": { + "schema_version": { "const": "ocr_failures.v1" }, + "doc_id": { "type": ["string", "null"] }, + "failure_count": { "type": "integer" }, + "failures": { + "type": "array", + "items": { + "type": "object", + "properties": { + "ts": { "type": "string" }, + "page": { "type": "integer" }, + "ms": { "type": "integer" }, + "reason": { "type": "string" }, + "image_byte_size": { "type": ["integer", "null"] } + } + } + } + }, + "required": ["schema_version", "failure_count", "failures"] +} +``` + +**파일**: `docs/wire-schema/v1/schema.schema.json` 갱신 + +`schema.v1.wire.schemas` list 에 `ocr_stats.v1` + `ocr_failures.v1` 추가 (additive). + +### §4.4 CLI inspect ocr-stats + ocr-failures + +**파일**: `crates/kebab-cli/src/main.rs` + +**신규 subcommand**: +```rust +#[derive(Subcommand)] +pub enum Subcommand { + Inspect(InspectCommand), + // ... +} + +#[derive(clap::Subcommand)] +pub enum InspectCommand { + OcrStats { + #[arg(long, default_value = "false")] + json: bool, + }, + OcrFailures { + #[arg(long)] + doc_id: Option, + #[arg(long, default_value = "10")] + limit: usize, + #[arg(long, default_value = "false")] + json: bool, + }, +} +``` + +**파일**: `crates/kebab-app/src/lib.rs` 확장 + +```rust +impl App { + pub fn inspect_ocr_stats(&self) -> anyhow::Result { + // SELECT 쿼리: pdf_ocr_events 에서 aggregate. + // 1. total_events, success_count, failure_count, success_rate 계산. + // 2. percentile via in-memory sort: SELECT ms FROM pdf_ocr_events WHERE success=1 ORDER BY ms. + // Vec 로 fetch 후 idx 계산 (p50 = idx 50%, p90 = idx 90%, p99 = idx 99%). + // 3. by_engine groupby (engine 별 success count). + // 4. by_doc top 10 (failure_count DESC). + } + + pub fn inspect_ocr_failures( + &self, + doc_id: Option<&str>, + limit: usize, + ) -> anyhow::Result { + // SELECT failure records WHERE success=0. + // doc_id 있으면 WHERE doc_id=?; 없으면 ORDER BY ts DESC LIMIT limit. + } +} +``` + +### §4.5 retention cleanup helper — file + SQLite + +**파일**: `crates/kebab-app/src/ingest_log.rs` 확장 + +```rust +impl IngestLogWriter { + pub fn open(cfg: &kebab_config::LoggingCfg) -> anyhow::Result> { + if !cfg.ingest_log_enabled { + return Ok(None); + } + let run_id = generate_run_id(); + let log_dir = expand_log_dir(&cfg.ingest_log_dir); + std::fs::create_dir_all(&log_dir)?; + + // Cleanup file logs (before creating new log). + if let Err(e) = Self::cleanup_old_logs(&log_dir, cfg.keep_recent_runs, cfg.retention_days) { + tracing::warn!(target: "kebab-app", "ingest log cleanup failed: {e}"); + // non-critical — continue without failing ingest. + } + + let path = log_dir.join(format!("ingest-{run_id}.ndjson")); + let file = BufWriter::new(File::create(&path)?); + Ok(Some(Self { file, path, run_id, started_at: SystemTime::now() })) + } + + fn cleanup_old_logs(log_dir: &Path, keep_recent: u32, retention_days: u32) -> anyhow::Result<()> { + let mut entries: Vec<_> = std::fs::read_dir(log_dir)? + .filter_map(|e| e.ok()) + .filter(|e| e.path().file_name() + .and_then(|n| n.to_str()) + .map(|s| s.starts_with("ingest-") && s.ends_with(".ndjson")) + .unwrap_or(false)) + .collect(); + + // Sort by modified time descending (newest first). + entries.sort_by_key(|e| std::cmp::Reverse(e.metadata().ok().and_then(|m| m.modified().ok()))); + + let cutoff_time = SystemTime::now() - std::time::Duration::from_secs(retention_days as u64 * 86400); + + for (idx, entry) in entries.into_iter().enumerate() { + let path = entry.path(); + let metadata = entry.metadata()?; + let modified = metadata.modified()?; + + // Delete if (idx >= keep_recent) OR (modified <= cutoff). + // Equivalent: keep iff (idx < keep_recent) AND (modified > cutoff) — both fresh. + // Per §3.4 OR-on-stale semantics. + if idx < keep_recent as usize && modified > cutoff_time { + continue; + } + + std::fs::remove_file(&path) + .map_err(|e| anyhow::anyhow!("failed to remove {}: {}", path.display(), e))?; + } + + Ok(()) + } +} +``` + +### §4.6 Config extension — LoggingCfg + +**파일**: `crates/kebab-config/src/lib.rs` (LoggingCfg 확장) + +```rust +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct LoggingCfg { + pub ingest_log_enabled: bool, + pub ingest_log_dir: PathBuf, + #[serde(default = "default_keep_recent_runs")] + pub keep_recent_runs: u32, + #[serde(default = "default_retention_days")] + pub retention_days: u32, +} + +fn default_keep_recent_runs() -> u32 { 100 } +fn default_retention_days() -> u32 { 30 } + +impl Default for LoggingCfg { + fn default() -> Self { + Self { + ingest_log_enabled: true, + ingest_log_dir: PathBuf::from("{state_dir}/logs"), + keep_recent_runs: 100, + retention_days: 30, + } + } +} +``` + +**파일**: `docs/SMOKE.md` — config example 갱신 + +```toml +[logging] +ingest_log_enabled = true +ingest_log_dir = "{state_dir}/logs" +keep_recent_runs = 100 +retention_days = 30 +``` + +### §4.7 IngestLogWriter dual-write integration — canonical.doc_id closure capture (F1) + +**파일**: `crates/kebab-app/src/ingest_log.rs` — write_event 확장 + +```rust +impl IngestLogWriter { + pub fn write_event_with_db( + &mut self, + event: &LogEvent<'_>, + store: Option<&SqliteStore>, + ) -> anyhow::Result<()> { + // Write to file. + serde_json::to_writer(&mut self.file, event)?; + writeln!(self.file)?; + + // Write to SQLite if store provided and event is Ocr. + if let (Some(store), LogEvent::Ocr { + ts, doc_id, doc_path, page, image_byte_size, image_width, image_height, + ms, chars, success, reason, ocr_engine, + }) = (store, event) { + let _ = store.record_pdf_ocr_event( + self.run_id(), + ts, + doc_id.as_deref(), // doc_id must be captured in closure scope (see below) + doc_path, + *page, + *image_byte_size, + *image_width, + *image_height, + *ms, + *chars, + *success, + *reason, + ocr_engine, + ).map_err(|e| { + // Non-critical — log warning but don't fail ingest. + tracing::warn!(target: "kebab-app", "sqlite ocr event insert failed: {e}"); + }); + } + + Ok(()) + } +} +``` + +**caller 예시** (kebab-app/src/ingest_one_pdf_asset 또는 apply_ocr_to_pdf_pages 내 emit_progress closure): + +```rust +// Pre-capture canonical.doc_id before apply_ocr_to_pdf_pages closure: +let doc_id_for_log: String = canonical.doc_id.0.clone(); +let doc_path_for_log = asset.workspace_path.0.clone(); + +let store = Arc::new(SqliteStore::open(&cfg.storage.sqlite)?); +let mut log_writer = IngestLogWriter::open(&cfg.logging)?; + +let emit_progress = move |progress: PdfOcrProgress| { + if let Some(writer) = &mut log_writer { + let event = LogEvent::Ocr { + ts: now_ts(), + doc_id: Some(doc_id_for_log.clone()), // ← captured in closure scope + doc_path: doc_path_for_log.clone(), + page: page_n, + // ... other fields ... + }; + let _ = writer.write_event_with_db(&event, Some(&store)); + } +}; +``` + +--- + +## §5 Acceptance criteria + +**AC-1**: image_width + image_height non-null after PDF OCR. +- Integration test: scanned PDF 로 `ingest_one_pdf_asset` → IngestReport check `pdf_ocr_summary.pages_ocrd > 0`, log file 의 `image_width: Some(_)`, `image_height: Some(_)` verify. + +**AC-2**: V008 migration successful + `pdf_ocr_events` table exists. +- test: fresh DB 생성 → migration apply → `SELECT name FROM sqlite_master WHERE type='table' AND name='pdf_ocr_events';` verify. + +**AC-3**: ingest 시 SQLite row 가 ndjson file 의 OCR record 와 1:1 일치. +- Integration test: ingest 후 `SELECT COUNT(*) FROM pdf_ocr_events WHERE success=1` = ndjson 의 `success=true` OCR line count. + +**AC-4**: `kebab inspect ocr-stats --json` 정상 emit + `ocr_stats.v1` schema_version. +- CLI test: `kebab inspect ocr-stats --json | jq '.schema_version'` = `"ocr_stats.v1"`, `total_events`, `success_rate` present. + +**AC-5**: `kebab inspect ocr-failures --doc-id --json` 정상 emit + `ocr_failures.v1`. +- CLI test: failure 가 있는 doc_id 로 조회 → `failures[]` array non-empty, `reason` field present. + +**AC-6**: `kebab inspect ocr-failures --json` (no doc-id) corpus-wide. +- CLI test: `--limit 5` 로 최근 5개 failure 반환, `failure_count >= 5`. + +**AC-7**: log retention — keep_recent_runs=2 시 3rd ingest 후 oldest file deleted. +- Integration test: temp log dir, 3 ingest run with `keep_recent_runs=2` → oldest 2 file only remain. + +**AC-8**: SQLite retention — retention_days=0 시 old row deleted. +- test: insert old row (ts = 90 days ago) → `prune_pdf_ocr_events(0)` → row deleted. + +**AC-9**: backward compat — old config (no `[logging] retention_*` field) parses with default. +- test: pre-v0.20.x config (no `[logging]` section) → load → `logging.keep_recent_runs == 100` (default). + +**AC-10**: workspace test + clippy green. +- `cargo test --workspace -j 1`, `cargo clippy --all-targets`. + +**AC-11**: integration test (`ocr_inspect_smoke` + `pdf_ocr_events_insert_smoke`). +- new test binary: scanned PDF ingest → `kebab inspect ocr-stats / ocr-failures` 검증. +- `crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs`. + +**AC-12**: `[logging] retention_*` default emit in `kebab init` config. +- test: `kebab init --config /tmp/test-cfg.toml` → `[logging] keep_recent_runs = 100` + `retention_days = 30` present. + +**AC-13**: wire schema additive list sync in `integrations/claude-code/kebab/SKILL.md`. +- test: `grep -c 'ocr_stats\.v1' integrations/claude-code/kebab/SKILL.md` returns ≥1, same for `ocr_failures.v1`. + +--- + +## §6 Risks + open questions + +### R-1 dual-write transaction safety (file vs SQLite race) + +**Issue**: emit_progress closure 가 file write 후 SQLite insert 실패 시, ndjson 과 DB 불일치. +**Mitigation**: +- file write first (durable, may fail). +- SQLite write second (non-critical, warn on fail, don't propagate error). +- per-run 단위 reconciliation tool (future enhancement, not in scope). + +### R-2 V008 migration rollback (F6) + +**Issue**: user 가 v0.20.x → older version downgrade 시 V008 rolled back? +**Mitigation**: CLAUDE.md migration policy follow. V008 은 additive table → old version 이 table ignore 하면 작동 OK. +**Manual rollback** (v0.19.x ↔ v0.20.x alternating dogfood): +```sql +DELETE FROM refinery_schema_history WHERE version=8; +DROP TABLE IF EXISTS pdf_ocr_events; +``` +**Out-of-scope**: v0.19.x ↔ v0.20.x alternate run 의 자동 rollback path 미제공. + +### R-3 prune helper 가 concurrent ingest 시 stale lock + +**Issue**: cleanup_old_logs 가 file 삭제 중인데 다른 process 가 write? +**Mitigation**: +- cleanup 은 IngestLogWriter::open 시만 (ingest 시작 전). +- per-process single ingest 가정 (현재 design). +- concurrent ingest support 는 future phase. + +### R-4 image decode failure handling (corrupt JPEG fallback) + +**Issue**: JPEG 가 corrupt → extract_image_dimensions 실패. +**Mitigation**: helper returns Option<(u32, u32)> → (None, None) fallback. OCR 완료도 유효. warning event push (optional, future enhancement). + +### R-5 wire schema additive minor — old consumer 의 schema 미인식 + +**Issue**: old `kebab` binary (v0.19.x) 가 v0.20.x `kebab inspect ocr-stats` 의 output consume? +**Mitigation**: +- `schema_version` = `ocr_stats.v1` explicit (old consumer 는 schema 미인식 → skip OK). +- wire.v1 의 additive list → backward compat (old consumer 는 list 만 ignores). +- new consumer 만 `ocr_stats.v1` / `ocr_failures.v1` 인식. + +### R-6 Concurrent cleanup 에 의한 log file loss + +**Issue**: keep_recent_runs / retention_days 정책 으로 파일 삭제 중 user 가 tail 시도? +**Mitigation**: cleanup 은 ingest start 때만. user 가 tail 하는 중인 파일은 일반적으로 recent (top N 내) → 안전. + +### R-7 doc_id NULL wiring in LogEvent::Ocr closure (F1) + +**Issue**: emit_progress closure 에서 doc_id 가 None 또는 mismatch 시, file ndjson 과 SQLite record 의 doc_id 가 불일치. +**Verification** (spec §6 R-7 add): +```bash +# canonical.doc_id 가 set 되는 시점 확인 +grep -n "canonical.doc_id\|\.doc_id\s*=" crates/kebab-app/src/lib.rs | head -10 +``` +**Mitigation**: +- closure scope 에서 doc_id 를 pre-capture (let doc_id_for_log: String = canonical.doc_id.0.clone()). +- LogEvent::Ocr 생성 시 captured value 사용. +- per-run integration test 에서 file ndjson 의 doc_id 와 SQLite SELECT 의 doc_id match verify. + +--- + +## §7 References + +- **Parent spec**: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` (design contract). +- **Round 1 spec**: v0.20.0 sub-item 1 ingest log (PR #189). +- **Code ranges**: + - `crates/kebab-app/src/pdf_ocr_apply.rs` lines 155, 188, 265 (6 emit point). + - `crates/kebab-app/src/ingest_log.rs` (LogEvent, IngestLogWriter). + - `crates/kebab-config/src/lib.rs` (Config, LoggingCfg). + - `crates/kebab-store-sqlite/src/lib.rs` (SqliteStore). + - `crates/kebab-cli/src/main.rs` (Subcommand). +- **Dependencies**: + - `image` crate (transitive via kebab-parse-image). + - `time` crate (RFC 3339 timestamp, already in workspace). + - `rusqlite` (already in kebab-store-sqlite). +- **Config sections**: + - `[logging]`: ingest_log_enabled, ingest_log_dir, keep_recent_runs, retention_days. + - `[pdf.ocr]`: (unchanged from v0.20.0). +- **Wire schemas**: + - `docs/wire-schema/v1/ocr_stats.schema.json` (신규). + - `docs/wire-schema/v1/ocr_failures.schema.json` (신규). + - `docs/wire-schema/v1/schema.schema.json` (additive: wire.schemas list 에 두 schema 추가). +- **HOTFIXES contract**: 새로운 deviations 는 `tasks/HOTFIXES.md` 에 dated entry + cross-link to this spec. +- **Version cascade**: image_width/height, SQLite table schema 추가는 index_version cascade 아님 (chunks/embeddings 미영향). +- **Backward compat**: old config parses with `[logging]` defaults, wire schema additive minor. + +--- + +## §8 Dependencies + imports + +### Allowed dependencies + +- `image` crate (for ImageReader::new, into_dimensions). +- `time` crate (RFC 3339 formatting, already in workspace). +- `rusqlite` (for SQL execute / query, already in kebab-store-sqlite). +- `serde_json` (for wire schema export, already in kebab-app). + +### Forbidden dependencies + +- **None new introduced.** All uses are transitive or workspace-existing. + +--- + +## §9 Testing strategy + +### Unit tests + +- `extract_image_dimensions` helper: valid JPEG → Some((w, h)), corrupt JPEG → None. +- `cleanup_old_logs`: keep_recent_runs / retention_days logic, file deletion. +- LoggingCfg defaults: serde round-trip, backward compat. + +### Integration tests + +**New test files**: +- `crates/kebab-app/tests/ocr_inspect_smoke.rs`: scanned PDF ingest → inspect ocr-stats / ocr-failures validation. +- `crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs`: V008 migration, dual-write, prune logic. + +**Existing test updates**: +- `pdf_ocr_roundtrip` → verify image_width/height non-null. +- `ingest_report_snapshot` → verify ocr_stats output shape. + +### Smoke test (docs/SMOKE.md) + +- `kebab ingest` with scanned PDF → `~/.local/state/kebab/logs/ingest-*.ndjson` + `pdf_ocr_events` table check. +- `kebab inspect ocr-stats --json | jq '.schema_version'` = `"ocr_stats.v1"`. + +### Regression tests + +- 기존 1370 workspace test suite — regression 0 기대 (cleanup 은 non-critical, file-only logging 은 unchanged). + +--- + +## §10 Rollout + dogfood + +### v0.20.x milestone + +1. **spec approval** (이번 round 0). +2. **implementation** (A6 round 1, estimate 3-4 days for 4 enhancement). +3. **review + merge** (pull request via gitea-ops). +4. **dogfood** (user runs v0.20.x binary, accumulates OCR stats over 2-4 weeks). +5. **data-driven tuning** (inspect ocr-stats → timeout default adjust, release note v0.20.y). + +### Backward compat notes + +- Old binary (v0.19.x) + new config (v0.20.x with `[logging] retention_*`): config parses, logging ignores new fields. +- New binary (v0.20.x) + old config (v0.19.x without `[logging]`): defaults apply, logging works. +- wire schema: additive, consumers ignore unknown fields. + +### Release notes + +- **wire schema additive minor** (`ocr_stats.v1`, `ocr_failures.v1` 추가) → release trigger 아님 (CLAUDE.md §Versioning cascade). +- 사용자 도그푸딩 중 데이터 누적 후, 본격 튜닝 (e.g. timeout 조정) 에 따라 `chore: bump 0.20.x → 0.20.y` 별 commit 가능 (dogfood 결과 반영 시). + +--- + +## §11 Contract stability + +**Locked sections** (design contract 의 일부, future changes require spec §N update): +- wire schema `ocr_stats.v1` field list. +- wire schema `ocr_failures.v1` field list. +- `[logging]` config fields. +- CLI `inspect ocr-stats / ocr-failures` output format. + +**Flexible sections** (implementation detail, future refactor OK): +- `extract_image_dimensions` helper location (intra-crate 이동 가능). +- `cleanup_old_logs` 정책 (더 sophisticated 알고리즘 가능). +- SQLite index strategy (추가 index 가능). + +--- + +## Summary + +v0.20.x ingest log round 2 는 round 1 의 file-only ndjson 을 4 가지로 확장: + +1. **image dimension capture** — raster JPEG decode (trivial). +2. **SQLite mirror** — V008 migration + `pdf_ocr_events` table (medium). +3. **CLI inspect** — corpus-wide OCR statistics API (medium). +4. **log retention** — automatic cleanup (low). + +모두 non-breaking additive changes. backward compat 보장, wire schema minor bump, 500-800 line spec (이 문서 ≈ 700 lines). diff --git a/docs/wire-schema/v1/ocr_failures.schema.json b/docs/wire-schema/v1/ocr_failures.schema.json new file mode 100644 index 0000000..0d0655a --- /dev/null +++ b/docs/wire-schema/v1/ocr_failures.schema.json @@ -0,0 +1,24 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ocr_failures.v1", + "type": "object", + "properties": { + "schema_version": { "const": "ocr_failures.v1" }, + "doc_id": { "type": ["string", "null"] }, + "failure_count": { "type": "integer" }, + "failures": { + "type": "array", + "items": { + "type": "object", + "properties": { + "ts": { "type": "string" }, + "page": { "type": "integer" }, + "ms": { "type": "integer" }, + "reason": { "type": "string" }, + "image_byte_size": { "type": ["integer", "null"] } + } + } + } + }, + "required": ["schema_version", "failure_count", "failures"] +} diff --git a/docs/wire-schema/v1/ocr_stats.schema.json b/docs/wire-schema/v1/ocr_stats.schema.json new file mode 100644 index 0000000..a71c2c9 --- /dev/null +++ b/docs/wire-schema/v1/ocr_stats.schema.json @@ -0,0 +1,31 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ocr_stats.v1", + "type": "object", + "properties": { + "schema_version": { "const": "ocr_stats.v1" }, + "total_events": { "type": "integer" }, + "total_runs": { "type": "integer" }, + "success_count": { "type": "integer" }, + "failure_count": { "type": "integer" }, + "success_rate": { "type": "number" }, + "p50_ms": { "type": ["integer", "null"] }, + "p90_ms": { "type": ["integer", "null"] }, + "p99_ms": { "type": ["integer", "null"] }, + "max_ms": { "type": ["integer", "null"] }, + "by_engine": { "type": "object", "additionalProperties": { "type": "integer" } }, + "by_doc": { + "type": "array", + "items": { + "type": "object", + "properties": { + "doc_id": { "type": "string" }, + "failure_count": { "type": "integer" }, + "success_count": { "type": "integer" }, + "p90_ms": { "type": ["integer", "null"] } + } + } + } + }, + "required": ["schema_version", "total_events", "total_runs", "success_count", "failure_count", "success_rate"] +} diff --git a/integrations/claude-code/kebab/SKILL.md b/integrations/claude-code/kebab/SKILL.md index 85472f2..87c984f 100644 --- a/integrations/claude-code/kebab/SKILL.md +++ b/integrations/claude-code/kebab/SKILL.md @@ -146,6 +146,7 @@ Claude Code spawns `kebab mcp` at session start; the process stays alive across - `search_response.v1.truncated = true` means budget forced snippet shortening or k reduction. Independent of `next_cursor`: widen `max_tokens` for fuller snippets, follow `next_cursor` for more hits, or both. - `ask`'s `citations[]` mirrors `search_hit.v1` minus retrieval internals — same `doc_path` / `citation` shape. - Schema reference lives in the kebab repo at `docs/wire-schema/v1/*.schema.json` if a field is unclear. v0.20.x additive minor: `ingest_progress.v1` `pdf_ocr_finished` events carry 4 optional new fields (`image_byte_size`, `image_width`, `image_height`, `failure_reason`) — absent on pre-v0.20 events (backward compat). +- v0.20.x r2 additive minor: `ocr_stats.v1` — corpus-wide OCR statistics (`total_events`, `success_rate`, `p50/p90/p99/max_ms`, `by_engine`, top-10 `by_doc` by failure count) emitted by `kebab inspect ocr-stats --json`. `ocr_failures.v1` — per-doc or corpus-wide recent failure list (`failure_count`, `failures[]` with `ts`, `page`, `ms`, `reason`, `image_byte_size`) emitted by `kebab inspect ocr-failures [--doc-id ] [--limit N] --json`. - `search_hit.v1` and `answer.v1.citations[]` carry `indexed_at` (RFC3339) + `stale` (bool). When `stale == true`, the source doc hasn't been re-processed since `config.search.stale_threshold_days`. Surface this caveat to the user when summarizing — the cited snapshot may not reflect current reality. ## Capability discovery diff --git a/migrations/V008__pdf_ocr_events.sql b/migrations/V008__pdf_ocr_events.sql new file mode 100644 index 0000000..43e4bf0 --- /dev/null +++ b/migrations/V008__pdf_ocr_events.sql @@ -0,0 +1,21 @@ +-- v0.20.x r2 Enhancement 2: PDF OCR events SQLite mirror. +-- Stores per-page OCR samples for corpus-wide latency / failure analysis. +CREATE TABLE pdf_ocr_events ( + id INTEGER PRIMARY KEY, + run_id TEXT NOT NULL, + ts TEXT NOT NULL, -- ISO 8601 UTC (RFC 3339) + doc_id TEXT, -- nullable (detect-skip path) + doc_path TEXT NOT NULL, + page INTEGER NOT NULL, + image_byte_size INTEGER, + image_width INTEGER, + image_height INTEGER, + ms INTEGER NOT NULL, + chars INTEGER NOT NULL, + success INTEGER NOT NULL, -- 0 = fail, 1 = success + reason TEXT, -- "timeout" / "ocr_error" / NULL + ocr_engine TEXT NOT NULL +); +CREATE INDEX idx_pdf_ocr_events_doc_id ON pdf_ocr_events(doc_id); +CREATE INDEX idx_pdf_ocr_events_run_id ON pdf_ocr_events(run_id); +CREATE INDEX idx_pdf_ocr_events_ts ON pdf_ocr_events(ts);