From 6482bf13216349ee933708d20971c29e0124bde2 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 28 May 2026 05:56:54 +0000 Subject: [PATCH] feat(store): V008 pdf_ocr_events migration + record/prune API (Enhancement 2) Add migrations/V008__pdf_ocr_events.sql with the events table + 3 indices (doc_id, run_id, ts). SqliteStore gains two pub fn: record_pdf_ocr_event (insert one OCR sample) and prune_pdf_ocr_events (delete rows older than retention_days; returns the affected row count). Both follow the existing Mutex lock pattern. Wiring into ingest path lands in the next commit. Closure r1 F2: explicit lock acquisition in both methods. Co-Authored-By: Claude Opus 4.7 --- crates/kebab-store-sqlite/src/store.rs | 58 ++++++++++++ .../tests/pdf_ocr_events_insert_smoke.rs | 89 +++++++++++++++++++ migrations/V008__pdf_ocr_events.sql | 21 +++++ 3 files changed, 168 insertions(+) create mode 100644 crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs create mode 100644 migrations/V008__pdf_ocr_events.sql diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index fd9e3c3..ffb7f28 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -988,6 +988,64 @@ impl SqliteStore { } Ok(out) } + + // ── v0.20.x r2 Enhancement 2: pdf_ocr_events ───────────────────────── + + /// Insert one OCR sample row into `pdf_ocr_events` (V008 migration). + /// Follows the existing `Mutex` lock pattern (F2). + #[allow(clippy::too_many_arguments)] + pub fn record_pdf_ocr_event( + &self, + run_id: &str, + ts: &str, + doc_id: Option<&str>, + doc_path: &str, + page: u32, + image_byte_size: Option, + image_width: Option, + image_height: Option, + ms: u64, + chars: u32, + success: bool, + reason: Option<&str>, + ocr_engine: &str, + ) -> anyhow::Result<()> { + let conn = self.conn.lock().expect("sqlite lock poisoned"); + conn.execute( + "INSERT INTO pdf_ocr_events + (run_id, ts, doc_id, doc_path, page, + image_byte_size, image_width, image_height, + ms, chars, success, reason, ocr_engine) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + rusqlite::params![ + run_id, ts, doc_id, doc_path, page, + image_byte_size, image_width, image_height, + ms, chars, + if success { 1i32 } else { 0i32 }, + reason, ocr_engine + ], + )?; + Ok(()) + } + + /// Delete rows from `pdf_ocr_events` older than `retention_days`. + /// Returns the number of deleted rows. + /// Cutoff is computed as `now_utc - retention_days`; a value of 0 + /// means "delete everything older than now" (i.e. all past rows). + pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result { + use time::format_description::well_known::Rfc3339; + let cutoff = time::OffsetDateTime::now_utc() + - time::Duration::days(retention_days as i64); + let cutoff_ts = cutoff + .format(&Rfc3339) + .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string()); + let conn = self.conn.lock().expect("sqlite lock poisoned"); + let n = conn.execute( + "DELETE FROM pdf_ocr_events WHERE ts < ?", + rusqlite::params![cutoff_ts], + )?; + Ok(n as u64) + } } /// Apply the design §5 / task-spec pragmas. Called once per connection. diff --git a/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs b/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs new file mode 100644 index 0000000..294aaea --- /dev/null +++ b/crates/kebab-store-sqlite/tests/pdf_ocr_events_insert_smoke.rs @@ -0,0 +1,89 @@ +//! Smoke tests for V008 pdf_ocr_events migration + record/prune API (Enhancement 2). +//! AC-2, AC-3, AC-8. + +mod common; + +use kebab_store_sqlite::SqliteStore; +use rusqlite::OptionalExtension; + +fn open_migrated() -> (common::TestEnv, SqliteStore) { + let env = common::TestEnv::new(); + let store = SqliteStore::open(&env.config()).expect("open"); + store.run_migrations().expect("run migrations"); + (env, store) +} + +/// AC-2: V008 migration creates the pdf_ocr_events table. +#[test] +fn v008_pdf_ocr_events_table_exists() { + let (env, _store) = open_migrated(); + let name: Option = env.with_conn(|c| { + c.query_row( + "SELECT name FROM sqlite_master WHERE type='table' AND name='pdf_ocr_events'", + [], + |r| r.get(0), + ) + .optional() + }); + assert_eq!(name.as_deref(), Some("pdf_ocr_events"), "pdf_ocr_events table must exist after V008"); +} + +/// AC-8: insert 2 rows with different timestamps; prune with retention_days=0 +/// (cutoff = now) → the old row is deleted, count returns 1. +#[test] +fn record_and_prune_pdf_ocr_event() { + let (_env, store) = open_migrated(); + + // Row 1: very old timestamp (1970) + store + .record_pdf_ocr_event( + "run-old", + "1970-01-01T00:00:00Z", + Some("doc-old"), + "path/old.pdf", + 1, + Some(12345), + Some(100), + Some(80), + 250, + 42, + true, + None, + "qwen2.5vl", + ) + .expect("insert old row"); + + // Row 2: future timestamp (far future, so it survives prune) + store + .record_pdf_ocr_event( + "run-new", + "2099-01-01T00:00:00Z", + Some("doc-new"), + "path/new.pdf", + 1, + None, + None, + None, + 180, + 30, + true, + None, + "qwen2.5vl", + ) + .expect("insert future row"); + + // prune with retention_days=0 → cutoff=now → deletes any row with ts < now. + // The 1970 row should be deleted; the 2099 row survives. + let pruned = store + .prune_pdf_ocr_events(0) + .expect("prune"); + assert_eq!(pruned, 1, "should have deleted exactly 1 old row"); + + // Verify only the future row remains + let count: i64 = { + let conn = store.read_conn(); + conn.query_row("SELECT COUNT(*) FROM pdf_ocr_events", [], |r| r.get(0)) + .expect("count") + }; + assert_eq!(count, 1, "exactly 1 row should survive after prune"); +} diff --git a/migrations/V008__pdf_ocr_events.sql b/migrations/V008__pdf_ocr_events.sql new file mode 100644 index 0000000..43e4bf0 --- /dev/null +++ b/migrations/V008__pdf_ocr_events.sql @@ -0,0 +1,21 @@ +-- v0.20.x r2 Enhancement 2: PDF OCR events SQLite mirror. +-- Stores per-page OCR samples for corpus-wide latency / failure analysis. +CREATE TABLE pdf_ocr_events ( + id INTEGER PRIMARY KEY, + run_id TEXT NOT NULL, + ts TEXT NOT NULL, -- ISO 8601 UTC (RFC 3339) + doc_id TEXT, -- nullable (detect-skip path) + doc_path TEXT NOT NULL, + page INTEGER NOT NULL, + image_byte_size INTEGER, + image_width INTEGER, + image_height INTEGER, + ms INTEGER NOT NULL, + chars INTEGER NOT NULL, + success INTEGER NOT NULL, -- 0 = fail, 1 = success + reason TEXT, -- "timeout" / "ocr_error" / NULL + ocr_engine TEXT NOT NULL +); +CREATE INDEX idx_pdf_ocr_events_doc_id ON pdf_ocr_events(doc_id); +CREATE INDEX idx_pdf_ocr_events_run_id ON pdf_ocr_events(run_id); +CREATE INDEX idx_pdf_ocr_events_ts ON pdf_ocr_events(ts);