feat(store): V008 pdf_ocr_events migration + record/prune API (Enhancement 2)

Add migrations/V008__pdf_ocr_events.sql with the events table + 3
indices (doc_id, run_id, ts). SqliteStore gains two pub fn:
record_pdf_ocr_event (insert one OCR sample) and prune_pdf_ocr_events
(delete rows older than retention_days; returns the affected row
count). Both follow the existing Mutex<Connection> lock pattern.

Wiring into ingest path lands in the next commit.

Closure r1 F2: explicit lock acquisition in both methods.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 05:56:54 +00:00
parent 5977c8cdf1
commit 6482bf1321
3 changed files with 168 additions and 0 deletions

View File

@@ -988,6 +988,64 @@ impl SqliteStore {
}
Ok(out)
}
// ── v0.20.x r2 Enhancement 2: pdf_ocr_events ─────────────────────────
/// Insert one OCR sample row into `pdf_ocr_events` (V008 migration).
/// Follows the existing `Mutex<Connection>` lock pattern (F2).
#[allow(clippy::too_many_arguments)]
pub fn record_pdf_ocr_event(
&self,
run_id: &str,
ts: &str,
doc_id: Option<&str>,
doc_path: &str,
page: u32,
image_byte_size: Option<u64>,
image_width: Option<u32>,
image_height: Option<u32>,
ms: u64,
chars: u32,
success: bool,
reason: Option<&str>,
ocr_engine: &str,
) -> anyhow::Result<()> {
let conn = self.conn.lock().expect("sqlite lock poisoned");
conn.execute(
"INSERT INTO pdf_ocr_events
(run_id, ts, doc_id, doc_path, page,
image_byte_size, image_width, image_height,
ms, chars, success, reason, ocr_engine)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
rusqlite::params![
run_id, ts, doc_id, doc_path, page,
image_byte_size, image_width, image_height,
ms, chars,
if success { 1i32 } else { 0i32 },
reason, ocr_engine
],
)?;
Ok(())
}
/// Delete rows from `pdf_ocr_events` older than `retention_days`.
/// Returns the number of deleted rows.
/// Cutoff is computed as `now_utc - retention_days`; a value of 0
/// means "delete everything older than now" (i.e. all past rows).
pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result<u64> {
use time::format_description::well_known::Rfc3339;
let cutoff = time::OffsetDateTime::now_utc()
- time::Duration::days(retention_days as i64);
let cutoff_ts = cutoff
.format(&Rfc3339)
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string());
let conn = self.conn.lock().expect("sqlite lock poisoned");
let n = conn.execute(
"DELETE FROM pdf_ocr_events WHERE ts < ?",
rusqlite::params![cutoff_ts],
)?;
Ok(n as u64)
}
}
/// Apply the design §5 / task-spec pragmas. Called once per connection.

View File

@@ -0,0 +1,89 @@
//! Smoke tests for V008 pdf_ocr_events migration + record/prune API (Enhancement 2).
//! AC-2, AC-3, AC-8.
mod common;
use kebab_store_sqlite::SqliteStore;
use rusqlite::OptionalExtension;
fn open_migrated() -> (common::TestEnv, SqliteStore) {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).expect("open");
store.run_migrations().expect("run migrations");
(env, store)
}
/// AC-2: V008 migration creates the pdf_ocr_events table.
#[test]
fn v008_pdf_ocr_events_table_exists() {
let (env, _store) = open_migrated();
let name: Option<String> = env.with_conn(|c| {
c.query_row(
"SELECT name FROM sqlite_master WHERE type='table' AND name='pdf_ocr_events'",
[],
|r| r.get(0),
)
.optional()
});
assert_eq!(name.as_deref(), Some("pdf_ocr_events"), "pdf_ocr_events table must exist after V008");
}
/// AC-8: insert 2 rows with different timestamps; prune with retention_days=0
/// (cutoff = now) → the old row is deleted, count returns 1.
#[test]
fn record_and_prune_pdf_ocr_event() {
let (_env, store) = open_migrated();
// Row 1: very old timestamp (1970)
store
.record_pdf_ocr_event(
"run-old",
"1970-01-01T00:00:00Z",
Some("doc-old"),
"path/old.pdf",
1,
Some(12345),
Some(100),
Some(80),
250,
42,
true,
None,
"qwen2.5vl",
)
.expect("insert old row");
// Row 2: future timestamp (far future, so it survives prune)
store
.record_pdf_ocr_event(
"run-new",
"2099-01-01T00:00:00Z",
Some("doc-new"),
"path/new.pdf",
1,
None,
None,
None,
180,
30,
true,
None,
"qwen2.5vl",
)
.expect("insert future row");
// prune with retention_days=0 → cutoff=now → deletes any row with ts < now.
// The 1970 row should be deleted; the 2099 row survives.
let pruned = store
.prune_pdf_ocr_events(0)
.expect("prune");
assert_eq!(pruned, 1, "should have deleted exactly 1 old row");
// Verify only the future row remains
let count: i64 = {
let conn = store.read_conn();
conn.query_row("SELECT COUNT(*) FROM pdf_ocr_events", [], |r| r.get(0))
.expect("count")
};
assert_eq!(count, 1, "exactly 1 row should survive after prune");
}

View File

@@ -0,0 +1,21 @@
-- v0.20.x r2 Enhancement 2: PDF OCR events SQLite mirror.
-- Stores per-page OCR samples for corpus-wide latency / failure analysis.
CREATE TABLE pdf_ocr_events (
id INTEGER PRIMARY KEY,
run_id TEXT NOT NULL,
ts TEXT NOT NULL, -- ISO 8601 UTC (RFC 3339)
doc_id TEXT, -- nullable (detect-skip path)
doc_path TEXT NOT NULL,
page INTEGER NOT NULL,
image_byte_size INTEGER,
image_width INTEGER,
image_height INTEGER,
ms INTEGER NOT NULL,
chars INTEGER NOT NULL,
success INTEGER NOT NULL, -- 0 = fail, 1 = success
reason TEXT, -- "timeout" / "ocr_error" / NULL
ocr_engine TEXT NOT NULL
);
CREATE INDEX idx_pdf_ocr_events_doc_id ON pdf_ocr_events(doc_id);
CREATE INDEX idx_pdf_ocr_events_run_id ON pdf_ocr_events(run_id);
CREATE INDEX idx_pdf_ocr_events_ts ON pdf_ocr_events(ts);