feat(store): V008 pdf_ocr_events migration + record/prune API (Enhancement 2)
Add migrations/V008__pdf_ocr_events.sql with the events table + 3 indices (doc_id, run_id, ts). SqliteStore gains two pub fn: record_pdf_ocr_event (insert one OCR sample) and prune_pdf_ocr_events (delete rows older than retention_days; returns the affected row count). Both follow the existing Mutex<Connection> lock pattern. Wiring into ingest path lands in the next commit. Closure r1 F2: explicit lock acquisition in both methods. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -988,6 +988,64 @@ impl SqliteStore {
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
// ── v0.20.x r2 Enhancement 2: pdf_ocr_events ─────────────────────────
|
||||
|
||||
/// Insert one OCR sample row into `pdf_ocr_events` (V008 migration).
|
||||
/// Follows the existing `Mutex<Connection>` lock pattern (F2).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn record_pdf_ocr_event(
|
||||
&self,
|
||||
run_id: &str,
|
||||
ts: &str,
|
||||
doc_id: Option<&str>,
|
||||
doc_path: &str,
|
||||
page: u32,
|
||||
image_byte_size: Option<u64>,
|
||||
image_width: Option<u32>,
|
||||
image_height: Option<u32>,
|
||||
ms: u64,
|
||||
chars: u32,
|
||||
success: bool,
|
||||
reason: Option<&str>,
|
||||
ocr_engine: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let conn = self.conn.lock().expect("sqlite lock poisoned");
|
||||
conn.execute(
|
||||
"INSERT INTO pdf_ocr_events
|
||||
(run_id, ts, doc_id, doc_path, page,
|
||||
image_byte_size, image_width, image_height,
|
||||
ms, chars, success, reason, ocr_engine)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
rusqlite::params![
|
||||
run_id, ts, doc_id, doc_path, page,
|
||||
image_byte_size, image_width, image_height,
|
||||
ms, chars,
|
||||
if success { 1i32 } else { 0i32 },
|
||||
reason, ocr_engine
|
||||
],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete rows from `pdf_ocr_events` older than `retention_days`.
|
||||
/// Returns the number of deleted rows.
|
||||
/// Cutoff is computed as `now_utc - retention_days`; a value of 0
|
||||
/// means "delete everything older than now" (i.e. all past rows).
|
||||
pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result<u64> {
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
let cutoff = time::OffsetDateTime::now_utc()
|
||||
- time::Duration::days(retention_days as i64);
|
||||
let cutoff_ts = cutoff
|
||||
.format(&Rfc3339)
|
||||
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string());
|
||||
let conn = self.conn.lock().expect("sqlite lock poisoned");
|
||||
let n = conn.execute(
|
||||
"DELETE FROM pdf_ocr_events WHERE ts < ?",
|
||||
rusqlite::params![cutoff_ts],
|
||||
)?;
|
||||
Ok(n as u64)
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the design §5 / task-spec pragmas. Called once per connection.
|
||||
|
||||
@@ -0,0 +1,89 @@
|
||||
//! Smoke tests for V008 pdf_ocr_events migration + record/prune API (Enhancement 2).
|
||||
//! AC-2, AC-3, AC-8.
|
||||
|
||||
mod common;
|
||||
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
use rusqlite::OptionalExtension;
|
||||
|
||||
fn open_migrated() -> (common::TestEnv, SqliteStore) {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).expect("open");
|
||||
store.run_migrations().expect("run migrations");
|
||||
(env, store)
|
||||
}
|
||||
|
||||
/// AC-2: V008 migration creates the pdf_ocr_events table.
|
||||
#[test]
|
||||
fn v008_pdf_ocr_events_table_exists() {
|
||||
let (env, _store) = open_migrated();
|
||||
let name: Option<String> = env.with_conn(|c| {
|
||||
c.query_row(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='pdf_ocr_events'",
|
||||
[],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.optional()
|
||||
});
|
||||
assert_eq!(name.as_deref(), Some("pdf_ocr_events"), "pdf_ocr_events table must exist after V008");
|
||||
}
|
||||
|
||||
/// AC-8: insert 2 rows with different timestamps; prune with retention_days=0
|
||||
/// (cutoff = now) → the old row is deleted, count returns 1.
|
||||
#[test]
|
||||
fn record_and_prune_pdf_ocr_event() {
|
||||
let (_env, store) = open_migrated();
|
||||
|
||||
// Row 1: very old timestamp (1970)
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-old",
|
||||
"1970-01-01T00:00:00Z",
|
||||
Some("doc-old"),
|
||||
"path/old.pdf",
|
||||
1,
|
||||
Some(12345),
|
||||
Some(100),
|
||||
Some(80),
|
||||
250,
|
||||
42,
|
||||
true,
|
||||
None,
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("insert old row");
|
||||
|
||||
// Row 2: future timestamp (far future, so it survives prune)
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-new",
|
||||
"2099-01-01T00:00:00Z",
|
||||
Some("doc-new"),
|
||||
"path/new.pdf",
|
||||
1,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
180,
|
||||
30,
|
||||
true,
|
||||
None,
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("insert future row");
|
||||
|
||||
// prune with retention_days=0 → cutoff=now → deletes any row with ts < now.
|
||||
// The 1970 row should be deleted; the 2099 row survives.
|
||||
let pruned = store
|
||||
.prune_pdf_ocr_events(0)
|
||||
.expect("prune");
|
||||
assert_eq!(pruned, 1, "should have deleted exactly 1 old row");
|
||||
|
||||
// Verify only the future row remains
|
||||
let count: i64 = {
|
||||
let conn = store.read_conn();
|
||||
conn.query_row("SELECT COUNT(*) FROM pdf_ocr_events", [], |r| r.get(0))
|
||||
.expect("count")
|
||||
};
|
||||
assert_eq!(count, 1, "exactly 1 row should survive after prune");
|
||||
}
|
||||
21
migrations/V008__pdf_ocr_events.sql
Normal file
21
migrations/V008__pdf_ocr_events.sql
Normal file
@@ -0,0 +1,21 @@
|
||||
-- v0.20.x r2 Enhancement 2: PDF OCR events SQLite mirror.
|
||||
-- Stores per-page OCR samples for corpus-wide latency / failure analysis.
|
||||
CREATE TABLE pdf_ocr_events (
|
||||
id INTEGER PRIMARY KEY,
|
||||
run_id TEXT NOT NULL,
|
||||
ts TEXT NOT NULL, -- ISO 8601 UTC (RFC 3339)
|
||||
doc_id TEXT, -- nullable (detect-skip path)
|
||||
doc_path TEXT NOT NULL,
|
||||
page INTEGER NOT NULL,
|
||||
image_byte_size INTEGER,
|
||||
image_width INTEGER,
|
||||
image_height INTEGER,
|
||||
ms INTEGER NOT NULL,
|
||||
chars INTEGER NOT NULL,
|
||||
success INTEGER NOT NULL, -- 0 = fail, 1 = success
|
||||
reason TEXT, -- "timeout" / "ocr_error" / NULL
|
||||
ocr_engine TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX idx_pdf_ocr_events_doc_id ON pdf_ocr_events(doc_id);
|
||||
CREATE INDEX idx_pdf_ocr_events_run_id ON pdf_ocr_events(run_id);
|
||||
CREATE INDEX idx_pdf_ocr_events_ts ON pdf_ocr_events(ts);
|
||||
Reference in New Issue
Block a user