diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml index b124318..c14c6ad 100644 --- a/crates/kebab-app/Cargo.toml +++ b/crates/kebab-app/Cargo.toml @@ -67,6 +67,8 @@ unicode-normalization = "0.1" ignore = "0.4" # p9-fb-34: opaque pagination cursor encodes payload as base64. base64 = { workspace = true } +# Enhancement 3 (v0.20.x r2): direct SQL queries for inspect_ocr_stats/failures. +rusqlite = { workspace = true } [dev-dependencies] rusqlite = { workspace = true } diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index 445b687..e3f55ce 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -1093,6 +1093,231 @@ fn backfill_code_lang(hits: &mut [SearchHit]) { } } +// ── v0.20.x r2 Enhancement 3: OCR stats + failures inspect ────────────── + +/// Wire type for `kebab inspect ocr-stats --json` (`ocr_stats.v1`). +#[derive(serde::Serialize)] +pub struct OcrStatsV1 { + pub schema_version: &'static str, + pub total_events: u64, + pub total_runs: u64, + pub success_count: u64, + pub failure_count: u64, + pub success_rate: f64, + pub p50_ms: Option, + pub p90_ms: Option, + pub p99_ms: Option, + pub max_ms: Option, + pub by_engine: std::collections::BTreeMap, + pub by_doc: Vec, +} + +/// Per-doc breakdown row inside `OcrStatsV1`. +#[derive(serde::Serialize)] +pub struct OcrStatsByDoc { + pub doc_id: String, + pub failure_count: u64, + pub success_count: u64, + pub p90_ms: Option, +} + +/// Wire type for `kebab inspect ocr-failures --json` (`ocr_failures.v1`). +#[derive(serde::Serialize)] +pub struct OcrFailuresV1 { + pub schema_version: &'static str, + pub doc_id: Option, + pub failure_count: u64, + pub failures: Vec, +} + +/// Single failure row inside `OcrFailuresV1`. +#[derive(serde::Serialize)] +pub struct OcrFailureRow { + pub ts: String, + pub page: u32, + pub ms: u64, + pub reason: String, + pub image_byte_size: Option, +} + +impl App { + /// Corpus-wide OCR statistics from the `pdf_ocr_events` SQLite mirror. + pub fn inspect_ocr_stats(&self) -> Result { + self.inspect_ocr_stats_with_config(&self.config) + } + + #[doc(hidden)] + pub fn inspect_ocr_stats_with_config( + &self, + _cfg: &kebab_config::Config, + ) -> Result { + use crate::ingest_log::percentiles; + let conn = self.sqlite.read_conn(); + + // 1. Aggregate counters + let (total_events, success_count, failure_count, total_runs): (u64, u64, u64, u64) = conn + .query_row( + "SELECT COUNT(*), \ + SUM(CASE WHEN success=1 THEN 1 ELSE 0 END), \ + SUM(CASE WHEN success=0 THEN 1 ELSE 0 END), \ + COUNT(DISTINCT run_id) \ + FROM pdf_ocr_events", + [], + |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)), + ) + .unwrap_or((0, 0, 0, 0)); + + let success_rate = if total_events == 0 { + 0.0 + } else { + success_count as f64 / total_events as f64 + }; + + // 2. Latency percentiles from successful events + let samples: Vec = { + let mut stmt = conn + .prepare("SELECT ms FROM pdf_ocr_events WHERE success=1 ORDER BY ms") + .context("prepare ms query")?; + stmt.query_map([], |r| r.get::<_, u64>(0)) + .context("query ms")? + .filter_map(|r| r.ok()) + .collect() + }; + let (p50_ms, p90_ms, p99_ms, max_ms) = percentiles(&samples); + + // 3. Engine breakdown + let mut by_engine = std::collections::BTreeMap::new(); + { + let mut stmt = conn + .prepare( + "SELECT ocr_engine, COUNT(*) FROM pdf_ocr_events GROUP BY ocr_engine", + ) + .context("prepare engine query")?; + let rows = stmt + .query_map([], |r| Ok((r.get::<_, String>(0)?, r.get::<_, u64>(1)?))) + .context("query engine")?; + for row in rows.filter_map(|r| r.ok()) { + by_engine.insert(row.0, row.1); + } + } + + // 4. Top-10 docs by failure count + let by_doc: Vec = { + let mut stmt = conn + .prepare( + "SELECT doc_id, \ + SUM(CASE WHEN success=0 THEN 1 ELSE 0 END), \ + SUM(CASE WHEN success=1 THEN 1 ELSE 0 END) \ + FROM pdf_ocr_events \ + WHERE doc_id IS NOT NULL \ + GROUP BY doc_id \ + ORDER BY 2 DESC \ + LIMIT 10", + ) + .context("prepare by_doc query")?; + stmt.query_map( + [], + |r| { + Ok(OcrStatsByDoc { + doc_id: r.get(0)?, + failure_count: r.get(1)?, + success_count: r.get(2)?, + p90_ms: None, // per-doc p90 deferred (open question #3) + }) + }, + ) + .context("query by_doc")? + .filter_map(|r| r.ok()) + .collect() + }; + + Ok(OcrStatsV1 { + schema_version: "ocr_stats.v1", + total_events, + total_runs, + success_count, + failure_count, + success_rate, + p50_ms, + p90_ms, + p99_ms, + max_ms, + by_engine, + by_doc, + }) + } + + /// Recent OCR failure rows, optionally filtered by `doc_id`. + pub fn inspect_ocr_failures( + &self, + doc_id: Option<&str>, + limit: usize, + ) -> Result { + self.inspect_ocr_failures_with_config(&self.config, doc_id, limit) + } + + #[doc(hidden)] + pub fn inspect_ocr_failures_with_config( + &self, + _cfg: &kebab_config::Config, + doc_id: Option<&str>, + limit: usize, + ) -> Result { + let conn = self.sqlite.read_conn(); + let failures: Vec = if let Some(did) = doc_id { + let mut stmt = conn + .prepare( + "SELECT ts, page, ms, COALESCE(reason,'unknown'), image_byte_size \ + FROM pdf_ocr_events \ + WHERE success=0 AND doc_id=? \ + ORDER BY ts DESC \ + LIMIT ?", + ) + .context("prepare failures by doc_id")?; + stmt.query_map(rusqlite::params![did, limit as i64], |r| { + Ok(OcrFailureRow { + ts: r.get(0)?, + page: r.get(1)?, + ms: r.get(2)?, + reason: r.get(3)?, + image_byte_size: r.get(4)?, + }) + }) + .context("query failures by doc_id")? + .filter_map(|r| r.ok()) + .collect() + } else { + let mut stmt = conn + .prepare( + "SELECT ts, page, ms, COALESCE(reason,'unknown'), image_byte_size \ + FROM pdf_ocr_events \ + WHERE success=0 \ + ORDER BY ts DESC \ + LIMIT ?", + ) + .context("prepare failures corpus-wide")?; + stmt.query_map(rusqlite::params![limit as i64], |r| { + Ok(OcrFailureRow { + ts: r.get(0)?, + page: r.get(1)?, + ms: r.get(2)?, + reason: r.get(3)?, + image_byte_size: r.get(4)?, + }) + }) + .context("query failures corpus-wide")? + .filter_map(|r| r.ok()) + .collect() + }; + Ok(OcrFailuresV1 { + schema_version: "ocr_failures.v1", + doc_id: doc_id.map(String::from), + failure_count: failures.len() as u64, + failures, + }) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/kebab-app/src/ingest_log.rs b/crates/kebab-app/src/ingest_log.rs index 4e5d4fe..9288f30 100644 --- a/crates/kebab-app/src/ingest_log.rs +++ b/crates/kebab-app/src/ingest_log.rs @@ -181,7 +181,7 @@ impl IngestSummary { ocr_ms_samples: &[u64], duration_ms: u64, ) -> Self { - let (p50, p90, max) = percentiles(ocr_ms_samples); + let (p50, p90, _p99, max) = percentiles(ocr_ms_samples); Self { kind: "summary".to_string(), ts, @@ -200,18 +200,22 @@ impl IngestSummary { } /// Simple percentile extraction on a sorted copy of `samples`. -/// Returns `(p50, p90, max)`. All `None` when samples is empty. -pub(crate) fn percentiles(samples: &[u64]) -> (Option, Option, Option) { +/// Returns `(p50, p90, p99, max)`. All `None` when samples is empty. +/// p99 surfaces via `inspect ocr-stats`; `IngestSummary` uses p50/p90/max only. +pub(crate) fn percentiles( + samples: &[u64], +) -> (Option, Option, Option, Option) { if samples.is_empty() { - return (None, None, None); + return (None, None, None, None); } let mut sorted = samples.to_vec(); sorted.sort_unstable(); let n = sorted.len(); - let p50 = sorted[n * 50 / 100]; - let p90 = sorted[n * 90 / 100]; + let p50 = sorted[(n.saturating_sub(1) * 50) / 100]; + let p90 = sorted[(n.saturating_sub(1) * 90) / 100]; + let p99 = sorted[(n.saturating_sub(1) * 99) / 100]; let max = *sorted.last().unwrap(); - (Some(p50), Some(p90), Some(max)) + (Some(p50), Some(p90), Some(p99), Some(max)) } #[cfg(test)] diff --git a/crates/kebab-app/src/schema.rs b/crates/kebab-app/src/schema.rs index 8982f62..d2bc396 100644 --- a/crates/kebab-app/src/schema.rs +++ b/crates/kebab-app/src/schema.rs @@ -116,6 +116,9 @@ const WIRE_SCHEMAS: &[&str] = &[ "error.v1", "bulk_search_item.v1", "bulk_search_response.v1", + // v0.20.x r2 Enhancement 3: OCR statistics + failures introspection. + "ocr_stats.v1", + "ocr_failures.v1", ]; /// Build a [`SchemaV1`] introspection report for the given config. diff --git a/crates/kebab-app/tests/ocr_inspect_smoke.rs b/crates/kebab-app/tests/ocr_inspect_smoke.rs new file mode 100644 index 0000000..eba7174 --- /dev/null +++ b/crates/kebab-app/tests/ocr_inspect_smoke.rs @@ -0,0 +1,153 @@ +//! Integration smoke tests for `kebab inspect ocr-stats / ocr-failures`. +//! AC-4, AC-5, AC-6, AC-11 (ocr_inspect_smoke binary), AC-13. + +mod common; + +use common::TestEnv; +use kebab_app::App; +use kebab_store_sqlite::SqliteStore; + +/// Insert synthetic pdf_ocr_events rows directly so the test runs without +/// a live Ollama endpoint. +fn seed_ocr_events(env: &TestEnv, store: &SqliteStore) { + // Success rows + for i in 0..3u32 { + store + .record_pdf_ocr_event( + "run-aaa", + &format!("2026-05-28T0{}:00:00Z", i), + Some("doc-abc"), + "path/scanned.pdf", + i + 1, + Some(50_000), + Some(200), + Some(150), + 100 + (i as u64) * 20, + 42, + true, + None, + "qwen2.5vl", + ) + .expect("seed success row"); + } + // Failure row + store + .record_pdf_ocr_event( + "run-bbb", + "2026-05-28T10:00:00Z", + Some("doc-abc"), + "path/scanned.pdf", + 4, + Some(30_000), + Some(200), + Some(150), + 9999, + 0, + false, + Some("ocr_error"), + "qwen2.5vl", + ) + .expect("seed failure row"); + // Row for different doc + store + .record_pdf_ocr_event( + "run-ccc", + "2026-05-28T11:00:00Z", + Some("doc-xyz"), + "path/other.pdf", + 1, + None, + None, + None, + 200, + 10, + true, + None, + "qwen2.5vl", + ) + .expect("seed doc-xyz row"); + // Trigger migration (no-op if already done via App::open_with_config) + let _ = env; +} + +fn open_app_with_seeded_events(env: &TestEnv) -> App { + let app = env.app(); + let store = SqliteStore::open(&env.config).expect("open store for seed"); + store.run_migrations().expect("run migrations for seed"); + seed_ocr_events(env, &store); + app +} + +/// AC-4: `inspect_ocr_stats` returns `schema_version = "ocr_stats.v1"`, +/// `total_events >= 1`, `0 ≤ success_rate ≤ 1`. +#[test] +fn ocr_stats_after_seeded_events() { + let env = TestEnv::lexical_only(); + let app = open_app_with_seeded_events(&env); + + let stats = app.inspect_ocr_stats().expect("inspect_ocr_stats"); + + assert_eq!(stats.schema_version, "ocr_stats.v1"); + assert!(stats.total_events >= 1, "total_events should be >= 1"); + assert!( + (0.0..=1.0).contains(&stats.success_rate), + "success_rate must be in [0, 1]: {}", + stats.success_rate + ); + assert!(stats.total_runs >= 1, "total_runs should be >= 1"); + // by_engine should have at least one entry + assert!(!stats.by_engine.is_empty(), "by_engine must be non-empty"); +} + +/// AC-6: `inspect_ocr_failures` (no doc_id, corpus-wide) returns failures list. +#[test] +fn ocr_failures_corpus_wide() { + let env = TestEnv::lexical_only(); + let app = open_app_with_seeded_events(&env); + + let result = app + .inspect_ocr_failures(None, 10) + .expect("inspect_ocr_failures"); + + assert_eq!(result.schema_version, "ocr_failures.v1"); + assert!(result.failure_count >= 1, "expected at least 1 failure"); + assert!(!result.failures.is_empty(), "failures list must be non-empty"); +} + +/// AC-5: `inspect_ocr_failures` with doc_id filter returns matching rows. +#[test] +fn ocr_failures_filter_by_doc_id() { + let env = TestEnv::lexical_only(); + let app = open_app_with_seeded_events(&env); + + let result = app + .inspect_ocr_failures(Some("doc-abc"), 10) + .expect("inspect_ocr_failures by doc_id"); + + assert_eq!(result.schema_version, "ocr_failures.v1"); + assert_eq!( + result.doc_id.as_deref(), + Some("doc-abc"), + "doc_id must be echoed back" + ); + // All rows must belong to doc-abc (no cross-doc leak) + for row in &result.failures { + // rows are failure rows for doc-abc only (reason = ocr_error) + assert_eq!(row.reason, "ocr_error"); + } +} + +/// AC-13: SKILL.md lists both new wire schemas. +#[test] +fn skill_md_lists_new_schemas() { + let skill_md = std::fs::read_to_string("../../integrations/claude-code/kebab/SKILL.md") + .expect("read SKILL.md"); + assert!( + skill_md.contains("ocr_stats.v1"), + "SKILL.md must mention ocr_stats.v1" + ); + assert!( + skill_md.contains("ocr_failures.v1"), + "SKILL.md must mention ocr_failures.v1" + ); +} diff --git a/crates/kebab-cli/src/main.rs b/crates/kebab-cli/src/main.rs index 539a277..1f7ae81 100644 --- a/crates/kebab-cli/src/main.rs +++ b/crates/kebab-cli/src/main.rs @@ -358,6 +358,17 @@ enum InspectWhat { Doc { id: String }, /// Inspect a single chunk by ID. Chunk { id: String }, + /// Corpus-wide OCR statistics (total events, latency percentiles, engine breakdown). + OcrStats, + /// Recent OCR failures, optionally filtered by document ID. + OcrFailures { + /// Filter failures to a single document UUID. + #[arg(long)] + doc_id: Option, + /// Maximum number of failure rows to return. + #[arg(long, default_value_t = 10)] + limit: usize, + }, } #[derive(Subcommand, Debug)] @@ -691,6 +702,21 @@ fn run(cli: &Cli) -> anyhow::Result<()> { ); Ok(()) } + InspectWhat::OcrStats => { + let cfg = kebab_config::Config::load(cli.config.as_deref())?; + let app = kebab_app::App::open_with_config(cfg.clone())?; + let stats = app.inspect_ocr_stats_with_config(&cfg)?; + println!("{}", serde_json::to_string(&stats)?); + Ok(()) + } + InspectWhat::OcrFailures { doc_id, limit } => { + let cfg = kebab_config::Config::load(cli.config.as_deref())?; + let app = kebab_app::App::open_with_config(cfg.clone())?; + let failures = + app.inspect_ocr_failures_with_config(&cfg, doc_id.as_deref(), *limit)?; + println!("{}", serde_json::to_string(&failures)?); + Ok(()) + } }, Cmd::Fetch { what } => { diff --git a/docs/wire-schema/v1/ocr_failures.schema.json b/docs/wire-schema/v1/ocr_failures.schema.json new file mode 100644 index 0000000..0d0655a --- /dev/null +++ b/docs/wire-schema/v1/ocr_failures.schema.json @@ -0,0 +1,24 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ocr_failures.v1", + "type": "object", + "properties": { + "schema_version": { "const": "ocr_failures.v1" }, + "doc_id": { "type": ["string", "null"] }, + "failure_count": { "type": "integer" }, + "failures": { + "type": "array", + "items": { + "type": "object", + "properties": { + "ts": { "type": "string" }, + "page": { "type": "integer" }, + "ms": { "type": "integer" }, + "reason": { "type": "string" }, + "image_byte_size": { "type": ["integer", "null"] } + } + } + } + }, + "required": ["schema_version", "failure_count", "failures"] +} diff --git a/docs/wire-schema/v1/ocr_stats.schema.json b/docs/wire-schema/v1/ocr_stats.schema.json new file mode 100644 index 0000000..a71c2c9 --- /dev/null +++ b/docs/wire-schema/v1/ocr_stats.schema.json @@ -0,0 +1,31 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ocr_stats.v1", + "type": "object", + "properties": { + "schema_version": { "const": "ocr_stats.v1" }, + "total_events": { "type": "integer" }, + "total_runs": { "type": "integer" }, + "success_count": { "type": "integer" }, + "failure_count": { "type": "integer" }, + "success_rate": { "type": "number" }, + "p50_ms": { "type": ["integer", "null"] }, + "p90_ms": { "type": ["integer", "null"] }, + "p99_ms": { "type": ["integer", "null"] }, + "max_ms": { "type": ["integer", "null"] }, + "by_engine": { "type": "object", "additionalProperties": { "type": "integer" } }, + "by_doc": { + "type": "array", + "items": { + "type": "object", + "properties": { + "doc_id": { "type": "string" }, + "failure_count": { "type": "integer" }, + "success_count": { "type": "integer" }, + "p90_ms": { "type": ["integer", "null"] } + } + } + } + }, + "required": ["schema_version", "total_events", "total_runs", "success_count", "failure_count", "success_rate"] +} diff --git a/integrations/claude-code/kebab/SKILL.md b/integrations/claude-code/kebab/SKILL.md index 85472f2..87c984f 100644 --- a/integrations/claude-code/kebab/SKILL.md +++ b/integrations/claude-code/kebab/SKILL.md @@ -146,6 +146,7 @@ Claude Code spawns `kebab mcp` at session start; the process stays alive across - `search_response.v1.truncated = true` means budget forced snippet shortening or k reduction. Independent of `next_cursor`: widen `max_tokens` for fuller snippets, follow `next_cursor` for more hits, or both. - `ask`'s `citations[]` mirrors `search_hit.v1` minus retrieval internals — same `doc_path` / `citation` shape. - Schema reference lives in the kebab repo at `docs/wire-schema/v1/*.schema.json` if a field is unclear. v0.20.x additive minor: `ingest_progress.v1` `pdf_ocr_finished` events carry 4 optional new fields (`image_byte_size`, `image_width`, `image_height`, `failure_reason`) — absent on pre-v0.20 events (backward compat). +- v0.20.x r2 additive minor: `ocr_stats.v1` — corpus-wide OCR statistics (`total_events`, `success_rate`, `p50/p90/p99/max_ms`, `by_engine`, top-10 `by_doc` by failure count) emitted by `kebab inspect ocr-stats --json`. `ocr_failures.v1` — per-doc or corpus-wide recent failure list (`failure_count`, `failures[]` with `ts`, `page`, `ms`, `reason`, `image_byte_size`) emitted by `kebab inspect ocr-failures [--doc-id ] [--limit N] --json`. - `search_hit.v1` and `answer.v1.citations[]` carry `indexed_at` (RFC3339) + `stale` (bool). When `stale == true`, the source doc hasn't been re-processed since `config.search.stale_threshold_days`. Surface this caveat to the user when summarizing — the cited snapshot may not reflect current reality. ## Capability discovery