From 8cc4e6d56314da298305abeb8b8f61e448c22104 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 4 Jun 2026 08:22:47 +0000 Subject: [PATCH] =?UTF-8?q?fix(ocr):=20T10/T11=20=E2=80=94=20unclip=20edge?= =?UTF-8?q?-offset=20(CER=200.26=E2=86=920.005)=20+=20e2e=20gate=20+=20err?= =?UTF-8?q?or=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause found at T11 e2e: unclip_rect pushed corners radially from the centroid. For a wide/short text box the diagonal is near-horizontal, so the box barely grew in height and clipped character tops (ㄷ→ㄴ, 다→나). Rewrote unclip as a proper per-edge polygon offset along the rect's own (u,v) axes — height and width each grow by 2*distance, matching PaddleOCR pyclipper. Result (synthetic-ocr-bench, real inference): mean gate CER 0.2585 → 0.0049 (clean_paragraph/korean_heavy/numbers_table/tech_terms = 0.0), beating the 0.976 PoC baseline. Big page 3.9s < 5s. T10: dict-length-mismatch construction error + undecodable-bytes recognize error. T11 e2e: tests/paddle_e2e.rs CER<=0.05 gate (skips cleanly when assets absent). Co-Authored-By: Claude Opus 4.8 --- crates/kebab-parse-image/src/paddle_onnx.rs | 59 ++++++-- crates/kebab-parse-image/tests/paddle_e2e.rs | 146 +++++++++++++++++++ 2 files changed, 191 insertions(+), 14 deletions(-) create mode 100644 crates/kebab-parse-image/tests/paddle_e2e.rs diff --git a/crates/kebab-parse-image/src/paddle_onnx.rs b/crates/kebab-parse-image/src/paddle_onnx.rs index df4aba5..e519866 100644 --- a/crates/kebab-parse-image/src/paddle_onnx.rs +++ b/crates/kebab-parse-image/src/paddle_onnx.rs @@ -718,21 +718,32 @@ fn unclip_rect(rect: &RotRect, ratio: f32) -> [(f32, f32); 4] { return rect.corners; } let distance = area * ratio / perimeter; - // expand around centroid - let cx = rect.corners.iter().map(|p| p.0).sum::() / 4.0; - let cy = rect.corners.iter().map(|p| p.1).sum::() / 4.0; - let mut out = rect.corners; - for p in &mut out { - let dx = p.0 - cx; - let dy = p.1 - cy; + // Offset every EDGE outward by `distance` (PaddleOCR pyclipper polygon + // offset): width and height each grow by 2*distance. A naive radial + // push-from-centroid is WRONG for text boxes — a wide/short box has an + // almost-horizontal diagonal, so radial expansion barely grows the height + // and clips character tops/bottoms (ㄷ→ㄴ, ascenders lost). We instead + // expand along the rect's own (u, v) axes recovered from its ordered + // corners (c0=min_u,min_v; c1=max_u,min_v; c2=max_u,max_v; c3=min_u,max_v). + let c = &rect.corners; + let unit = |dx: f32, dy: f32| -> (f32, f32) { let len = (dx * dx + dy * dy).sqrt(); - if len > 1e-6 { - // push each corner outward along its diagonal by `distance` - p.0 += dx / len * distance; - p.1 += dy / len * distance; - } - } - out + if len > 1e-6 { (dx / len, dy / len) } else { (0.0, 0.0) } + }; + let u = unit(c[1].0 - c[0].0, c[1].1 - c[0].1); // +u (along width) + let v = unit(c[3].0 - c[0].0, c[3].1 - c[0].1); // +v (along height) + let off = |p: (f32, f32), su: f32, sv: f32| -> (f32, f32) { + ( + p.0 + su * distance * u.0 + sv * distance * v.0, + p.1 + su * distance * u.1 + sv * distance * v.1, + ) + }; + [ + off(c[0], -1.0, -1.0), + off(c[1], 1.0, -1.0), + off(c[2], 1.0, 1.0), + off(c[3], -1.0, 1.0), + ] } // ── crop + rectify ─────────────────────────────────────────────────────────── @@ -834,6 +845,26 @@ mod tests { assert!((hi - 20.0).abs() < 1e-3, "long side {hi}"); } + #[test] + fn dict_length_mismatch_is_construction_error() { + // T10: a dict whose line count != DICT_LINES must fail at construction + // (before loading the ONNX sessions) rather than mis-decoding silently. + use std::io::Write; + let dir = tempfile::tempdir().unwrap(); + let dict_path = dir.path().join("bad_dict.txt"); + let mut f = std::fs::File::create(&dict_path).unwrap(); + writeln!(f, "a\nb\nc").unwrap(); // 3 lines, not DICT_LINES + let paths = ModelPaths { + det: dir.path().join("unused_det.onnx"), + rec: dir.path().join("unused_rec.onnx"), + dict: dict_path, + }; + let err = OnnxPaddleOcr::from_paths(&paths, 0.3, 1.5, 1000, 1600) + .expect_err("dict mismatch must error"); + let msg = format!("{err:#}"); + assert!(msg.contains("dict has 3 lines"), "unexpected error: {msg}"); + } + #[test] fn model_paths_from_config_uses_overrides() { // T7: unset overrides → bundled default asset paths. diff --git a/crates/kebab-parse-image/tests/paddle_e2e.rs b/crates/kebab-parse-image/tests/paddle_e2e.rs new file mode 100644 index 0000000..36f8e08 --- /dev/null +++ b/crates/kebab-parse-image/tests/paddle_e2e.rs @@ -0,0 +1,146 @@ +//! T11 e2e accuracy gate for the paddle-onnx OCR engine. +//! +//! Runs the full `OnnxPaddleOcr` pipeline (det → rectify → rec → CTC) over the +//! synthetic OCR benchmark fixtures and asserts the mean character error rate +//! (CER) over the clean text set is `<= 0.05`, matching the spec gate. +//! +//! Model assets come from `KEBAB_TEST_OCR_MODEL_DIR` (default: the crate's +//! bundled `assets/paddleocr-onnx/`). Fixtures come from +//! `KEBAB_TEST_OCR_FIXTURE_DIR` (default: the dogfood corpus). If either is +//! absent the test skips with a warning rather than failing — CI without the +//! large models / fixtures stays green (plan T0/M4). + +use std::collections::HashMap; +use std::path::PathBuf; + +use kebab_parse_image::{ModelPaths, OcrEngine, OnnxPaddleOcr}; + +/// Collapse all whitespace runs to a single space + trim — matches the Python +/// `score_lib.norm` so the Rust gate and the bench harness agree. +fn norm(s: &str) -> String { + s.split_whitespace().collect::>().join(" ") +} + +/// Character error rate = Levenshtein(gt, pred) / len(gt), both normalized. +fn cer(gt: &str, pred: &str) -> f64 { + let g: Vec = norm(gt).chars().collect(); + let p: Vec = norm(pred).chars().collect(); + if g.is_empty() { + return if p.is_empty() { 0.0 } else { 1.0 }; + } + let (m, n) = (g.len(), p.len()); + let mut prev: Vec = (0..=n).collect(); + for i in 1..=m { + let mut cur = vec![i; n + 1]; + for j in 1..=n { + let cost = if g[i - 1] == p[j - 1] { 0 } else { 1 }; + cur[j] = (prev[j] + 1).min(cur[j - 1] + 1).min(prev[j - 1] + cost); + } + prev = cur; + } + prev[n] as f64 / m as f64 +} + +fn fixture_dir() -> PathBuf { + std::env::var("KEBAB_TEST_OCR_FIXTURE_DIR") + .map(PathBuf::from) + .unwrap_or_else(|_| { + PathBuf::from("/build/dogfood/corpus/images/synthetic-ocr-bench") + }) +} + +/// T10: undecodable image bytes must surface as an error (the kebab-app caller +/// then skips the asset + records provenance), not panic or return garbage. +#[test] +fn paddle_onnx_decode_failure_is_error() { + let paths = ModelPaths::from_default_dir(); + if !paths.det.exists() || !paths.rec.exists() || !paths.dict.exists() { + eprintln!("SKIP paddle_onnx_decode_failure_is_error: model assets not found"); + return; + } + let engine = OnnxPaddleOcr::from_paths(&paths, 0.3, 1.5, 1000, 1600).unwrap(); + let err = engine + .recognize(b"not a real image", None) + .expect_err("garbage bytes must fail to decode"); + let msg = format!("{err:#}"); + assert!(msg.contains("decoding image"), "unexpected error: {msg}"); +} + +#[test] +fn paddle_onnx_cer_gate() { + let paths = ModelPaths::from_default_dir(); + if !paths.det.exists() || !paths.rec.exists() || !paths.dict.exists() { + eprintln!( + "SKIP paddle_onnx_cer_gate: model assets not found (det={}). \ + Set KEBAB_TEST_OCR_MODEL_DIR or place assets/paddleocr-onnx/.", + paths.det.display() + ); + return; + } + let fdir = fixture_dir(); + let gt_path = fdir.join("gt.json"); + if !gt_path.exists() { + eprintln!( + "SKIP paddle_onnx_cer_gate: fixtures not found at {}", + fdir.display() + ); + return; + } + + let gt: HashMap = + serde_json::from_str(&std::fs::read_to_string(>_path).unwrap()).unwrap(); + + let engine = OnnxPaddleOcr::from_paths(&paths, 0.3, 1.5, 1000, 1600) + .expect("build OnnxPaddleOcr from bundled assets"); + + // "clean" set used for the gate — the standard, well-formed text fixtures. + // low_contrast / small_dense are intentionally hard and tracked but not + // part of the hard gate. + let gate_set = [ + "clean_paragraph.png", + "title_body.png", + "tech_terms.png", + "korean_heavy.png", + "numbers_table.png", + ]; + + let mut gate_cers = Vec::new(); + let mut names: Vec<&String> = gt.keys().collect(); + names.sort(); + println!("\n=== paddle-onnx CER per fixture ==="); + for name in names { + let img_path = fdir.join(name); + if !img_path.exists() { + continue; + } + let bytes = std::fs::read(&img_path).unwrap(); + let t0 = std::time::Instant::now(); + let out = engine.recognize(&bytes, None).expect("recognize"); + let dt = t0.elapsed(); + let c = cer(>[name], &out.joined); + if std::env::var("KEBAB_OCR_DUMP").is_ok() { + println!(" GT [{name}]: {:?}", norm(>[name])); + println!(" OUT [{name}]: {:?}", norm(&out.joined)); + } + let gated = gate_set.contains(&name.as_str()); + println!( + "{:<22} CER={:.4} {} ({} regions, {} ms)", + name, + c, + if gated { "[gate]" } else { " " }, + out.regions.len(), + dt.as_millis() + ); + if gated { + gate_cers.push(c); + } + } + + assert!(!gate_cers.is_empty(), "no gate fixtures were scored"); + let mean = gate_cers.iter().sum::() / gate_cers.len() as f64; + println!("=== mean gate CER = {mean:.4} (threshold 0.05) ===\n"); + assert!( + mean <= 0.05, + "paddle-onnx mean CER {mean:.4} exceeds 0.05 gate" + ); +}