fix(ocr): T10/T11 — unclip edge-offset (CER 0.26→0.005) + e2e gate + error tests
Root cause found at T11 e2e: unclip_rect pushed corners radially from the centroid. For a wide/short text box the diagonal is near-horizontal, so the box barely grew in height and clipped character tops (ㄷ→ㄴ, 다→나). Rewrote unclip as a proper per-edge polygon offset along the rect's own (u,v) axes — height and width each grow by 2*distance, matching PaddleOCR pyclipper. Result (synthetic-ocr-bench, real inference): mean gate CER 0.2585 → 0.0049 (clean_paragraph/korean_heavy/numbers_table/tech_terms = 0.0), beating the 0.976 PoC baseline. Big page 3.9s < 5s. T10: dict-length-mismatch construction error + undecodable-bytes recognize error. T11 e2e: tests/paddle_e2e.rs CER<=0.05 gate (skips cleanly when assets absent). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -718,21 +718,32 @@ fn unclip_rect(rect: &RotRect, ratio: f32) -> [(f32, f32); 4] {
|
|||||||
return rect.corners;
|
return rect.corners;
|
||||||
}
|
}
|
||||||
let distance = area * ratio / perimeter;
|
let distance = area * ratio / perimeter;
|
||||||
// expand around centroid
|
// Offset every EDGE outward by `distance` (PaddleOCR pyclipper polygon
|
||||||
let cx = rect.corners.iter().map(|p| p.0).sum::<f32>() / 4.0;
|
// offset): width and height each grow by 2*distance. A naive radial
|
||||||
let cy = rect.corners.iter().map(|p| p.1).sum::<f32>() / 4.0;
|
// push-from-centroid is WRONG for text boxes — a wide/short box has an
|
||||||
let mut out = rect.corners;
|
// almost-horizontal diagonal, so radial expansion barely grows the height
|
||||||
for p in &mut out {
|
// and clips character tops/bottoms (ㄷ→ㄴ, ascenders lost). We instead
|
||||||
let dx = p.0 - cx;
|
// expand along the rect's own (u, v) axes recovered from its ordered
|
||||||
let dy = p.1 - cy;
|
// corners (c0=min_u,min_v; c1=max_u,min_v; c2=max_u,max_v; c3=min_u,max_v).
|
||||||
|
let c = &rect.corners;
|
||||||
|
let unit = |dx: f32, dy: f32| -> (f32, f32) {
|
||||||
let len = (dx * dx + dy * dy).sqrt();
|
let len = (dx * dx + dy * dy).sqrt();
|
||||||
if len > 1e-6 {
|
if len > 1e-6 { (dx / len, dy / len) } else { (0.0, 0.0) }
|
||||||
// push each corner outward along its diagonal by `distance`
|
};
|
||||||
p.0 += dx / len * distance;
|
let u = unit(c[1].0 - c[0].0, c[1].1 - c[0].1); // +u (along width)
|
||||||
p.1 += dy / len * distance;
|
let v = unit(c[3].0 - c[0].0, c[3].1 - c[0].1); // +v (along height)
|
||||||
}
|
let off = |p: (f32, f32), su: f32, sv: f32| -> (f32, f32) {
|
||||||
}
|
(
|
||||||
out
|
p.0 + su * distance * u.0 + sv * distance * v.0,
|
||||||
|
p.1 + su * distance * u.1 + sv * distance * v.1,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
[
|
||||||
|
off(c[0], -1.0, -1.0),
|
||||||
|
off(c[1], 1.0, -1.0),
|
||||||
|
off(c[2], 1.0, 1.0),
|
||||||
|
off(c[3], -1.0, 1.0),
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── crop + rectify ───────────────────────────────────────────────────────────
|
// ── crop + rectify ───────────────────────────────────────────────────────────
|
||||||
@@ -834,6 +845,26 @@ mod tests {
|
|||||||
assert!((hi - 20.0).abs() < 1e-3, "long side {hi}");
|
assert!((hi - 20.0).abs() < 1e-3, "long side {hi}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn dict_length_mismatch_is_construction_error() {
|
||||||
|
// T10: a dict whose line count != DICT_LINES must fail at construction
|
||||||
|
// (before loading the ONNX sessions) rather than mis-decoding silently.
|
||||||
|
use std::io::Write;
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let dict_path = dir.path().join("bad_dict.txt");
|
||||||
|
let mut f = std::fs::File::create(&dict_path).unwrap();
|
||||||
|
writeln!(f, "a\nb\nc").unwrap(); // 3 lines, not DICT_LINES
|
||||||
|
let paths = ModelPaths {
|
||||||
|
det: dir.path().join("unused_det.onnx"),
|
||||||
|
rec: dir.path().join("unused_rec.onnx"),
|
||||||
|
dict: dict_path,
|
||||||
|
};
|
||||||
|
let err = OnnxPaddleOcr::from_paths(&paths, 0.3, 1.5, 1000, 1600)
|
||||||
|
.expect_err("dict mismatch must error");
|
||||||
|
let msg = format!("{err:#}");
|
||||||
|
assert!(msg.contains("dict has 3 lines"), "unexpected error: {msg}");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn model_paths_from_config_uses_overrides() {
|
fn model_paths_from_config_uses_overrides() {
|
||||||
// T7: unset overrides → bundled default asset paths.
|
// T7: unset overrides → bundled default asset paths.
|
||||||
|
|||||||
146
crates/kebab-parse-image/tests/paddle_e2e.rs
Normal file
146
crates/kebab-parse-image/tests/paddle_e2e.rs
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
//! T11 e2e accuracy gate for the paddle-onnx OCR engine.
|
||||||
|
//!
|
||||||
|
//! Runs the full `OnnxPaddleOcr` pipeline (det → rectify → rec → CTC) over the
|
||||||
|
//! synthetic OCR benchmark fixtures and asserts the mean character error rate
|
||||||
|
//! (CER) over the clean text set is `<= 0.05`, matching the spec gate.
|
||||||
|
//!
|
||||||
|
//! Model assets come from `KEBAB_TEST_OCR_MODEL_DIR` (default: the crate's
|
||||||
|
//! bundled `assets/paddleocr-onnx/`). Fixtures come from
|
||||||
|
//! `KEBAB_TEST_OCR_FIXTURE_DIR` (default: the dogfood corpus). If either is
|
||||||
|
//! absent the test skips with a warning rather than failing — CI without the
|
||||||
|
//! large models / fixtures stays green (plan T0/M4).
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use kebab_parse_image::{ModelPaths, OcrEngine, OnnxPaddleOcr};
|
||||||
|
|
||||||
|
/// Collapse all whitespace runs to a single space + trim — matches the Python
|
||||||
|
/// `score_lib.norm` so the Rust gate and the bench harness agree.
|
||||||
|
fn norm(s: &str) -> String {
|
||||||
|
s.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Character error rate = Levenshtein(gt, pred) / len(gt), both normalized.
|
||||||
|
fn cer(gt: &str, pred: &str) -> f64 {
|
||||||
|
let g: Vec<char> = norm(gt).chars().collect();
|
||||||
|
let p: Vec<char> = norm(pred).chars().collect();
|
||||||
|
if g.is_empty() {
|
||||||
|
return if p.is_empty() { 0.0 } else { 1.0 };
|
||||||
|
}
|
||||||
|
let (m, n) = (g.len(), p.len());
|
||||||
|
let mut prev: Vec<usize> = (0..=n).collect();
|
||||||
|
for i in 1..=m {
|
||||||
|
let mut cur = vec![i; n + 1];
|
||||||
|
for j in 1..=n {
|
||||||
|
let cost = if g[i - 1] == p[j - 1] { 0 } else { 1 };
|
||||||
|
cur[j] = (prev[j] + 1).min(cur[j - 1] + 1).min(prev[j - 1] + cost);
|
||||||
|
}
|
||||||
|
prev = cur;
|
||||||
|
}
|
||||||
|
prev[n] as f64 / m as f64
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fixture_dir() -> PathBuf {
|
||||||
|
std::env::var("KEBAB_TEST_OCR_FIXTURE_DIR")
|
||||||
|
.map(PathBuf::from)
|
||||||
|
.unwrap_or_else(|_| {
|
||||||
|
PathBuf::from("/build/dogfood/corpus/images/synthetic-ocr-bench")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// T10: undecodable image bytes must surface as an error (the kebab-app caller
|
||||||
|
/// then skips the asset + records provenance), not panic or return garbage.
|
||||||
|
#[test]
|
||||||
|
fn paddle_onnx_decode_failure_is_error() {
|
||||||
|
let paths = ModelPaths::from_default_dir();
|
||||||
|
if !paths.det.exists() || !paths.rec.exists() || !paths.dict.exists() {
|
||||||
|
eprintln!("SKIP paddle_onnx_decode_failure_is_error: model assets not found");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let engine = OnnxPaddleOcr::from_paths(&paths, 0.3, 1.5, 1000, 1600).unwrap();
|
||||||
|
let err = engine
|
||||||
|
.recognize(b"not a real image", None)
|
||||||
|
.expect_err("garbage bytes must fail to decode");
|
||||||
|
let msg = format!("{err:#}");
|
||||||
|
assert!(msg.contains("decoding image"), "unexpected error: {msg}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn paddle_onnx_cer_gate() {
|
||||||
|
let paths = ModelPaths::from_default_dir();
|
||||||
|
if !paths.det.exists() || !paths.rec.exists() || !paths.dict.exists() {
|
||||||
|
eprintln!(
|
||||||
|
"SKIP paddle_onnx_cer_gate: model assets not found (det={}). \
|
||||||
|
Set KEBAB_TEST_OCR_MODEL_DIR or place assets/paddleocr-onnx/.",
|
||||||
|
paths.det.display()
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let fdir = fixture_dir();
|
||||||
|
let gt_path = fdir.join("gt.json");
|
||||||
|
if !gt_path.exists() {
|
||||||
|
eprintln!(
|
||||||
|
"SKIP paddle_onnx_cer_gate: fixtures not found at {}",
|
||||||
|
fdir.display()
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let gt: HashMap<String, String> =
|
||||||
|
serde_json::from_str(&std::fs::read_to_string(>_path).unwrap()).unwrap();
|
||||||
|
|
||||||
|
let engine = OnnxPaddleOcr::from_paths(&paths, 0.3, 1.5, 1000, 1600)
|
||||||
|
.expect("build OnnxPaddleOcr from bundled assets");
|
||||||
|
|
||||||
|
// "clean" set used for the gate — the standard, well-formed text fixtures.
|
||||||
|
// low_contrast / small_dense are intentionally hard and tracked but not
|
||||||
|
// part of the hard gate.
|
||||||
|
let gate_set = [
|
||||||
|
"clean_paragraph.png",
|
||||||
|
"title_body.png",
|
||||||
|
"tech_terms.png",
|
||||||
|
"korean_heavy.png",
|
||||||
|
"numbers_table.png",
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut gate_cers = Vec::new();
|
||||||
|
let mut names: Vec<&String> = gt.keys().collect();
|
||||||
|
names.sort();
|
||||||
|
println!("\n=== paddle-onnx CER per fixture ===");
|
||||||
|
for name in names {
|
||||||
|
let img_path = fdir.join(name);
|
||||||
|
if !img_path.exists() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let bytes = std::fs::read(&img_path).unwrap();
|
||||||
|
let t0 = std::time::Instant::now();
|
||||||
|
let out = engine.recognize(&bytes, None).expect("recognize");
|
||||||
|
let dt = t0.elapsed();
|
||||||
|
let c = cer(>[name], &out.joined);
|
||||||
|
if std::env::var("KEBAB_OCR_DUMP").is_ok() {
|
||||||
|
println!(" GT [{name}]: {:?}", norm(>[name]));
|
||||||
|
println!(" OUT [{name}]: {:?}", norm(&out.joined));
|
||||||
|
}
|
||||||
|
let gated = gate_set.contains(&name.as_str());
|
||||||
|
println!(
|
||||||
|
"{:<22} CER={:.4} {} ({} regions, {} ms)",
|
||||||
|
name,
|
||||||
|
c,
|
||||||
|
if gated { "[gate]" } else { " " },
|
||||||
|
out.regions.len(),
|
||||||
|
dt.as_millis()
|
||||||
|
);
|
||||||
|
if gated {
|
||||||
|
gate_cers.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(!gate_cers.is_empty(), "no gate fixtures were scored");
|
||||||
|
let mean = gate_cers.iter().sum::<f64>() / gate_cers.len() as f64;
|
||||||
|
println!("=== mean gate CER = {mean:.4} (threshold 0.05) ===\n");
|
||||||
|
assert!(
|
||||||
|
mean <= 0.05,
|
||||||
|
"paddle-onnx mean CER {mean:.4} exceeds 0.05 gate"
|
||||||
|
);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user