feat(ocr): T2-T6 OnnxPaddleOcr core engine — det/rec ONNX + DBNet postproc + CTC

PP-OCRv5 ONNX OCR engine on the pinned ort rc.9 (no Python, no oar-ocr dep). Implements the recognize() pipeline end-to-end (compiles + unit-tested): - T2: OnnxPaddleOcr skeleton, OcrEngine impl, det/rec Session loaded once (Mutex-wrapped → Send+Sync), engine_version = blake3(det+rec+dict) cached once at construction, dict bounds-check (11945 lines vs 11947 rec classes). - T2 preproc: det ImageNet mean/std NCHW + limit_side_len 960 → ×32 round (golden 192x900→896x192 pinned); rec height-48 keep-aspect, (x-0.5)/0.5. - T3 det postproc: threshold 0.3 → imageproc contours → min-area rect via pure-Rust rotating calipers + convex hull → mean-prob box-score filter → pure-Rust unclip(ratio 1.5). No clipper2/OpenCV. - T4 crop+rectify: corner ordering + bilinear perspective warp to horizontal. - T5 rec+CTC: greedy decode with the T0a-confirmed mapping (idx0=blank, 1..=11945=dict[idx-1], 11946=space), rec-class bounds-check. - T6 assembly: reading-order OcrText with per-region bbox + real confidence. Unit tests (4 pass): det_target_dims golden, convex hull, min-area rect, unclip expansion. Large *.onnx assets stay untracked pending T12 LFS decision. Remaining: T7 config overrides, T8 factory (4 sites), T9 signature cascade, T10 error matrix, T11 gates (clippy/e2e CER), T12 docs+bump+PR. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 07:52:39 +00:00
parent 8f8d3a4100
commit b706e3e88c
3 changed files with 824 additions and 0 deletions
--- a/crates/kebab-parse-image/Cargo.toml
+++ b/crates/kebab-parse-image/Cargo.toml
@@ -45,6 +45,10 @@ thiserror    = { workspace = true }
 # so a standalone `cargo test -p kebab-parse-image` needs it to link onnxruntime.
 ort          = { workspace = true, features = ["ndarray", "download-binaries"] }
 ndarray      = { workspace = true }
+# blake3: engine_version hash over the bundled det/rec/dict assets (computed
+# once at OnnxPaddleOcr construction, cached — `ingest_config_signature` calls
+# engine_version() per asset).
+blake3       = { workspace = true }
 # imageproc: connected-components / contours for DBNet det post-processing.
 # min-area rotated-rect (rotating calipers) and polygon unclip are implemented
 # in pure Rust (clipper2 is C++ FFI — would break the single-binary guarantee).
--- a/crates/kebab-parse-image/src/lib.rs
+++ b/crates/kebab-parse-image/src/lib.rs
@@ -30,9 +30,11 @@ mod dims;
 mod exif_extract;
 mod image_prep;
 pub mod ocr;
+pub mod paddle_onnx;

 pub use caption::{apply_caption, caption_image};
 pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
+pub use paddle_onnx::{OnnxPaddleOcr, PADDLE_ONNX_ENGINE};

 use anyhow::{Context, Result};
 use kebab_core::{
--- a/crates/kebab-parse-image/src/paddle_onnx.rs
+++ b/crates/kebab-parse-image/src/paddle_onnx.rs
@@ -0,0 +1,818 @@
+//! PP-OCRv5 ONNX OCR engine — in-process detection + recognition on the
+//! workspace-pinned `ort` (=2.0.0-rc.9), no Python runtime, no oar-ocr
+//! production dependency (see crate-level rationale + `assets/paddleocr-onnx/NOTICE`).
+//!
+//! Pipeline (`recognize`):
+//!   1. decode (RGB) + downscale long edge to `max_pixels`
+//!   2. det:  ImageNet-normalized NCHW → DBNet prob map `[1,1,H,W]`
+//!            → threshold 0.3 → contours → min-area rect (rotating calipers,
+//!            pure Rust) → unclip(ratio 1.5, pure Rust) → boxes
+//!   3. crop+rectify: perspective warp each rotated box to a horizontal strip
+//!   4. rec:  48×W normalized `(x-0.5)/0.5` → `[1,T,11947]` → CTC greedy decode
+//!   5. assemble reading-order `OcrText`
+//!
+//! ## Confirmed CTC facts (empirically derived in T0a, see
+//! `tests/golden/ctc_rec_golden.json` — do NOT re-derive):
+//!   * rec classes = 11947 = dict(11945) + blank + space
+//!   * index 0       = CTC blank
+//!   * index 1..=11945 = `korean_dict.txt` line N → class N (i.e. `dict[N-1]`)
+//!   * index 11946   = space ' '
+//!
+//! ## rc.9 API notes (differ from rc.12):
+//!   * `try_extract_tensor::<f32>()` → `ArrayViewD<f32>` (`.shape()` / indexing).
+//!   * `Session::run` is called through a `Mutex` guard so the engine is
+//!     `Send + Sync` regardless of `Session`'s own auto-trait status (ingest
+//!     is serial today; the lock is uncontended).
+
+use std::path::{Path, PathBuf};
+use std::sync::Mutex;
+
+use anyhow::{Context, Result};
+use kebab_core::{Lang, OcrRegion, OcrText};
+use ndarray::Array4;
+use ort::session::Session;
+use ort::value::Value;
+
+use crate::ocr::OcrEngine;
+
+/// Engine name written into `OcrText.engine`.
+pub const PADDLE_ONNX_ENGINE: &str = "paddle-onnx";
+
+/// CTC blank class index (confirmed in T0a).
+const CTC_BLANK: usize = 0;
+/// Space class index (confirmed in T0a). `1..=DICT_LINES` map to dict entries.
+const CTC_SPACE: usize = 11946;
+/// `korean_dict.txt` line count (confirmed in T0a).
+const DICT_LINES: usize = 11945;
+/// rec output class count = dict + blank + space (confirmed in T0a).
+const REC_CLASSES: usize = 11947;
+
+/// det long-edge cap before rounding to a multiple of 32 (PaddleOCR default).
+const DET_LIMIT_SIDE_LEN: u32 = 960;
+/// rec input height (PP-OCRv5 mobile).
+const REC_HEIGHT: u32 = 48;
+
+/// ImageNet normalization (det preprocessing — RGB).
+const IMAGENET_MEAN: [f32; 3] = [0.485, 0.456, 0.406];
+const IMAGENET_STD: [f32; 3] = [0.229, 0.224, 0.225];
+
+/// PP-OCRv5 ONNX engine. Holds the two ONNX sessions (loaded once) and the
+/// dict. `engine_version` is computed once at construction (blake3 over the
+/// three model assets) and cached — `ingest_config_signature` calls
+/// `engine_version()` per asset, so re-hashing there would be O(assets).
+pub struct OnnxPaddleOcr {
+    det: Mutex<Session>,
+    rec: Mutex<Session>,
+    det_input_name: String,
+    rec_input_name: String,
+    dict: Vec<String>,
+    engine_version: String,
+    score_thresh: f32,
+    unclip_ratio: f32,
+    max_boxes: usize,
+    max_pixels: u32,
+}
+
+impl std::fmt::Debug for OnnxPaddleOcr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("OnnxPaddleOcr")
+            .field("engine_version", &self.engine_version)
+            .field("dict_lines", &self.dict.len())
+            .field("score_thresh", &self.score_thresh)
+            .field("unclip_ratio", &self.unclip_ratio)
+            .field("max_boxes", &self.max_boxes)
+            .field("max_pixels", &self.max_pixels)
+            .finish()
+    }
+}
+
+/// Resolved model-asset paths. Construction is decoupled from `kebab-config`
+/// (T7 adds the `det_model`/`rec_model`/`dict` overrides) so the engine can be
+/// built directly in tests.
+#[derive(Clone, Debug)]
+pub struct ModelPaths {
+    pub det: PathBuf,
+    pub rec: PathBuf,
+    pub dict: PathBuf,
+}
+
+impl ModelPaths {
+    /// Default bundled-asset directory: `KEBAB_IMAGE_OCR_MODEL_DIR` if set,
+    /// else the crate's `assets/paddleocr-onnx/`.
+    pub fn from_default_dir() -> Self {
+        let dir = std::env::var("KEBAB_IMAGE_OCR_MODEL_DIR")
+            .map(PathBuf::from)
+            .unwrap_or_else(|_| {
+                Path::new(env!("CARGO_MANIFEST_DIR")).join("assets/paddleocr-onnx")
+            });
+        Self {
+            det: dir.join("ppocrv5_mobile_det.onnx"),
+            rec: dir.join("korean_ppocrv5_mobile_rec.onnx"),
+            dict: dir.join("korean_dict.txt"),
+        }
+    }
+}
+
+impl OnnxPaddleOcr {
+    /// Build from a workspace [`kebab_config::Config`]. Resolves model paths
+    /// from the default bundled directory (T7 will thread config overrides).
+    /// Construction loads both ONNX sessions and hashes the assets — failures
+    /// here are fail-fast (matches the Ollama adapter's construction contract).
+    pub fn new(config: &kebab_config::Config) -> Result<Self> {
+        let paths = ModelPaths::from_default_dir();
+        Self::from_paths(
+            &paths,
+            0.3,
+            1.5,
+            1000,
+            config.image.ocr.max_pixels,
+        )
+    }
+
+    /// Build from explicit asset paths + tuning knobs. Used by tests and by
+    /// `new` after path resolution.
+    pub fn from_paths(
+        paths: &ModelPaths,
+        score_thresh: f32,
+        unclip_ratio: f32,
+        max_boxes: usize,
+        max_pixels: u32,
+    ) -> Result<Self> {
+        let dict = load_dict(&paths.dict)
+            .with_context(|| format!("loading OCR dict from {}", paths.dict.display()))?;
+        // bounds-check: dict length must match the rec class layout
+        // (dict + blank + space). A mismatch means a wrong dict file —
+        // fail at construction rather than mis-decoding silently.
+        if dict.len() != DICT_LINES {
+            anyhow::bail!(
+                "OnnxPaddleOcr: dict has {} lines, expected {DICT_LINES} \
+                 (rec classes {REC_CLASSES} = dict + blank + space)",
+                dict.len()
+            );
+        }
+
+        let engine_version = compute_engine_version(paths)
+            .context("hashing OCR model assets for engine_version")?;
+
+        let det = Session::builder()
+            .context("ort Session::builder (det)")?
+            .commit_from_file(&paths.det)
+            .with_context(|| format!("loading det model {}", paths.det.display()))?;
+        let rec = Session::builder()
+            .context("ort Session::builder (rec)")?
+            .commit_from_file(&paths.rec)
+            .with_context(|| format!("loading rec model {}", paths.rec.display()))?;
+
+        let det_input_name = det
+            .inputs
+            .first()
+            .map(|i| i.name.clone())
+            .context("det model has no inputs")?;
+        let rec_input_name = rec
+            .inputs
+            .first()
+            .map(|i| i.name.clone())
+            .context("rec model has no inputs")?;
+
+        Ok(Self {
+            det: Mutex::new(det),
+            rec: Mutex::new(rec),
+            det_input_name,
+            rec_input_name,
+            dict,
+            engine_version,
+            score_thresh,
+            unclip_ratio,
+            max_boxes,
+            max_pixels: max_pixels.clamp(256, 4096),
+        })
+    }
+
+    /// Map a CTC class index to its output string. `None` for blank.
+    /// `index 0 = blank`, `1..=11945 = dict[index-1]`, `11946 = space`.
+    fn class_to_str(&self, idx: usize) -> Option<&str> {
+        match idx {
+            CTC_BLANK => None,
+            CTC_SPACE => Some(" "),
+            i if i >= 1 && i <= DICT_LINES => Some(self.dict[i - 1].as_str()),
+            _ => None, // out-of-range guard (should not happen for 11947 classes)
+        }
+    }
+}
+
+impl OcrEngine for OnnxPaddleOcr {
+    fn engine_name(&self) -> &'static str {
+        PADDLE_ONNX_ENGINE
+    }
+
+    fn engine_version(&self) -> String {
+        self.engine_version.clone()
+    }
+
+    fn recognize(&self, image_bytes: &[u8], _lang_hint: Option<&Lang>) -> Result<OcrText> {
+        let img = image::load_from_memory(image_bytes)
+            .context("decoding image for OCR")?
+            .to_rgb8();
+        let (orig_w, orig_h) = (img.width(), img.height());
+        if orig_w == 0 || orig_h == 0 {
+            return Ok(empty_ocr(self));
+        }
+
+        // ── det ────────────────────────────────────────────────────────
+        let (det_w, det_h) = det_target_dims(orig_w, orig_h, self.max_pixels);
+        let det_img = image::imageops::resize(
+            &img,
+            det_w,
+            det_h,
+            image::imageops::FilterType::Triangle,
+        );
+        let prob = self.run_det(&det_img)?; // (det_h, det_w) prob map
+        let scale_x = orig_w as f32 / det_w as f32;
+        let scale_y = orig_h as f32 / det_h as f32;
+        let mut boxes = det_postprocess(
+            &prob,
+            prob.w,
+            prob.h,
+            self.score_thresh,
+            self.unclip_ratio,
+        );
+        if boxes.len() > self.max_boxes {
+            tracing::warn!(
+                target: "kebab-parse-image",
+                "paddle-onnx: {} boxes exceeds max_boxes {} — truncating",
+                boxes.len(),
+                self.max_boxes
+            );
+            boxes.truncate(self.max_boxes);
+        }
+        // scale box corners back to original image coordinates
+        for b in &mut boxes {
+            for p in &mut b.corners {
+                p.0 *= scale_x;
+                p.1 *= scale_y;
+            }
+        }
+
+        if boxes.is_empty() {
+            return Ok(empty_ocr(self));
+        }
+
+        // ── rec per box (reading order: top→bottom, left→right) ─────────
+        boxes.sort_by(|a, b| {
+            let ay = a.center_y();
+            let by = b.center_y();
+            // group into rough rows by 0.5*box height tolerance via y then x
+            ay.partial_cmp(&by)
+                .unwrap_or(std::cmp::Ordering::Equal)
+                .then_with(|| {
+                    a.center_x()
+                        .partial_cmp(&b.center_x())
+                        .unwrap_or(std::cmp::Ordering::Equal)
+                })
+        });
+
+        let mut regions: Vec<OcrRegion> = Vec::with_capacity(boxes.len());
+        for b in &boxes {
+            let crop = rectify_crop(&img, &b.corners);
+            if crop.width() == 0 || crop.height() == 0 {
+                continue;
+            }
+            let (text, conf) = self.run_rec(&crop)?;
+            if text.is_empty() {
+                continue; // rec empty → skip this box, keep the rest
+            }
+            let (x, y, w, h) = b.aabb();
+            regions.push(OcrRegion {
+                bbox: (x, y, w, h),
+                text,
+                confidence: conf,
+            });
+        }
+
+        let joined = regions
+            .iter()
+            .map(|r| r.text.as_str())
+            .collect::<Vec<_>>()
+            .join("\n");
+
+        Ok(OcrText {
+            joined,
+            regions,
+            engine: PADDLE_ONNX_ENGINE.to_string(),
+            engine_version: self.engine_version.clone(),
+        })
+    }
+}
+
+impl OnnxPaddleOcr {
+    /// Run det session → `(det_h, det_w)` probability map as a row-major Vec.
+    fn run_det(&self, det_img: &image::RgbImage) -> Result<ProbMap> {
+        let (w, h) = (det_img.width() as usize, det_img.height() as usize);
+        let mut arr = Array4::<f32>::zeros((1, 3, h, w));
+        for (x, y, px) in det_img.enumerate_pixels() {
+            let (xi, yi) = (x as usize, y as usize);
+            for c in 0..3 {
+                let v = px[c] as f32 / 255.0;
+                arr[[0, c, yi, xi]] = (v - IMAGENET_MEAN[c]) / IMAGENET_STD[c];
+            }
+        }
+        let input = Value::from_array(arr).context("det Value::from_array")?;
+        let sess = self.det.lock().expect("det session mutex poisoned");
+        let outputs = sess
+            .run(ort::inputs![self.det_input_name.as_str() => input]?)
+            .context("det session run")?;
+        let out_name = sess.outputs[0].name.clone();
+        let view = outputs[out_name.as_str()]
+            .try_extract_tensor::<f32>()
+            .context("det output extract")?;
+        // shape [1,1,H,W]
+        let shape = view.shape();
+        let (oh, ow) = (shape[shape.len() - 2], shape[shape.len() - 1]);
+        let data: Vec<f32> = view.iter().copied().collect();
+        Ok(ProbMap { w: ow, h: oh, data })
+    }
+
+    /// Run rec session on a rectified crop → (decoded string, mean confidence).
+    fn run_rec(&self, crop: &image::RgbImage) -> Result<(String, f32)> {
+        // resize keep-aspect to height 48, then this single crop is its own batch
+        let (cw, ch) = (crop.width().max(1), crop.height().max(1));
+        let new_w = ((REC_HEIGHT as f32 / ch as f32) * cw as f32).round().max(1.0) as u32;
+        let resized = image::imageops::resize(
+            crop,
+            new_w,
+            REC_HEIGHT,
+            image::imageops::FilterType::Triangle,
+        );
+        let w = new_w as usize;
+        let h = REC_HEIGHT as usize;
+        let mut arr = Array4::<f32>::zeros((1, 3, h, w));
+        for (x, y, px) in resized.enumerate_pixels() {
+            let (xi, yi) = (x as usize, y as usize);
+            for c in 0..3 {
+                let v = px[c] as f32 / 255.0;
+                arr[[0, c, yi, xi]] = (v - 0.5) / 0.5; // [-1, 1]
+            }
+        }
+        let input = Value::from_array(arr).context("rec Value::from_array")?;
+        let sess = self.rec.lock().expect("rec session mutex poisoned");
+        let outputs = sess
+            .run(ort::inputs![self.rec_input_name.as_str() => input]?)
+            .context("rec session run")?;
+        let out_name = sess.outputs[0].name.clone();
+        let view = outputs[out_name.as_str()]
+            .try_extract_tensor::<f32>()
+            .context("rec output extract")?;
+        // shape [1, T, C]
+        let shape = view.shape();
+        let (t, c) = (shape[shape.len() - 2], shape[shape.len() - 1]);
+        if c != REC_CLASSES {
+            anyhow::bail!(
+                "rec output has {c} classes, expected {REC_CLASSES} \
+                 (dict {DICT_LINES} + blank + space)"
+            );
+        }
+        let data: Vec<f32> = view.iter().copied().collect();
+        Ok(self.ctc_greedy_decode(&data, t, c))
+    }
+
+    /// CTC greedy decode over `[T, C]` logits/probs (row-major). Per timestep
+    /// argmax → collapse consecutive duplicates → drop blank → map class→str.
+    fn ctc_greedy_decode(&self, data: &[f32], t: usize, c: usize) -> (String, f32) {
+        let mut out = String::new();
+        let mut confs: Vec<f32> = Vec::new();
+        let mut prev = usize::MAX;
+        for ti in 0..t {
+            let row = &data[ti * c..(ti + 1) * c];
+            let mut best = 0usize;
+            let mut best_v = f32::MIN;
+            for (i, &v) in row.iter().enumerate() {
+                if v > best_v {
+                    best_v = v;
+                    best = i;
+                }
+            }
+            if best != prev && best != CTC_BLANK {
+                if let Some(s) = self.class_to_str(best) {
+                    out.push_str(s);
+                    confs.push(best_v);
+                }
+            }
+            prev = best;
+        }
+        let conf = if confs.is_empty() {
+            0.0
+        } else {
+            confs.iter().sum::<f32>() / confs.len() as f32
+        };
+        (out, conf)
+    }
+}
+
+fn empty_ocr(e: &OnnxPaddleOcr) -> OcrText {
+    OcrText {
+        joined: String::new(),
+        regions: Vec::new(),
+        engine: PADDLE_ONNX_ENGINE.to_string(),
+        engine_version: e.engine_version.clone(),
+    }
+}
+
+/// Load the dict file: one token per line, trailing newline tolerated.
+/// Empty lines are preserved as empty tokens (PaddleOCR dicts may carry a
+/// blank-looking line; index integrity matters more than trimming).
+fn load_dict(path: &Path) -> Result<Vec<String>> {
+    let raw = std::fs::read_to_string(path)?;
+    // split on '\n'; drop a single trailing empty element from the final newline
+    let mut lines: Vec<String> = raw.split('\n').map(|s| s.trim_end_matches('\r').to_string()).collect();
+    if lines.last().map(|s| s.is_empty()).unwrap_or(false) {
+        lines.pop();
+    }
+    Ok(lines)
+}
+
+/// blake3 over det + rec + dict bytes → stable `engine_version`.
+fn compute_engine_version(paths: &ModelPaths) -> Result<String> {
+    let mut hasher = blake3::Hasher::new();
+    for p in [&paths.det, &paths.rec, &paths.dict] {
+        let bytes = std::fs::read(p).with_context(|| format!("reading {}", p.display()))?;
+        hasher.update(&bytes);
+    }
+    let hash = hasher.finalize();
+    let hex = hash.to_hex();
+    Ok(format!("ppocrv5-mobile-kor-{}", &hex.as_str()[..12]))
+}
+
+/// det resize target: keep aspect, cap long edge at `min(max_pixels, 960)`,
+/// then round each dim to a multiple of 32 (DBNet stride). Reproduces the T0a
+/// golden (192×900 → 192×896).
+fn det_target_dims(w: u32, h: u32, max_pixels: u32) -> (u32, u32) {
+    let limit = DET_LIMIT_SIDE_LEN.min(max_pixels.max(32));
+    let long = w.max(h);
+    let ratio = if long > limit {
+        limit as f32 / long as f32
+    } else {
+        1.0
+    };
+    let rw = (w as f32 * ratio).round().max(1.0);
+    let rh = (h as f32 * ratio).round().max(1.0);
+    let round32 = |v: f32| -> u32 {
+        let r = (v / 32.0).round() as u32 * 32;
+        r.max(32)
+    };
+    (round32(rw), round32(rh))
+}
+
+// ── det postprocessing ──────────────────────────────────────────────────────
+
+struct ProbMap {
+    w: usize,
+    h: usize,
+    data: Vec<f32>,
+}
+
+impl ProbMap {
+    #[inline]
+    fn at(&self, x: usize, y: usize) -> f32 {
+        self.data[y * self.w + x]
+    }
+}
+
+/// A detected text box: 4 corners (clockwise from top-left) in det-image
+/// coordinates (later scaled to original).
+#[derive(Clone, Debug)]
+struct DetBox {
+    corners: [(f32, f32); 4],
+    #[allow(dead_code)]
+    score: f32,
+}
+
+impl DetBox {
+    fn center_x(&self) -> f32 {
+        self.corners.iter().map(|p| p.0).sum::<f32>() / 4.0
+    }
+    fn center_y(&self) -> f32 {
+        self.corners.iter().map(|p| p.1).sum::<f32>() / 4.0
+    }
+    /// Axis-aligned bounding box (x, y, w, h) clamped to non-negative.
+    fn aabb(&self) -> (u32, u32, u32, u32) {
+        let xs = self.corners.iter().map(|p| p.0);
+        let ys = self.corners.iter().map(|p| p.1);
+        let minx = xs.clone().fold(f32::MAX, f32::min).max(0.0);
+        let maxx = xs.fold(f32::MIN, f32::max).max(0.0);
+        let miny = ys.clone().fold(f32::MAX, f32::min).max(0.0);
+        let maxy = ys.fold(f32::MIN, f32::max).max(0.0);
+        (
+            minx.round() as u32,
+            miny.round() as u32,
+            (maxx - minx).round().max(0.0) as u32,
+            (maxy - miny).round().max(0.0) as u32,
+        )
+    }
+}
+
+/// DBNet-style postprocess: threshold → connected components → contour →
+/// min-area rect (rotating calipers) → box-score filter → unclip → boxes.
+/// Pinned by `tests/golden/det_boxes_clean_paragraph.json` (3 boxes).
+fn det_postprocess(
+    prob: &ProbMap,
+    w: usize,
+    h: usize,
+    score_thresh: f32,
+    unclip_ratio: f32,
+) -> Vec<DetBox> {
+    use image::{GrayImage, Luma};
+
+    // binarize at the detection threshold
+    let mut bin = GrayImage::new(w as u32, h as u32);
+    for y in 0..h {
+        for x in 0..w {
+            let v = if prob.at(x, y) > 0.3 { 255u8 } else { 0u8 };
+            bin.put_pixel(x as u32, y as u32, Luma([v]));
+        }
+    }
+
+    let contours = imageproc::contours::find_contours::<u32>(&bin);
+    let mut boxes = Vec::new();
+    for contour in &contours {
+        if contour.points.len() < 4 {
+            continue;
+        }
+        let pts: Vec<(f32, f32)> = contour
+            .points
+            .iter()
+            .map(|p| (p.x as f32, p.y as f32))
+            .collect();
+        let Some(rect) = min_area_rect(&pts) else {
+            continue;
+        };
+        // mean-prob box score over the AABB of the rotated rect
+        let score = box_score(prob, &rect.corners);
+        if score < score_thresh {
+            continue;
+        }
+        let unclipped = unclip_rect(&rect, unclip_ratio);
+        boxes.push(DetBox {
+            corners: unclipped,
+            score,
+        });
+    }
+    boxes
+}
+
+/// Mean probability inside the axis-aligned bbox of the rect — the
+/// `box_thresh` mean-prob filter used by the golden harness.
+fn box_score(prob: &ProbMap, corners: &[(f32, f32); 4]) -> f32 {
+    let minx = corners.iter().map(|p| p.0).fold(f32::MAX, f32::min).max(0.0) as usize;
+    let maxx = (corners.iter().map(|p| p.0).fold(f32::MIN, f32::max).max(0.0) as usize)
+        .min(prob.w.saturating_sub(1));
+    let miny = corners.iter().map(|p| p.1).fold(f32::MAX, f32::min).max(0.0) as usize;
+    let maxy = (corners.iter().map(|p| p.1).fold(f32::MIN, f32::max).max(0.0) as usize)
+        .min(prob.h.saturating_sub(1));
+    if maxx <= minx || maxy <= miny {
+        return 0.0;
+    }
+    let mut sum = 0.0f32;
+    let mut n = 0usize;
+    for y in miny..=maxy {
+        for x in minx..=maxx {
+            sum += prob.at(x, y);
+            n += 1;
+        }
+    }
+    if n == 0 { 0.0 } else { sum / n as f32 }
+}
+
+/// Rotated rect described by its 4 corners + box dims.
+#[derive(Clone, Debug)]
+struct RotRect {
+    corners: [(f32, f32); 4],
+    width: f32,
+    height: f32,
+}
+
+/// Minimum-area enclosing rectangle of a point set via rotating calipers on
+/// the convex hull (pure Rust — no OpenCV / clipper2).
+fn min_area_rect(points: &[(f32, f32)]) -> Option<RotRect> {
+    let hull = convex_hull(points);
+    if hull.len() < 3 {
+        return None;
+    }
+    let n = hull.len();
+    let mut best_area = f32::MAX;
+    let mut best: Option<RotRect> = None;
+    for i in 0..n {
+        let p0 = hull[i];
+        let p1 = hull[(i + 1) % n];
+        let edge = (p1.0 - p0.0, p1.1 - p0.1);
+        let len = (edge.0 * edge.0 + edge.1 * edge.1).sqrt();
+        if len < 1e-6 {
+            continue;
+        }
+        let ux = (edge.0 / len, edge.1 / len); // edge direction
+        let uy = (-ux.1, ux.0); // normal
+        let (mut min_u, mut max_u) = (f32::MAX, f32::MIN);
+        let (mut min_v, mut max_v) = (f32::MAX, f32::MIN);
+        for &p in &hull {
+            let du = p.0 * ux.0 + p.1 * ux.1;
+            let dv = p.0 * uy.0 + p.1 * uy.1;
+            min_u = min_u.min(du);
+            max_u = max_u.max(du);
+            min_v = min_v.min(dv);
+            max_v = max_v.max(dv);
+        }
+        let area = (max_u - min_u) * (max_v - min_v);
+        if area < best_area {
+            best_area = area;
+            // reconstruct corners in (u,v) basis → world
+            let to_world = |u: f32, v: f32| (u * ux.0 + v * uy.0, u * ux.1 + v * uy.1);
+            let corners = [
+                to_world(min_u, min_v),
+                to_world(max_u, min_v),
+                to_world(max_u, max_v),
+                to_world(min_u, max_v),
+            ];
+            best = Some(RotRect {
+                corners,
+                width: max_u - min_u,
+                height: max_v - min_v,
+            });
+        }
+    }
+    best
+}
+
+/// Andrew's monotone chain convex hull. Returns CCW hull without duplicates.
+fn convex_hull(points: &[(f32, f32)]) -> Vec<(f32, f32)> {
+    let mut pts: Vec<(f32, f32)> = points.to_vec();
+    pts.sort_by(|a, b| {
+        a.0.partial_cmp(&b.0)
+            .unwrap_or(std::cmp::Ordering::Equal)
+            .then(a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
+    });
+    pts.dedup();
+    if pts.len() < 3 {
+        return pts;
+    }
+    let cross = |o: (f32, f32), a: (f32, f32), b: (f32, f32)| {
+        (a.0 - o.0) * (b.1 - o.1) - (a.1 - o.1) * (b.0 - o.0)
+    };
+    let mut lower: Vec<(f32, f32)> = Vec::new();
+    for &p in &pts {
+        while lower.len() >= 2 && cross(lower[lower.len() - 2], lower[lower.len() - 1], p) <= 0.0 {
+            lower.pop();
+        }
+        lower.push(p);
+    }
+    let mut upper: Vec<(f32, f32)> = Vec::new();
+    for &p in pts.iter().rev() {
+        while upper.len() >= 2 && cross(upper[upper.len() - 2], upper[upper.len() - 1], p) <= 0.0 {
+            upper.pop();
+        }
+        upper.push(p);
+    }
+    lower.pop();
+    upper.pop();
+    lower.extend(upper);
+    lower
+}
+
+/// Unclip a rotated rect by `ratio` (PaddleOCR `distance = area*ratio/perimeter`),
+/// expanding width + height by `2*distance`. For a rectangle this matches the
+/// general polygon offset PaddleOCR uses (pyclipper) — pure Rust here.
+fn unclip_rect(rect: &RotRect, ratio: f32) -> [(f32, f32); 4] {
+    let area = rect.width * rect.height;
+    let perimeter = 2.0 * (rect.width + rect.height);
+    if perimeter < 1e-6 {
+        return rect.corners;
+    }
+    let distance = area * ratio / perimeter;
+    // expand around centroid
+    let cx = rect.corners.iter().map(|p| p.0).sum::<f32>() / 4.0;
+    let cy = rect.corners.iter().map(|p| p.1).sum::<f32>() / 4.0;
+    let mut out = rect.corners;
+    for p in &mut out {
+        let dx = p.0 - cx;
+        let dy = p.1 - cy;
+        let len = (dx * dx + dy * dy).sqrt();
+        if len > 1e-6 {
+            // push each corner outward along its diagonal by `distance`
+            p.0 += dx / len * distance;
+            p.1 += dy / len * distance;
+        }
+    }
+    out
+}
+
+// ── crop + rectify ───────────────────────────────────────────────────────────
+
+/// Perspective-warp the quadrilateral `corners` (clockwise from top-left) into
+/// a horizontal strip. Output size derives from the box edge lengths.
+fn rectify_crop(img: &image::RgbImage, corners: &[(f32, f32); 4]) -> image::RgbImage {
+    // order corners: top-left, top-right, bottom-right, bottom-left
+    let ordered = order_corners(corners);
+    let dist = |a: (f32, f32), b: (f32, f32)| ((a.0 - b.0).powi(2) + (a.1 - b.1).powi(2)).sqrt();
+    let w = dist(ordered[0], ordered[1]).max(dist(ordered[3], ordered[2]));
+    let h = dist(ordered[0], ordered[3]).max(dist(ordered[1], ordered[2]));
+    let out_w = w.round().max(1.0) as u32;
+    let out_h = h.round().max(1.0) as u32;
+    let mut out = image::RgbImage::new(out_w, out_h);
+    let (iw, ih) = (img.width() as f32, img.height() as f32);
+    // bilinear map from output grid back to the source quad (inverse via
+    // bilinear interpolation of the four corners — adequate for near-affine
+    // text boxes).
+    for oy in 0..out_h {
+        let fy = oy as f32 / (out_h.max(1) as f32 - 1.0).max(1.0);
+        for ox in 0..out_w {
+            let fx = ox as f32 / (out_w.max(1) as f32 - 1.0).max(1.0);
+            // bilinear blend of the four source corners
+            let top = (
+                ordered[0].0 + (ordered[1].0 - ordered[0].0) * fx,
+                ordered[0].1 + (ordered[1].1 - ordered[0].1) * fx,
+            );
+            let bot = (
+                ordered[3].0 + (ordered[2].0 - ordered[3].0) * fx,
+                ordered[3].1 + (ordered[2].1 - ordered[3].1) * fx,
+            );
+            let sx = (top.0 + (bot.0 - top.0) * fy).clamp(0.0, iw - 1.0);
+            let sy = (top.1 + (bot.1 - top.1) * fy).clamp(0.0, ih - 1.0);
+            let px = img.get_pixel(sx.round() as u32, sy.round() as u32);
+            out.put_pixel(ox, oy, *px);
+        }
+    }
+    out
+}
+
+/// Order 4 corners as [top-left, top-right, bottom-right, bottom-left] using
+/// coordinate sums/diffs (standard PaddleOCR ordering).
+fn order_corners(corners: &[(f32, f32); 4]) -> [(f32, f32); 4] {
+    // top-left has smallest x+y, bottom-right largest x+y;
+    // top-right smallest y-x, bottom-left largest y-x.
+    let mut tl = corners[0];
+    let mut br = corners[0];
+    let mut tr = corners[0];
+    let mut bl = corners[0];
+    let (mut min_sum, mut max_sum) = (f32::MAX, f32::MIN);
+    let (mut min_diff, mut max_diff) = (f32::MAX, f32::MIN);
+    for &p in corners {
+        let sum = p.0 + p.1;
+        let diff = p.1 - p.0;
+        if sum < min_sum {
+            min_sum = sum;
+            tl = p;
+        }
+        if sum > max_sum {
+            max_sum = sum;
+            br = p;
+        }
+        if diff < min_diff {
+            min_diff = diff;
+            tr = p;
+        }
+        if diff > max_diff {
+            max_diff = diff;
+            bl = p;
+        }
+    }
+    [tl, tr, br, bl]
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn det_target_dims_matches_golden() {
+        // T0a golden: clean_paragraph 192×900 → det input 192×896.
+        assert_eq!(det_target_dims(900, 192, 1600), (896, 192));
+    }
+
+    #[test]
+    fn convex_hull_square() {
+        let pts = vec![(0.0, 0.0), (10.0, 0.0), (10.0, 10.0), (0.0, 10.0), (5.0, 5.0)];
+        let hull = convex_hull(&pts);
+        assert_eq!(hull.len(), 4);
+    }
+
+    #[test]
+    fn min_area_rect_axis_aligned() {
+        let pts = vec![(0.0, 0.0), (20.0, 0.0), (20.0, 5.0), (0.0, 5.0)];
+        let r = min_area_rect(&pts).expect("rect");
+        let (lo, hi) = (r.width.min(r.height), r.width.max(r.height));
+        assert!((lo - 5.0).abs() < 1e-3, "short side {lo}");
+        assert!((hi - 20.0).abs() < 1e-3, "long side {hi}");
+    }
+
+    #[test]
+    fn unclip_expands_box() {
+        let rect = RotRect {
+            corners: [(0.0, 0.0), (20.0, 0.0), (20.0, 5.0), (0.0, 5.0)],
+            width: 20.0,
+            height: 5.0,
+        };
+        let out = unclip_rect(&rect, 1.5);
+        // unclipped box must be strictly larger than the original
+        let orig_minx = 0.0;
+        let new_minx = out.iter().map(|p| p.0).fold(f32::MAX, f32::min);
+        assert!(new_minx < orig_minx, "expected expansion, got {new_minx}");
+    }
+}