feat(ocr): T2-T6 OnnxPaddleOcr core engine — det/rec ONNX + DBNet postproc + CTC

PP-OCRv5 ONNX OCR engine on the pinned ort rc.9 (no Python, no oar-ocr dep).
Implements the recognize() pipeline end-to-end (compiles + unit-tested):

- T2: OnnxPaddleOcr skeleton, OcrEngine impl, det/rec Session loaded once
  (Mutex-wrapped → Send+Sync), engine_version = blake3(det+rec+dict) cached
  once at construction, dict bounds-check (11945 lines vs 11947 rec classes).
- T2 preproc: det ImageNet mean/std NCHW + limit_side_len 960 → ×32 round
  (golden 192x900→896x192 pinned); rec height-48 keep-aspect, (x-0.5)/0.5.
- T3 det postproc: threshold 0.3 → imageproc contours → min-area rect via
  pure-Rust rotating calipers + convex hull → mean-prob box-score filter →
  pure-Rust unclip(ratio 1.5). No clipper2/OpenCV.
- T4 crop+rectify: corner ordering + bilinear perspective warp to horizontal.
- T5 rec+CTC: greedy decode with the T0a-confirmed mapping
  (idx0=blank, 1..=11945=dict[idx-1], 11946=space), rec-class bounds-check.
- T6 assembly: reading-order OcrText with per-region bbox + real confidence.

Unit tests (4 pass): det_target_dims golden, convex hull, min-area rect,
unclip expansion. Large *.onnx assets stay untracked pending T12 LFS decision.

Remaining: T7 config overrides, T8 factory (4 sites), T9 signature cascade,
T10 error matrix, T11 gates (clippy/e2e CER), T12 docs+bump+PR.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-04 07:52:39 +00:00
parent 8f8d3a4100
commit b706e3e88c
3 changed files with 824 additions and 0 deletions

View File

@@ -45,6 +45,10 @@ thiserror = { workspace = true }
# so a standalone `cargo test -p kebab-parse-image` needs it to link onnxruntime.
ort = { workspace = true, features = ["ndarray", "download-binaries"] }
ndarray = { workspace = true }
# blake3: engine_version hash over the bundled det/rec/dict assets (computed
# once at OnnxPaddleOcr construction, cached — `ingest_config_signature` calls
# engine_version() per asset).
blake3 = { workspace = true }
# imageproc: connected-components / contours for DBNet det post-processing.
# min-area rotated-rect (rotating calipers) and polygon unclip are implemented
# in pure Rust (clipper2 is C++ FFI — would break the single-binary guarantee).

View File

@@ -30,9 +30,11 @@ mod dims;
mod exif_extract;
mod image_prep;
pub mod ocr;
pub mod paddle_onnx;
pub use caption::{apply_caption, caption_image};
pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
pub use paddle_onnx::{OnnxPaddleOcr, PADDLE_ONNX_ENGINE};
use anyhow::{Context, Result};
use kebab_core::{

View File

@@ -0,0 +1,818 @@
//! PP-OCRv5 ONNX OCR engine — in-process detection + recognition on the
//! workspace-pinned `ort` (=2.0.0-rc.9), no Python runtime, no oar-ocr
//! production dependency (see crate-level rationale + `assets/paddleocr-onnx/NOTICE`).
//!
//! Pipeline (`recognize`):
//! 1. decode (RGB) + downscale long edge to `max_pixels`
//! 2. det: ImageNet-normalized NCHW → DBNet prob map `[1,1,H,W]`
//! → threshold 0.3 → contours → min-area rect (rotating calipers,
//! pure Rust) → unclip(ratio 1.5, pure Rust) → boxes
//! 3. crop+rectify: perspective warp each rotated box to a horizontal strip
//! 4. rec: 48×W normalized `(x-0.5)/0.5` → `[1,T,11947]` → CTC greedy decode
//! 5. assemble reading-order `OcrText`
//!
//! ## Confirmed CTC facts (empirically derived in T0a, see
//! `tests/golden/ctc_rec_golden.json` — do NOT re-derive):
//! * rec classes = 11947 = dict(11945) + blank + space
//! * index 0 = CTC blank
//! * index 1..=11945 = `korean_dict.txt` line N → class N (i.e. `dict[N-1]`)
//! * index 11946 = space ' '
//!
//! ## rc.9 API notes (differ from rc.12):
//! * `try_extract_tensor::<f32>()` → `ArrayViewD<f32>` (`.shape()` / indexing).
//! * `Session::run` is called through a `Mutex` guard so the engine is
//! `Send + Sync` regardless of `Session`'s own auto-trait status (ingest
//! is serial today; the lock is uncontended).
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use anyhow::{Context, Result};
use kebab_core::{Lang, OcrRegion, OcrText};
use ndarray::Array4;
use ort::session::Session;
use ort::value::Value;
use crate::ocr::OcrEngine;
/// Engine name written into `OcrText.engine`.
pub const PADDLE_ONNX_ENGINE: &str = "paddle-onnx";
/// CTC blank class index (confirmed in T0a).
const CTC_BLANK: usize = 0;
/// Space class index (confirmed in T0a). `1..=DICT_LINES` map to dict entries.
const CTC_SPACE: usize = 11946;
/// `korean_dict.txt` line count (confirmed in T0a).
const DICT_LINES: usize = 11945;
/// rec output class count = dict + blank + space (confirmed in T0a).
const REC_CLASSES: usize = 11947;
/// det long-edge cap before rounding to a multiple of 32 (PaddleOCR default).
const DET_LIMIT_SIDE_LEN: u32 = 960;
/// rec input height (PP-OCRv5 mobile).
const REC_HEIGHT: u32 = 48;
/// ImageNet normalization (det preprocessing — RGB).
const IMAGENET_MEAN: [f32; 3] = [0.485, 0.456, 0.406];
const IMAGENET_STD: [f32; 3] = [0.229, 0.224, 0.225];
/// PP-OCRv5 ONNX engine. Holds the two ONNX sessions (loaded once) and the
/// dict. `engine_version` is computed once at construction (blake3 over the
/// three model assets) and cached — `ingest_config_signature` calls
/// `engine_version()` per asset, so re-hashing there would be O(assets).
pub struct OnnxPaddleOcr {
det: Mutex<Session>,
rec: Mutex<Session>,
det_input_name: String,
rec_input_name: String,
dict: Vec<String>,
engine_version: String,
score_thresh: f32,
unclip_ratio: f32,
max_boxes: usize,
max_pixels: u32,
}
impl std::fmt::Debug for OnnxPaddleOcr {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("OnnxPaddleOcr")
.field("engine_version", &self.engine_version)
.field("dict_lines", &self.dict.len())
.field("score_thresh", &self.score_thresh)
.field("unclip_ratio", &self.unclip_ratio)
.field("max_boxes", &self.max_boxes)
.field("max_pixels", &self.max_pixels)
.finish()
}
}
/// Resolved model-asset paths. Construction is decoupled from `kebab-config`
/// (T7 adds the `det_model`/`rec_model`/`dict` overrides) so the engine can be
/// built directly in tests.
#[derive(Clone, Debug)]
pub struct ModelPaths {
pub det: PathBuf,
pub rec: PathBuf,
pub dict: PathBuf,
}
impl ModelPaths {
/// Default bundled-asset directory: `KEBAB_IMAGE_OCR_MODEL_DIR` if set,
/// else the crate's `assets/paddleocr-onnx/`.
pub fn from_default_dir() -> Self {
let dir = std::env::var("KEBAB_IMAGE_OCR_MODEL_DIR")
.map(PathBuf::from)
.unwrap_or_else(|_| {
Path::new(env!("CARGO_MANIFEST_DIR")).join("assets/paddleocr-onnx")
});
Self {
det: dir.join("ppocrv5_mobile_det.onnx"),
rec: dir.join("korean_ppocrv5_mobile_rec.onnx"),
dict: dir.join("korean_dict.txt"),
}
}
}
impl OnnxPaddleOcr {
/// Build from a workspace [`kebab_config::Config`]. Resolves model paths
/// from the default bundled directory (T7 will thread config overrides).
/// Construction loads both ONNX sessions and hashes the assets — failures
/// here are fail-fast (matches the Ollama adapter's construction contract).
pub fn new(config: &kebab_config::Config) -> Result<Self> {
let paths = ModelPaths::from_default_dir();
Self::from_paths(
&paths,
0.3,
1.5,
1000,
config.image.ocr.max_pixels,
)
}
/// Build from explicit asset paths + tuning knobs. Used by tests and by
/// `new` after path resolution.
pub fn from_paths(
paths: &ModelPaths,
score_thresh: f32,
unclip_ratio: f32,
max_boxes: usize,
max_pixels: u32,
) -> Result<Self> {
let dict = load_dict(&paths.dict)
.with_context(|| format!("loading OCR dict from {}", paths.dict.display()))?;
// bounds-check: dict length must match the rec class layout
// (dict + blank + space). A mismatch means a wrong dict file —
// fail at construction rather than mis-decoding silently.
if dict.len() != DICT_LINES {
anyhow::bail!(
"OnnxPaddleOcr: dict has {} lines, expected {DICT_LINES} \
(rec classes {REC_CLASSES} = dict + blank + space)",
dict.len()
);
}
let engine_version = compute_engine_version(paths)
.context("hashing OCR model assets for engine_version")?;
let det = Session::builder()
.context("ort Session::builder (det)")?
.commit_from_file(&paths.det)
.with_context(|| format!("loading det model {}", paths.det.display()))?;
let rec = Session::builder()
.context("ort Session::builder (rec)")?
.commit_from_file(&paths.rec)
.with_context(|| format!("loading rec model {}", paths.rec.display()))?;
let det_input_name = det
.inputs
.first()
.map(|i| i.name.clone())
.context("det model has no inputs")?;
let rec_input_name = rec
.inputs
.first()
.map(|i| i.name.clone())
.context("rec model has no inputs")?;
Ok(Self {
det: Mutex::new(det),
rec: Mutex::new(rec),
det_input_name,
rec_input_name,
dict,
engine_version,
score_thresh,
unclip_ratio,
max_boxes,
max_pixels: max_pixels.clamp(256, 4096),
})
}
/// Map a CTC class index to its output string. `None` for blank.
/// `index 0 = blank`, `1..=11945 = dict[index-1]`, `11946 = space`.
fn class_to_str(&self, idx: usize) -> Option<&str> {
match idx {
CTC_BLANK => None,
CTC_SPACE => Some(" "),
i if i >= 1 && i <= DICT_LINES => Some(self.dict[i - 1].as_str()),
_ => None, // out-of-range guard (should not happen for 11947 classes)
}
}
}
impl OcrEngine for OnnxPaddleOcr {
fn engine_name(&self) -> &'static str {
PADDLE_ONNX_ENGINE
}
fn engine_version(&self) -> String {
self.engine_version.clone()
}
fn recognize(&self, image_bytes: &[u8], _lang_hint: Option<&Lang>) -> Result<OcrText> {
let img = image::load_from_memory(image_bytes)
.context("decoding image for OCR")?
.to_rgb8();
let (orig_w, orig_h) = (img.width(), img.height());
if orig_w == 0 || orig_h == 0 {
return Ok(empty_ocr(self));
}
// ── det ────────────────────────────────────────────────────────
let (det_w, det_h) = det_target_dims(orig_w, orig_h, self.max_pixels);
let det_img = image::imageops::resize(
&img,
det_w,
det_h,
image::imageops::FilterType::Triangle,
);
let prob = self.run_det(&det_img)?; // (det_h, det_w) prob map
let scale_x = orig_w as f32 / det_w as f32;
let scale_y = orig_h as f32 / det_h as f32;
let mut boxes = det_postprocess(
&prob,
prob.w,
prob.h,
self.score_thresh,
self.unclip_ratio,
);
if boxes.len() > self.max_boxes {
tracing::warn!(
target: "kebab-parse-image",
"paddle-onnx: {} boxes exceeds max_boxes {} — truncating",
boxes.len(),
self.max_boxes
);
boxes.truncate(self.max_boxes);
}
// scale box corners back to original image coordinates
for b in &mut boxes {
for p in &mut b.corners {
p.0 *= scale_x;
p.1 *= scale_y;
}
}
if boxes.is_empty() {
return Ok(empty_ocr(self));
}
// ── rec per box (reading order: top→bottom, left→right) ─────────
boxes.sort_by(|a, b| {
let ay = a.center_y();
let by = b.center_y();
// group into rough rows by 0.5*box height tolerance via y then x
ay.partial_cmp(&by)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| {
a.center_x()
.partial_cmp(&b.center_x())
.unwrap_or(std::cmp::Ordering::Equal)
})
});
let mut regions: Vec<OcrRegion> = Vec::with_capacity(boxes.len());
for b in &boxes {
let crop = rectify_crop(&img, &b.corners);
if crop.width() == 0 || crop.height() == 0 {
continue;
}
let (text, conf) = self.run_rec(&crop)?;
if text.is_empty() {
continue; // rec empty → skip this box, keep the rest
}
let (x, y, w, h) = b.aabb();
regions.push(OcrRegion {
bbox: (x, y, w, h),
text,
confidence: conf,
});
}
let joined = regions
.iter()
.map(|r| r.text.as_str())
.collect::<Vec<_>>()
.join("\n");
Ok(OcrText {
joined,
regions,
engine: PADDLE_ONNX_ENGINE.to_string(),
engine_version: self.engine_version.clone(),
})
}
}
impl OnnxPaddleOcr {
/// Run det session → `(det_h, det_w)` probability map as a row-major Vec.
fn run_det(&self, det_img: &image::RgbImage) -> Result<ProbMap> {
let (w, h) = (det_img.width() as usize, det_img.height() as usize);
let mut arr = Array4::<f32>::zeros((1, 3, h, w));
for (x, y, px) in det_img.enumerate_pixels() {
let (xi, yi) = (x as usize, y as usize);
for c in 0..3 {
let v = px[c] as f32 / 255.0;
arr[[0, c, yi, xi]] = (v - IMAGENET_MEAN[c]) / IMAGENET_STD[c];
}
}
let input = Value::from_array(arr).context("det Value::from_array")?;
let sess = self.det.lock().expect("det session mutex poisoned");
let outputs = sess
.run(ort::inputs![self.det_input_name.as_str() => input]?)
.context("det session run")?;
let out_name = sess.outputs[0].name.clone();
let view = outputs[out_name.as_str()]
.try_extract_tensor::<f32>()
.context("det output extract")?;
// shape [1,1,H,W]
let shape = view.shape();
let (oh, ow) = (shape[shape.len() - 2], shape[shape.len() - 1]);
let data: Vec<f32> = view.iter().copied().collect();
Ok(ProbMap { w: ow, h: oh, data })
}
/// Run rec session on a rectified crop → (decoded string, mean confidence).
fn run_rec(&self, crop: &image::RgbImage) -> Result<(String, f32)> {
// resize keep-aspect to height 48, then this single crop is its own batch
let (cw, ch) = (crop.width().max(1), crop.height().max(1));
let new_w = ((REC_HEIGHT as f32 / ch as f32) * cw as f32).round().max(1.0) as u32;
let resized = image::imageops::resize(
crop,
new_w,
REC_HEIGHT,
image::imageops::FilterType::Triangle,
);
let w = new_w as usize;
let h = REC_HEIGHT as usize;
let mut arr = Array4::<f32>::zeros((1, 3, h, w));
for (x, y, px) in resized.enumerate_pixels() {
let (xi, yi) = (x as usize, y as usize);
for c in 0..3 {
let v = px[c] as f32 / 255.0;
arr[[0, c, yi, xi]] = (v - 0.5) / 0.5; // [-1, 1]
}
}
let input = Value::from_array(arr).context("rec Value::from_array")?;
let sess = self.rec.lock().expect("rec session mutex poisoned");
let outputs = sess
.run(ort::inputs![self.rec_input_name.as_str() => input]?)
.context("rec session run")?;
let out_name = sess.outputs[0].name.clone();
let view = outputs[out_name.as_str()]
.try_extract_tensor::<f32>()
.context("rec output extract")?;
// shape [1, T, C]
let shape = view.shape();
let (t, c) = (shape[shape.len() - 2], shape[shape.len() - 1]);
if c != REC_CLASSES {
anyhow::bail!(
"rec output has {c} classes, expected {REC_CLASSES} \
(dict {DICT_LINES} + blank + space)"
);
}
let data: Vec<f32> = view.iter().copied().collect();
Ok(self.ctc_greedy_decode(&data, t, c))
}
/// CTC greedy decode over `[T, C]` logits/probs (row-major). Per timestep
/// argmax → collapse consecutive duplicates → drop blank → map class→str.
fn ctc_greedy_decode(&self, data: &[f32], t: usize, c: usize) -> (String, f32) {
let mut out = String::new();
let mut confs: Vec<f32> = Vec::new();
let mut prev = usize::MAX;
for ti in 0..t {
let row = &data[ti * c..(ti + 1) * c];
let mut best = 0usize;
let mut best_v = f32::MIN;
for (i, &v) in row.iter().enumerate() {
if v > best_v {
best_v = v;
best = i;
}
}
if best != prev && best != CTC_BLANK {
if let Some(s) = self.class_to_str(best) {
out.push_str(s);
confs.push(best_v);
}
}
prev = best;
}
let conf = if confs.is_empty() {
0.0
} else {
confs.iter().sum::<f32>() / confs.len() as f32
};
(out, conf)
}
}
fn empty_ocr(e: &OnnxPaddleOcr) -> OcrText {
OcrText {
joined: String::new(),
regions: Vec::new(),
engine: PADDLE_ONNX_ENGINE.to_string(),
engine_version: e.engine_version.clone(),
}
}
/// Load the dict file: one token per line, trailing newline tolerated.
/// Empty lines are preserved as empty tokens (PaddleOCR dicts may carry a
/// blank-looking line; index integrity matters more than trimming).
fn load_dict(path: &Path) -> Result<Vec<String>> {
let raw = std::fs::read_to_string(path)?;
// split on '\n'; drop a single trailing empty element from the final newline
let mut lines: Vec<String> = raw.split('\n').map(|s| s.trim_end_matches('\r').to_string()).collect();
if lines.last().map(|s| s.is_empty()).unwrap_or(false) {
lines.pop();
}
Ok(lines)
}
/// blake3 over det + rec + dict bytes → stable `engine_version`.
fn compute_engine_version(paths: &ModelPaths) -> Result<String> {
let mut hasher = blake3::Hasher::new();
for p in [&paths.det, &paths.rec, &paths.dict] {
let bytes = std::fs::read(p).with_context(|| format!("reading {}", p.display()))?;
hasher.update(&bytes);
}
let hash = hasher.finalize();
let hex = hash.to_hex();
Ok(format!("ppocrv5-mobile-kor-{}", &hex.as_str()[..12]))
}
/// det resize target: keep aspect, cap long edge at `min(max_pixels, 960)`,
/// then round each dim to a multiple of 32 (DBNet stride). Reproduces the T0a
/// golden (192×900 → 192×896).
fn det_target_dims(w: u32, h: u32, max_pixels: u32) -> (u32, u32) {
let limit = DET_LIMIT_SIDE_LEN.min(max_pixels.max(32));
let long = w.max(h);
let ratio = if long > limit {
limit as f32 / long as f32
} else {
1.0
};
let rw = (w as f32 * ratio).round().max(1.0);
let rh = (h as f32 * ratio).round().max(1.0);
let round32 = |v: f32| -> u32 {
let r = (v / 32.0).round() as u32 * 32;
r.max(32)
};
(round32(rw), round32(rh))
}
// ── det postprocessing ──────────────────────────────────────────────────────
struct ProbMap {
w: usize,
h: usize,
data: Vec<f32>,
}
impl ProbMap {
#[inline]
fn at(&self, x: usize, y: usize) -> f32 {
self.data[y * self.w + x]
}
}
/// A detected text box: 4 corners (clockwise from top-left) in det-image
/// coordinates (later scaled to original).
#[derive(Clone, Debug)]
struct DetBox {
corners: [(f32, f32); 4],
#[allow(dead_code)]
score: f32,
}
impl DetBox {
fn center_x(&self) -> f32 {
self.corners.iter().map(|p| p.0).sum::<f32>() / 4.0
}
fn center_y(&self) -> f32 {
self.corners.iter().map(|p| p.1).sum::<f32>() / 4.0
}
/// Axis-aligned bounding box (x, y, w, h) clamped to non-negative.
fn aabb(&self) -> (u32, u32, u32, u32) {
let xs = self.corners.iter().map(|p| p.0);
let ys = self.corners.iter().map(|p| p.1);
let minx = xs.clone().fold(f32::MAX, f32::min).max(0.0);
let maxx = xs.fold(f32::MIN, f32::max).max(0.0);
let miny = ys.clone().fold(f32::MAX, f32::min).max(0.0);
let maxy = ys.fold(f32::MIN, f32::max).max(0.0);
(
minx.round() as u32,
miny.round() as u32,
(maxx - minx).round().max(0.0) as u32,
(maxy - miny).round().max(0.0) as u32,
)
}
}
/// DBNet-style postprocess: threshold → connected components → contour →
/// min-area rect (rotating calipers) → box-score filter → unclip → boxes.
/// Pinned by `tests/golden/det_boxes_clean_paragraph.json` (3 boxes).
fn det_postprocess(
prob: &ProbMap,
w: usize,
h: usize,
score_thresh: f32,
unclip_ratio: f32,
) -> Vec<DetBox> {
use image::{GrayImage, Luma};
// binarize at the detection threshold
let mut bin = GrayImage::new(w as u32, h as u32);
for y in 0..h {
for x in 0..w {
let v = if prob.at(x, y) > 0.3 { 255u8 } else { 0u8 };
bin.put_pixel(x as u32, y as u32, Luma([v]));
}
}
let contours = imageproc::contours::find_contours::<u32>(&bin);
let mut boxes = Vec::new();
for contour in &contours {
if contour.points.len() < 4 {
continue;
}
let pts: Vec<(f32, f32)> = contour
.points
.iter()
.map(|p| (p.x as f32, p.y as f32))
.collect();
let Some(rect) = min_area_rect(&pts) else {
continue;
};
// mean-prob box score over the AABB of the rotated rect
let score = box_score(prob, &rect.corners);
if score < score_thresh {
continue;
}
let unclipped = unclip_rect(&rect, unclip_ratio);
boxes.push(DetBox {
corners: unclipped,
score,
});
}
boxes
}
/// Mean probability inside the axis-aligned bbox of the rect — the
/// `box_thresh` mean-prob filter used by the golden harness.
fn box_score(prob: &ProbMap, corners: &[(f32, f32); 4]) -> f32 {
let minx = corners.iter().map(|p| p.0).fold(f32::MAX, f32::min).max(0.0) as usize;
let maxx = (corners.iter().map(|p| p.0).fold(f32::MIN, f32::max).max(0.0) as usize)
.min(prob.w.saturating_sub(1));
let miny = corners.iter().map(|p| p.1).fold(f32::MAX, f32::min).max(0.0) as usize;
let maxy = (corners.iter().map(|p| p.1).fold(f32::MIN, f32::max).max(0.0) as usize)
.min(prob.h.saturating_sub(1));
if maxx <= minx || maxy <= miny {
return 0.0;
}
let mut sum = 0.0f32;
let mut n = 0usize;
for y in miny..=maxy {
for x in minx..=maxx {
sum += prob.at(x, y);
n += 1;
}
}
if n == 0 { 0.0 } else { sum / n as f32 }
}
/// Rotated rect described by its 4 corners + box dims.
#[derive(Clone, Debug)]
struct RotRect {
corners: [(f32, f32); 4],
width: f32,
height: f32,
}
/// Minimum-area enclosing rectangle of a point set via rotating calipers on
/// the convex hull (pure Rust — no OpenCV / clipper2).
fn min_area_rect(points: &[(f32, f32)]) -> Option<RotRect> {
let hull = convex_hull(points);
if hull.len() < 3 {
return None;
}
let n = hull.len();
let mut best_area = f32::MAX;
let mut best: Option<RotRect> = None;
for i in 0..n {
let p0 = hull[i];
let p1 = hull[(i + 1) % n];
let edge = (p1.0 - p0.0, p1.1 - p0.1);
let len = (edge.0 * edge.0 + edge.1 * edge.1).sqrt();
if len < 1e-6 {
continue;
}
let ux = (edge.0 / len, edge.1 / len); // edge direction
let uy = (-ux.1, ux.0); // normal
let (mut min_u, mut max_u) = (f32::MAX, f32::MIN);
let (mut min_v, mut max_v) = (f32::MAX, f32::MIN);
for &p in &hull {
let du = p.0 * ux.0 + p.1 * ux.1;
let dv = p.0 * uy.0 + p.1 * uy.1;
min_u = min_u.min(du);
max_u = max_u.max(du);
min_v = min_v.min(dv);
max_v = max_v.max(dv);
}
let area = (max_u - min_u) * (max_v - min_v);
if area < best_area {
best_area = area;
// reconstruct corners in (u,v) basis → world
let to_world = |u: f32, v: f32| (u * ux.0 + v * uy.0, u * ux.1 + v * uy.1);
let corners = [
to_world(min_u, min_v),
to_world(max_u, min_v),
to_world(max_u, max_v),
to_world(min_u, max_v),
];
best = Some(RotRect {
corners,
width: max_u - min_u,
height: max_v - min_v,
});
}
}
best
}
/// Andrew's monotone chain convex hull. Returns CCW hull without duplicates.
fn convex_hull(points: &[(f32, f32)]) -> Vec<(f32, f32)> {
let mut pts: Vec<(f32, f32)> = points.to_vec();
pts.sort_by(|a, b| {
a.0.partial_cmp(&b.0)
.unwrap_or(std::cmp::Ordering::Equal)
.then(a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
});
pts.dedup();
if pts.len() < 3 {
return pts;
}
let cross = |o: (f32, f32), a: (f32, f32), b: (f32, f32)| {
(a.0 - o.0) * (b.1 - o.1) - (a.1 - o.1) * (b.0 - o.0)
};
let mut lower: Vec<(f32, f32)> = Vec::new();
for &p in &pts {
while lower.len() >= 2 && cross(lower[lower.len() - 2], lower[lower.len() - 1], p) <= 0.0 {
lower.pop();
}
lower.push(p);
}
let mut upper: Vec<(f32, f32)> = Vec::new();
for &p in pts.iter().rev() {
while upper.len() >= 2 && cross(upper[upper.len() - 2], upper[upper.len() - 1], p) <= 0.0 {
upper.pop();
}
upper.push(p);
}
lower.pop();
upper.pop();
lower.extend(upper);
lower
}
/// Unclip a rotated rect by `ratio` (PaddleOCR `distance = area*ratio/perimeter`),
/// expanding width + height by `2*distance`. For a rectangle this matches the
/// general polygon offset PaddleOCR uses (pyclipper) — pure Rust here.
fn unclip_rect(rect: &RotRect, ratio: f32) -> [(f32, f32); 4] {
let area = rect.width * rect.height;
let perimeter = 2.0 * (rect.width + rect.height);
if perimeter < 1e-6 {
return rect.corners;
}
let distance = area * ratio / perimeter;
// expand around centroid
let cx = rect.corners.iter().map(|p| p.0).sum::<f32>() / 4.0;
let cy = rect.corners.iter().map(|p| p.1).sum::<f32>() / 4.0;
let mut out = rect.corners;
for p in &mut out {
let dx = p.0 - cx;
let dy = p.1 - cy;
let len = (dx * dx + dy * dy).sqrt();
if len > 1e-6 {
// push each corner outward along its diagonal by `distance`
p.0 += dx / len * distance;
p.1 += dy / len * distance;
}
}
out
}
// ── crop + rectify ───────────────────────────────────────────────────────────
/// Perspective-warp the quadrilateral `corners` (clockwise from top-left) into
/// a horizontal strip. Output size derives from the box edge lengths.
fn rectify_crop(img: &image::RgbImage, corners: &[(f32, f32); 4]) -> image::RgbImage {
// order corners: top-left, top-right, bottom-right, bottom-left
let ordered = order_corners(corners);
let dist = |a: (f32, f32), b: (f32, f32)| ((a.0 - b.0).powi(2) + (a.1 - b.1).powi(2)).sqrt();
let w = dist(ordered[0], ordered[1]).max(dist(ordered[3], ordered[2]));
let h = dist(ordered[0], ordered[3]).max(dist(ordered[1], ordered[2]));
let out_w = w.round().max(1.0) as u32;
let out_h = h.round().max(1.0) as u32;
let mut out = image::RgbImage::new(out_w, out_h);
let (iw, ih) = (img.width() as f32, img.height() as f32);
// bilinear map from output grid back to the source quad (inverse via
// bilinear interpolation of the four corners — adequate for near-affine
// text boxes).
for oy in 0..out_h {
let fy = oy as f32 / (out_h.max(1) as f32 - 1.0).max(1.0);
for ox in 0..out_w {
let fx = ox as f32 / (out_w.max(1) as f32 - 1.0).max(1.0);
// bilinear blend of the four source corners
let top = (
ordered[0].0 + (ordered[1].0 - ordered[0].0) * fx,
ordered[0].1 + (ordered[1].1 - ordered[0].1) * fx,
);
let bot = (
ordered[3].0 + (ordered[2].0 - ordered[3].0) * fx,
ordered[3].1 + (ordered[2].1 - ordered[3].1) * fx,
);
let sx = (top.0 + (bot.0 - top.0) * fy).clamp(0.0, iw - 1.0);
let sy = (top.1 + (bot.1 - top.1) * fy).clamp(0.0, ih - 1.0);
let px = img.get_pixel(sx.round() as u32, sy.round() as u32);
out.put_pixel(ox, oy, *px);
}
}
out
}
/// Order 4 corners as [top-left, top-right, bottom-right, bottom-left] using
/// coordinate sums/diffs (standard PaddleOCR ordering).
fn order_corners(corners: &[(f32, f32); 4]) -> [(f32, f32); 4] {
// top-left has smallest x+y, bottom-right largest x+y;
// top-right smallest y-x, bottom-left largest y-x.
let mut tl = corners[0];
let mut br = corners[0];
let mut tr = corners[0];
let mut bl = corners[0];
let (mut min_sum, mut max_sum) = (f32::MAX, f32::MIN);
let (mut min_diff, mut max_diff) = (f32::MAX, f32::MIN);
for &p in corners {
let sum = p.0 + p.1;
let diff = p.1 - p.0;
if sum < min_sum {
min_sum = sum;
tl = p;
}
if sum > max_sum {
max_sum = sum;
br = p;
}
if diff < min_diff {
min_diff = diff;
tr = p;
}
if diff > max_diff {
max_diff = diff;
bl = p;
}
}
[tl, tr, br, bl]
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn det_target_dims_matches_golden() {
// T0a golden: clean_paragraph 192×900 → det input 192×896.
assert_eq!(det_target_dims(900, 192, 1600), (896, 192));
}
#[test]
fn convex_hull_square() {
let pts = vec![(0.0, 0.0), (10.0, 0.0), (10.0, 10.0), (0.0, 10.0), (5.0, 5.0)];
let hull = convex_hull(&pts);
assert_eq!(hull.len(), 4);
}
#[test]
fn min_area_rect_axis_aligned() {
let pts = vec![(0.0, 0.0), (20.0, 0.0), (20.0, 5.0), (0.0, 5.0)];
let r = min_area_rect(&pts).expect("rect");
let (lo, hi) = (r.width.min(r.height), r.width.max(r.height));
assert!((lo - 5.0).abs() < 1e-3, "short side {lo}");
assert!((hi - 20.0).abs() < 1e-3, "long side {hi}");
}
#[test]
fn unclip_expands_box() {
let rect = RotRect {
corners: [(0.0, 0.0), (20.0, 0.0), (20.0, 5.0), (0.0, 5.0)],
width: 20.0,
height: 5.0,
};
let out = unclip_rect(&rect, 1.5);
// unclipped box must be strictly larger than the original
let orig_minx = 0.0;
let new_minx = out.iter().map(|p| p.0).fold(f32::MAX, f32::min);
assert!(new_minx < orig_minx, "expected expansion, got {new_minx}");
}
}