chore(ocr): T11/T12 — clippy clean + docs + v0.27.0 bump

T11: fix 12 clippy lints in paddle_onnx.rs/paddle_e2e.rs (doc overindent, finish_non_exhaustive, map_or_else, RangeInclusive::contains, cast_lossless, is_some_and, usize::from). Full-workspace clippy -D warnings = 0. Smoke (paddle-onnx, real binary): clean_paragraph OCR verbatim-correct, real per-region confidence (0.99/0.96/0.95), FTS5 lexical hit on Korean(검색)+ English(embedding), parser_version folds |ocr:1:paddle-onnx:<ver>. Big page <4s inference (5.6s ingest incl. one-time session load). T12: README [image.ocr].engine + ARCHITECTURE OCR row + SMOKE paddle-onnx config + HANDOFF + HOTFIXES dated entry. Workspace version 0.26.2 → 0.27.0 (minor: new engine value + config keys). .gitattributes: onnx as plain blobs (no git-lfs). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 08:36:10 +00:00
parent 8cc4e6d563
commit 375a0693e4
12 changed files with 114 additions and 55 deletions
--- a/crates/kebab-app/tests/common/mock_ocr.rs
+++ b/crates/kebab-app/tests/common/mock_ocr.rs
@@ -39,6 +39,7 @@ impl OcrEngine for MockOcrEngine {
        "mock-v1".to_string()
    }

+    #[allow(clippy::unnecessary_literal_bound)]
    fn model(&self) -> &str {
        "mock-model"
    }
--- a/crates/kebab-parse-image/src/lib.rs
+++ b/crates/kebab-parse-image/src/lib.rs
@@ -34,7 +34,7 @@ pub mod paddle_onnx;

 pub use caption::{apply_caption, caption_image};
 pub use ocr::{OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, apply_ocr};
-pub use paddle_onnx::{OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config};
+pub use paddle_onnx::{ModelPaths, OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config};

 use anyhow::{Context, Result};
 use kebab_core::{
--- a/crates/kebab-parse-image/src/paddle_onnx.rs
+++ b/crates/kebab-parse-image/src/paddle_onnx.rs
@@ -3,13 +3,13 @@
 //! production dependency (see crate-level rationale + `assets/paddleocr-onnx/NOTICE`).
 //!
 //! Pipeline (`recognize`):
-//!   1. decode (RGB) + downscale long edge to `max_pixels`
-//!   2. det:  ImageNet-normalized NCHW → DBNet prob map `[1,1,H,W]`
-//!            → threshold 0.3 → contours → min-area rect (rotating calipers,
-//!            pure Rust) → unclip(ratio 1.5, pure Rust) → boxes
-//!   3. crop+rectify: perspective warp each rotated box to a horizontal strip
-//!   4. rec:  48×W normalized `(x-0.5)/0.5` → `[1,T,11947]` → CTC greedy decode
-//!   5. assemble reading-order `OcrText`
+//! 1. decode (RGB) + downscale long edge to `max_pixels`
+//! 2. det: ImageNet-normalized NCHW → DBNet prob map `[1,1,H,W]` → threshold
+//!    0.3 → contours → min-area rect (rotating calipers, pure Rust) →
+//!    unclip(ratio 1.5, pure Rust) → boxes
+//! 3. crop+rectify: perspective warp each rotated box to a horizontal strip
+//! 4. rec: 48×W normalized `(x-0.5)/0.5` → `[1,T,11947]` → CTC greedy decode
+//! 5. assemble reading-order `OcrText`
 //!
 //! ## Confirmed CTC facts (empirically derived in T0a, see
 //! `tests/golden/ctc_rec_golden.json` — do NOT re-derive):
@@ -82,7 +82,7 @@ impl std::fmt::Debug for OnnxPaddleOcr {
            .field("unclip_ratio", &self.unclip_ratio)
            .field("max_boxes", &self.max_boxes)
            .field("max_pixels", &self.max_pixels)
-            .finish()
+            .finish_non_exhaustive()
    }
 }

@@ -100,11 +100,10 @@ impl ModelPaths {
    /// Default bundled-asset directory: `KEBAB_IMAGE_OCR_MODEL_DIR` if set,
    /// else the crate's `assets/paddleocr-onnx/`.
    pub fn from_default_dir() -> Self {
-        let dir = std::env::var("KEBAB_IMAGE_OCR_MODEL_DIR")
-            .map(PathBuf::from)
-            .unwrap_or_else(|_| {
-                Path::new(env!("CARGO_MANIFEST_DIR")).join("assets/paddleocr-onnx")
-            });
+        let dir = std::env::var("KEBAB_IMAGE_OCR_MODEL_DIR").map_or_else(
+            |_| Path::new(env!("CARGO_MANIFEST_DIR")).join("assets/paddleocr-onnx"),
+            PathBuf::from,
+        );
        Self {
            det: dir.join("ppocrv5_mobile_det.onnx"),
            rec: dir.join("korean_ppocrv5_mobile_rec.onnx"),
@@ -211,7 +210,7 @@ impl OnnxPaddleOcr {
        match idx {
            CTC_BLANK => None,
            CTC_SPACE => Some(" "),
-            i if i >= 1 && i <= DICT_LINES => Some(self.dict[i - 1].as_str()),
+            i if (1..=DICT_LINES).contains(&i) => Some(self.dict[i - 1].as_str()),
            _ => None, // out-of-range guard (should not happen for 11947 classes)
        }
    }
@@ -226,6 +225,10 @@ impl OcrEngine for OnnxPaddleOcr {
        self.engine_version.clone()
    }

+    // The trait method's elided lifetime ties the return to `&self`; the body
+    // returns a literal, but the signature must match the trait, so allow the
+    // `'static`-narrowing lint here.
+    #[allow(clippy::unnecessary_literal_bound)]
    fn model(&self) -> &str {
        // Static label for the progress display; the per-asset hash lives
        // in `engine_version`.
@@ -335,7 +338,7 @@ impl OnnxPaddleOcr {
        for (x, y, px) in det_img.enumerate_pixels() {
            let (xi, yi) = (x as usize, y as usize);
            for c in 0..3 {
-                let v = px[c] as f32 / 255.0;
+                let v = f32::from(px[c]) / 255.0;
                arr[[0, c, yi, xi]] = (v - IMAGENET_MEAN[c]) / IMAGENET_STD[c];
            }
        }
@@ -372,7 +375,7 @@ impl OnnxPaddleOcr {
        for (x, y, px) in resized.enumerate_pixels() {
            let (xi, yi) = (x as usize, y as usize);
            for c in 0..3 {
-                let v = px[c] as f32 / 255.0;
+                let v = f32::from(px[c]) / 255.0;
                arr[[0, c, yi, xi]] = (v - 0.5) / 0.5; // [-1, 1]
            }
        }
@@ -447,7 +450,7 @@ fn load_dict(path: &Path) -> Result<Vec<String>> {
    let raw = std::fs::read_to_string(path)?;
    // split on '\n'; drop a single trailing empty element from the final newline
    let mut lines: Vec<String> = raw.split('\n').map(|s| s.trim_end_matches('\r').to_string()).collect();
-    if lines.last().map(|s| s.is_empty()).unwrap_or(false) {
+    if lines.last().is_some_and(String::is_empty) {
        lines.pop();
    }
    Ok(lines)
--- a/crates/kebab-parse-image/tests/paddle_e2e.rs
+++ b/crates/kebab-parse-image/tests/paddle_e2e.rs
@@ -33,7 +33,7 @@ fn cer(gt: &str, pred: &str) -> f64 {
    for i in 1..=m {
        let mut cur = vec![i; n + 1];
        for j in 1..=n {
-            let cost = if g[i - 1] == p[j - 1] { 0 } else { 1 };
+            let cost = usize::from(g[i - 1] != p[j - 1]);
            cur[j] = (prev[j] + 1).min(cur[j - 1] + 1).min(prev[j - 1] + cost);
        }
        prev = cur;
@@ -42,11 +42,10 @@ fn cer(gt: &str, pred: &str) -> f64 {
 }

 fn fixture_dir() -> PathBuf {
-    std::env::var("KEBAB_TEST_OCR_FIXTURE_DIR")
-        .map(PathBuf::from)
-        .unwrap_or_else(|_| {
-            PathBuf::from("/build/dogfood/corpus/images/synthetic-ocr-bench")
-        })
+    std::env::var("KEBAB_TEST_OCR_FIXTURE_DIR").map_or_else(
+        |_| PathBuf::from("/build/dogfood/corpus/images/synthetic-ocr-bench"),
+        PathBuf::from,
+    )
 }

 /// T10: undecodable image bytes must surface as an error (the kebab-app caller