PP-OCRv5 ONNX OCR engine on the pinned ort rc.9 (no Python, no oar-ocr dep). Implements the recognize() pipeline end-to-end (compiles + unit-tested): - T2: OnnxPaddleOcr skeleton, OcrEngine impl, det/rec Session loaded once (Mutex-wrapped → Send+Sync), engine_version = blake3(det+rec+dict) cached once at construction, dict bounds-check (11945 lines vs 11947 rec classes). - T2 preproc: det ImageNet mean/std NCHW + limit_side_len 960 → ×32 round (golden 192x900→896x192 pinned); rec height-48 keep-aspect, (x-0.5)/0.5. - T3 det postproc: threshold 0.3 → imageproc contours → min-area rect via pure-Rust rotating calipers + convex hull → mean-prob box-score filter → pure-Rust unclip(ratio 1.5). No clipper2/OpenCV. - T4 crop+rectify: corner ordering + bilinear perspective warp to horizontal. - T5 rec+CTC: greedy decode with the T0a-confirmed mapping (idx0=blank, 1..=11945=dict[idx-1], 11946=space), rec-class bounds-check. - T6 assembly: reading-order OcrText with per-region bbox + real confidence. Unit tests (4 pass): det_target_dims golden, convex hull, min-area rect, unclip expansion. Large *.onnx assets stay untracked pending T12 LFS decision. Remaining: T7 config overrides, T8 factory (4 sites), T9 signature cascade, T10 error matrix, T11 gates (clippy/e2e CER), T12 docs+bump+PR. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
79 lines
3.8 KiB
TOML
79 lines
3.8 KiB
TOML
[package]
|
|
name = "kebab-parse-image"
|
|
version = { workspace = true }
|
|
edition = { workspace = true }
|
|
rust-version = { workspace = true }
|
|
license = { workspace = true }
|
|
repository = { workspace = true }
|
|
description = "Image extractor + EXIF + OCR (Ollama-vision) for the kebab pipeline (P6-1, P6-2)"
|
|
|
|
[dependencies]
|
|
kebab-core = { path = "../kebab-core" }
|
|
kebab-config = { path = "../kebab-config" }
|
|
# `kebab-llm` re-exports the trait crate (`kebab-core::LanguageModel`)
|
|
# under a stable surface; the caption adapter consumes any
|
|
# `dyn LanguageModel`. We do NOT depend on `kebab-llm-local` (forbidden
|
|
# by p6-3 design §8) — the trait abstraction is exactly what spec
|
|
# requires.
|
|
kebab-llm = { path = "../kebab-llm" }
|
|
anyhow = { workspace = true }
|
|
serde = { workspace = true }
|
|
serde_json = { workspace = true }
|
|
time = { workspace = true }
|
|
tracing = { workspace = true }
|
|
# `image` ships a wide format menagerie under default features (BMP, DDS,
|
|
# Farbfeld, …). We only need PNG / JPEG / WebP / GIF / TIFF for v1 (per
|
|
# task spec out-of-scope HEIC/RAW). Trim defaults to keep the dep
|
|
# closure small.
|
|
image = { version = "0.25", default-features = false, features = ["png", "jpeg", "webp", "gif", "tiff"] }
|
|
# kamadak-exif: pure-Rust EXIF reader. Used for the whitelisted tag
|
|
# extraction (DateTimeOriginal, GPS, Make, Model, Orientation, Software).
|
|
kamadak-exif = "0.6"
|
|
# Ollama-vision OCR adapter (P6-2) talks HTTP directly. We keep the
|
|
# feature surface identical to `kebab-llm-local` (blocking + json +
|
|
# rustls-tls) so both crates share the same TLS backend and the
|
|
# transitive tokio runtime is brought in once.
|
|
reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
|
base64 = { workspace = true }
|
|
thiserror = { workspace = true }
|
|
# paddle-onnx OCR engine (PP-OCRv5, in-process). We reuse the workspace ort
|
|
# pin (=2.0.0-rc.9) so the ONNX Runtime native lib stays single-versioned with
|
|
# fastembed / kebab-nli (oar-ocr is intentionally NOT a dep — it would pull
|
|
# ort rc.12 + ndarray 0.17, splitting the native `links` and threatening the
|
|
# embedding stack). `download-binaries` extends the pin the same way
|
|
# `kebab-nli/Cargo.toml:23` does: this crate isn't in fastembed's build graph,
|
|
# so a standalone `cargo test -p kebab-parse-image` needs it to link onnxruntime.
|
|
ort = { workspace = true, features = ["ndarray", "download-binaries"] }
|
|
ndarray = { workspace = true }
|
|
# blake3: engine_version hash over the bundled det/rec/dict assets (computed
|
|
# once at OnnxPaddleOcr construction, cached — `ingest_config_signature` calls
|
|
# engine_version() per asset).
|
|
blake3 = { workspace = true }
|
|
# imageproc: connected-components / contours for DBNet det post-processing.
|
|
# min-area rotated-rect (rotating calipers) and polygon unclip are implemented
|
|
# in pure Rust (clipper2 is C++ FFI — would break the single-binary guarantee).
|
|
imageproc = "0.25"
|
|
|
|
[dev-dependencies]
|
|
tempfile = { workspace = true }
|
|
blake3 = { workspace = true }
|
|
# Shared test infrastructure with `kebab-llm-local`: wiremock under
|
|
# tokio for HTTP fixtures.
|
|
wiremock = { workspace = true }
|
|
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
|
# Used by `tests/common/mod.rs` to render the opt-in OCR integration
|
|
# fixture. Only loaded for tests; the production crate doesn't need
|
|
# font rendering.
|
|
ab_glyph = "0.2"
|
|
base64 = { workspace = true }
|
|
# `kebab-llm/mock` exposes `MockLanguageModel` for hermetic caption
|
|
# tests. Real adapters (Ollama) live in `kebab-llm-local`, which is
|
|
# only allowed at the dev-dep level here — the runtime crate stays
|
|
# trait-only, so the §8 forbidden-deps rule (no `kebab-llm-local`
|
|
# at runtime) is preserved.
|
|
kebab-llm = { path = "../kebab-llm", features = ["mock"] }
|
|
kebab-llm-local = { path = "../kebab-llm-local" }
|
|
|
|
[lints]
|
|
workspace = true
|