From fd918a60ce482373500330002892775e7e3e5b61 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 27 May 2026 07:07:18 +0000 Subject: [PATCH] =?UTF-8?q?feat(config):=20add=20[pdf.ocr]=20section=20?= =?UTF-8?q?=E2=80=94=20qwen2.5vl:3b=20default,=20opt-in=20+=20env=20overri?= =?UTF-8?q?des=20+=20doc(app):=20PdfOcrOpts=20field=20doc=20(Step=204=20I-?= =?UTF-8?q?1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 5 (Group F) of v0.20.0 sub-item 1 (scanned PDF OCR) plan + Step 4 reviewer Important I-1 fix (PdfOcrOpts field doc) 동봉. F1 — `kebab-config::PdfCfg` + `PdfOcrCfg` + 4 default fn: - PdfCfg { ocr: PdfOcrCfg }. - PdfOcrCfg with 11 field (enabled/always_on/engine/model/endpoint/ languages/max_pixels/request_timeout_secs/valid_ratio_threshold/ min_char_count/lang_hint). - defaults: opt-in (enabled=false), qwen2.5vl:3b, 0.5 threshold, 20 char. - mirror of image OCR cfg pattern (spec §4.5). Config struct extension: - `pdf: PdfCfg` field with `#[serde(default = "PdfCfg::defaults")]`. 11 env var override (parallel to KEBAB_IMAGE_OCR_*): KEBAB_PDF_OCR_{ENABLED,ALWAYS_ON,ENGINE,MODEL,ENDPOINT,LANGUAGES, MAX_PIXELS,REQUEST_TIMEOUT_SECS,VALID_RATIO_THRESHOLD,MIN_CHAR_COUNT, LANG_HINT}. F2 — `crates/kebab-config/tests/pdf_ocr.rs` (신규): - toml roundtrip (11 field). - defaults (opt-in + qwen2.5vl:3b). - env override (4 key sample + default preservation). F3 (Step 4 I-1) — `pdf_ocr_apply.rs` 4 public item 의 doc comment: - PdfOcrOpts struct + 6 field. - PdfOcrSummary struct + 2 field. - apply_ocr_to_pdf_pages fn (Errors block 포함). - PdfOcrProgress enum + 2 variant + 5 field. body 변경 0, doc-only. spec: docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md (§4.5) plan: docs/superpowers/plans/2026-05-27-pdf-scanned-ocr-plan.md (Step 5 F1+F2) prior: 9f003ef (Step 4) — code reviewer Important I-1 resolution contract: §9 (additive minor wire bump — Step 7) Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/pdf_ocr_apply.rs | 55 ++++++++++- crates/kebab-config/src/lib.rs | 127 ++++++++++++++++++++++++++ crates/kebab-config/tests/pdf_ocr.rs | 80 ++++++++++++++++ 3 files changed, 260 insertions(+), 2 deletions(-) create mode 100644 crates/kebab-config/tests/pdf_ocr.rs diff --git a/crates/kebab-app/src/pdf_ocr_apply.rs b/crates/kebab-app/src/pdf_ocr_apply.rs index 1c6ed14..f903507 100644 --- a/crates/kebab-app/src/pdf_ocr_apply.rs +++ b/crates/kebab-app/src/pdf_ocr_apply.rs @@ -22,11 +22,26 @@ use lopdf::Document as LopdfDocument; use time::OffsetDateTime; use tracing::warn; +/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`]. +/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade +/// (`kebab_app::ingest_one_pdf_asset`) fills these from +/// `kebab_config::Config::pdf::ocr` plus runtime flags (CLI / SIGINT). pub struct PdfOcrOpts { + /// Master switch. `false` short-circuits to + /// `PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }` without lopdf reparse. pub enabled: bool, + /// `true` → 모든 page OCR (dual-block path, new `Block::Paragraph` push). + /// `false` → text-detect block 의 `min_char_count` 또는 + /// `valid_ratio_threshold` 미달인 page 만 OCR (in-place mutate). pub always_on: bool, + /// 0.0..=1.0. text-detect block 의 `compute_valid_char_ratio` 가 + /// 본 임계 미만이면 OCR fallback. Default `0.5`. pub valid_ratio_threshold: f32, + /// text-detect block 의 char count 가 본 임계 미만이면 OCR fallback. + /// empty page (cover, blank separator) 자동 skip. Default `20`. pub min_char_count: u32, + /// OCR engine 에 전달할 언어 힌트 (예: `Lang("kor".into())`). + /// `None` → no hint passed to engine. pub lang_hint: Option, /// Optional per-page cancellation handle. checked at start of each page /// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4 @@ -34,12 +49,34 @@ pub struct PdfOcrOpts { pub cancel: Option>, } +/// OCR run summary returned by [`apply_ocr_to_pdf_pages`] for the caller's +/// `IngestItem.pdf_ocr_pages` + `pdf_ocr_ms_total` wire fields (§4.6.2). #[derive(Debug)] pub struct PdfOcrSummary { + /// Number of pages 가 OCR pipeline 을 실제 통과 (skipped page 제외). pub pages_ocrd: u32, + /// Cumulative wall-clock duration of successful OCR engine calls (ms). + /// `saturating_add` 사용 — 24-day cumulative 까지 overflow-safe. pub ms_total: u64, } +/// Post-extract OCR enrichment for PDF. Walks `canonical.blocks` page-by-page, +/// classifies each page via `text_quality::compute_valid_char_ratio` + +/// `min_char_count`, and either: +/// - skips (vector PDF + sufficient text + `always_on=false`), +/// - mutates the text-detect `Block::Paragraph` in-place with OCR output +/// (scanned/mojibake page), or +/// - pushes a new `Block::Paragraph` with dual ordinal (`always_on=true` + +/// vector page). +/// +/// Errors: +/// - cancel handle (`opts.cancel = Some(true)`) → `Err("PDF OCR cancelled mid-PDF at page N")`. +/// - lopdf re-parse failure → `Err(...)`. +/// - per-page OCR engine failure 또는 DCTDecode 부재 → `ProvenanceKind::Warning` +/// event push + `emit_progress(Finished { skipped: true })` + continue +/// (no `Err` propagation). +/// +/// See spec §4.1 + §4.4 for the full pipeline. pub fn apply_ocr_to_pdf_pages( canonical: &mut CanonicalDocument, engine: &dyn OcrEngine, @@ -233,12 +270,26 @@ fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize { .expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)") } +/// Per-page OCR progress event 가 caller 의 `emit_progress` closure 호출 시 emit. +/// Step 6 의 ingest_one_pdf_asset 가 IngestEvent::PdfOcrStarted / PdfOcrFinished +/// 로 carry (spec §4.6.1 wire schema). pub enum PdfOcrProgress { - Started { page: u32 }, - Finished { + /// page 별 OCR 시작 시 emit. `engine.recognize` 호출 직전. + Started { + /// 1-based PDF page number. page: u32, + }, + /// page 별 OCR 종료 시 emit (성공 / skip / failure 모두). + Finished { + /// 1-based PDF page number. + page: u32, + /// `engine.recognize` wall-clock duration. skip path 의 의미는 mixed + /// (DCTDecode 부재 시 `0`, OCR engine 실패 시 actual latency before bail). ms: u64, + /// OCR result text 의 char count. skip 시 `0`. chars: u32, + /// `true` = DCTDecode 부재 또는 OCR engine 실패 로 skip. + /// `false` = 정상 OCR 완료. skipped: bool, }, } diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index 5a4215e..672bc4c 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -50,6 +50,11 @@ pub struct Config { /// load cleanly with built-in defaults. #[serde(default)] pub ingest: IngestCfg, + /// v0.20.0 sub-item 1: PDF ingest pipeline settings. `#[serde(default)]` + /// so pre-v0.20 config files without a `[pdf]` section load with + /// built-in defaults (OCR disabled — opt-in for scanned PDF KB). + #[serde(default = "PdfCfg::defaults")] + pub pdf: PdfCfg, /// p9-fb-05: directory of the on-disk config file this `Config` /// was loaded from, if any. Populated by `Config::from_file` / /// `Config::load` — never serialized (`#[serde(skip)]`). Used by @@ -392,6 +397,88 @@ impl CaptionCfg { } } +/// Settings for the PDF ingest pipeline (P7 + v0.20.0 sub-item 1). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct PdfCfg { + #[serde(default = "PdfOcrCfg::defaults")] + pub ocr: PdfOcrCfg, +} + +impl PdfCfg { + pub fn defaults() -> Self { + Self { ocr: PdfOcrCfg::defaults() } + } +} + +impl Default for PdfCfg { + fn default() -> Self { Self::defaults() } +} + +/// v0.20.0 sub-item 1: scanned PDF OCR via Ollama vision LLM. Default +/// disabled — opt-in because OCR adds ~45-100s per scanned page on CPU +/// (qwen2.5vl:3b, remote). Enable for book / paper scan KB. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct PdfOcrCfg { + /// Run OCR on scanned PDF pages. Default `false` (opt-in). + pub enabled: bool, + /// `false` (default) — text-detect first + vision fallback on + /// scanned pages only. `true` — vision LLM 호출 on every page + /// (vector PDF 의 dual-text confidence boost — doubles chunk count). + pub always_on: bool, + /// Engine identifier. v1 only ships `"ollama-vision"`. + pub engine: String, + /// Vision model id. Default `"qwen2.5vl:3b"` per PoC (§3.5 family + /// asymmetry vs image OCR's gemma4:e4b is acknowledged). + pub model: String, + /// HTTP endpoint. `None` → fall back to `models.llm.endpoint`. + #[serde(default)] + pub endpoint: Option, + /// BCP-47 language hints rendered into prompt. + pub languages: Vec, + /// Long-edge cap (px). Larger images bloat prompt cost. + pub max_pixels: u32, + /// HTTP request timeout (sec). Same `0` = "fail immediately" + /// semantics as `image.ocr.request_timeout_secs` (NOT a disable + /// sentinel — see image.ocr docs). + #[serde(default = "default_pdf_ocr_request_timeout_secs")] + pub request_timeout_secs: u64, + /// Valid char ratio threshold (0.0..=1.0). Page with ratio below + /// this is classified as scanned/mojibake → OCR fallback. Default + /// `0.5`. + #[serde(default = "default_pdf_ocr_valid_ratio")] + pub valid_ratio_threshold: f32, + /// Minimum char count per page below which page is auto-scanned. + /// Default `20`. + #[serde(default = "default_pdf_ocr_min_char_count")] + pub min_char_count: u32, + /// Single-page lang hint. Default `Some("kor")`. `None` = no hint. + #[serde(default = "default_pdf_ocr_lang_hint")] + pub lang_hint: Option, +} + +impl PdfOcrCfg { + pub fn defaults() -> Self { + Self { + enabled: false, + always_on: false, + engine: "ollama-vision".to_string(), + model: "qwen2.5vl:3b".to_string(), + endpoint: None, + languages: vec!["eng".to_string(), "kor".to_string()], + max_pixels: 2048, + request_timeout_secs: default_pdf_ocr_request_timeout_secs(), + valid_ratio_threshold: default_pdf_ocr_valid_ratio(), + min_char_count: default_pdf_ocr_min_char_count(), + lang_hint: default_pdf_ocr_lang_hint(), + } + } +} + +fn default_pdf_ocr_request_timeout_secs() -> u64 { 600 } +fn default_pdf_ocr_valid_ratio() -> f32 { 0.5 } +fn default_pdf_ocr_min_char_count() -> u32 { 20 } +fn default_pdf_ocr_lang_hint() -> Option { Some("kor".to_string()) } + /// p9-fb-14: TUI-only configuration. Currently a single `theme` /// selector (`"dark"` / `"light"`); future fields (custom role /// overrides, mode-machine cursor shapes, …) extend the same @@ -539,6 +626,7 @@ impl Config { image: ImageCfg::defaults(), ui: UiCfg::defaults(), ingest: IngestCfg::default(), + pdf: PdfCfg::defaults(), // p9-fb-05: defaults are not loaded from disk, so no // source_dir. Relative `workspace.root` (rare with // defaults) falls back to caller `cwd` via the @@ -903,6 +991,45 @@ impl Config { self.image.caption.prompt_template_version = v.clone(); } + // pdf.ocr (v0.20.0 sub-item 1) + "KEBAB_PDF_OCR_ENABLED" => self.pdf.ocr.enabled = parse_bool(v), + "KEBAB_PDF_OCR_ALWAYS_ON" => self.pdf.ocr.always_on = parse_bool(v), + "KEBAB_PDF_OCR_ENGINE" => self.pdf.ocr.engine = v.clone(), + "KEBAB_PDF_OCR_MODEL" => self.pdf.ocr.model = v.clone(), + "KEBAB_PDF_OCR_ENDPOINT" => { + self.pdf.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) }; + } + "KEBAB_PDF_OCR_LANGUAGES" => { + self.pdf.ocr.languages = v + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + } + "KEBAB_PDF_OCR_MAX_PIXELS" => { + if let Ok(n) = v.parse::() { + self.pdf.ocr.max_pixels = n; + } + } + "KEBAB_PDF_OCR_REQUEST_TIMEOUT_SECS" => { + if let Ok(n) = v.parse::() { + self.pdf.ocr.request_timeout_secs = n; + } + } + "KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD" => { + if let Ok(n) = v.parse::() { + self.pdf.ocr.valid_ratio_threshold = n.clamp(0.0, 1.0); + } + } + "KEBAB_PDF_OCR_MIN_CHAR_COUNT" => { + if let Ok(n) = v.parse::() { + self.pdf.ocr.min_char_count = n; + } + } + "KEBAB_PDF_OCR_LANG_HINT" => { + self.pdf.ocr.lang_hint = if v.is_empty() { None } else { Some(v.clone()) }; + } + // Unknown KEBAB_* keys are silently ignored — see // `env_unknown_key_is_ignored` test. _ => {} diff --git a/crates/kebab-config/tests/pdf_ocr.rs b/crates/kebab-config/tests/pdf_ocr.rs new file mode 100644 index 0000000..d559646 --- /dev/null +++ b/crates/kebab-config/tests/pdf_ocr.rs @@ -0,0 +1,80 @@ +// crates/kebab-config/tests/pdf_ocr.rs +// +// Integration tests for [pdf.ocr] config section (v0.20.0 sub-item 1). + +use std::collections::HashMap; +use kebab_config::{Config, PdfCfg}; + +// Test 1: toml roundtrip — spec §4.5 line 1034-1047 example block. +// Config requires many required fields; test the [pdf] section via PdfCfg wrapper. +#[derive(serde::Deserialize)] +struct PdfWrapper { pdf: PdfCfg } + +#[test] +fn pdf_ocr_toml_roundtrip() { + let toml = r#" +[pdf.ocr] +enabled = true +always_on = false +engine = "ollama-vision" +model = "qwen2.5vl:7b" +endpoint = "http://192.168.0.47:11434" +languages = ["eng", "kor"] +max_pixels = 3072 +request_timeout_secs = 900 +valid_ratio_threshold = 0.6 +min_char_count = 30 +lang_hint = "kor" +"#; + let w: PdfWrapper = toml::from_str(toml).expect("parse toml"); + let ocr = &w.pdf.ocr; + assert!(ocr.enabled); + assert!(!ocr.always_on); + assert_eq!(ocr.engine, "ollama-vision"); + assert_eq!(ocr.model, "qwen2.5vl:7b"); + assert_eq!(ocr.endpoint.as_deref(), Some("http://192.168.0.47:11434")); + assert_eq!(ocr.languages, vec!["eng".to_string(), "kor".to_string()]); + assert_eq!(ocr.max_pixels, 3072); + assert_eq!(ocr.request_timeout_secs, 900); + assert!((ocr.valid_ratio_threshold - 0.6).abs() < 1e-6); + assert_eq!(ocr.min_char_count, 30); + assert_eq!(ocr.lang_hint.as_deref(), Some("kor")); +} + +// Test 2: defaults — opt-in, qwen2.5vl:3b model, 0.5 threshold, 20 min_char. +#[test] +fn pdf_ocr_defaults_off_with_qwen_3b() { + let cfg = Config::defaults(); + assert_eq!(cfg.pdf.ocr.enabled, false); + assert_eq!(cfg.pdf.ocr.always_on, false); + assert_eq!(cfg.pdf.ocr.engine, "ollama-vision"); + assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:3b"); + assert!(cfg.pdf.ocr.endpoint.is_none()); + assert_eq!(cfg.pdf.ocr.languages, vec!["eng".to_string(), "kor".to_string()]); + assert_eq!(cfg.pdf.ocr.max_pixels, 2048); + assert_eq!(cfg.pdf.ocr.request_timeout_secs, 600); + assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.5).abs() < 1e-6); + assert_eq!(cfg.pdf.ocr.min_char_count, 20); + assert_eq!(cfg.pdf.ocr.lang_hint.as_deref(), Some("kor")); +} + +// Test 3: env var override — 4 keys 의 typical override case. +#[test] +fn pdf_ocr_env_overrides() { + let mut env: HashMap = HashMap::new(); + env.insert("KEBAB_PDF_OCR_ENABLED".to_string(), "true".to_string()); + env.insert("KEBAB_PDF_OCR_MODEL".to_string(), "qwen2.5vl:7b".to_string()); + env.insert("KEBAB_PDF_OCR_ALWAYS_ON".to_string(), "true".to_string()); + env.insert("KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD".to_string(), "0.75".to_string()); + + let cfg = Config::defaults().apply_env(&env); + + assert_eq!(cfg.pdf.ocr.enabled, true); + assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:7b"); + assert_eq!(cfg.pdf.ocr.always_on, true); + assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.75).abs() < 1e-6); + + // 다른 env var 가 default 보존 + assert_eq!(cfg.pdf.ocr.engine, "ollama-vision"); + assert_eq!(cfg.pdf.ocr.min_char_count, 20); +}