From 9f003ef1cd067fc9aa6677ec3715f18c72481410 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 27 May 2026 06:42:01 +0000 Subject: [PATCH] =?UTF-8?q?feat(app):=20add=20pdf=5Focr=5Fapply=20helper?= =?UTF-8?q?=20(10=20test,=20F7=20split=20+=20cancel)=20=E2=80=94=20post-ex?= =?UTF-8?q?tract=20OCR=20enrichment=20for=20PDF=20(H-1=20resolution)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 4 (Group D) of v0.20.0 sub-item 1 (scanned PDF OCR) plan. D1 — `apply_ocr_to_pdf_pages(&mut canonical, &dyn OcrEngine, &bytes, &opts, emit_progress)` in `kebab-app::pdf_ocr_apply`. spec §4.1 line 381-599 body 그대로 + PdfOcrOpts.cancel field + per-page cancel check (verifier LOW L-1). post-extract enrichment pattern (H-1 resolution): kebab-parse-pdf 가 kebab-parse-image::OcrEngine 을 import 하지 않음 (parser isolation 보존). helper 가 kebab-app 의 facade 안 — both parser crate 의 cross-import 회피. Per-page decision matrix (spec §4.1 line 459-464): - always_on=true → 모든 page OCR (dual-block, ordinal = page-1 + page_count). - always_on=false + needs_ocr → in-place OCR (text-detect block mutate). - needs_ocr=false → skip. DCTDecode-only v1 (H-3): FlateDecode / CCITTFaxDecode page 는 extract_dctdecode_page_image=None → Warning event + skip + emit_progress(skipped=true). OcrEngine.recognize 실패 → Warning event + skip + emit_progress(skipped=true). D3 — per-page cancel handle (verifier LOW L-1 + spec §4.8 line 1159): PdfOcrOpts.cancel: Option>. set→true 시 `anyhow::bail!("PDF OCR cancelled mid-PDF at page N")`. lopdf = "0.32" added to [dependencies] (already transitive via kebab-parse-pdf; no new crate introduced — dep graph kebab-parse-* baseline unchanged). Integration test (`tests/pdf_ocr_apply.rs`, 10 test): - f1_input_with_ocr_enabled_replaces_empty_block — in-place mutate. - f3_input_with_ocr_enabled_keeps_text_detect_blocks — vector PDF skip. - f1_input_with_ocr_disabled_keeps_empty_block — disabled no-op. - f4_input_with_ocr_enabled_replaces_mojibake_block — mojibake → in-place mutate. - f3_input_with_always_on_pushes_dual_blocks — always_on dual-block. - f6_flatedecode_skipped_with_warning — FlateDecode skip + Warning event. - f7_ccittfax_skipped_with_warning — CCITTFax skip + Warning event (verifier M-4 split). - ocr_engine_failure_surfaces_as_warning — OCR failure → Warning event. - dual_block_ordinals_are_deterministic_and_unique — ordinal invariant. - cancel_handle_aborts_mid_pdf — cancel handle 의 production source (D3). MockOcrEngine fixture: spec §5.5 line 1284-1299. F3 fixture 부재 → mock CanonicalDocument construction + F1 bytes reuse pattern (Option B: PdfTextExtractor::extract 를 통한 실제 production path canonical 생성). spec: docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md (§4.1 + §5.5) plan: docs/superpowers/plans/2026-05-27-pdf-scanned-ocr-plan.md (Step 4 D1+D2+D3) prior: c2cd3a7 (Step 3) + 8d81bc1 (Step 3 clippy fix) contract: §9 (additive minor wire bump — 후속 step) Co-Authored-By: Claude Sonnet 4.6 --- crates/kebab-app/Cargo.toml | 1 + crates/kebab-app/src/lib.rs | 1 + crates/kebab-app/src/pdf_ocr_apply.rs | 244 ++++++++++++++ crates/kebab-app/tests/pdf_ocr_apply.rs | 417 ++++++++++++++++++++++++ 4 files changed, 663 insertions(+) create mode 100644 crates/kebab-app/src/pdf_ocr_apply.rs create mode 100644 crates/kebab-app/tests/pdf_ocr_apply.rs diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml index f1b991e..2ad9fcf 100644 --- a/crates/kebab-app/Cargo.toml +++ b/crates/kebab-app/Cargo.toml @@ -35,6 +35,7 @@ kebab-parse-image = { path = "../kebab-parse-image" } # per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the # resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`. kebab-parse-pdf = { path = "../kebab-parse-pdf" } +lopdf = "0.32" # p10-1A-2: Rust AST extractor lives here. App threads it into the # per-asset dispatch (see `ingest_one_asset` Code branch) and runs the # resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`. diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 96951e5..78c1ca8 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -62,6 +62,7 @@ pub mod external; pub mod fetch; pub mod ingest_progress; pub mod logging; +pub mod pdf_ocr_apply; pub mod reset; pub mod schema; mod staleness; diff --git a/crates/kebab-app/src/pdf_ocr_apply.rs b/crates/kebab-app/src/pdf_ocr_apply.rs new file mode 100644 index 0000000..1c6ed14 --- /dev/null +++ b/crates/kebab-app/src/pdf_ocr_apply.rs @@ -0,0 +1,244 @@ +// crates/kebab-app/src/pdf_ocr_apply.rs +// +// PDF post-extract OCR enrichment. parser isolation 보존 — kebab-parse-pdf 가 +// kebab-parse-image::OcrEngine 을 import 하지 않도록, helper 는 kebab-app 에 둠. +// image path 의 apply_ocr (kebab-parse-image::ocr::apply_ocr) 의 +// PDF page 변형 — image 는 ImageRefBlock.ocr 를 mutate, PDF 는 +// Block::Paragraph.text / inlines 를 in-place mutate (단일 OCR fallback) 또는 +// 새 Block::Paragraph 를 push (always_on dual-block). + +use std::sync::Arc; +use std::sync::atomic::AtomicBool; +use std::time::Instant; + +use anyhow::{Context, Result}; +use kebab_core::{ + Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent, + ProvenanceKind, SourceSpan, TextBlock, id_for_block, +}; +use kebab_parse_image::OcrEngine; +use kebab_parse_pdf::{compute_valid_char_ratio, extract_dctdecode_page_image}; +use lopdf::Document as LopdfDocument; +use time::OffsetDateTime; +use tracing::warn; + +pub struct PdfOcrOpts { + pub enabled: bool, + pub always_on: bool, + pub valid_ratio_threshold: f32, + pub min_char_count: u32, + pub lang_hint: Option, + /// Optional per-page cancellation handle. checked at start of each page + /// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4 + /// + verifier LOW L-1 resolution + spec §4.8 line 1159 명시. + pub cancel: Option>, +} + +#[derive(Debug)] +pub struct PdfOcrSummary { + pub pages_ocrd: u32, + pub ms_total: u64, +} + +pub fn apply_ocr_to_pdf_pages( + canonical: &mut CanonicalDocument, + engine: &dyn OcrEngine, + pdf_bytes: &[u8], + opts: &PdfOcrOpts, + mut emit_progress: F, +) -> Result +where + F: FnMut(PdfOcrProgress), +{ + if !opts.enabled { + return Ok(PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }); + } + let pdf_doc = LopdfDocument::load_mem(pdf_bytes) + .context("kb-app::pdf_ocr_apply: re-parse PDF for image extract")?; + let page_count = pdf_doc.get_pages().len() as u32; + + let mut new_events: Vec = Vec::new(); + let mut ocr_blocks: Vec = Vec::new(); + let mut pages_ocrd: u32 = 0; + let mut ms_total: u64 = 0; + + // canonical.blocks 의 page → block index map (text-detect block 의 in-place + // mutate 또는 dual-block push 결정용). + // PdfTextExtractor 가 page 마다 1 Block::Paragraph + SourceSpan::Page 를 + // 생성 (§1.4) — 그 invariant 사용. + for page_num in 1..=page_count { + if let Some(cancel) = &opts.cancel { + if cancel.load(std::sync::atomic::Ordering::Relaxed) { + anyhow::bail!("PDF OCR cancelled mid-PDF at page {page_num}"); + } + } + + let text_block_idx = find_paragraph_block_idx(&canonical.blocks, page_num); + let text = match &canonical.blocks[text_block_idx] { + Block::Paragraph(tb) => tb.text.clone(), + _ => String::new(), + }; + let chars = text.chars().count() as u32; + let valid_ratio = compute_valid_char_ratio(&text); + let needs_ocr = + chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold; + + // 결정 matrix: + // always_on=true → 모든 page OCR (dual-block). + // always_on=false + needs_ocr → in-place OCR (text-detect block mutate). + // needs_ocr=false → skip. + let do_ocr = opts.always_on || needs_ocr; + if !do_ocr { + continue; + } + + emit_progress(PdfOcrProgress::Started { page: page_num }); + + let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? { b } else { + let note = format!( + "page={page_num} skipped: no DCTDecode image XObject (vector PDF page or unsupported /Filter — v1 supports DCTDecode passthrough only; see release notes for normalization guidance)" + ); + warn!(target: "kebab-app", "{}", note); + new_events.push(ProvenanceEvent { + at: OffsetDateTime::now_utc(), + agent: "kb-parse-pdf".to_string(), + kind: ProvenanceKind::Warning, + note: Some(note), + }); + emit_progress(PdfOcrProgress::Finished { + page: page_num, + ms: 0, + chars: 0, + skipped: true, + }); + continue; + }; + + let start = Instant::now(); + let ocr = match engine.recognize(&page_image_bytes, opts.lang_hint.as_ref()) { + Ok(t) => t, + Err(e) => { + // OCR failure: warning event + skip (text-detect block 그대로). + let note = format!( + "page={} OCR failed engine={} version={} err={}", + page_num, + engine.engine_name(), + engine.engine_version(), + e + ); + warn!(target: "kebab-app", "{}", note); + new_events.push(ProvenanceEvent { + at: OffsetDateTime::now_utc(), + agent: "kb-parse-pdf".to_string(), + kind: ProvenanceKind::Warning, + note: Some(note), + }); + emit_progress(PdfOcrProgress::Finished { + page: page_num, + ms: start.elapsed().as_millis() as u64, + chars: 0, + skipped: true, + }); + continue; + } + }; + let elapsed_ms = start.elapsed().as_millis() as u64; + let chars_ocr = ocr.joined.chars().count() as u32; + + pages_ocrd = pages_ocrd.saturating_add(1); + ms_total = ms_total.saturating_add(elapsed_ms); + + if opts.always_on && !needs_ocr { + // dual-block path: 새 Block::Paragraph push, ordinal = page-1 + page_count. + let ocr_ordinal = (page_num - 1) + page_count; + let span_ocr = SourceSpan::Page { + page: page_num, + char_start: Some(0), + char_end: Some(chars_ocr), + }; + let block_id = + id_for_block(&canonical.doc_id, "paragraph", &[], ocr_ordinal, &span_ocr); + let common = CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span_ocr, + }; + ocr_blocks.push(Block::Paragraph(TextBlock { + common, + text: ocr.joined.clone(), + inlines: if ocr.joined.is_empty() { + Vec::new() + } else { + vec![Inline::Text { + text: ocr.joined.clone(), + }] + }, + })); + } else { + // in-place mutate: text-detect block (빈 또는 low-valid) 의 text/inlines 교체. + // block_id / ordinal 보존 — span 의 char_end 만 갱신. + if let Block::Paragraph(tb) = &mut canonical.blocks[text_block_idx] { + tb.text = ocr.joined.clone(); + tb.inlines = if ocr.joined.is_empty() { + Vec::new() + } else { + vec![Inline::Text { + text: ocr.joined.clone(), + }] + }; + if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span { + *char_end = Some(chars_ocr); + } + } + } + + new_events.push(ProvenanceEvent { + at: OffsetDateTime::now_utc(), + agent: "kb-parse-pdf".to_string(), + kind: ProvenanceKind::OcrApplied, + note: Some(format!( + "page={} engine={} version={} regions={} ms={} chars={}", + page_num, + engine.engine_name(), + engine.engine_version(), + ocr.regions.len(), + elapsed_ms, + chars_ocr + )), + }); + + emit_progress(PdfOcrProgress::Finished { + page: page_num, + ms: elapsed_ms, + chars: chars_ocr, + skipped: false, + }); + } + + canonical.blocks.extend(ocr_blocks); + canonical.provenance.events.extend(new_events); + Ok(PdfOcrSummary { pages_ocrd, ms_total }) +} + +fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize { + blocks + .iter() + .position(|b| match b { + Block::Paragraph(tb) => matches!( + tb.common.source_span, + SourceSpan::Page { page, .. } if page == page_num + ), + _ => false, + }) + .expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)") +} + +pub enum PdfOcrProgress { + Started { page: u32 }, + Finished { + page: u32, + ms: u64, + chars: u32, + skipped: bool, + }, +} diff --git a/crates/kebab-app/tests/pdf_ocr_apply.rs b/crates/kebab-app/tests/pdf_ocr_apply.rs new file mode 100644 index 0000000..78e0597 --- /dev/null +++ b/crates/kebab-app/tests/pdf_ocr_apply.rs @@ -0,0 +1,417 @@ +//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern. + +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::sync::atomic::AtomicBool; + +use anyhow::Result; +use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages}; +use kebab_core::{ + AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, + Extractor, Inline, Lang, MediaType, OcrText, RawAsset, SourceSpan, + SourceUri, WorkspacePath, id_for_asset, +}; +use kebab_parse_image::OcrEngine; +use kebab_parse_pdf::PdfTextExtractor; +use time::OffsetDateTime; + +// ── MockOcrEngine fixture ───────────────────────────────────────────────── + +struct MockOcrEngine { + expected_text: String, + fail: bool, +} + +impl OcrEngine for MockOcrEngine { + fn engine_name(&self) -> &'static str { + "mock-ocr" + } + + fn engine_version(&self) -> String { + "mock-v1".to_string() + } + + fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result { + if self.fail { + anyhow::bail!("mock failure"); + } + Ok(OcrText { + joined: self.expected_text.clone(), + regions: Vec::new(), + engine: self.engine_name().to_string(), + engine_version: self.engine_version(), + }) + } +} + +// ── Fixture helpers ─────────────────────────────────────────────────────── + +fn f1_pdf_bytes() -> Vec { + std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf") + .expect("F1 fixture missing") +} + +fn make_raw_asset(path: &str, media_type: MediaType, byte_len: u64) -> RawAsset { + let fake_hash = "0".repeat(64); + let asset_id = id_for_asset(&fake_hash); + RawAsset { + asset_id, + source_uri: SourceUri::File(PathBuf::from(path)), + workspace_path: WorkspacePath::new(path.to_string()).unwrap(), + media_type, + byte_len, + checksum: Checksum(fake_hash.clone()), + discovered_at: OffsetDateTime::UNIX_EPOCH, + stored: AssetStorage::Copied { + path: PathBuf::from(path), + }, + } +} + +/// Build a CanonicalDocument from raw PDF bytes using PdfTextExtractor. +/// F1 (scanned) returns an empty-text Block::Paragraph per page. +fn extract_canonical_from_bytes(bytes: &[u8]) -> CanonicalDocument { + let asset = make_raw_asset("test.pdf", MediaType::Pdf, bytes.len() as u64); + let workspace_root = Path::new("/"); + let config = ExtractConfig::default(); + let ctx = ExtractContext { + asset: &asset, + workspace_root, + config: &config, + }; + PdfTextExtractor::new().extract(&ctx, bytes).unwrap() +} + +/// F1 bytes → canonical with 1 empty Block::Paragraph for page 1. +fn canonical_with_empty_block() -> CanonicalDocument { + extract_canonical_from_bytes(&f1_pdf_bytes()) +} + +/// F1-based canonical with block text replaced by `text` (high valid_ratio, chars≥20). +fn canonical_with_filled_block(text: &str) -> CanonicalDocument { + let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes()); + if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() { + let char_count = text.chars().count() as u32; + tb.text = text.to_string(); + tb.inlines = vec![Inline::Text { + text: text.to_string(), + }]; + if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span { + *char_end = Some(char_count); + } + } + canonical +} + +/// F1-based canonical with block text replaced by PUA codepoints (low valid_ratio). +fn canonical_with_mojibake_block() -> CanonicalDocument { + let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes()); + if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() { + let pua = "\u{E000}".repeat(25); // 25 PUA codepoints → valid_ratio ≈ 0 + let char_count = pua.chars().count() as u32; + tb.text = pua.clone(); + tb.inlines = vec![Inline::Text { text: pua }]; + if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span { + *char_end = Some(char_count); + } + } + canonical +} + +fn default_opts(enabled: bool) -> PdfOcrOpts { + PdfOcrOpts { + enabled, + always_on: false, + valid_ratio_threshold: 0.5, + min_char_count: 20, + lang_hint: None, + cancel: None, + } +} + +// ── Tests ───────────────────────────────────────────────────────────────── + +// Test 1: F1 + enabled=true → in-place mutate +#[test] +fn f1_input_with_ocr_enabled_replaces_empty_block() { + let bytes = f1_pdf_bytes(); + let mut canonical = canonical_with_empty_block(); + let engine = MockOcrEngine { + expected_text: "MOCK_OCR_TEXT".into(), + fail: false, + }; + let opts = PdfOcrOpts { + enabled: true, + always_on: false, + valid_ratio_threshold: 0.5, + min_char_count: 20, + lang_hint: Some(Lang("kor".into())), + cancel: None, + }; + + let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); + + assert_eq!(summary.pages_ocrd, 1); + let first_para = canonical.blocks.iter().find_map(|b| match b { + Block::Paragraph(tb) => Some(tb), + _ => None, + }); + assert!(first_para.is_some()); + assert_eq!(first_para.unwrap().text, "MOCK_OCR_TEXT"); +} + +// Test 2: F3 vector (mock filled canonical) + enabled=true → OCR skip (needs_ocr=false) +#[test] +fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() { + let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text + let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters."; + let mut canonical = canonical_with_filled_block(text); + let engine = MockOcrEngine { + expected_text: "SHOULD_NOT_BE_CALLED".into(), + fail: false, + }; + let opts = default_opts(true); + + let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); + + assert_eq!(summary.pages_ocrd, 0, "vector PDF 의 OCR 호출 0"); + let first_para = canonical.blocks.iter().find_map(|b| match b { + Block::Paragraph(tb) => Some(tb), + _ => None, + }); + if let Some(tb) = first_para { + assert!(tb.text.starts_with("충분한"), "원본 text 보존"); + } +} + +// Test 3: F1 + enabled=false → no-op +#[test] +fn f1_input_with_ocr_disabled_keeps_empty_block() { + let bytes = f1_pdf_bytes(); + let mut canonical = canonical_with_empty_block(); + let engine = MockOcrEngine { + expected_text: "IGNORED".into(), + fail: false, + }; + let opts = default_opts(false); + + let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); + + assert_eq!(summary.pages_ocrd, 0); + assert_eq!(summary.ms_total, 0); +} + +// Test 4: mojibake canonical (PUA chars) + enabled=true → in-place mutate +#[test] +fn f4_input_with_ocr_enabled_replaces_mojibake_block() { + let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image + let mut canonical = canonical_with_mojibake_block(); + let engine = MockOcrEngine { + expected_text: "OCR_MOJIBAKE_REPLACEMENT".into(), + fail: false, + }; + let opts = PdfOcrOpts { + enabled: true, + always_on: false, + valid_ratio_threshold: 0.5, + min_char_count: 20, + lang_hint: None, + cancel: None, + }; + + let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); + + assert_eq!(summary.pages_ocrd, 1, "mojibake page 의 OCR 호출"); + let first_para = canonical.blocks.iter().find_map(|b| match b { + Block::Paragraph(tb) => Some(tb), + _ => None, + }); + if let Some(tb) = first_para { + assert_eq!(tb.text, "OCR_MOJIBAKE_REPLACEMENT"); + } +} + +// Test 5: filled canonical + always_on=true → dual-block (+1 OCR block) +#[test] +fn f3_input_with_always_on_pushes_dual_blocks() { + let bytes = f1_pdf_bytes(); + let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio."; + let mut canonical = canonical_with_filled_block(text); + let original_block_count = canonical.blocks.len(); + let engine = MockOcrEngine { + expected_text: "OCR_DUAL".into(), + fail: false, + }; + let opts = PdfOcrOpts { + enabled: true, + always_on: true, + valid_ratio_threshold: 0.5, + min_char_count: 20, + lang_hint: None, + cancel: None, + }; + + let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); + + assert_eq!(summary.pages_ocrd, 1); + assert_eq!( + canonical.blocks.len(), + original_block_count + 1, + "always_on 시 새 Block::Paragraph push" + ); + let texts: Vec<&str> = canonical + .blocks + .iter() + .filter_map(|b| match b { + Block::Paragraph(tb) => Some(tb.text.as_str()), + _ => None, + }) + .collect(); + assert!(texts.contains(&"OCR_DUAL"), "OCR block 포함"); + assert!( + texts.iter().any(|t| t.starts_with("vector")), + "원본 text-detect block 보존" + ); +} + +// Test 6: F6 FlateDecode → extract_dctdecode_page_image=None → skip + warning +#[test] +fn f6_flatedecode_skipped_with_warning() { + let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf") + .expect("F6 fixture missing"); + let mut canonical = canonical_with_empty_block(); // page-1 block from F1 + let engine = MockOcrEngine { + expected_text: "SHOULD_NOT_BE_CALLED".into(), + fail: false, + }; + let opts = default_opts(true); + + let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); + + assert_eq!( + summary.pages_ocrd, 0, + "FlateDecode page 는 skip (DCTDecode-only v1 invariant)" + ); + let warning_count = canonical + .provenance + .events + .iter() + .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning) + .count(); + assert!(warning_count >= 1, "FlateDecode skip 시 Warning event 발행"); +} + +// Test 7: F7 CCITTFax → skip + warning (verifier M-4 split) +#[test] +fn f7_ccittfax_skipped_with_warning() { + let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf") + .expect("F7 fixture missing"); + let mut canonical = canonical_with_empty_block(); // page-1 block from F1 + let engine = MockOcrEngine { + expected_text: "SHOULD_NOT_BE_CALLED".into(), + fail: false, + }; + let opts = default_opts(true); + + let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); + + assert_eq!(summary.pages_ocrd, 0, "CCITTFax page 는 skip"); + let warning_count = canonical + .provenance + .events + .iter() + .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning) + .count(); + assert!(warning_count >= 1, "CCITTFax skip 시 Warning event 발행"); +} + +// Test 8: OCR engine failure → warning event + skip +#[test] +fn ocr_engine_failure_surfaces_as_warning() { + let bytes = f1_pdf_bytes(); + let mut canonical = canonical_with_empty_block(); + let engine = MockOcrEngine { + expected_text: String::new(), + fail: true, + }; + let opts = default_opts(true); + + let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); + + assert_eq!(summary.pages_ocrd, 0, "OCR failure 시 pages_ocrd=0"); + let warning_with_failure = canonical.provenance.events.iter().any(|e| { + e.kind == kebab_core::ProvenanceKind::Warning + && e.note.as_deref().unwrap_or("").contains("mock failure") + }); + assert!( + warning_with_failure, + "OCR failure 의 error message 가 warning event 의 note 안" + ); +} + +// Test 9: dual-block ordinals are deterministic and unique +#[test] +fn dual_block_ordinals_are_deterministic_and_unique() { + let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1 + let text = "vector 충분한 텍스트. This text has more than twenty characters total."; + let mut canonical = canonical_with_filled_block(text); + let engine = MockOcrEngine { + expected_text: "DUAL".into(), + fail: false, + }; + let opts = PdfOcrOpts { + enabled: true, + always_on: true, + valid_ratio_threshold: 0.5, + min_char_count: 20, + lang_hint: None, + cancel: None, + }; + + apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); + + // page_count=1 → text-detect ordinal=0, ocr ordinal=1 (page_num-1 + page_count = 0+1=1) + let para_count = canonical + .blocks + .iter() + .filter(|b| matches!(b, Block::Paragraph(_))) + .count(); + assert_eq!(para_count, 2, "dual-block: text-detect + OCR"); + + let all_page_1 = canonical + .blocks + .iter() + .filter_map(|b| match b { + Block::Paragraph(tb) => Some(&tb.common.source_span), + _ => None, + }) + .all(|s| matches!(s, SourceSpan::Page { page: 1, .. })); + assert!(all_page_1, "두 block 모두 page=1"); +} + +// Test 10: cancel handle aborts mid-PDF +#[test] +fn cancel_handle_aborts_mid_pdf() { + let bytes = f1_pdf_bytes(); + let mut canonical = canonical_with_empty_block(); + let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel + let engine = MockOcrEngine { + expected_text: "IGNORED".into(), + fail: false, + }; + let opts = PdfOcrOpts { + enabled: true, + always_on: false, + valid_ratio_threshold: 0.5, + min_char_count: 20, + lang_hint: None, + cancel: Some(cancel.clone()), + }; + + let result = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}); + let err = result.expect_err("cancel=true 시 error 반환"); + assert!( + format!("{err}").contains("cancelled mid-PDF"), + "error message 가 'cancelled mid-PDF' 포함: {err}" + ); +}