feat(app): add pdf_ocr_apply helper (10 test, F7 split + cancel) — post-extract OCR enrichment for PDF (H-1 resolution)

Step 4 (Group D) of v0.20.0 sub-item 1 (scanned PDF OCR) plan. D1 — `apply_ocr_to_pdf_pages(&mut canonical, &dyn OcrEngine, &bytes, &opts, emit_progress)` in `kebab-app::pdf_ocr_apply`. spec §4.1 line 381-599 body 그대로 + PdfOcrOpts.cancel field + per-page cancel check (verifier LOW L-1). post-extract enrichment pattern (H-1 resolution): kebab-parse-pdf 가 kebab-parse-image::OcrEngine 을 import 하지 않음 (parser isolation 보존). helper 가 kebab-app 의 facade 안 — both parser crate 의 cross-import 회피. Per-page decision matrix (spec §4.1 line 459-464): - always_on=true → 모든 page OCR (dual-block, ordinal = page-1 + page_count). - always_on=false + needs_ocr → in-place OCR (text-detect block mutate). - needs_ocr=false → skip. DCTDecode-only v1 (H-3): FlateDecode / CCITTFaxDecode page 는 extract_dctdecode_page_image=None → Warning event + skip + emit_progress(skipped=true). OcrEngine.recognize 실패 → Warning event + skip + emit_progress(skipped=true). D3 — per-page cancel handle (verifier LOW L-1 + spec §4.8 line 1159): PdfOcrOpts.cancel: Option<Arc<AtomicBool>>. set→true 시 `anyhow::bail!("PDF OCR cancelled mid-PDF at page N")`. lopdf = "0.32" added to [dependencies] (already transitive via kebab-parse-pdf; no new crate introduced — dep graph kebab-parse-* baseline unchanged). Integration test (`tests/pdf_ocr_apply.rs`, 10 test): - f1_input_with_ocr_enabled_replaces_empty_block — in-place mutate. - f3_input_with_ocr_enabled_keeps_text_detect_blocks — vector PDF skip. - f1_input_with_ocr_disabled_keeps_empty_block — disabled no-op. - f4_input_with_ocr_enabled_replaces_mojibake_block — mojibake → in-place mutate. - f3_input_with_always_on_pushes_dual_blocks — always_on dual-block. - f6_flatedecode_skipped_with_warning — FlateDecode skip + Warning event. - f7_ccittfax_skipped_with_warning — CCITTFax skip + Warning event (verifier M-4 split). - ocr_engine_failure_surfaces_as_warning — OCR failure → Warning event. - dual_block_ordinals_are_deterministic_and_unique — ordinal invariant. - cancel_handle_aborts_mid_pdf — cancel handle 의 production source (D3). MockOcrEngine fixture: spec §5.5 line 1284-1299. F3 fixture 부재 → mock CanonicalDocument construction + F1 bytes reuse pattern (Option B: PdfTextExtractor::extract 를 통한 실제 production path canonical 생성). spec: docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md (§4.1 + §5.5) plan: docs/superpowers/plans/2026-05-27-pdf-scanned-ocr-plan.md (Step 4 D1+D2+D3) prior: c2cd3a7 (Step 3) + 8d81bc1 (Step 3 clippy fix) contract: §9 (additive minor wire bump — 후속 step) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 06:42:01 +00:00
parent 8d81bc1071
commit 9f003ef1cd
4 changed files with 663 additions and 0 deletions
--- a/crates/kebab-app/Cargo.toml
+++ b/crates/kebab-app/Cargo.toml
@@ -35,6 +35,7 @@ kebab-parse-image = { path = "../kebab-parse-image" }
 # per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
 # resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
 kebab-parse-pdf = { path = "../kebab-parse-pdf" }
+lopdf            = "0.32"
 # p10-1A-2: Rust AST extractor lives here. App threads it into the
 # per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
 # resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -62,6 +62,7 @@ pub mod external;
 pub mod fetch;
 pub mod ingest_progress;
 pub mod logging;
+pub mod pdf_ocr_apply;
 pub mod reset;
 pub mod schema;
 mod staleness;
--- a/crates/kebab-app/src/pdf_ocr_apply.rs
+++ b/crates/kebab-app/src/pdf_ocr_apply.rs
@@ -0,0 +1,244 @@
+// crates/kebab-app/src/pdf_ocr_apply.rs
+//
+// PDF post-extract OCR enrichment. parser isolation 보존 — kebab-parse-pdf 가
+// kebab-parse-image::OcrEngine 을 import 하지 않도록, helper 는 kebab-app 에 둠.
+// image path 의 apply_ocr (kebab-parse-image::ocr::apply_ocr) 의
+// PDF page 변형 — image 는 ImageRefBlock.ocr 를 mutate, PDF 는
+// Block::Paragraph.text / inlines 를 in-place mutate (단일 OCR fallback) 또는
+// 새 Block::Paragraph 를 push (always_on dual-block).
+
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+use std::time::Instant;
+
+use anyhow::{Context, Result};
+use kebab_core::{
+    Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent,
+    ProvenanceKind, SourceSpan, TextBlock, id_for_block,
+};
+use kebab_parse_image::OcrEngine;
+use kebab_parse_pdf::{compute_valid_char_ratio, extract_dctdecode_page_image};
+use lopdf::Document as LopdfDocument;
+use time::OffsetDateTime;
+use tracing::warn;
+
+pub struct PdfOcrOpts {
+    pub enabled: bool,
+    pub always_on: bool,
+    pub valid_ratio_threshold: f32,
+    pub min_char_count: u32,
+    pub lang_hint: Option<Lang>,
+    /// Optional per-page cancellation handle. checked at start of each page
+    /// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4
+    /// + verifier LOW L-1 resolution + spec §4.8 line 1159 명시.
+    pub cancel: Option<Arc<AtomicBool>>,
+}
+
+#[derive(Debug)]
+pub struct PdfOcrSummary {
+    pub pages_ocrd: u32,
+    pub ms_total: u64,
+}
+
+pub fn apply_ocr_to_pdf_pages<F>(
+    canonical: &mut CanonicalDocument,
+    engine: &dyn OcrEngine,
+    pdf_bytes: &[u8],
+    opts: &PdfOcrOpts,
+    mut emit_progress: F,
+) -> Result<PdfOcrSummary>
+where
+    F: FnMut(PdfOcrProgress),
+{
+    if !opts.enabled {
+        return Ok(PdfOcrSummary { pages_ocrd: 0, ms_total: 0 });
+    }
+    let pdf_doc = LopdfDocument::load_mem(pdf_bytes)
+        .context("kb-app::pdf_ocr_apply: re-parse PDF for image extract")?;
+    let page_count = pdf_doc.get_pages().len() as u32;
+
+    let mut new_events: Vec<ProvenanceEvent> = Vec::new();
+    let mut ocr_blocks: Vec<Block> = Vec::new();
+    let mut pages_ocrd: u32 = 0;
+    let mut ms_total: u64 = 0;
+
+    // canonical.blocks 의 page → block index map (text-detect block 의 in-place
+    // mutate 또는 dual-block push 결정용).
+    // PdfTextExtractor 가 page 마다 1 Block::Paragraph + SourceSpan::Page 를
+    // 생성 (§1.4) — 그 invariant 사용.
+    for page_num in 1..=page_count {
+        if let Some(cancel) = &opts.cancel {
+            if cancel.load(std::sync::atomic::Ordering::Relaxed) {
+                anyhow::bail!("PDF OCR cancelled mid-PDF at page {page_num}");
+            }
+        }
+
+        let text_block_idx = find_paragraph_block_idx(&canonical.blocks, page_num);
+        let text = match &canonical.blocks[text_block_idx] {
+            Block::Paragraph(tb) => tb.text.clone(),
+            _ => String::new(),
+        };
+        let chars = text.chars().count() as u32;
+        let valid_ratio = compute_valid_char_ratio(&text);
+        let needs_ocr =
+            chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold;
+
+        // 결정 matrix:
+        //   always_on=true → 모든 page OCR (dual-block).
+        //   always_on=false + needs_ocr → in-place OCR (text-detect block mutate).
+        //   needs_ocr=false → skip.
+        let do_ocr = opts.always_on || needs_ocr;
+        if !do_ocr {
+            continue;
+        }
+
+        emit_progress(PdfOcrProgress::Started { page: page_num });
+
+        let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? { b } else {
+            let note = format!(
+                "page={page_num} skipped: no DCTDecode image XObject (vector PDF page or unsupported /Filter — v1 supports DCTDecode passthrough only; see release notes for normalization guidance)"
+            );
+            warn!(target: "kebab-app", "{}", note);
+            new_events.push(ProvenanceEvent {
+                at: OffsetDateTime::now_utc(),
+                agent: "kb-parse-pdf".to_string(),
+                kind: ProvenanceKind::Warning,
+                note: Some(note),
+            });
+            emit_progress(PdfOcrProgress::Finished {
+                page: page_num,
+                ms: 0,
+                chars: 0,
+                skipped: true,
+            });
+            continue;
+        };
+
+        let start = Instant::now();
+        let ocr = match engine.recognize(&page_image_bytes, opts.lang_hint.as_ref()) {
+            Ok(t) => t,
+            Err(e) => {
+                // OCR failure: warning event + skip (text-detect block 그대로).
+                let note = format!(
+                    "page={} OCR failed engine={} version={} err={}",
+                    page_num,
+                    engine.engine_name(),
+                    engine.engine_version(),
+                    e
+                );
+                warn!(target: "kebab-app", "{}", note);
+                new_events.push(ProvenanceEvent {
+                    at: OffsetDateTime::now_utc(),
+                    agent: "kb-parse-pdf".to_string(),
+                    kind: ProvenanceKind::Warning,
+                    note: Some(note),
+                });
+                emit_progress(PdfOcrProgress::Finished {
+                    page: page_num,
+                    ms: start.elapsed().as_millis() as u64,
+                    chars: 0,
+                    skipped: true,
+                });
+                continue;
+            }
+        };
+        let elapsed_ms = start.elapsed().as_millis() as u64;
+        let chars_ocr = ocr.joined.chars().count() as u32;
+
+        pages_ocrd = pages_ocrd.saturating_add(1);
+        ms_total = ms_total.saturating_add(elapsed_ms);
+
+        if opts.always_on && !needs_ocr {
+            // dual-block path: 새 Block::Paragraph push, ordinal = page-1 + page_count.
+            let ocr_ordinal = (page_num - 1) + page_count;
+            let span_ocr = SourceSpan::Page {
+                page: page_num,
+                char_start: Some(0),
+                char_end: Some(chars_ocr),
+            };
+            let block_id =
+                id_for_block(&canonical.doc_id, "paragraph", &[], ocr_ordinal, &span_ocr);
+            let common = CommonBlock {
+                block_id,
+                heading_path: Vec::new(),
+                source_span: span_ocr,
+            };
+            ocr_blocks.push(Block::Paragraph(TextBlock {
+                common,
+                text: ocr.joined.clone(),
+                inlines: if ocr.joined.is_empty() {
+                    Vec::new()
+                } else {
+                    vec![Inline::Text {
+                        text: ocr.joined.clone(),
+                    }]
+                },
+            }));
+        } else {
+            // in-place mutate: text-detect block (빈 또는 low-valid) 의 text/inlines 교체.
+            // block_id / ordinal 보존 — span 의 char_end 만 갱신.
+            if let Block::Paragraph(tb) = &mut canonical.blocks[text_block_idx] {
+                tb.text = ocr.joined.clone();
+                tb.inlines = if ocr.joined.is_empty() {
+                    Vec::new()
+                } else {
+                    vec![Inline::Text {
+                        text: ocr.joined.clone(),
+                    }]
+                };
+                if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
+                    *char_end = Some(chars_ocr);
+                }
+            }
+        }
+
+        new_events.push(ProvenanceEvent {
+            at: OffsetDateTime::now_utc(),
+            agent: "kb-parse-pdf".to_string(),
+            kind: ProvenanceKind::OcrApplied,
+            note: Some(format!(
+                "page={} engine={} version={} regions={} ms={} chars={}",
+                page_num,
+                engine.engine_name(),
+                engine.engine_version(),
+                ocr.regions.len(),
+                elapsed_ms,
+                chars_ocr
+            )),
+        });
+
+        emit_progress(PdfOcrProgress::Finished {
+            page: page_num,
+            ms: elapsed_ms,
+            chars: chars_ocr,
+            skipped: false,
+        });
+    }
+
+    canonical.blocks.extend(ocr_blocks);
+    canonical.provenance.events.extend(new_events);
+    Ok(PdfOcrSummary { pages_ocrd, ms_total })
+}
+
+fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize {
+    blocks
+        .iter()
+        .position(|b| match b {
+            Block::Paragraph(tb) => matches!(
+                tb.common.source_span,
+                SourceSpan::Page { page, .. } if page == page_num
+            ),
+            _ => false,
+        })
+        .expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)")
+}
+
+pub enum PdfOcrProgress {
+    Started { page: u32 },
+    Finished {
+        page: u32,
+        ms: u64,
+        chars: u32,
+        skipped: bool,
+    },
+}
--- a/crates/kebab-app/tests/pdf_ocr_apply.rs
+++ b/crates/kebab-app/tests/pdf_ocr_apply.rs
@@ -0,0 +1,417 @@
+//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
+
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+
+use anyhow::Result;
+use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
+use kebab_core::{
+    AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext,
+    Extractor, Inline, Lang, MediaType, OcrText, RawAsset, SourceSpan,
+    SourceUri, WorkspacePath, id_for_asset,
+};
+use kebab_parse_image::OcrEngine;
+use kebab_parse_pdf::PdfTextExtractor;
+use time::OffsetDateTime;
+
+// ── MockOcrEngine fixture ─────────────────────────────────────────────────
+
+struct MockOcrEngine {
+    expected_text: String,
+    fail: bool,
+}
+
+impl OcrEngine for MockOcrEngine {
+    fn engine_name(&self) -> &'static str {
+        "mock-ocr"
+    }
+
+    fn engine_version(&self) -> String {
+        "mock-v1".to_string()
+    }
+
+    fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
+        if self.fail {
+            anyhow::bail!("mock failure");
+        }
+        Ok(OcrText {
+            joined: self.expected_text.clone(),
+            regions: Vec::new(),
+            engine: self.engine_name().to_string(),
+            engine_version: self.engine_version(),
+        })
+    }
+}
+
+// ── Fixture helpers ───────────────────────────────────────────────────────
+
+fn f1_pdf_bytes() -> Vec<u8> {
+    std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
+        .expect("F1 fixture missing")
+}
+
+fn make_raw_asset(path: &str, media_type: MediaType, byte_len: u64) -> RawAsset {
+    let fake_hash = "0".repeat(64);
+    let asset_id = id_for_asset(&fake_hash);
+    RawAsset {
+        asset_id,
+        source_uri: SourceUri::File(PathBuf::from(path)),
+        workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
+        media_type,
+        byte_len,
+        checksum: Checksum(fake_hash.clone()),
+        discovered_at: OffsetDateTime::UNIX_EPOCH,
+        stored: AssetStorage::Copied {
+            path: PathBuf::from(path),
+        },
+    }
+}
+
+/// Build a CanonicalDocument from raw PDF bytes using PdfTextExtractor.
+/// F1 (scanned) returns an empty-text Block::Paragraph per page.
+fn extract_canonical_from_bytes(bytes: &[u8]) -> CanonicalDocument {
+    let asset = make_raw_asset("test.pdf", MediaType::Pdf, bytes.len() as u64);
+    let workspace_root = Path::new("/");
+    let config = ExtractConfig::default();
+    let ctx = ExtractContext {
+        asset: &asset,
+        workspace_root,
+        config: &config,
+    };
+    PdfTextExtractor::new().extract(&ctx, bytes).unwrap()
+}
+
+/// F1 bytes → canonical with 1 empty Block::Paragraph for page 1.
+fn canonical_with_empty_block() -> CanonicalDocument {
+    extract_canonical_from_bytes(&f1_pdf_bytes())
+}
+
+/// F1-based canonical with block text replaced by `text` (high valid_ratio, chars≥20).
+fn canonical_with_filled_block(text: &str) -> CanonicalDocument {
+    let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
+    if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
+        let char_count = text.chars().count() as u32;
+        tb.text = text.to_string();
+        tb.inlines = vec![Inline::Text {
+            text: text.to_string(),
+        }];
+        if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
+            *char_end = Some(char_count);
+        }
+    }
+    canonical
+}
+
+/// F1-based canonical with block text replaced by PUA codepoints (low valid_ratio).
+fn canonical_with_mojibake_block() -> CanonicalDocument {
+    let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
+    if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
+        let pua = "\u{E000}".repeat(25); // 25 PUA codepoints → valid_ratio ≈ 0
+        let char_count = pua.chars().count() as u32;
+        tb.text = pua.clone();
+        tb.inlines = vec![Inline::Text { text: pua }];
+        if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
+            *char_end = Some(char_count);
+        }
+    }
+    canonical
+}
+
+fn default_opts(enabled: bool) -> PdfOcrOpts {
+    PdfOcrOpts {
+        enabled,
+        always_on: false,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: None,
+    }
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────
+
+// Test 1: F1 + enabled=true → in-place mutate
+#[test]
+fn f1_input_with_ocr_enabled_replaces_empty_block() {
+    let bytes = f1_pdf_bytes();
+    let mut canonical = canonical_with_empty_block();
+    let engine = MockOcrEngine {
+        expected_text: "MOCK_OCR_TEXT".into(),
+        fail: false,
+    };
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: false,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: Some(Lang("kor".into())),
+        cancel: None,
+    };
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 1);
+    let first_para = canonical.blocks.iter().find_map(|b| match b {
+        Block::Paragraph(tb) => Some(tb),
+        _ => None,
+    });
+    assert!(first_para.is_some());
+    assert_eq!(first_para.unwrap().text, "MOCK_OCR_TEXT");
+}
+
+// Test 2: F3 vector (mock filled canonical) + enabled=true → OCR skip (needs_ocr=false)
+#[test]
+fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
+    let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
+    let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
+    let mut canonical = canonical_with_filled_block(text);
+    let engine = MockOcrEngine {
+        expected_text: "SHOULD_NOT_BE_CALLED".into(),
+        fail: false,
+    };
+    let opts = default_opts(true);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 0, "vector PDF 의 OCR 호출 0");
+    let first_para = canonical.blocks.iter().find_map(|b| match b {
+        Block::Paragraph(tb) => Some(tb),
+        _ => None,
+    });
+    if let Some(tb) = first_para {
+        assert!(tb.text.starts_with("충분한"), "원본 text 보존");
+    }
+}
+
+// Test 3: F1 + enabled=false → no-op
+#[test]
+fn f1_input_with_ocr_disabled_keeps_empty_block() {
+    let bytes = f1_pdf_bytes();
+    let mut canonical = canonical_with_empty_block();
+    let engine = MockOcrEngine {
+        expected_text: "IGNORED".into(),
+        fail: false,
+    };
+    let opts = default_opts(false);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 0);
+    assert_eq!(summary.ms_total, 0);
+}
+
+// Test 4: mojibake canonical (PUA chars) + enabled=true → in-place mutate
+#[test]
+fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
+    let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
+    let mut canonical = canonical_with_mojibake_block();
+    let engine = MockOcrEngine {
+        expected_text: "OCR_MOJIBAKE_REPLACEMENT".into(),
+        fail: false,
+    };
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: false,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: None,
+    };
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 1, "mojibake page 의 OCR 호출");
+    let first_para = canonical.blocks.iter().find_map(|b| match b {
+        Block::Paragraph(tb) => Some(tb),
+        _ => None,
+    });
+    if let Some(tb) = first_para {
+        assert_eq!(tb.text, "OCR_MOJIBAKE_REPLACEMENT");
+    }
+}
+
+// Test 5: filled canonical + always_on=true → dual-block (+1 OCR block)
+#[test]
+fn f3_input_with_always_on_pushes_dual_blocks() {
+    let bytes = f1_pdf_bytes();
+    let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
+    let mut canonical = canonical_with_filled_block(text);
+    let original_block_count = canonical.blocks.len();
+    let engine = MockOcrEngine {
+        expected_text: "OCR_DUAL".into(),
+        fail: false,
+    };
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: true,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: None,
+    };
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 1);
+    assert_eq!(
+        canonical.blocks.len(),
+        original_block_count + 1,
+        "always_on 시 새 Block::Paragraph push"
+    );
+    let texts: Vec<&str> = canonical
+        .blocks
+        .iter()
+        .filter_map(|b| match b {
+            Block::Paragraph(tb) => Some(tb.text.as_str()),
+            _ => None,
+        })
+        .collect();
+    assert!(texts.contains(&"OCR_DUAL"), "OCR block 포함");
+    assert!(
+        texts.iter().any(|t| t.starts_with("vector")),
+        "원본 text-detect block 보존"
+    );
+}
+
+// Test 6: F6 FlateDecode → extract_dctdecode_page_image=None → skip + warning
+#[test]
+fn f6_flatedecode_skipped_with_warning() {
+    let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
+        .expect("F6 fixture missing");
+    let mut canonical = canonical_with_empty_block(); // page-1 block from F1
+    let engine = MockOcrEngine {
+        expected_text: "SHOULD_NOT_BE_CALLED".into(),
+        fail: false,
+    };
+    let opts = default_opts(true);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(
+        summary.pages_ocrd, 0,
+        "FlateDecode page 는 skip (DCTDecode-only v1 invariant)"
+    );
+    let warning_count = canonical
+        .provenance
+        .events
+        .iter()
+        .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
+        .count();
+    assert!(warning_count >= 1, "FlateDecode skip 시 Warning event 발행");
+}
+
+// Test 7: F7 CCITTFax → skip + warning (verifier M-4 split)
+#[test]
+fn f7_ccittfax_skipped_with_warning() {
+    let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf")
+        .expect("F7 fixture missing");
+    let mut canonical = canonical_with_empty_block(); // page-1 block from F1
+    let engine = MockOcrEngine {
+        expected_text: "SHOULD_NOT_BE_CALLED".into(),
+        fail: false,
+    };
+    let opts = default_opts(true);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 0, "CCITTFax page 는 skip");
+    let warning_count = canonical
+        .provenance
+        .events
+        .iter()
+        .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
+        .count();
+    assert!(warning_count >= 1, "CCITTFax skip 시 Warning event 발행");
+}
+
+// Test 8: OCR engine failure → warning event + skip
+#[test]
+fn ocr_engine_failure_surfaces_as_warning() {
+    let bytes = f1_pdf_bytes();
+    let mut canonical = canonical_with_empty_block();
+    let engine = MockOcrEngine {
+        expected_text: String::new(),
+        fail: true,
+    };
+    let opts = default_opts(true);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 0, "OCR failure 시 pages_ocrd=0");
+    let warning_with_failure = canonical.provenance.events.iter().any(|e| {
+        e.kind == kebab_core::ProvenanceKind::Warning
+            && e.note.as_deref().unwrap_or("").contains("mock failure")
+    });
+    assert!(
+        warning_with_failure,
+        "OCR failure 의 error message 가 warning event 의 note 안"
+    );
+}
+
+// Test 9: dual-block ordinals are deterministic and unique
+#[test]
+fn dual_block_ordinals_are_deterministic_and_unique() {
+    let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
+    let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
+    let mut canonical = canonical_with_filled_block(text);
+    let engine = MockOcrEngine {
+        expected_text: "DUAL".into(),
+        fail: false,
+    };
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: true,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: None,
+    };
+
+    apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    // page_count=1 → text-detect ordinal=0, ocr ordinal=1 (page_num-1 + page_count = 0+1=1)
+    let para_count = canonical
+        .blocks
+        .iter()
+        .filter(|b| matches!(b, Block::Paragraph(_)))
+        .count();
+    assert_eq!(para_count, 2, "dual-block: text-detect + OCR");
+
+    let all_page_1 = canonical
+        .blocks
+        .iter()
+        .filter_map(|b| match b {
+            Block::Paragraph(tb) => Some(&tb.common.source_span),
+            _ => None,
+        })
+        .all(|s| matches!(s, SourceSpan::Page { page: 1, .. }));
+    assert!(all_page_1, "두 block 모두 page=1");
+}
+
+// Test 10: cancel handle aborts mid-PDF
+#[test]
+fn cancel_handle_aborts_mid_pdf() {
+    let bytes = f1_pdf_bytes();
+    let mut canonical = canonical_with_empty_block();
+    let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
+    let engine = MockOcrEngine {
+        expected_text: "IGNORED".into(),
+        fail: false,
+    };
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: false,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: Some(cancel.clone()),
+    };
+
+    let result = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {});
+    let err = result.expect_err("cancel=true 시 error 반환");
+    assert!(
+        format!("{err}").contains("cancelled mid-PDF"),
+        "error message 가 'cancelled mid-PDF' 포함: {err}"
+    );
+}