Phase C4 executor 의 마지막 `fix(test): clippy + fmt fixes` commit 이 test file 부분만 fmt 적용. workspace 전체 fmt 누락 발견 → cargo fmt --all 적용. 모든 import alphabetical reorder + line wrapping 정합. 추가 untracked artifact 동시 commit: - docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md (491 line, ACCEPT) - docs/superpowers/plans/2026-05-28-v0.20-ingest-log-plan.md (616 line, ACCEPT) workspace test: 1370 passed / 0 failed / 50 ignored, ingest_log_smoke green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
359 lines
13 KiB
Rust
359 lines
13 KiB
Rust
//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
|
|
|
|
mod common;
|
|
|
|
use std::path::{Path, PathBuf};
|
|
use std::sync::Arc;
|
|
use std::sync::atomic::AtomicBool;
|
|
|
|
use common::mock_ocr::MockOcrEngine;
|
|
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
|
use kebab_core::{
|
|
AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, Extractor,
|
|
Inline, Lang, MediaType, RawAsset, SourceSpan, SourceUri, WorkspacePath, id_for_asset,
|
|
};
|
|
use kebab_parse_pdf::PdfTextExtractor;
|
|
use time::OffsetDateTime;
|
|
|
|
// ── Fixture helpers ───────────────────────────────────────────────────────
|
|
|
|
fn f1_pdf_bytes() -> Vec<u8> {
|
|
std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
|
.expect("F1 fixture missing")
|
|
}
|
|
|
|
fn make_raw_asset(path: &str, media_type: MediaType, byte_len: u64) -> RawAsset {
|
|
let fake_hash = "0".repeat(64);
|
|
let asset_id = id_for_asset(&fake_hash);
|
|
RawAsset {
|
|
asset_id,
|
|
source_uri: SourceUri::File(PathBuf::from(path)),
|
|
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
|
media_type,
|
|
byte_len,
|
|
checksum: Checksum(fake_hash.clone()),
|
|
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
|
stored: AssetStorage::Copied {
|
|
path: PathBuf::from(path),
|
|
},
|
|
}
|
|
}
|
|
|
|
/// Build a CanonicalDocument from raw PDF bytes using PdfTextExtractor.
|
|
/// F1 (scanned) returns an empty-text Block::Paragraph per page.
|
|
fn extract_canonical_from_bytes(bytes: &[u8]) -> CanonicalDocument {
|
|
let asset = make_raw_asset("test.pdf", MediaType::Pdf, bytes.len() as u64);
|
|
let workspace_root = Path::new("/");
|
|
let config = ExtractConfig::default();
|
|
let ctx = ExtractContext {
|
|
asset: &asset,
|
|
workspace_root,
|
|
config: &config,
|
|
};
|
|
PdfTextExtractor::new().extract(&ctx, bytes).unwrap()
|
|
}
|
|
|
|
/// F1 bytes → canonical with 1 empty Block::Paragraph for page 1.
|
|
fn canonical_with_empty_block() -> CanonicalDocument {
|
|
extract_canonical_from_bytes(&f1_pdf_bytes())
|
|
}
|
|
|
|
/// F1-based canonical with block text replaced by `text` (high valid_ratio, chars≥20).
|
|
fn canonical_with_filled_block(text: &str) -> CanonicalDocument {
|
|
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
|
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
|
let char_count = text.chars().count() as u32;
|
|
tb.text = text.to_string();
|
|
tb.inlines = vec![Inline::Text {
|
|
text: text.to_string(),
|
|
}];
|
|
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
|
*char_end = Some(char_count);
|
|
}
|
|
}
|
|
canonical
|
|
}
|
|
|
|
/// F1-based canonical with block text replaced by PUA codepoints (low valid_ratio).
|
|
fn canonical_with_mojibake_block() -> CanonicalDocument {
|
|
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
|
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
|
let pua = "\u{E000}".repeat(25); // 25 PUA codepoints → valid_ratio ≈ 0
|
|
let char_count = pua.chars().count() as u32;
|
|
tb.text = pua.clone();
|
|
tb.inlines = vec![Inline::Text { text: pua }];
|
|
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
|
*char_end = Some(char_count);
|
|
}
|
|
}
|
|
canonical
|
|
}
|
|
|
|
fn default_opts(enabled: bool) -> PdfOcrOpts {
|
|
PdfOcrOpts {
|
|
enabled,
|
|
always_on: false,
|
|
valid_ratio_threshold: 0.5,
|
|
min_char_count: 20,
|
|
lang_hint: None,
|
|
cancel: None,
|
|
}
|
|
}
|
|
|
|
// ── Tests ─────────────────────────────────────────────────────────────────
|
|
|
|
// Test 1: F1 + enabled=true → in-place mutate
|
|
#[test]
|
|
fn f1_input_with_ocr_enabled_replaces_empty_block() {
|
|
let bytes = f1_pdf_bytes();
|
|
let mut canonical = canonical_with_empty_block();
|
|
let engine = MockOcrEngine::single("MOCK_OCR_TEXT", false);
|
|
let opts = PdfOcrOpts {
|
|
enabled: true,
|
|
always_on: false,
|
|
valid_ratio_threshold: 0.5,
|
|
min_char_count: 20,
|
|
lang_hint: Some(Lang("kor".into())),
|
|
cancel: None,
|
|
};
|
|
|
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
|
|
|
assert_eq!(summary.pages_ocrd, 1);
|
|
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
|
Block::Paragraph(tb) => Some(tb),
|
|
_ => None,
|
|
});
|
|
assert!(first_para.is_some());
|
|
assert_eq!(first_para.unwrap().text, "MOCK_OCR_TEXT");
|
|
}
|
|
|
|
// Test 2: F3 vector (mock filled canonical) + enabled=true → OCR skip (needs_ocr=false)
|
|
#[test]
|
|
fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
|
|
let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
|
|
let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
|
|
let mut canonical = canonical_with_filled_block(text);
|
|
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
|
let opts = default_opts(true);
|
|
|
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
|
|
|
assert_eq!(summary.pages_ocrd, 0, "vector PDF 의 OCR 호출 0");
|
|
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
|
Block::Paragraph(tb) => Some(tb),
|
|
_ => None,
|
|
});
|
|
if let Some(tb) = first_para {
|
|
assert!(tb.text.starts_with("충분한"), "원본 text 보존");
|
|
}
|
|
}
|
|
|
|
// Test 3: F1 + enabled=false → no-op
|
|
#[test]
|
|
fn f1_input_with_ocr_disabled_keeps_empty_block() {
|
|
let bytes = f1_pdf_bytes();
|
|
let mut canonical = canonical_with_empty_block();
|
|
let engine = MockOcrEngine::single("IGNORED", false);
|
|
let opts = default_opts(false);
|
|
|
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
|
|
|
assert_eq!(summary.pages_ocrd, 0);
|
|
assert_eq!(summary.ms_total, 0);
|
|
}
|
|
|
|
// Test 4: mojibake canonical (PUA chars) + enabled=true → in-place mutate
|
|
#[test]
|
|
fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
|
|
let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
|
|
let mut canonical = canonical_with_mojibake_block();
|
|
let engine = MockOcrEngine::single("OCR_MOJIBAKE_REPLACEMENT", false);
|
|
let opts = PdfOcrOpts {
|
|
enabled: true,
|
|
always_on: false,
|
|
valid_ratio_threshold: 0.5,
|
|
min_char_count: 20,
|
|
lang_hint: None,
|
|
cancel: None,
|
|
};
|
|
|
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
|
|
|
assert_eq!(summary.pages_ocrd, 1, "mojibake page 의 OCR 호출");
|
|
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
|
Block::Paragraph(tb) => Some(tb),
|
|
_ => None,
|
|
});
|
|
if let Some(tb) = first_para {
|
|
assert_eq!(tb.text, "OCR_MOJIBAKE_REPLACEMENT");
|
|
}
|
|
}
|
|
|
|
// Test 5: filled canonical + always_on=true → dual-block (+1 OCR block)
|
|
#[test]
|
|
fn f3_input_with_always_on_pushes_dual_blocks() {
|
|
let bytes = f1_pdf_bytes();
|
|
let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
|
|
let mut canonical = canonical_with_filled_block(text);
|
|
let original_block_count = canonical.blocks.len();
|
|
let engine = MockOcrEngine::single("OCR_DUAL", false);
|
|
let opts = PdfOcrOpts {
|
|
enabled: true,
|
|
always_on: true,
|
|
valid_ratio_threshold: 0.5,
|
|
min_char_count: 20,
|
|
lang_hint: None,
|
|
cancel: None,
|
|
};
|
|
|
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
|
|
|
assert_eq!(summary.pages_ocrd, 1);
|
|
assert_eq!(
|
|
canonical.blocks.len(),
|
|
original_block_count + 1,
|
|
"always_on 시 새 Block::Paragraph push"
|
|
);
|
|
let texts: Vec<&str> = canonical
|
|
.blocks
|
|
.iter()
|
|
.filter_map(|b| match b {
|
|
Block::Paragraph(tb) => Some(tb.text.as_str()),
|
|
_ => None,
|
|
})
|
|
.collect();
|
|
assert!(texts.contains(&"OCR_DUAL"), "OCR block 포함");
|
|
assert!(
|
|
texts.iter().any(|t| t.starts_with("vector")),
|
|
"원본 text-detect block 보존"
|
|
);
|
|
}
|
|
|
|
// Test 6: F6 FlateDecode → extract_dctdecode_page_image=None → skip + warning
|
|
#[test]
|
|
fn f6_flatedecode_skipped_with_warning() {
|
|
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
|
|
.expect("F6 fixture missing");
|
|
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
|
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
|
let opts = default_opts(true);
|
|
|
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
|
|
|
assert_eq!(
|
|
summary.pages_ocrd, 0,
|
|
"FlateDecode page 는 skip (DCTDecode-only v1 invariant)"
|
|
);
|
|
let warning_count = canonical
|
|
.provenance
|
|
.events
|
|
.iter()
|
|
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
|
.count();
|
|
assert!(warning_count >= 1, "FlateDecode skip 시 Warning event 발행");
|
|
}
|
|
|
|
// Test 7: F7 CCITTFax → skip + warning (verifier M-4 split)
|
|
#[test]
|
|
fn f7_ccittfax_skipped_with_warning() {
|
|
let bytes =
|
|
std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf").expect("F7 fixture missing");
|
|
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
|
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
|
let opts = default_opts(true);
|
|
|
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
|
|
|
assert_eq!(summary.pages_ocrd, 0, "CCITTFax page 는 skip");
|
|
let warning_count = canonical
|
|
.provenance
|
|
.events
|
|
.iter()
|
|
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
|
.count();
|
|
assert!(warning_count >= 1, "CCITTFax skip 시 Warning event 발행");
|
|
}
|
|
|
|
// Test 8: OCR engine failure → warning event + skip
|
|
#[test]
|
|
fn ocr_engine_failure_surfaces_as_warning() {
|
|
let bytes = f1_pdf_bytes();
|
|
let mut canonical = canonical_with_empty_block();
|
|
let engine = MockOcrEngine::single("", true);
|
|
let opts = default_opts(true);
|
|
|
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
|
|
|
assert_eq!(summary.pages_ocrd, 0, "OCR failure 시 pages_ocrd=0");
|
|
let warning_with_failure = canonical.provenance.events.iter().any(|e| {
|
|
e.kind == kebab_core::ProvenanceKind::Warning
|
|
&& e.note.as_deref().unwrap_or("").contains("mock failure")
|
|
});
|
|
assert!(
|
|
warning_with_failure,
|
|
"OCR failure 의 error message 가 warning event 의 note 안"
|
|
);
|
|
}
|
|
|
|
// Test 9: dual-block ordinals are deterministic and unique
|
|
#[test]
|
|
fn dual_block_ordinals_are_deterministic_and_unique() {
|
|
let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
|
|
let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
|
|
let mut canonical = canonical_with_filled_block(text);
|
|
let engine = MockOcrEngine::single("DUAL", false);
|
|
let opts = PdfOcrOpts {
|
|
enabled: true,
|
|
always_on: true,
|
|
valid_ratio_threshold: 0.5,
|
|
min_char_count: 20,
|
|
lang_hint: None,
|
|
cancel: None,
|
|
};
|
|
|
|
apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
|
|
|
// page_count=1 → text-detect ordinal=0, ocr ordinal=1 (page_num-1 + page_count = 0+1=1)
|
|
let para_count = canonical
|
|
.blocks
|
|
.iter()
|
|
.filter(|b| matches!(b, Block::Paragraph(_)))
|
|
.count();
|
|
assert_eq!(para_count, 2, "dual-block: text-detect + OCR");
|
|
|
|
let all_page_1 = canonical
|
|
.blocks
|
|
.iter()
|
|
.filter_map(|b| match b {
|
|
Block::Paragraph(tb) => Some(&tb.common.source_span),
|
|
_ => None,
|
|
})
|
|
.all(|s| matches!(s, SourceSpan::Page { page: 1, .. }));
|
|
assert!(all_page_1, "두 block 모두 page=1");
|
|
}
|
|
|
|
// Test 10: cancel handle aborts mid-PDF
|
|
#[test]
|
|
fn cancel_handle_aborts_mid_pdf() {
|
|
let bytes = f1_pdf_bytes();
|
|
let mut canonical = canonical_with_empty_block();
|
|
let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
|
|
let engine = MockOcrEngine::single("IGNORED", false);
|
|
let opts = PdfOcrOpts {
|
|
enabled: true,
|
|
always_on: false,
|
|
valid_ratio_threshold: 0.5,
|
|
min_char_count: 20,
|
|
lang_hint: None,
|
|
cancel: Some(cancel.clone()),
|
|
};
|
|
|
|
let result = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {});
|
|
let err = result.expect_err("cancel=true 시 error 반환");
|
|
assert!(
|
|
format!("{err}").contains("cancelled mid-PDF"),
|
|
"error message 가 'cancelled mid-PDF' 포함: {err}"
|
|
);
|
|
}
|