diff --git a/crates/kebab-app/tests/common/mock_ocr.rs b/crates/kebab-app/tests/common/mock_ocr.rs new file mode 100644 index 0000000..3632214 --- /dev/null +++ b/crates/kebab-app/tests/common/mock_ocr.rs @@ -0,0 +1,60 @@ +use std::sync::Mutex; + +use anyhow::Result; +use kebab_core::{Lang, OcrText}; +use kebab_parse_image::OcrEngine; + +pub struct MockOcrEngine { + expected_texts: Vec, + call_index: Mutex, + fail: bool, +} + +impl MockOcrEngine { + /// Single text (backward-compat ctor for pdf_ocr_apply.rs 10 sites). + pub fn single(text: impl Into, fail: bool) -> Self { + Self { + expected_texts: vec![text.into()], + call_index: Mutex::new(0), + fail, + } + } + + /// Per-page texts (cursor advances per recognize call). + pub fn per_page(texts: Vec, fail: bool) -> Self { + Self { + expected_texts: texts, + call_index: Mutex::new(0), + fail, + } + } +} + +impl OcrEngine for MockOcrEngine { + fn engine_name(&self) -> &'static str { + "mock-ocr" + } + + fn engine_version(&self) -> String { + "mock-v1".to_string() + } + + fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result { + if self.fail { + anyhow::bail!("mock failure"); + } + let mut idx = self.call_index.lock().unwrap(); + let text = self + .expected_texts + .get(*idx) + .cloned() + .unwrap_or_else(|| self.expected_texts.last().cloned().unwrap_or_default()); + *idx += 1; + Ok(OcrText { + joined: text, + regions: vec![], + engine: "mock-ocr".to_string(), + engine_version: "mock-v1".to_string(), + }) + } +} diff --git a/crates/kebab-app/tests/common/mod.rs b/crates/kebab-app/tests/common/mod.rs index c06098f..ee85663 100644 --- a/crates/kebab-app/tests/common/mod.rs +++ b/crates/kebab-app/tests/common/mod.rs @@ -169,3 +169,5 @@ fn copy_dir_recursive(src: &Path, dest: &Path) { } } } + +pub mod mock_ocr; diff --git a/crates/kebab-app/tests/multi_scanned_pdf_ingest_no_chunk_id_collision.rs b/crates/kebab-app/tests/multi_scanned_pdf_ingest_no_chunk_id_collision.rs new file mode 100644 index 0000000..b90730f --- /dev/null +++ b/crates/kebab-app/tests/multi_scanned_pdf_ingest_no_chunk_id_collision.rs @@ -0,0 +1,122 @@ +//! Bug #3 regression: multi-scanned PDF ingest must produce globally unique chunk_ids. +//! v0.20.0 sub-item 1 bugfix. +//! +//! Strategy: helper-level chain test (apply_ocr_to_pdf_pages → PdfPageV1Chunker). +//! Facade mock injection is unavailable (kebab-app hardcodes OllamaVisionOcr), so +//! this test covers the full OCR→chunk pipeline with real PDF fixtures + MockOcrEngine, +//! adding value beyond kebab-chunk unit test B5 (which tests PdfPageV1Chunker alone). + +mod common; + +use std::collections::HashSet; +use std::path::{Path, PathBuf}; + +use common::mock_ocr::MockOcrEngine; +use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages}; +use kebab_chunk::PdfPageV1Chunker; +use kebab_core::{ + AssetStorage, Checksum, ChunkPolicy, Chunker, ExtractConfig, ExtractContext, Extractor, + MediaType, RawAsset, SourceUri, WorkspacePath, id_for_asset, +}; +use kebab_parse_image::OcrEngine; +use kebab_parse_pdf::PdfTextExtractor; +use time::OffsetDateTime; + +fn make_pdf_asset(path: &str, hash_char: char, byte_len: u64) -> RawAsset { + let fake_hash: String = hash_char.to_string().repeat(64); + let asset_id = id_for_asset(&fake_hash); + RawAsset { + asset_id, + source_uri: SourceUri::File(PathBuf::from(path)), + workspace_path: WorkspacePath::new(path.to_string()).unwrap(), + media_type: MediaType::Pdf, + byte_len, + checksum: Checksum(fake_hash), + discovered_at: OffsetDateTime::UNIX_EPOCH, + stored: AssetStorage::Copied { + path: PathBuf::from(path), + }, + } +} + +fn extract_and_ocr( + bytes: &[u8], + path: &str, + hash_char: char, + engine: &dyn OcrEngine, +) -> kebab_core::CanonicalDocument { + let asset = make_pdf_asset(path, hash_char, bytes.len() as u64); + let workspace_root = Path::new("/"); + let config = ExtractConfig::default(); + let ctx = ExtractContext { + asset: &asset, + workspace_root, + config: &config, + }; + let mut canonical = PdfTextExtractor::new().extract(&ctx, bytes).unwrap(); + let opts = PdfOcrOpts { + enabled: true, + always_on: false, + valid_ratio_threshold: 0.5, + min_char_count: 20, + lang_hint: None, + cancel: None, + }; + apply_ocr_to_pdf_pages(&mut canonical, engine, bytes, &opts, |_| {}).unwrap(); + canonical +} + +#[test] +fn multi_scanned_pdf_ingest_no_chunk_id_collision() { + let f1_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf") + .expect("F1 fixture missing"); + let f2_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page2.pdf") + .expect("F2 fixture missing"); + + // Bug #3 trigger shape: 10-char early segment + ". " + 500-char tail. + // byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500 → multi-chunk. + // overlap_bytes = min(240, 750) = 240 / chars=80 → second chunk's actual_start + // collapses to prev_min=0 without the fix → same #c0 suffix → chunk_id collision. + let trigger_text = format!("{}. {}", "가".repeat(10), "나".repeat(500)); + + let f1_engine = MockOcrEngine::single("F1 mock OCR page text", false); + let f2_engine = MockOcrEngine::single(&trigger_text, false); + + let f1_canonical = extract_and_ocr(&f1_bytes, "page1.pdf", '1', &f1_engine); + let f2_canonical = extract_and_ocr(&f2_bytes, "page2.pdf", '2', &f2_engine); + + let chunk_policy = ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: PdfPageV1Chunker.chunker_version(), + }; + + let f1_chunks = PdfPageV1Chunker + .chunk(&f1_canonical, &chunk_policy) + .unwrap(); + let f2_chunks = PdfPageV1Chunker + .chunk(&f2_canonical, &chunk_policy) + .unwrap(); + + assert!( + f2_chunks.len() >= 2, + "F2 trigger text must produce ≥2 chunks for the collision to be possible; got {}", + f2_chunks.len() + ); + + let all_ids: Vec<&str> = f1_chunks + .iter() + .chain(f2_chunks.iter()) + .map(|c| c.chunk_id.0.as_str()) + .collect(); + let total = all_ids.len(); + let unique: HashSet<&str> = all_ids.iter().copied().collect(); + assert_eq!( + unique.len(), + total, + "all chunk_ids must be globally unique across F1 + F2 ({} unique vs {} total — collision detected)", + unique.len(), + total, + ); +} diff --git a/crates/kebab-app/tests/pdf_ocr_apply.rs b/crates/kebab-app/tests/pdf_ocr_apply.rs index 78e0597..3f8eb4d 100644 --- a/crates/kebab-app/tests/pdf_ocr_apply.rs +++ b/crates/kebab-app/tests/pdf_ocr_apply.rs @@ -1,49 +1,21 @@ //! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern. +mod common; + use std::path::{Path, PathBuf}; use std::sync::Arc; use std::sync::atomic::AtomicBool; -use anyhow::Result; +use common::mock_ocr::MockOcrEngine; use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages}; use kebab_core::{ AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, - Extractor, Inline, Lang, MediaType, OcrText, RawAsset, SourceSpan, + Extractor, Inline, Lang, MediaType, RawAsset, SourceSpan, SourceUri, WorkspacePath, id_for_asset, }; -use kebab_parse_image::OcrEngine; use kebab_parse_pdf::PdfTextExtractor; use time::OffsetDateTime; -// ── MockOcrEngine fixture ───────────────────────────────────────────────── - -struct MockOcrEngine { - expected_text: String, - fail: bool, -} - -impl OcrEngine for MockOcrEngine { - fn engine_name(&self) -> &'static str { - "mock-ocr" - } - - fn engine_version(&self) -> String { - "mock-v1".to_string() - } - - fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result { - if self.fail { - anyhow::bail!("mock failure"); - } - Ok(OcrText { - joined: self.expected_text.clone(), - regions: Vec::new(), - engine: self.engine_name().to_string(), - engine_version: self.engine_version(), - }) - } -} - // ── Fixture helpers ─────────────────────────────────────────────────────── fn f1_pdf_bytes() -> Vec { @@ -136,10 +108,7 @@ fn default_opts(enabled: bool) -> PdfOcrOpts { fn f1_input_with_ocr_enabled_replaces_empty_block() { let bytes = f1_pdf_bytes(); let mut canonical = canonical_with_empty_block(); - let engine = MockOcrEngine { - expected_text: "MOCK_OCR_TEXT".into(), - fail: false, - }; + let engine = MockOcrEngine::single("MOCK_OCR_TEXT", false); let opts = PdfOcrOpts { enabled: true, always_on: false, @@ -166,10 +135,7 @@ fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() { let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters."; let mut canonical = canonical_with_filled_block(text); - let engine = MockOcrEngine { - expected_text: "SHOULD_NOT_BE_CALLED".into(), - fail: false, - }; + let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false); let opts = default_opts(true); let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); @@ -189,10 +155,7 @@ fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() { fn f1_input_with_ocr_disabled_keeps_empty_block() { let bytes = f1_pdf_bytes(); let mut canonical = canonical_with_empty_block(); - let engine = MockOcrEngine { - expected_text: "IGNORED".into(), - fail: false, - }; + let engine = MockOcrEngine::single("IGNORED", false); let opts = default_opts(false); let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); @@ -206,10 +169,7 @@ fn f1_input_with_ocr_disabled_keeps_empty_block() { fn f4_input_with_ocr_enabled_replaces_mojibake_block() { let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image let mut canonical = canonical_with_mojibake_block(); - let engine = MockOcrEngine { - expected_text: "OCR_MOJIBAKE_REPLACEMENT".into(), - fail: false, - }; + let engine = MockOcrEngine::single("OCR_MOJIBAKE_REPLACEMENT", false); let opts = PdfOcrOpts { enabled: true, always_on: false, @@ -238,10 +198,7 @@ fn f3_input_with_always_on_pushes_dual_blocks() { let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio."; let mut canonical = canonical_with_filled_block(text); let original_block_count = canonical.blocks.len(); - let engine = MockOcrEngine { - expected_text: "OCR_DUAL".into(), - fail: false, - }; + let engine = MockOcrEngine::single("OCR_DUAL", false); let opts = PdfOcrOpts { enabled: true, always_on: true, @@ -280,10 +237,7 @@ fn f6_flatedecode_skipped_with_warning() { let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf") .expect("F6 fixture missing"); let mut canonical = canonical_with_empty_block(); // page-1 block from F1 - let engine = MockOcrEngine { - expected_text: "SHOULD_NOT_BE_CALLED".into(), - fail: false, - }; + let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false); let opts = default_opts(true); let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); @@ -307,10 +261,7 @@ fn f7_ccittfax_skipped_with_warning() { let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf") .expect("F7 fixture missing"); let mut canonical = canonical_with_empty_block(); // page-1 block from F1 - let engine = MockOcrEngine { - expected_text: "SHOULD_NOT_BE_CALLED".into(), - fail: false, - }; + let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false); let opts = default_opts(true); let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); @@ -330,10 +281,7 @@ fn f7_ccittfax_skipped_with_warning() { fn ocr_engine_failure_surfaces_as_warning() { let bytes = f1_pdf_bytes(); let mut canonical = canonical_with_empty_block(); - let engine = MockOcrEngine { - expected_text: String::new(), - fail: true, - }; + let engine = MockOcrEngine::single("", true); let opts = default_opts(true); let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap(); @@ -355,10 +303,7 @@ fn dual_block_ordinals_are_deterministic_and_unique() { let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1 let text = "vector 충분한 텍스트. This text has more than twenty characters total."; let mut canonical = canonical_with_filled_block(text); - let engine = MockOcrEngine { - expected_text: "DUAL".into(), - fail: false, - }; + let engine = MockOcrEngine::single("DUAL", false); let opts = PdfOcrOpts { enabled: true, always_on: true, @@ -395,10 +340,7 @@ fn cancel_handle_aborts_mid_pdf() { let bytes = f1_pdf_bytes(); let mut canonical = canonical_with_empty_block(); let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel - let engine = MockOcrEngine { - expected_text: "IGNORED".into(), - fail: false, - }; + let engine = MockOcrEngine::single("IGNORED", false); let opts = PdfOcrOpts { enabled: true, always_on: false,