test(app): multi-scanned PDF chunk_id collision-free integration test (Bug #3 regression)
v0.20.0 sub-item 1 bugfix Step 3 (Group C) — integration-level regression
for Bug #3 (intra-doc chunk_id collision under aggressive overlap).
- `crates/kebab-app/tests/common/mod.rs`: `pub mod mock_ocr;` 1 line append.
- `crates/kebab-app/tests/common/mock_ocr.rs` (new): MockOcrEngine lift +
`single` / `per_page` ctor (backward-compat single + per-page cursor).
- `crates/kebab-app/tests/pdf_ocr_apply.rs`: inline MockOcrEngine 제거 +
`mod common; use common::mock_ocr::MockOcrEngine;` import. 10 ctor call
site migration (`MockOcrEngine { .. }` → `MockOcrEngine::single(...)`).
- `crates/kebab-app/tests/multi_scanned_pdf_ingest_no_chunk_id_collision.rs`
(new): F1 + F2 scanned PDF + Bug #3 trigger shape (10 char "가" + ". " +
500 char "나") via mock OCR. assertion: chunk_id global uniqueness (HashSet
dedup) across F1 + F2; F2 trigger text produces ≥2 chunks (collision shape).
- C1 decision: Option A (share via tests/common/mock_ocr.rs). Facade mock
injection unavailable (OllamaVisionOcr hardcoded) — helper-level chain test
(apply_ocr_to_pdf_pages → PdfPageV1Chunker) adds value beyond unit B5.
spec: docs/superpowers/specs/2026-05-27-v0.20-sub1-bugfix-spec.md (§4.5)
plan: docs/superpowers/plans/2026-05-27-v0.20-sub1-bugfix-plan.md (Step 3)
prior: 436fd01 (Step 2 Bug #3 chunk_id fix)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
60
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
60
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
use std::sync::Mutex;
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use kebab_core::{Lang, OcrText};
|
||||||
|
use kebab_parse_image::OcrEngine;
|
||||||
|
|
||||||
|
pub struct MockOcrEngine {
|
||||||
|
expected_texts: Vec<String>,
|
||||||
|
call_index: Mutex<usize>,
|
||||||
|
fail: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MockOcrEngine {
|
||||||
|
/// Single text (backward-compat ctor for pdf_ocr_apply.rs 10 sites).
|
||||||
|
pub fn single(text: impl Into<String>, fail: bool) -> Self {
|
||||||
|
Self {
|
||||||
|
expected_texts: vec![text.into()],
|
||||||
|
call_index: Mutex::new(0),
|
||||||
|
fail,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-page texts (cursor advances per recognize call).
|
||||||
|
pub fn per_page(texts: Vec<String>, fail: bool) -> Self {
|
||||||
|
Self {
|
||||||
|
expected_texts: texts,
|
||||||
|
call_index: Mutex::new(0),
|
||||||
|
fail,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OcrEngine for MockOcrEngine {
|
||||||
|
fn engine_name(&self) -> &'static str {
|
||||||
|
"mock-ocr"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn engine_version(&self) -> String {
|
||||||
|
"mock-v1".to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
|
||||||
|
if self.fail {
|
||||||
|
anyhow::bail!("mock failure");
|
||||||
|
}
|
||||||
|
let mut idx = self.call_index.lock().unwrap();
|
||||||
|
let text = self
|
||||||
|
.expected_texts
|
||||||
|
.get(*idx)
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_else(|| self.expected_texts.last().cloned().unwrap_or_default());
|
||||||
|
*idx += 1;
|
||||||
|
Ok(OcrText {
|
||||||
|
joined: text,
|
||||||
|
regions: vec![],
|
||||||
|
engine: "mock-ocr".to_string(),
|
||||||
|
engine_version: "mock-v1".to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -169,3 +169,5 @@ fn copy_dir_recursive(src: &Path, dest: &Path) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub mod mock_ocr;
|
||||||
|
|||||||
@@ -0,0 +1,122 @@
|
|||||||
|
//! Bug #3 regression: multi-scanned PDF ingest must produce globally unique chunk_ids.
|
||||||
|
//! v0.20.0 sub-item 1 bugfix.
|
||||||
|
//!
|
||||||
|
//! Strategy: helper-level chain test (apply_ocr_to_pdf_pages → PdfPageV1Chunker).
|
||||||
|
//! Facade mock injection is unavailable (kebab-app hardcodes OllamaVisionOcr), so
|
||||||
|
//! this test covers the full OCR→chunk pipeline with real PDF fixtures + MockOcrEngine,
|
||||||
|
//! adding value beyond kebab-chunk unit test B5 (which tests PdfPageV1Chunker alone).
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
use common::mock_ocr::MockOcrEngine;
|
||||||
|
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||||
|
use kebab_chunk::PdfPageV1Chunker;
|
||||||
|
use kebab_core::{
|
||||||
|
AssetStorage, Checksum, ChunkPolicy, Chunker, ExtractConfig, ExtractContext, Extractor,
|
||||||
|
MediaType, RawAsset, SourceUri, WorkspacePath, id_for_asset,
|
||||||
|
};
|
||||||
|
use kebab_parse_image::OcrEngine;
|
||||||
|
use kebab_parse_pdf::PdfTextExtractor;
|
||||||
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
|
fn make_pdf_asset(path: &str, hash_char: char, byte_len: u64) -> RawAsset {
|
||||||
|
let fake_hash: String = hash_char.to_string().repeat(64);
|
||||||
|
let asset_id = id_for_asset(&fake_hash);
|
||||||
|
RawAsset {
|
||||||
|
asset_id,
|
||||||
|
source_uri: SourceUri::File(PathBuf::from(path)),
|
||||||
|
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
||||||
|
media_type: MediaType::Pdf,
|
||||||
|
byte_len,
|
||||||
|
checksum: Checksum(fake_hash),
|
||||||
|
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
||||||
|
stored: AssetStorage::Copied {
|
||||||
|
path: PathBuf::from(path),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_and_ocr(
|
||||||
|
bytes: &[u8],
|
||||||
|
path: &str,
|
||||||
|
hash_char: char,
|
||||||
|
engine: &dyn OcrEngine,
|
||||||
|
) -> kebab_core::CanonicalDocument {
|
||||||
|
let asset = make_pdf_asset(path, hash_char, bytes.len() as u64);
|
||||||
|
let workspace_root = Path::new("/");
|
||||||
|
let config = ExtractConfig::default();
|
||||||
|
let ctx = ExtractContext {
|
||||||
|
asset: &asset,
|
||||||
|
workspace_root,
|
||||||
|
config: &config,
|
||||||
|
};
|
||||||
|
let mut canonical = PdfTextExtractor::new().extract(&ctx, bytes).unwrap();
|
||||||
|
let opts = PdfOcrOpts {
|
||||||
|
enabled: true,
|
||||||
|
always_on: false,
|
||||||
|
valid_ratio_threshold: 0.5,
|
||||||
|
min_char_count: 20,
|
||||||
|
lang_hint: None,
|
||||||
|
cancel: None,
|
||||||
|
};
|
||||||
|
apply_ocr_to_pdf_pages(&mut canonical, engine, bytes, &opts, |_| {}).unwrap();
|
||||||
|
canonical
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multi_scanned_pdf_ingest_no_chunk_id_collision() {
|
||||||
|
let f1_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||||
|
.expect("F1 fixture missing");
|
||||||
|
let f2_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page2.pdf")
|
||||||
|
.expect("F2 fixture missing");
|
||||||
|
|
||||||
|
// Bug #3 trigger shape: 10-char early segment + ". " + 500-char tail.
|
||||||
|
// byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500 → multi-chunk.
|
||||||
|
// overlap_bytes = min(240, 750) = 240 / chars=80 → second chunk's actual_start
|
||||||
|
// collapses to prev_min=0 without the fix → same #c0 suffix → chunk_id collision.
|
||||||
|
let trigger_text = format!("{}. {}", "가".repeat(10), "나".repeat(500));
|
||||||
|
|
||||||
|
let f1_engine = MockOcrEngine::single("F1 mock OCR page text", false);
|
||||||
|
let f2_engine = MockOcrEngine::single(&trigger_text, false);
|
||||||
|
|
||||||
|
let f1_canonical = extract_and_ocr(&f1_bytes, "page1.pdf", '1', &f1_engine);
|
||||||
|
let f2_canonical = extract_and_ocr(&f2_bytes, "page2.pdf", '2', &f2_engine);
|
||||||
|
|
||||||
|
let chunk_policy = ChunkPolicy {
|
||||||
|
target_tokens: 500,
|
||||||
|
overlap_tokens: 80,
|
||||||
|
respect_markdown_headings: false,
|
||||||
|
chunker_version: PdfPageV1Chunker.chunker_version(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let f1_chunks = PdfPageV1Chunker
|
||||||
|
.chunk(&f1_canonical, &chunk_policy)
|
||||||
|
.unwrap();
|
||||||
|
let f2_chunks = PdfPageV1Chunker
|
||||||
|
.chunk(&f2_canonical, &chunk_policy)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
f2_chunks.len() >= 2,
|
||||||
|
"F2 trigger text must produce ≥2 chunks for the collision to be possible; got {}",
|
||||||
|
f2_chunks.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
let all_ids: Vec<&str> = f1_chunks
|
||||||
|
.iter()
|
||||||
|
.chain(f2_chunks.iter())
|
||||||
|
.map(|c| c.chunk_id.0.as_str())
|
||||||
|
.collect();
|
||||||
|
let total = all_ids.len();
|
||||||
|
let unique: HashSet<&str> = all_ids.iter().copied().collect();
|
||||||
|
assert_eq!(
|
||||||
|
unique.len(),
|
||||||
|
total,
|
||||||
|
"all chunk_ids must be globally unique across F1 + F2 ({} unique vs {} total — collision detected)",
|
||||||
|
unique.len(),
|
||||||
|
total,
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -1,49 +1,21 @@
|
|||||||
//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
|
//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::atomic::AtomicBool;
|
use std::sync::atomic::AtomicBool;
|
||||||
|
|
||||||
use anyhow::Result;
|
use common::mock_ocr::MockOcrEngine;
|
||||||
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||||
use kebab_core::{
|
use kebab_core::{
|
||||||
AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext,
|
AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext,
|
||||||
Extractor, Inline, Lang, MediaType, OcrText, RawAsset, SourceSpan,
|
Extractor, Inline, Lang, MediaType, RawAsset, SourceSpan,
|
||||||
SourceUri, WorkspacePath, id_for_asset,
|
SourceUri, WorkspacePath, id_for_asset,
|
||||||
};
|
};
|
||||||
use kebab_parse_image::OcrEngine;
|
|
||||||
use kebab_parse_pdf::PdfTextExtractor;
|
use kebab_parse_pdf::PdfTextExtractor;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
// ── MockOcrEngine fixture ─────────────────────────────────────────────────
|
|
||||||
|
|
||||||
struct MockOcrEngine {
|
|
||||||
expected_text: String,
|
|
||||||
fail: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl OcrEngine for MockOcrEngine {
|
|
||||||
fn engine_name(&self) -> &'static str {
|
|
||||||
"mock-ocr"
|
|
||||||
}
|
|
||||||
|
|
||||||
fn engine_version(&self) -> String {
|
|
||||||
"mock-v1".to_string()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
|
|
||||||
if self.fail {
|
|
||||||
anyhow::bail!("mock failure");
|
|
||||||
}
|
|
||||||
Ok(OcrText {
|
|
||||||
joined: self.expected_text.clone(),
|
|
||||||
regions: Vec::new(),
|
|
||||||
engine: self.engine_name().to_string(),
|
|
||||||
engine_version: self.engine_version(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Fixture helpers ───────────────────────────────────────────────────────
|
// ── Fixture helpers ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn f1_pdf_bytes() -> Vec<u8> {
|
fn f1_pdf_bytes() -> Vec<u8> {
|
||||||
@@ -136,10 +108,7 @@ fn default_opts(enabled: bool) -> PdfOcrOpts {
|
|||||||
fn f1_input_with_ocr_enabled_replaces_empty_block() {
|
fn f1_input_with_ocr_enabled_replaces_empty_block() {
|
||||||
let bytes = f1_pdf_bytes();
|
let bytes = f1_pdf_bytes();
|
||||||
let mut canonical = canonical_with_empty_block();
|
let mut canonical = canonical_with_empty_block();
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("MOCK_OCR_TEXT", false);
|
||||||
expected_text: "MOCK_OCR_TEXT".into(),
|
|
||||||
fail: false,
|
|
||||||
};
|
|
||||||
let opts = PdfOcrOpts {
|
let opts = PdfOcrOpts {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
always_on: false,
|
always_on: false,
|
||||||
@@ -166,10 +135,7 @@ fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
|
|||||||
let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
|
let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
|
||||||
let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
|
let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
|
||||||
let mut canonical = canonical_with_filled_block(text);
|
let mut canonical = canonical_with_filled_block(text);
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||||
expected_text: "SHOULD_NOT_BE_CALLED".into(),
|
|
||||||
fail: false,
|
|
||||||
};
|
|
||||||
let opts = default_opts(true);
|
let opts = default_opts(true);
|
||||||
|
|
||||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||||
@@ -189,10 +155,7 @@ fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
|
|||||||
fn f1_input_with_ocr_disabled_keeps_empty_block() {
|
fn f1_input_with_ocr_disabled_keeps_empty_block() {
|
||||||
let bytes = f1_pdf_bytes();
|
let bytes = f1_pdf_bytes();
|
||||||
let mut canonical = canonical_with_empty_block();
|
let mut canonical = canonical_with_empty_block();
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("IGNORED", false);
|
||||||
expected_text: "IGNORED".into(),
|
|
||||||
fail: false,
|
|
||||||
};
|
|
||||||
let opts = default_opts(false);
|
let opts = default_opts(false);
|
||||||
|
|
||||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||||
@@ -206,10 +169,7 @@ fn f1_input_with_ocr_disabled_keeps_empty_block() {
|
|||||||
fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
|
fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
|
||||||
let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
|
let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
|
||||||
let mut canonical = canonical_with_mojibake_block();
|
let mut canonical = canonical_with_mojibake_block();
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("OCR_MOJIBAKE_REPLACEMENT", false);
|
||||||
expected_text: "OCR_MOJIBAKE_REPLACEMENT".into(),
|
|
||||||
fail: false,
|
|
||||||
};
|
|
||||||
let opts = PdfOcrOpts {
|
let opts = PdfOcrOpts {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
always_on: false,
|
always_on: false,
|
||||||
@@ -238,10 +198,7 @@ fn f3_input_with_always_on_pushes_dual_blocks() {
|
|||||||
let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
|
let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
|
||||||
let mut canonical = canonical_with_filled_block(text);
|
let mut canonical = canonical_with_filled_block(text);
|
||||||
let original_block_count = canonical.blocks.len();
|
let original_block_count = canonical.blocks.len();
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("OCR_DUAL", false);
|
||||||
expected_text: "OCR_DUAL".into(),
|
|
||||||
fail: false,
|
|
||||||
};
|
|
||||||
let opts = PdfOcrOpts {
|
let opts = PdfOcrOpts {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
always_on: true,
|
always_on: true,
|
||||||
@@ -280,10 +237,7 @@ fn f6_flatedecode_skipped_with_warning() {
|
|||||||
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
|
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
|
||||||
.expect("F6 fixture missing");
|
.expect("F6 fixture missing");
|
||||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||||
expected_text: "SHOULD_NOT_BE_CALLED".into(),
|
|
||||||
fail: false,
|
|
||||||
};
|
|
||||||
let opts = default_opts(true);
|
let opts = default_opts(true);
|
||||||
|
|
||||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||||
@@ -307,10 +261,7 @@ fn f7_ccittfax_skipped_with_warning() {
|
|||||||
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf")
|
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf")
|
||||||
.expect("F7 fixture missing");
|
.expect("F7 fixture missing");
|
||||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||||
expected_text: "SHOULD_NOT_BE_CALLED".into(),
|
|
||||||
fail: false,
|
|
||||||
};
|
|
||||||
let opts = default_opts(true);
|
let opts = default_opts(true);
|
||||||
|
|
||||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||||
@@ -330,10 +281,7 @@ fn f7_ccittfax_skipped_with_warning() {
|
|||||||
fn ocr_engine_failure_surfaces_as_warning() {
|
fn ocr_engine_failure_surfaces_as_warning() {
|
||||||
let bytes = f1_pdf_bytes();
|
let bytes = f1_pdf_bytes();
|
||||||
let mut canonical = canonical_with_empty_block();
|
let mut canonical = canonical_with_empty_block();
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("", true);
|
||||||
expected_text: String::new(),
|
|
||||||
fail: true,
|
|
||||||
};
|
|
||||||
let opts = default_opts(true);
|
let opts = default_opts(true);
|
||||||
|
|
||||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||||
@@ -355,10 +303,7 @@ fn dual_block_ordinals_are_deterministic_and_unique() {
|
|||||||
let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
|
let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
|
||||||
let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
|
let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
|
||||||
let mut canonical = canonical_with_filled_block(text);
|
let mut canonical = canonical_with_filled_block(text);
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("DUAL", false);
|
||||||
expected_text: "DUAL".into(),
|
|
||||||
fail: false,
|
|
||||||
};
|
|
||||||
let opts = PdfOcrOpts {
|
let opts = PdfOcrOpts {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
always_on: true,
|
always_on: true,
|
||||||
@@ -395,10 +340,7 @@ fn cancel_handle_aborts_mid_pdf() {
|
|||||||
let bytes = f1_pdf_bytes();
|
let bytes = f1_pdf_bytes();
|
||||||
let mut canonical = canonical_with_empty_block();
|
let mut canonical = canonical_with_empty_block();
|
||||||
let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
|
let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
|
||||||
let engine = MockOcrEngine {
|
let engine = MockOcrEngine::single("IGNORED", false);
|
||||||
expected_text: "IGNORED".into(),
|
|
||||||
fail: false,
|
|
||||||
};
|
|
||||||
let opts = PdfOcrOpts {
|
let opts = PdfOcrOpts {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
always_on: false,
|
always_on: false,
|
||||||
|
|||||||
Reference in New Issue
Block a user