diff --git a/Cargo.toml b/Cargo.toml index 27f589d..0bdf230 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -141,6 +141,7 @@ proptest = "1" # p9-fb-19: LRU cache for `App::search` results. Bounded capacity # from `config.search.cache_capacity` (default 256, ~1.3 MB cap). lru = "0.12" +lopdf = "0.32" # fastembed-rs ships ONNX runtime via the `ort-download-binaries` feature # in its default set (which also pulls `hf-hub` for first-run model # downloads). Pinned to the 4.x line per task p3-2 (current 5.x release diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml index 2ad9fcf..239b380 100644 --- a/crates/kebab-app/Cargo.toml +++ b/crates/kebab-app/Cargo.toml @@ -35,7 +35,7 @@ kebab-parse-image = { path = "../kebab-parse-image" } # per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the # resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`. kebab-parse-pdf = { path = "../kebab-parse-pdf" } -lopdf = "0.32" +lopdf = { workspace = true } # p10-1A-2: Rust AST extractor lives here. App threads it into the # per-asset dispatch (see `ingest_one_asset` Code branch) and runs the # resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`. @@ -76,7 +76,7 @@ image = { version = "0.25", default-features = false, features = # lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned # to the same major (0.32) so byte output is identical between the two # fixture surfaces. -lopdf = "0.32" +lopdf = { workspace = true } # error_wire::tests::llm_unreachable_classifies_to_model_unreachable needs a real # reqwest::Error (private constructor) — built from a connect-refused call. reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] } diff --git a/crates/kebab-app/src/ingest_progress.rs b/crates/kebab-app/src/ingest_progress.rs index 597ba81..19f1d6f 100644 --- a/crates/kebab-app/src/ingest_progress.rs +++ b/crates/kebab-app/src/ingest_progress.rs @@ -85,6 +85,15 @@ pub enum IngestEvent { /// aggregate at the cancel boundary. Emitted by `p9-fb-04`; this /// task never produces `Aborted`. Aborted { counts: AggregateCounts }, + /// PDF page 별 OCR 시작 시 emit. v0.20.0 sub-item 1. + PdfOcrStarted { page: u32 }, + /// PDF page 별 OCR 종료 시 emit. v0.20.0 sub-item 1. + PdfOcrFinished { + page: u32, + ms: u64, + chars: u32, + ocr_engine: String, + }, } /// Map a `MediaType` to the short label used by `IngestEvent::AssetStarted`. diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 78c1ca8..e7839aa 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -48,7 +48,7 @@ use kebab_core::{ SourceUri, VectorRecord, VectorStore, }; use kebab_llm_local::OllamaLanguageModel; -use kebab_parse_image::{OllamaVisionOcr, apply_caption, apply_ocr}; +use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_caption, apply_ocr}; use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -357,6 +357,29 @@ pub fn ingest_with_config_opts( caption_llm: caption_llm.as_deref(), }; + // p10 / v0.20 sub-item 1: PDF OCR engine eager init (H-5 resolution). + // image OCR pattern mirror — per-ingest 1회 build, fallible → fail-fast. + let pdf_ocr_engine: Option = + if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on { + let cfg = &app.config.pdf.ocr; + let endpoint = match cfg.endpoint.as_deref() { + Some(s) if !s.is_empty() => s.to_string(), + _ => app.config.models.llm.endpoint.clone(), + }; + Some( + OllamaVisionOcr::from_parts( + endpoint, + cfg.model.clone(), + cfg.languages.clone(), + cfg.max_pixels, + cfg.request_timeout_secs, + ) + .context("kb-app::ingest: build OllamaVisionOcr (pdf)")?, + ) + } else { + None + }; + // Pre-load every existing doc_id so we can label `IngestItem.kind` // as `New` vs `Updated` correctly. `list_documents` returns one // row per `(workspace_path, asset_id)` — index by the deterministic @@ -448,6 +471,9 @@ pub fn ingest_with_config_opts( &existing_doc_ids, &image_pipeline, force_reingest, + pdf_ocr_engine.as_ref(), + progress, + opts.cancel.as_ref(), ); let item = match item { @@ -476,6 +502,8 @@ pub fn ingest_with_config_opts( parser_version: None, chunker_version: None, warnings: Vec::new(), + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: Some(format!("{e:#}")), } } @@ -864,6 +892,8 @@ fn try_skip_unchanged( parser_version: Some(existing_doc.parser_version.clone()), chunker_version: existing_doc.last_chunker_version.clone(), warnings: Vec::new(), + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, })); } @@ -922,6 +952,8 @@ fn try_skip_unchanged( parser_version: Some(existing_doc.parser_version.clone()), chunker_version: existing_doc.last_chunker_version.clone(), warnings: Vec::new(), + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, })) } @@ -964,6 +996,9 @@ fn ingest_one_asset( existing_doc_ids: &std::collections::HashSet, image_pipeline: &ImagePipeline<'_>, force_reingest: bool, + pdf_ocr_engine: Option<&OllamaVisionOcr>, + progress: Option<&std::sync::mpsc::Sender>, + cancel: Option<&std::sync::Arc>, ) -> anyhow::Result { tracing::debug!( target: "kebab-app::ingest", @@ -999,6 +1034,9 @@ fn ingest_one_asset( vector_store, existing_doc_ids, force_reingest, + pdf_ocr_engine, + progress, + cancel, ); } // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. p10-1D: c/cpp added. @@ -1033,6 +1071,8 @@ fn ingest_one_asset( parser_version: None, chunker_version: None, warnings: vec![unsupported_media_warning(&asset.workspace_path.0)], + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }); } @@ -1052,6 +1092,8 @@ fn ingest_one_asset( parser_version: None, chunker_version: None, warnings: vec!["kb:// URI not yet supported".to_string()], + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }); } @@ -1201,6 +1243,8 @@ fn ingest_one_asset( parser_version: Some(parser_version.clone()), chunker_version: Some(MdHeadingV1Chunker.chunker_version()), warnings: warning_notes, + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }) } @@ -1246,6 +1290,8 @@ fn ingest_one_image_asset( warnings: vec![ "kb:// URI not yet supported".to_string(), ], + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }); } @@ -1456,6 +1502,8 @@ fn ingest_one_image_asset( parser_version: Some(canonical.parser_version.clone()), chunker_version: Some(MdHeadingV1Chunker.chunker_version()), warnings: warning_notes, + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }) } @@ -1726,6 +1774,9 @@ fn ingest_one_pdf_asset( vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, force_reingest: bool, + pdf_ocr_engine: Option<&OllamaVisionOcr>, + progress: Option<&std::sync::mpsc::Sender>, + cancel: Option<&std::sync::Arc>, ) -> anyhow::Result { let path = match &asset.source_uri { SourceUri::File(p) => p.clone(), @@ -1743,6 +1794,8 @@ fn ingest_one_pdf_asset( warnings: vec![ "kb:// URI not yet supported".to_string(), ], + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }); } @@ -1779,6 +1832,62 @@ fn ingest_one_pdf_asset( .extract_for(&asset.media_type, &ctx, &bytes) .context("kb-app::extract_for (pdf)")?; + // v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry + // dispatch invariant 보존 — extract_for 가 normal entry). + let (pdf_ocr_pages, pdf_ocr_ms_total): (Option, Option) = + if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on { + match pdf_ocr_engine { + Some(engine) => { + let ocr_opts = crate::pdf_ocr_apply::PdfOcrOpts { + enabled: app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on, + always_on: app.config.pdf.ocr.always_on, + valid_ratio_threshold: app.config.pdf.ocr.valid_ratio_threshold, + min_char_count: app.config.pdf.ocr.min_char_count, + lang_hint: app.config.pdf.ocr.lang_hint.clone().map(kebab_core::Lang), + cancel: cancel.cloned(), + }; + let summary = crate::pdf_ocr_apply::apply_ocr_to_pdf_pages( + &mut canonical, + engine, + &bytes, + &ocr_opts, + |p| match p { + crate::pdf_ocr_apply::PdfOcrProgress::Started { page } => { + if let Some(sender) = progress { + let _ = sender.send( + crate::ingest_progress::IngestEvent::PdfOcrStarted { + page, + }, + ); + } + } + crate::pdf_ocr_apply::PdfOcrProgress::Finished { + page, + ms, + chars, + skipped: _, + } => { + if let Some(sender) = progress { + let _ = sender.send( + crate::ingest_progress::IngestEvent::PdfOcrFinished { + page, + ms, + chars, + ocr_engine: engine.engine_name().to_string(), + }, + ); + } + } + }, + )?; + (Some(summary.pages_ocrd), Some(summary.ms_total)) + } + None => (Some(0), Some(0)), + } + } else { + (None, None) + }; + // Per-medium chunker selection: PDF docs always use pdf-page-v1 // regardless of `config.chunking.chunker_version`. The chunker // validates every block carries `SourceSpan::Page`; failure here @@ -1880,6 +1989,8 @@ fn ingest_one_pdf_asset( parser_version: Some(canonical.parser_version.clone()), chunker_version: Some(chunker.chunker_version()), warnings, + pdf_ocr_pages, + pdf_ocr_ms_total, error: None, }) } @@ -1921,6 +2032,8 @@ fn ingest_one_code_asset( warnings: vec![ "kb:// URI not yet supported".to_string(), ], + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }); } @@ -2227,6 +2340,8 @@ fn ingest_one_code_asset( parser_version: Some(canonical.parser_version.clone()), chunker_version: Some(chunker_version), warnings, + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }) } diff --git a/crates/kebab-cli/src/progress.rs b/crates/kebab-cli/src/progress.rs index be17a46..6e79df3 100644 --- a/crates/kebab-cli/src/progress.rs +++ b/crates/kebab-cli/src/progress.rs @@ -201,6 +201,9 @@ impl ProgressDisplay { ); } } + // v0.20.0 sub-item 1: per-page PDF OCR events — not surfaced in + // human-readable progress output (no TTY bar update needed). + IngestEvent::PdfOcrStarted { .. } | IngestEvent::PdfOcrFinished { .. } => {} } Ok(()) } diff --git a/crates/kebab-core/src/ingest.rs b/crates/kebab-core/src/ingest.rs index 8fff25b..c3c5f3a 100644 --- a/crates/kebab-core/src/ingest.rs +++ b/crates/kebab-core/src/ingest.rs @@ -83,6 +83,12 @@ pub struct IngestItem { pub parser_version: Option, pub chunker_version: Option, pub warnings: Vec, + /// v0.20.0 sub-item 1: number of PDF pages 가 OCR pipeline 통과. + /// `None` = OCR disabled or non-PDF asset. + pub pdf_ocr_pages: Option, + /// v0.20.0 sub-item 1: cumulative OCR engine wall-clock duration (ms). + /// `None` = OCR disabled or non-PDF asset. + pub pdf_ocr_ms_total: Option, pub error: Option, } diff --git a/crates/kebab-parse-pdf/Cargo.toml b/crates/kebab-parse-pdf/Cargo.toml index c6037c8..836d421 100644 --- a/crates/kebab-parse-pdf/Cargo.toml +++ b/crates/kebab-parse-pdf/Cargo.toml @@ -20,7 +20,7 @@ tracing = { workspace = true } # crates (pom, postscript, type1-encoding-parser, …) buy us nothing # at v1 (we don't call its whole-doc API), and the future scanned-PDF # OCR fallback can re-add it when it actually needs it. -lopdf = "0.32" +lopdf = { workspace = true } [dev-dependencies] blake3 = { workspace = true } diff --git a/crates/kebab-store-sqlite/snapshots/ingest_report.snapshot.json b/crates/kebab-store-sqlite/snapshots/ingest_report.snapshot.json index 3fb065f..a2162ba 100644 --- a/crates/kebab-store-sqlite/snapshots/ingest_report.snapshot.json +++ b/crates/kebab-store-sqlite/snapshots/ingest_report.snapshot.json @@ -13,6 +13,8 @@ "error": null, "kind": "new", "parser_version": "md-frontmatter-v2", + "pdf_ocr_ms_total": null, + "pdf_ocr_pages": null, "warnings": [] }, { @@ -26,6 +28,8 @@ "error": null, "kind": "updated", "parser_version": "md-frontmatter-v2", + "pdf_ocr_ms_total": null, + "pdf_ocr_pages": null, "warnings": [ "malformed frontmatter" ] diff --git a/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs b/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs index 6b15bc0..0fe1866 100644 --- a/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs +++ b/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs @@ -54,6 +54,8 @@ fn fixture_report() -> IngestReport { parser_version: Some(ParserVersion("md-frontmatter-v2".into())), chunker_version: Some(ChunkerVersion("md-heading-v1".into())), warnings: vec![], + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }, IngestItem { @@ -67,6 +69,8 @@ fn fixture_report() -> IngestReport { parser_version: Some(ParserVersion("md-frontmatter-v2".into())), chunker_version: Some(ChunkerVersion("md-heading-v1".into())), warnings: vec!["malformed frontmatter".into()], + pdf_ocr_pages: None, + pdf_ocr_ms_total: None, error: None, }, ]), diff --git a/crates/kebab-tui/src/ingest_progress.rs b/crates/kebab-tui/src/ingest_progress.rs index 62eab84..7ce3e75 100644 --- a/crates/kebab-tui/src/ingest_progress.rs +++ b/crates/kebab-tui/src/ingest_progress.rs @@ -154,6 +154,9 @@ fn apply_event(state: &mut IngestState, event: IngestEvent) { state.terminal_at = Some(std::time::Instant::now()); state.aborted = true; } + // v0.20.0 sub-item 1: per-page PDF OCR events — TUI does not + // surface per-page OCR progress in v1; no counter to update. + IngestEvent::PdfOcrStarted { .. } | IngestEvent::PdfOcrFinished { .. } => {} } }