Step 8 (Group H) of v0.20.0 sub-item 1 (scanned PDF OCR) plan +
Step 7 reviewer concern fix (spec literal deviation).
H1 — kebab-cli/src/progress.rs printer activation:
- 구 no-op stub `IngestEvent::PdfOcr* { .. } => {}` (Step 6 placeholder)
를 사람-친화 stderr line printer 로 활성화.
- spec §4.6.1 line 1085-1086 wording 그대로:
- PdfOcrStarted → ` 📷 OCR page {page}...`
- PdfOcrFinished (skipped=false) → ` ✓ OCR page {page} ({chars} chars, {ms}ms via {ocr_engine})`
- PdfOcrFinished (skipped=true) → ` ⊘ OCR page {page} skipped (no DCTDecode or engine fail, {ms}ms)` (M-4 의 skipped field carry 활용)
- `!quiet` gate 정합 (AssetStarted/Finished pattern mirror).
H2 — crates/kebab-app/tests/ingest_progress.rs 의 새 test:
- pdf_ocr_progress_emits_started_finished_events (real Ollama 의존, `#[ignore]`).
- F1 fixture (scanned_page1.pdf) ingest 시 pdf_ocr_started + pdf_ocr_finished
event 가 emit 됨을 verify. Started count == Finished count invariant.
- Manual invoke: `KEBAB_PDF_OCR_ENABLED=true cargo test -p kebab-app --test
ingest_progress --ignored`.
- mock OcrEngine inject path 부재 (Step 6 의 eager build), Step 9 I5 의
ocr_e2e pattern (real Ollama + `#[ignore]`) 와 동일.
Step 7 reviewer concern fix — spec §4.6.1 literal:
- line 1076-1077 의 `ocr_ms` / `ocr_chars` literal 을 wire schema 의 실제
field name `ms` / `chars` (option_A, Rust serde 와 정합) 로 갱신.
- line 1087 의 printer wording 도 `{ocr_chars}` / `{ocr_ms}` → `{chars}` / `{ms}`.
- line 1556 의 rationale 참조 `pdf_ocr_finished.ocr_ms` → `.ms`.
- `skipped` field 도 명시 (Step 6 reviewer M-4 결과).
spec: docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md (§4.6.1)
plan: docs/superpowers/plans/2026-05-27-pdf-scanned-ocr-plan.md (Step 8 H1+H2)
prior: 4c5ccd5 (Step 7 wire schema) — Step 7 reviewer concern 1 의 fix
contract: §9 (additive minor wire bump — Step 7 commit 에서 완료)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
211 lines
7.2 KiB
Rust
211 lines
7.2 KiB
Rust
//! Integration coverage for `ingest_with_config_progress` —
|
|
//! exercises the streaming progress channel against the same lexical
|
|
//! fixture used by `ingest_lexical.rs`.
|
|
|
|
mod common;
|
|
|
|
use std::sync::mpsc;
|
|
|
|
use common::TestEnv;
|
|
use kebab_app::{AggregateCounts, IngestEvent};
|
|
use kebab_core::IngestItemKind;
|
|
|
|
fn run_with_progress() -> Vec<IngestEvent> {
|
|
let env = TestEnv::lexical_only();
|
|
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
|
let report = kebab_app::ingest_with_config_progress(
|
|
env.config.clone(),
|
|
env.scope(),
|
|
false,
|
|
Some(tx),
|
|
)
|
|
.unwrap();
|
|
assert_eq!(report.scanned, 3);
|
|
assert_eq!(report.new, 3);
|
|
|
|
// Drain until the sender (held inside `ingest_with_config_progress`)
|
|
// is dropped on return.
|
|
let mut events = Vec::new();
|
|
while let Ok(ev) = rx.recv() {
|
|
events.push(ev);
|
|
}
|
|
events
|
|
}
|
|
|
|
#[test]
|
|
fn progress_event_sequence_matches_design_section_2_4a() {
|
|
let events = run_with_progress();
|
|
|
|
// First event: ScanStarted with workspace root.
|
|
match &events[0] {
|
|
IngestEvent::ScanStarted { root } => {
|
|
assert!(!root.is_empty(), "ScanStarted root must be a path");
|
|
}
|
|
other => panic!("expected ScanStarted, got {other:?}"),
|
|
}
|
|
|
|
// Second event: ScanCompleted with total = 3 fixture files.
|
|
match &events[1] {
|
|
IngestEvent::ScanCompleted { total } => {
|
|
assert_eq!(*total, 3, "ScanCompleted total: {events:?}");
|
|
}
|
|
other => panic!("expected ScanCompleted, got {other:?}"),
|
|
}
|
|
|
|
// Final event: Completed with the aggregate counters mirroring the
|
|
// returned report.
|
|
let last = events.last().expect("at least one event");
|
|
match last {
|
|
IngestEvent::Completed { counts } => {
|
|
assert_eq!(
|
|
*counts,
|
|
AggregateCounts {
|
|
scanned: 3,
|
|
new: 3,
|
|
chunks_indexed: counts.chunks_indexed,
|
|
embeddings_indexed: 0,
|
|
..Default::default()
|
|
},
|
|
"Completed counts: {counts:?}"
|
|
);
|
|
assert!(counts.chunks_indexed >= 3, "chunks_indexed: {counts:?}");
|
|
}
|
|
other => panic!("expected Completed last, got {other:?}"),
|
|
}
|
|
|
|
// Middle: 3 AssetStarted/AssetFinished pairs in monotonic idx order.
|
|
let asset_events: Vec<&IngestEvent> = events[2..events.len() - 1].iter().collect();
|
|
assert_eq!(
|
|
asset_events.len(),
|
|
6,
|
|
"expected 3 (Started + Finished) pairs, got {asset_events:?}"
|
|
);
|
|
for (chunk_idx, pair) in asset_events.chunks(2).enumerate() {
|
|
let expected_idx = chunk_idx as u32 + 1;
|
|
match (pair[0], pair[1]) {
|
|
(
|
|
IngestEvent::AssetStarted {
|
|
idx: si,
|
|
total: st,
|
|
media,
|
|
..
|
|
},
|
|
IngestEvent::AssetFinished {
|
|
idx: fi,
|
|
total: ft,
|
|
result,
|
|
chunks,
|
|
},
|
|
) => {
|
|
assert_eq!(*si, expected_idx, "Started idx mismatch: {pair:?}");
|
|
assert_eq!(*fi, expected_idx, "Finished idx mismatch: {pair:?}");
|
|
assert_eq!(*st, 3, "Started total mismatch");
|
|
assert_eq!(*ft, 3, "Finished total mismatch");
|
|
assert_eq!(media, "markdown", "fixture is markdown only");
|
|
assert_eq!(*result, IngestItemKind::New, "first ingest → New");
|
|
assert!(*chunks >= 1, "chunks: {pair:?}");
|
|
}
|
|
other => panic!("expected Started+Finished pair, got {other:?}"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn ingest_with_config_progress_none_matches_ingest_with_config() {
|
|
// Forwarding wrapper: `ingest_with_config(...)` and
|
|
// `ingest_with_config_progress(..., None)` must produce identical
|
|
// reports modulo wall-clock duration.
|
|
let env = TestEnv::lexical_only();
|
|
let r_none = kebab_app::ingest_with_config_progress(
|
|
env.config.clone(),
|
|
env.scope(),
|
|
true,
|
|
None,
|
|
)
|
|
.unwrap();
|
|
assert_eq!(r_none.scanned, 3);
|
|
assert_eq!(r_none.new, 3);
|
|
}
|
|
|
|
#[test]
|
|
fn dropped_receiver_does_not_panic_or_fail_ingest() {
|
|
// Best-effort send: if the consumer dies mid-run, ingest must
|
|
// still complete normally.
|
|
let env = TestEnv::lexical_only();
|
|
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
|
drop(rx);
|
|
let report = kebab_app::ingest_with_config_progress(
|
|
env.config.clone(),
|
|
env.scope(),
|
|
true,
|
|
Some(tx),
|
|
)
|
|
.unwrap();
|
|
assert_eq!(report.scanned, 3);
|
|
}
|
|
|
|
/// v0.20.0 sub-item 1: pdf_ocr_started + pdf_ocr_finished events 가 PDF asset 의
|
|
/// OCR-enabled ingest 시 emit 됨을 검증. real Ollama 의존 — `#[ignore]` default.
|
|
///
|
|
/// Manual invoke:
|
|
/// ```
|
|
/// KEBAB_PDF_OCR_ENABLED=true \
|
|
/// KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
|
/// cargo test -p kebab-app --test ingest_progress \
|
|
/// --ignored pdf_ocr_progress_emits_started_finished_events
|
|
/// ```
|
|
#[test]
|
|
#[ignore = "real Ollama dependency — manual invoke via KEBAB_PDF_OCR_ENABLED=true"]
|
|
fn pdf_ocr_progress_emits_started_finished_events() {
|
|
// F1 fixture (DCTDecode JPEG passthrough) 을 tmpdir 의 workspace 로 copy.
|
|
let tmpdir = tempfile::tempdir().expect("create tmpdir");
|
|
let workspace = tmpdir.path().join("workspace");
|
|
std::fs::create_dir_all(&workspace).expect("create workspace dir");
|
|
let f1_src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
|
.join("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
|
let f1 = std::fs::read(&f1_src).expect("F1 fixture present");
|
|
std::fs::write(workspace.join("page1.pdf"), &f1).expect("copy F1");
|
|
|
|
let data_dir = tmpdir.path().join("data");
|
|
std::fs::create_dir_all(&data_dir).expect("create data dir");
|
|
|
|
let mut config = kebab_config::Config::defaults();
|
|
config.workspace.root = workspace.to_string_lossy().into_owned();
|
|
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
|
config.models.embedding.provider = "none".to_string();
|
|
config.models.embedding.dimensions = 0;
|
|
config.pdf.ocr.enabled = true;
|
|
if let Ok(endpoint) = std::env::var("KEBAB_PDF_OCR_ENDPOINT") {
|
|
config.pdf.ocr.endpoint = Some(endpoint);
|
|
}
|
|
|
|
let scope = kebab_core::SourceScope {
|
|
root: workspace.clone(),
|
|
..Default::default()
|
|
};
|
|
|
|
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
|
let _report = kebab_app::ingest_with_config_progress(
|
|
config,
|
|
scope,
|
|
false,
|
|
Some(tx),
|
|
)
|
|
.expect("ingest_with_config_progress");
|
|
|
|
let events: Vec<_> = rx.iter().collect();
|
|
|
|
let started_count = events
|
|
.iter()
|
|
.filter(|e| matches!(e, IngestEvent::PdfOcrStarted { .. }))
|
|
.count();
|
|
let finished_count = events
|
|
.iter()
|
|
.filter(|e| matches!(e, IngestEvent::PdfOcrFinished { .. }))
|
|
.count();
|
|
|
|
assert!(started_count >= 1, "PdfOcrStarted 가 ≥ 1 emit 됨 (got {started_count})");
|
|
assert!(finished_count >= 1, "PdfOcrFinished 가 ≥ 1 emit 됨 (got {finished_count})");
|
|
assert_eq!(started_count, finished_count, "Started 와 Finished 의 count 일치");
|
|
}
|