Files
kebab/crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs
altair823 d5c69f6715 refactor(config): v3 경로 call-site sweep (kebab-app/kebab-eval/kebab-parse-image)
부모 경로에 .ingest 삽입(leaf 구조체 불변). src + 테스트 call-site 전부.
kebab-cli 테스트의 v2 TOML fixture 는 from_file 자동변환(T6) 경로 검증용으로 유지.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 12:40:06 +00:00

140 lines
4.7 KiB
Rust

//! Integration smoke test: dual-write (ndjson + SQLite) for PDF OCR events.
//! AC-3: SQLite row count and doc_id matches ndjson LogEvent::Ocr.
//!
//! Uses wiremock to stub the Ollama `/api/generate` endpoint so the test
//! runs without a live Ollama instance.
mod common;
use std::path::PathBuf;
use common::TestEnv;
use kebab_config::LoggingCfg;
use serde_json::Value;
use tokio::task::spawn_blocking;
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};
fn scanned_pdf_src() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
}
/// AC-3: ndjson OCR line count == pdf_ocr_events row count, and doc_id matches.
#[tokio::test]
async fn ingest_dual_write_doc_id_matches_ndjson() {
let src = scanned_pdf_src();
if !src.exists() {
eprintln!("skipping test: scanned_page1.pdf fixture not found");
return;
}
let server = MockServer::start().await;
// Stub Ollama /api/generate to return a minimal OCR response.
Mock::given(method("POST"))
.and(path("/api/generate"))
.respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
"model": "qwen2.5vl:3b",
"response": "test ocr output",
"done": true,
"done_reason": "stop"
})))
.mount(&server)
.await;
let mock_url = server.uri();
let result = spawn_blocking(move || {
let mut env = TestEnv::lexical_only();
// Enable PDF OCR + set up mock endpoint
env.config.ingest.pdf.ocr.enabled = true;
env.config.ingest.pdf.ocr.endpoint = Some(mock_url.clone());
env.config.ingest.pdf.ocr.model = "qwen2.5vl:3b".to_string();
// Enable ingest log
let log_dir = env.temp.path().join("logs");
std::fs::create_dir_all(&log_dir).unwrap();
env.config.logging = LoggingCfg {
ingest_log_enabled: true,
ingest_log_dir: log_dir.clone(),
..Default::default()
};
// Copy scanned PDF into workspace
let dest = env.workspace_root.join("scanned.pdf");
std::fs::copy(scanned_pdf_src(), &dest).expect("copy scanned PDF");
// Run ingest
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
// Read ndjson log
let log_files: Vec<_> = std::fs::read_dir(&log_dir)
.unwrap()
.filter_map(Result::ok)
.filter(|e| {
let name = e.file_name().to_string_lossy().to_string();
name.starts_with("ingest-") && name.ends_with(".ndjson")
})
.collect();
assert_eq!(log_files.len(), 1, "expected 1 ndjson log file");
let body = std::fs::read_to_string(log_files[0].path()).unwrap();
let ocr_lines: Vec<Value> = body
.lines()
.filter_map(|l| serde_json::from_str(l).ok())
.filter(|v: &Value| v.get("kind").and_then(Value::as_str) == Some("ocr"))
.collect();
// Read pdf_ocr_events from SQLite
let db_path = PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
let conn = rusqlite::Connection::open(&db_path).expect("open db");
let rows: Vec<(Option<String>, String)> = {
let mut stmt = conn
.prepare("SELECT doc_id, doc_path FROM pdf_ocr_events ORDER BY id")
.expect("prepare");
stmt.query_map([], |r| Ok((r.get(0)?, r.get(1)?)))
.expect("query")
.map(|r| r.expect("row"))
.collect()
};
(ocr_lines, rows)
})
.await
.expect("spawn_blocking");
let (ocr_lines, rows) = result;
// At least one OCR event must be produced
assert!(!ocr_lines.is_empty(), "expected ≥1 ndjson ocr line");
assert!(!rows.is_empty(), "expected ≥1 pdf_ocr_events row");
// Row counts must match
assert_eq!(
ocr_lines.len(),
rows.len(),
"ndjson ocr lines ({}) must equal pdf_ocr_events rows ({})",
ocr_lines.len(),
rows.len()
);
// doc_id in both sources must be non-null and consistent
for (line, (sql_doc_id, _sql_doc_path)) in ocr_lines.iter().zip(rows.iter()) {
let json_doc_id = line.get("doc_id").and_then(Value::as_str);
assert!(
json_doc_id.is_some(),
"ndjson ocr line should have doc_id: {line}"
);
assert!(
sql_doc_id.is_some(),
"pdf_ocr_events row should have doc_id"
);
assert_eq!(
json_doc_id,
sql_doc_id.as_deref(),
"ndjson doc_id must equal SQLite doc_id"
);
}
}