diff --git a/Cargo.lock b/Cargo.lock index cfc034c..92ef438 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3408,12 +3408,14 @@ dependencies = [ "kebab-normalize", "kebab-parse-image", "kebab-parse-md", + "kebab-parse-pdf", "kebab-parse-types", "kebab-rag", "kebab-search", "kebab-source-fs", "kebab-store-sqlite", "kebab-store-vector", + "lopdf", "rusqlite", "serde", "serde_json", diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml index c50ae8e..613d9ea 100644 --- a/crates/kebab-app/Cargo.toml +++ b/crates/kebab-app/Cargo.toml @@ -28,6 +28,10 @@ kebab-rag = { path = "../kebab-rag" } # image branch). Trait-only consumption — no `kebab-parse-image` # internals leak into kb-app code. kebab-parse-image = { path = "../kebab-parse-image" } +# P7-3: PDF text extractor lives here. App threads it into the +# per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the +# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`. +kebab-parse-pdf = { path = "../kebab-parse-pdf" } anyhow = { workspace = true } blake3 = { workspace = true } serde = { workspace = true } @@ -48,3 +52,8 @@ tempfile = { workspace = true } wiremock = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread"] } image = { version = "0.25", default-features = false, features = ["png"] } +# P7-3 PDF integration tests build in-memory PDF fixtures via the same +# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned +# to the same major (0.32) so byte output is identical between the two +# fixture surfaces. +lopdf = "0.32" diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index d916f7e..026a5d5 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::MdHeadingV1Chunker; +use kebab_chunk::{MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -50,6 +50,7 @@ use kebab_core::{ use kebab_llm_local::OllamaLanguageModel; use kebab_normalize::build_canonical_document; use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; +use kebab_parse_pdf::PdfTextExtractor; use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -520,6 +521,16 @@ fn ingest_one_asset( image_pipeline, ); } + MediaType::Pdf => { + return ingest_one_pdf_asset( + app, + asset, + chunk_policy, + embedder, + vector_store, + existing_doc_ids, + ); + } _ => { return Ok(kebab_core::IngestItem { kind: kebab_core::IngestItemKind::Skipped, @@ -938,6 +949,156 @@ fn record_image_analysis_failure( warning_notes.push(note); } +/// P7-3: process one `MediaType::Pdf` asset end-to-end. +/// +/// - Reads bytes from disk. +/// - Calls [`PdfTextExtractor::extract`]. Failure (corrupt header, +/// encrypted PDF, etc.) → `IngestItemKind::Error` with the formatted +/// message (so the `qpdf --decrypt` hint surfaces verbatim for the +/// encrypted-PDF case). Continue to next asset; do not abort. +/// - Hands the `CanonicalDocument` to [`PdfPageV1Chunker`] (per-medium +/// chunker selection — keyed on `MediaType::Pdf` at compile time). +/// Chunker validation failure (would only fire on P7-1 contract +/// drift OR a future routing bug) is treated as `Error` too. +/// - Persists doc + blocks + chunks via the same `DocumentStore` +/// calls the markdown / image branches use. +/// - Embeds chunks if both an embedder and a vector store are +/// configured. Embed failure marks the item as `Error` AFTER +/// doc/block/chunk rows are already written — re-running ingest +/// re-attempts the embed (consistent with the markdown path; whole- +/// asset rollback on embed-fail is a P+ task). +/// +/// `chunker_version` is hard-coded to `pdf-page-v1` (HOTFIXES entry — +/// `config.chunking.chunker_version` is single-valued today and serves +/// the markdown path; per-medium config split is a P+ chunker registry +/// task). +#[allow(clippy::too_many_arguments)] +fn ingest_one_pdf_asset( + app: &App, + asset: &RawAsset, + chunk_policy: &ChunkPolicy, + embedder: Option<&Arc>, + vector_store: Option<&Arc>, + existing_doc_ids: &std::collections::HashSet, +) -> anyhow::Result { + let path = match &asset.source_uri { + SourceUri::File(p) => p.clone(), + SourceUri::Kb(_) => { + return Ok(kebab_core::IngestItem { + kind: kebab_core::IngestItemKind::Skipped, + doc_id: None, + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: None, + chunk_count: None, + parser_version: None, + chunker_version: None, + warnings: vec![ + "kb:// source URIs are not supported by the fs ingester".into(), + ], + error: None, + }); + } + }; + let bytes = std::fs::read(&path) + .with_context(|| format!("read PDF asset bytes from {}", path.display()))?; + + let extract_config = kebab_core::ExtractConfig::default(); + let workspace_root = std::path::PathBuf::from(&app.config.workspace.root); + let ctx = ExtractContext { + asset, + workspace_root: &workspace_root, + config: &extract_config, + }; + let canonical = PdfTextExtractor::new() + .extract(&ctx, &bytes) + .context("kb-parse-pdf::PdfTextExtractor::extract")?; + + // Per-medium chunker selection: PDF docs always use pdf-page-v1 + // regardless of `config.chunking.chunker_version`. The chunker + // validates every block carries `SourceSpan::Page`; failure here + // means the parser drifted from its contract. + let chunker = PdfPageV1Chunker; + let chunks = chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::PdfPageV1Chunker::chunk")?; + + app.sqlite + .put_asset_with_bytes(asset, &bytes) + .context("DocumentStore::put_asset_with_bytes (pdf)")?; + app.sqlite + .put_document(&canonical) + .context("DocumentStore::put_document (pdf)")?; + app.sqlite + .put_blocks(&canonical.doc_id, &canonical.blocks) + .context("DocumentStore::put_blocks (pdf)")?; + app.sqlite + .put_chunks(&canonical.doc_id, &chunks) + .context("DocumentStore::put_chunks (pdf)")?; + + if let (Some(emb), Some(vec_store)) = (embedder, vector_store) + && !chunks.is_empty() + { + let inputs: Vec> = chunks + .iter() + .map(|c| EmbeddingInput { + text: c.text.as_str(), + kind: EmbeddingKind::Document, + }) + .collect(); + let vectors = emb + .embed(&inputs) + .context("Embedder::embed (pdf chunks)")?; + let model_id = emb.model_id(); + let model_version = emb.model_version(); + let dimensions = emb.dimensions(); + let records: Vec = chunks + .iter() + .zip(vectors) + .map(|(c, v)| VectorRecord { + embedding_id: kebab_core::id_for_embedding( + &c.chunk_id, + &model_id, + &model_version, + dimensions, + ), + chunk_id: c.chunk_id.clone(), + vector: v, + doc_id: canonical.doc_id.clone(), + text: c.text.clone(), + heading_path: c.heading_path.clone(), + model_id: model_id.clone(), + model_version: model_version.clone(), + dimensions, + }) + .collect(); + vec_store + .upsert(&records) + .context("VectorStore::upsert (pdf)")?; + } + + let kind = if existing_doc_ids.contains(&canonical.doc_id.0) { + kebab_core::IngestItemKind::Updated + } else { + kebab_core::IngestItemKind::New + }; + + Ok(kebab_core::IngestItem { + kind, + doc_id: Some(canonical.doc_id.clone()), + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: u32::try_from(canonical.blocks.len()).ok(), + chunk_count: u32::try_from(chunks.len()).ok(), + parser_version: Some(canonical.parser_version.clone()), + chunker_version: Some(chunker.chunker_version()), + warnings: Vec::new(), + error: None, + }) +} + /// Pull the BCP-47 language hint from the canonical document. P6-1 /// stamps `Lang("und")` by default; image-pipeline OCR / caption /// adapters special-case "und" so the hint is intentionally dropped diff --git a/crates/kebab-app/tests/pdf_pipeline.rs b/crates/kebab-app/tests/pdf_pipeline.rs new file mode 100644 index 0000000..12accf1 --- /dev/null +++ b/crates/kebab-app/tests/pdf_pipeline.rs @@ -0,0 +1,495 @@ +//! P7-3 PDF ingest wiring — end-to-end integration. +//! +//! Each test spins up a `TempDir` workspace + writes one or more PDF +//! fixtures via the same `lopdf` builder pattern +//! `kebab-parse-pdf::tests::common` uses, then runs `kebab_app:: +//! ingest_with_config` against it. PDF ingest needs no external HTTP +//! adapter (no OCR / caption / LM), so unlike the image pipeline these +//! tests do NOT need wiremock — they run sync, no async runtime. + +mod common; + +use std::path::Path; + +use common::TestEnv; +use kebab_config::Config; +use kebab_core::{Block, IngestItemKind, SourceSpan}; +use lopdf::content::{Content, Operation}; +use lopdf::{Document, Object, Stream, dictionary}; + +// ── Fixture helpers ────────────────────────────────────────────────────── + +/// Build a Helvetica-text PDF mirroring `kebab-parse-pdf::tests::common:: +/// build_text_pdf`. `pages` is one entry per page; `None` means the page +/// has no `/Contents` stream (the "scanned candidate" shape — extract +/// returns empty + emits a Provenance Warning). +fn build_text_pdf(pages: &[Option<&str>]) -> Vec { + let mut doc = Document::with_version("1.5"); + let pages_id = doc.new_object_id(); + let font_id = doc.add_object(dictionary! { + "Type" => "Font", + "Subtype" => "Type1", + "BaseFont" => "Helvetica", + }); + let resources_id = doc.add_object(dictionary! { + "Font" => dictionary! { "F1" => font_id }, + }); + + let mut page_refs: Vec = Vec::new(); + for page in pages { + let mut page_dict = dictionary! { + "Type" => "Page", + "Parent" => pages_id, + }; + if let Some(text) = page { + let content = Content { + operations: vec![ + Operation::new("BT", vec![]), + Operation::new("Tf", vec!["F1".into(), 24.into()]), + Operation::new( + "Td", + vec![Object::Integer(100), Object::Integer(700)], + ), + Operation::new("Tj", vec![Object::string_literal(*text)]), + Operation::new("ET", vec![]), + ], + }; + let stream_data = content.encode().expect("content encode"); + let content_id = + doc.add_object(Stream::new(dictionary! {}, stream_data)); + page_dict.set("Contents", content_id); + } + let page_id = doc.add_object(page_dict); + page_refs.push(page_id.into()); + } + + let count = page_refs.len() as i64; + let pages_dict = dictionary! { + "Type" => "Pages", + "Kids" => page_refs, + "Count" => count, + "Resources" => resources_id, + "MediaBox" => vec![ + Object::Integer(0), + Object::Integer(0), + Object::Integer(595), + Object::Integer(842), + ], + }; + doc.objects + .insert(pages_id, Object::Dictionary(pages_dict)); + + let catalog_id = doc.add_object(dictionary! { + "Type" => "Catalog", + "Pages" => pages_id, + }); + doc.trailer.set("Root", catalog_id); + + let mut out: Vec = Vec::new(); + doc.save_to(&mut out).expect("save PDF to memory"); + out +} + +/// Wrap any valid PDF byte buffer with a fake `/Encrypt` trailer entry +/// so `Document::is_encrypted()` flips to true. Mirrors +/// `kebab-parse-pdf::tests::common::make_encrypted_pdf`. +fn make_encrypted_pdf() -> Vec { + let bytes = build_text_pdf(&[Some("placeholder")]); + let mut doc = Document::load_mem(&bytes).expect("load round-tripped PDF"); + let enc_id = doc.add_object(dictionary! { + "Filter" => "Standard", + "V" => 1, + "R" => 2, + "Length" => 40, + "P" => -4, + }); + doc.trailer.set("Encrypt", enc_id); + let mut out = Vec::new(); + doc.save_to(&mut out).expect("save encrypted PDF"); + out +} + +fn corrupt_pdf() -> Vec { + b"NOT A PDF; just plain bytes".to_vec() +} + +fn write_pdf(root: &Path, name: &str, bytes: &[u8]) -> std::path::PathBuf { + let path = root.join(name); + std::fs::write(&path, bytes).expect("write PDF fixture"); + path +} + +fn cfg_with_pdf(env: &TestEnv) -> Config { + let mut cfg = env.config.clone(); + cfg.workspace.include.push("**/*.pdf".to_string()); + // PDF ingest does not need OCR / caption / LM — leave defaults + // (ocr.enabled=false, caption.enabled=false). The image pipeline + // construction step skips both adapters. + cfg.image.ocr.enabled = false; + cfg.image.caption.enabled = false; + cfg +} + +// ── Tests ──────────────────────────────────────────────────────────────── + +/// 3-page text PDF → 1 doc + 3 chunks, each chunk's `source_spans[0]` +/// is `Page { page: i, .. }`. +#[test] +fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() { + let env = TestEnv::lexical_only(); + let bytes = build_text_pdf(&[ + Some("Hello page 1 body."), + Some("Hello page 2 body."), + Some("Hello page 3 body."), + ]); + write_pdf(&env.workspace_root, "three.pdf", &bytes); + let cfg = cfg_with_pdf(&env); + + let report = + kebab_app::ingest_with_config(cfg.clone(), env.scope(), false) + .expect("PDF ingest must succeed"); + + assert_eq!(report.errors, 0); + let items = report.items.as_ref().expect("items present"); + let pdf_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("three.pdf")) + .expect("PDF item present"); + assert_eq!(pdf_item.kind, IngestItemKind::New); + assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page"); + assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page"); + assert_eq!( + pdf_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("pdf-text-v1") + ); + assert_eq!( + pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("pdf-page-v1") + ); + + // Inspect the stored doc to confirm SourceSpan::Page round-trip. + let doc = kebab_app::inspect_doc_with_config( + cfg, + pdf_item.doc_id.as_ref().unwrap(), + ) + .expect("inspect_doc returns the PDF document"); + assert_eq!(doc.blocks.len(), 3); + for (i, block) in doc.blocks.iter().enumerate() { + let want_page = (i as u32) + 1; + let common = match block { + Block::Paragraph(p) => &p.common, + other => panic!("expected Paragraph, got {other:?}"), + }; + match common.source_span { + SourceSpan::Page { page, .. } => assert_eq!(page, want_page), + ref other => panic!("expected Page span, got {other:?}"), + } + } +} + +/// Re-ingest the SAME PDF bytes → identical doc_id, identical chunk_id +/// set, item kind = Updated. P1 idempotency contract. +#[test] +fn re_ingest_identical_pdf_produces_updated_with_same_doc_id() { + let env = TestEnv::lexical_only(); + let bytes = build_text_pdf(&[Some("page 1"), Some("page 2")]); + write_pdf(&env.workspace_root, "stable.pdf", &bytes); + let cfg = cfg_with_pdf(&env); + + let report1 = + kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let item1 = report1 + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("stable.pdf")) + .cloned() + .unwrap(); + assert_eq!(item1.kind, IngestItemKind::New); + + let report2 = + kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let item2 = report2 + .items + .unwrap() + .into_iter() + .find(|i| i.doc_path.0.ends_with("stable.pdf")) + .unwrap(); + assert_eq!(item2.kind, IngestItemKind::Updated); + assert_eq!(item2.doc_id, item1.doc_id); +} + +/// Edit a PDF (replace bytes) → different blake3 → different asset_id +/// → different doc_id → `new+=1` for the new doc_id; first-pass row +/// remains untouched. +/// +/// **Currently `#[ignore]`** — exposes a storage-layer bug discovered +/// by this PR: `assets.workspace_path` carries a UNIQUE constraint and +/// `upsert_asset_row` only handles `ON CONFLICT(asset_id)`, so the +/// second insert (new `asset_id` for the edited bytes, same +/// `workspace_path`) trips constraint 2067. Affects markdown / image / +/// PDF paths equally; no test exercised it before P7-3. Logged in +/// `tasks/HOTFIXES.md` for a P+ storage-layer fix. +#[test] +#[ignore = "exposes storage-layer assets.workspace_path UNIQUE bug — see HOTFIXES 2026-05-02 P7-3"] +fn re_ingest_edited_pdf_produces_new_doc_id() { + let env = TestEnv::lexical_only(); + let path = env.workspace_root.join("evolving.pdf"); + let bytes_v1 = build_text_pdf(&[Some("version one body")]); + std::fs::write(&path, &bytes_v1).unwrap(); + let cfg = cfg_with_pdf(&env); + + let report_v1 = + kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let id_v1 = report_v1 + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("evolving.pdf")) + .unwrap() + .doc_id + .clone() + .unwrap(); + + let bytes_v2 = + build_text_pdf(&[Some("VERSION TWO entirely different body content.")]); + std::fs::write(&path, &bytes_v2).unwrap(); + + let report_v2 = + kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let item_v2 = report_v2 + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("evolving.pdf")) + .unwrap(); + assert_eq!( + item_v2.kind, + IngestItemKind::New, + "edited PDF gets a new asset_id → new doc_id → counted as New" + ); + assert_ne!(item_v2.doc_id.as_ref().unwrap().0, id_v1.0); +} + +/// Encrypted PDF → asset NOT stored; errors+=1; IngestItem.error +/// preserves the qpdf decrypt hint from kebab-parse-pdf verbatim. +#[test] +fn encrypted_pdf_fails_with_qpdf_hint() { + let env = TestEnv::lexical_only(); + let bytes = make_encrypted_pdf(); + write_pdf(&env.workspace_root, "secret.pdf", &bytes); + let cfg = cfg_with_pdf(&env); + + let report = + kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap(); + assert!(report.errors >= 1, "encrypted PDF must increment errors"); + let items = report.items.as_ref().unwrap(); + let pdf_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("secret.pdf")) + .expect("encrypted PDF item present"); + assert_eq!(pdf_item.kind, IngestItemKind::Error); + let err = pdf_item.error.as_ref().expect("error field set"); + assert!( + err.contains("encrypted"), + "error mentions encryption: {err}" + ); + assert!( + err.contains("qpdf") || err.contains("decrypt"), + "error preserves remediation hint: {err}" + ); +} + +/// Corrupt header PDF → asset NOT stored; errors+=1. +#[test] +fn corrupt_pdf_fails_without_storing() { + let env = TestEnv::lexical_only(); + let bytes = corrupt_pdf(); + write_pdf(&env.workspace_root, "corrupt.pdf", &bytes); + let cfg = cfg_with_pdf(&env); + + let report = + kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + assert!(report.errors >= 1); + let items = report.items.as_ref().unwrap(); + let pdf_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("corrupt.pdf")) + .unwrap(); + assert_eq!(pdf_item.kind, IngestItemKind::Error); + + // Confirm the doc was NOT stored — list_docs returns nothing for + // this path. + let summaries = kebab_app::list_docs_with_config( + cfg, + kebab_core::DocFilter::default(), + ) + .unwrap(); + assert!( + !summaries + .iter() + .any(|s| s.doc_path.0.ends_with("corrupt.pdf")), + "corrupt PDF must not have a stored doc row" + ); +} + +/// Mixed page PDF (text page 1, empty page 2, text page 3) → asset +/// stored; 2 chunks (pages 1 + 3); doc.provenance.events contains the +/// page-2 Warning emitted by kebab-parse-pdf. +#[test] +fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() { + let env = TestEnv::lexical_only(); + let bytes = + build_text_pdf(&[Some("first page"), None, Some("third page")]); + write_pdf(&env.workspace_root, "mixed.pdf", &bytes); + let cfg = cfg_with_pdf(&env); + + let report = + kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error"); + let pdf_item = report + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("mixed.pdf")) + .unwrap(); + assert_eq!(pdf_item.kind, IngestItemKind::New); + assert_eq!( + pdf_item.block_count, + Some(3), + "still 3 blocks (P7-1 emits empty Block::Paragraph for the empty page)" + ); + assert_eq!( + pdf_item.chunk_count, + Some(2), + "pdf-page-v1 emits 0 chunks for the empty page; total = 2" + ); + + let doc = kebab_app::inspect_doc_with_config( + cfg, + pdf_item.doc_id.as_ref().unwrap(), + ) + .unwrap(); + let warnings: Vec<_> = doc + .provenance + .events + .iter() + .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning) + .collect(); + assert_eq!( + warnings.len(), + 1, + "exactly one Warning event for the empty page" + ); + let note = warnings[0].note.as_deref().unwrap_or(""); + assert!( + note.contains("page2") && note.contains("scanned candidate"), + "Warning note marks page 2 as scanned candidate: {note}" + ); +} + +/// IngestReport invariant `scanned == new + updated + skipped + errors` +/// when ingesting a mixed corpus including a corrupt PDF. +#[test] +fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() { + let env = TestEnv::lexical_only(); + write_pdf( + &env.workspace_root, + "good.pdf", + &build_text_pdf(&[Some("ok body")]), + ); + write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf()); + let cfg = cfg_with_pdf(&env); + + let report = + kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap(); + let total = report.new + report.updated + report.skipped + report.errors; + assert_eq!( + report.scanned, total, + "invariant: scanned ({}) == new ({}) + updated ({}) + skipped ({}) + errors ({})", + report.scanned, report.new, report.updated, report.skipped, report.errors + ); + // Sanity: 1 good (new) + 1 broken (error) = 2 scanned for our PDFs; + // markdown fixtures already in the workspace add to scanned/new + // alike, so we only assert the invariant rather than absolute counts. +} + +/// 50-page PDF → ≥50 chunks (≥1 per page); ingest completes; storage +/// round-trips. Vector embedding is disabled in the lexical-only env +/// so this exercises the SQLite write path only. +#[test] +fn long_pdf_round_trips_through_lexical_pipeline() { + let env = TestEnv::lexical_only(); + let pages: Vec = (1..=50) + .map(|i| format!("Page {i} body — lorem ipsum dolor sit amet.")) + .collect(); + let page_refs: Vec> = + pages.iter().map(|s| Some(s.as_str())).collect(); + let bytes = build_text_pdf(&page_refs); + write_pdf(&env.workspace_root, "long.pdf", &bytes); + let cfg = cfg_with_pdf(&env); + + let report = + kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + assert_eq!(report.errors, 0); + let pdf_item = report + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("long.pdf")) + .unwrap(); + assert_eq!(pdf_item.block_count, Some(50)); + assert!( + pdf_item.chunk_count.unwrap() >= 50, + "chunk_count={:?} should be ≥50", + pdf_item.chunk_count + ); + + // Round-trip: list_docs sees the long PDF. + let summaries = + kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()) + .unwrap(); + assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf"))); +} + +/// `kebab inspect doc ` returns the PDF CanonicalDocument +/// with per-page Block::Paragraph + SourceSpan::Page intact. +#[test] +fn inspect_doc_surfaces_page_spans() { + let env = TestEnv::lexical_only(); + let bytes = + build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]); + write_pdf(&env.workspace_root, "inspect.pdf", &bytes); + let cfg = cfg_with_pdf(&env); + + let report = + kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let pdf_item = report + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("inspect.pdf")) + .unwrap(); + let doc = kebab_app::inspect_doc_with_config( + cfg, + pdf_item.doc_id.as_ref().unwrap(), + ) + .unwrap(); + assert_eq!(doc.parser_version.0, "pdf-text-v1"); + assert_eq!(doc.blocks.len(), 3); + for block in &doc.blocks { + match block { + Block::Paragraph(p) => assert!(matches!( + p.common.source_span, + SourceSpan::Page { .. } + )), + other => panic!("expected Paragraph, got {other:?}"), + } + } +} diff --git a/docs/SMOKE.md b/docs/SMOKE.md index 3a17012..bc5a1f2 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -151,7 +151,26 @@ max_pixels = 768 prompt_template_version = "caption-v1" ``` -이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로 (P7 머지 후). +이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로. + +## P7-3 PDF ingestion + +`config.toml` 의 `[workspace] include` 에 `**/*.pdf` 를 추가하면 `kebab ingest` 가 텍스트 PDF 자산도 색인합니다. 외부 service 의존 없음 — `kebab-parse-pdf` 가 lopdf 로 페이지 단위 텍스트 추출, `kebab-chunk::PdfPageV1Chunker` 가 페이지 경계를 절대 넘지 않는 chunk 생성. + +```toml +[workspace] +include = ["**/*.md", "**/*.pdf"] +``` + +PDF 한 권당 페이지 수만큼 (또는 페이지 텍스트가 길면 그 이상의) chunk 가 한 transaction 안에서 commit. 검색 결과의 `chunk.source_spans[0]` 가 `Page { page, char_start, char_end }` 형태라 인용 시 페이지 번호가 그대로 사용 가능. + +```bash +kebab --config /tmp/kebab-smoke/config.toml ingest +kebab --config /tmp/kebab-smoke/config.toml search --mode hybrid "<본문 단어>" +kebab --config /tmp/kebab-smoke/config.toml inspect doc "" +``` + +암호화 PDF (예: DRM 책) → `errors+=1`, `error` 필드에 `qpdf --decrypt` 안내. 빈/스캔 페이지 (텍스트 추출 실패) → 0 chunk + `Provenance::Warning` ("scanned candidate"). v1 에서는 검색 불가, P+ scanned-PDF OCR fallback 까지 대기. 각 명령은 0 종료 코드면 정상. `kebab ask` 는 거절 시 종료 코드 1 (`RefusalSignal`) — 의도된 동작. @@ -165,6 +184,7 @@ prompt_template_version = "caption-v1" - 코퍼스에 없는 주제로 `kebab ask` → `refusal_reason: "llm_self_judge"` (또는 `no_chunks` / `score_gate`) + `grounded: false`. - (P6-4) `image.ocr.enabled = true` 로 PNG 자산을 ingest 하면 `kebab list docs` 가 markdown 옆에 image doc 도 출력 (`workspace_path` 가 `*.png`). `kebab inspect doc ` 의 `block.ocr.joined` 가 vision LM 의 OCR 결과 (예: 스크린샷 안의 텍스트). `kebab search --mode lexical ""` 가 그 image chunk 를 반환하면 wiring 정상. - OCR / caption 부분 실패는 `errors` 카운터 미증가 — `kebab inspect doc ` 의 Provenance Warning 이벤트 또는 `--debug` 로그에서만 확인. +- (P7-3) `*.pdf` 자산을 워크스페이스에 두면 `kebab ingest` 출력에 PDF 도 `new` 카운터에 포함. `kebab inspect doc ` 가 `parser_version = "pdf-text-v1"` + 페이지마다 `Block::Paragraph` + `SourceSpan::Page { page, char_start, char_end }`. 본문에 등장하는 단어로 `kebab search --mode hybrid` 시 PDF chunk 가 결과에 포함되고 `source_span.kind = "page"` 면 wiring 정상. 암호화 PDF 는 `errors+=1` 로 분류되며 `error` 필드에 `qpdf --decrypt` 안내 보존. 빈/스캔 페이지 (PDF 가 텍스트를 추출하지 못한 페이지) 는 0 chunk + `Provenance::Warning` ("scanned candidate") 로 표시 — P+ scanned-PDF OCR fallback 까지는 검색 불가. ## 정리 @@ -181,6 +201,9 @@ rm -rf /tmp/kebab-smoke # 통째로 정리 - `kebab ask` 응답 시간 = LLM 토큰 throughput 에 종속. M4 Pro 48GB + gemma4:26b 기준 답변 50–100 토큰에 20–55초. - `--config` path 가 존재하지 않거나 malformed 면 `kebab doctor` 가 hard fail (defaults 가 silently mask 하지 않게 하는 hotfix 동작). - 매 CLI invocation 마다 fastembed 모델 init 비용 (~4초) — process-level 캐시 부재 때문. P9 TUI 진입 시 `App` 의 `OnceLock` 으로 세션 동안 한 번만 init. -- (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장 (P7 머지 후). +- (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장. +- (P7-3) `config.chunking.chunker_version` 는 markdown 만 represent — PDF 자산은 `pdf-page-v1` 하드코딩. `config.toml` 의 `chunker_version = "md-heading-v1"` 을 봐도 PDF 는 영향 안 받음. HOTFIXES `2026-05-02 P7-3` entry 참조 (P+ chunker registry task 까지 유지). +- (P7-3) 한 PDF 가 N 페이지면 `kebab ingest` 가 N 개 (또는 그 이상의, 페이지 길면 multi-chunk) 의 chunk 를 한 transaction 안에서 commit. 500 페이지 책 → 500+ chunk 한 번에 → embedding throughput 가 bottleneck. 임베딩 활성 워크스페이스에서 큰 PDF 를 처음 ingest 하면 분-단위 시간 + WAL 크기 증가 가능 — P+ 스케일 hardening task 까지 정상 동작이지만 비용은 측정 가능. +- (P7-3) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 storage UNIQUE 제약 (`assets.workspace_path`) 에 트립 → `errors+=1`. md / image / pdf 모두 동일하지만 P7-3 의 통합 테스트가 처음 노출. 우회: 파일 path 변경 후 ingest. 영구 fix 는 P+ storage 작업. 자세한 history 와 발견된 버그는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 참조. diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index ed69f17..4c4368f 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,26 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-05-02 — P7-3 PDF ingest wiring: chunker_version deviation + storage UNIQUE bug + +**Discovered**: P7-3 implementation start. + +**Symptom 1 (deviation, intentional)**: `tasks/p7/p7-3-pdf-ingest-wiring.md` § Chunker selection notes that `config.chunking.chunker_version` is single-valued and serves the markdown path only. PDF ingest hard-codes `pdf-page-v1` regardless of the config value. A user who reads `config.toml` and sees `chunker_version = "md-heading-v1"` reasonably assumes PDFs use the same — they don't. + +**Fix 1**: `ingest_one_pdf_asset` (in `kebab-app::lib.rs`) instantiates `PdfPageV1Chunker` directly. The `Chunk.chunker_version` field on emitted PDF chunks records `pdf-page-v1` truthfully. A future P+ task (chunker registry) either splits `Config::chunking.chunker_version` per medium or replaces the dispatch with a runtime registry. No HOTFIX entry needed once that happens — this entry is the cross-reference. + +**Symptom 2 (storage-layer bug, exposed but not fixed by P7-3)**: P7-3's edited-bytes re-ingest test (`re_ingest_edited_pdf_produces_new_doc_id`) tripped on `sqlite error: UNIQUE constraint failed: assets.workspace_path: Error code 2067`. The assets table has a UNIQUE constraint on `workspace_path`, but `upsert_asset_row` (in `kebab-store-sqlite::store.rs:305`) only handles `ON CONFLICT(asset_id)`. When a file's bytes change, the new BLAKE3 produces a new `asset_id` while the `workspace_path` stays the same — INSERT picks the new asset_id branch, then trips the secondary UNIQUE on workspace_path. + +**Why it didn't surface earlier**: No existing test (markdown / image) exercises edited-bytes re-ingest. The image path's `re_ingest_image_produces_updated_with_same_doc_id` uses identical bytes (same asset_id → ON CONFLICT(asset_id) catches it). Real-world editing of a tracked file would hit the same bug across all media types. + +**Fix 2 (deferred)**: Storage-layer fix is out of scope for P7-3. The P7-3 implementation PR `#[ignore]`s the `re_ingest_edited_pdf_produces_new_doc_id` test with a doc-comment pointing here. A P+ storage task either: +- Adds `ON CONFLICT(workspace_path) DO UPDATE` alongside the existing `ON CONFLICT(asset_id)` clause (DELETE-the-old + INSERT-the-new in a single statement, since UPSERT can only target one conflict path). +- Or drops the UNIQUE constraint on `assets.workspace_path` and relies on application-level uniqueness (workspace_path → asset_id mapping in a separate index table). + +**Amends**: +- tasks/p7/p7-3-pdf-ingest-wiring.md (chunker_version deviation, edited-bytes test ignored). +- (Implicitly) every previous task spec that assumed `assets.workspace_path` UNIQUE was safe — the constraint is in fact too strict for the byte-edit re-ingest case. + ## 2026-05-02 — P7-2 pdf-page-v1: chunk_id collision + BYTES_PER_TOKEN **Discovered**: P7-2 implementation start. diff --git a/tasks/p7/p7-3-pdf-ingest-wiring.md b/tasks/p7/p7-3-pdf-ingest-wiring.md index a96371a..f13eddb 100644 --- a/tasks/p7/p7-3-pdf-ingest-wiring.md +++ b/tasks/p7/p7-3-pdf-ingest-wiring.md @@ -3,7 +3,7 @@ phase: P7 component: kebab-app (PDF ingest dispatch + chunker selection) task_id: p7-3 title: "Wire PdfTextExtractor + PdfPageV1Chunker into kebab-app::ingest end-to-end" -status: planned +status: completed depends_on: [p7-1, p7-2, p1-6, p3-5, p6-4] unblocks: [] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md