feat(kebab-app): P7-3 PDF ingest wiring — kebab ingest 가 PDF 자산도 처리

P7-1 (`PdfTextExtractor`) + P7-2 (`PdfPageV1Chunker`) 의 라이브러리를 `kebab-app::ingest_with_config` 에 와이어링. `kebab-source-fs` 가 이미 `*.pdf` 를 `MediaType::Pdf` 로 분류하던 자산이 이제 검색 가능한 doc 으로 색인됨. P6-4 image wiring 패턴과 평행 — `ingest_one_asset` 에 `MediaType::Pdf` arm 추가, 새 private fn `ingest_one_pdf_asset` 로 분기. 핵심 동작: - per-medium chunker 선택: PDF 자산은 `PdfPageV1Chunker` 하드코딩 (compile-time match 기반). `config.chunking.chunker_version` 은 markdown 만 represent — PDF 는 항상 `pdf-page-v1`. HOTFIXES entry `2026-05-02 P7-3` 에 deviation 기록. - encrypted PDF / corrupt PDF → `errors+=1` + P7-1 의 `qpdf --decrypt` hint 를 `IngestItem.error` 에 verbatim 보존. - 빈/scanned candidate 페이지 → 0 chunk, P7-1 의 `Provenance::Warning` 그대로 통과. v1 에서는 검색 불가, P+ scanned-PDF OCR fallback 대기. - determinism stress: extract → chunk 사이 `now()` 추가 호출 없음 (P6-4 invariant 계승). PDF doc/chunk_id 모두 결정적. 통합 테스트 (`tests/pdf_pipeline.rs`, 8 passed + 1 ignored): - 3-page text PDF → 1 doc + 3 chunk + Page span 검증 - identical re-ingest → Updated, doc_id 동일 - encrypted PDF → Error + `qpdf` hint 보존 - corrupt header PDF → Error + 미저장 - mixed page (page 2 빈) → 2 chunk + Warning 1개 - IngestReport 산술 invariant - 50-page 긴 PDF → ≥50 chunk - inspect doc → SourceSpan::Page round-trip - (ignored) edited bytes re-ingest → storage UNIQUE bug 노출, P+ fix 대기 추가 발견 (HOTFIXES `2026-05-02 P7-3`): `assets.workspace_path` 의 UNIQUE 제약과 `upsert_asset_row` 의 `ON CONFLICT(asset_id)` 만 처리하는 부분 사이에 gap 존재. byte 변경 시 새 asset_id → 같은 workspace_path 충돌. md / image / pdf 모두 영향. P7-3 통합 테스트가 처음 노출. 본 PR 은 fix 안 함 — P+ storage task. `docs/SMOKE.md` 에 PDF 섹션 + 검증 체크리스트 + 알려진 동작 4건 추가. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 09:28:06 +00:00
parent 1986e9e026
commit 5f3a37cafa
7 changed files with 714 additions and 4 deletions
--- a/crates/kebab-app/Cargo.toml
+++ b/crates/kebab-app/Cargo.toml
@@ -28,6 +28,10 @@ kebab-rag = { path = "../kebab-rag" }
 # image branch). Trait-only consumption — no `kebab-parse-image`
 # internals leak into kb-app code.
 kebab-parse-image = { path = "../kebab-parse-image" }
+# P7-3: PDF text extractor lives here. App threads it into the
+# per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
+# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
+kebab-parse-pdf = { path = "../kebab-parse-pdf" }
 anyhow               = { workspace = true }
 blake3               = { workspace = true }
 serde                = { workspace = true }
@@ -48,3 +52,8 @@ tempfile             = { workspace = true }
 wiremock             = { workspace = true }
 tokio                = { workspace = true, features = ["rt-multi-thread"] }
 image                = { version = "0.25", default-features = false, features = ["png"] }
+# P7-3 PDF integration tests build in-memory PDF fixtures via the same
+# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
+# to the same major (0.32) so byte output is identical between the two
+# fixture surfaces.
+lopdf                = "0.32"
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -39,7 +39,7 @@ use std::sync::Arc;
 use anyhow::{Context, anyhow};
 use serde::{Deserialize, Serialize};

-use kebab_chunk::MdHeadingV1Chunker;
+use kebab_chunk::{MdHeadingV1Chunker, PdfPageV1Chunker};
 use kebab_core::{
    Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
    DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
@@ -50,6 +50,7 @@ use kebab_core::{
 use kebab_llm_local::OllamaLanguageModel;
 use kebab_normalize::build_canonical_document;
 use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr};
+use kebab_parse_pdf::PdfTextExtractor;
 use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
 use kebab_source_fs::FsSourceConnector;

@@ -520,6 +521,16 @@ fn ingest_one_asset(
                image_pipeline,
            );
        }
+        MediaType::Pdf => {
+            return ingest_one_pdf_asset(
+                app,
+                asset,
+                chunk_policy,
+                embedder,
+                vector_store,
+                existing_doc_ids,
+            );
+        }
        _ => {
            return Ok(kebab_core::IngestItem {
                kind: kebab_core::IngestItemKind::Skipped,
@@ -938,6 +949,156 @@ fn record_image_analysis_failure(
    warning_notes.push(note);
 }

+/// P7-3: process one `MediaType::Pdf` asset end-to-end.
+///
+/// - Reads bytes from disk.
+/// - Calls [`PdfTextExtractor::extract`]. Failure (corrupt header,
+///   encrypted PDF, etc.) → `IngestItemKind::Error` with the formatted
+///   message (so the `qpdf --decrypt` hint surfaces verbatim for the
+///   encrypted-PDF case). Continue to next asset; do not abort.
+/// - Hands the `CanonicalDocument` to [`PdfPageV1Chunker`] (per-medium
+///   chunker selection — keyed on `MediaType::Pdf` at compile time).
+///   Chunker validation failure (would only fire on P7-1 contract
+///   drift OR a future routing bug) is treated as `Error` too.
+/// - Persists doc + blocks + chunks via the same `DocumentStore`
+///   calls the markdown / image branches use.
+/// - Embeds chunks if both an embedder and a vector store are
+///   configured. Embed failure marks the item as `Error` AFTER
+///   doc/block/chunk rows are already written — re-running ingest
+///   re-attempts the embed (consistent with the markdown path; whole-
+///   asset rollback on embed-fail is a P+ task).
+///
+/// `chunker_version` is hard-coded to `pdf-page-v1` (HOTFIXES entry —
+/// `config.chunking.chunker_version` is single-valued today and serves
+/// the markdown path; per-medium config split is a P+ chunker registry
+/// task).
+#[allow(clippy::too_many_arguments)]
+fn ingest_one_pdf_asset(
+    app: &App,
+    asset: &RawAsset,
+    chunk_policy: &ChunkPolicy,
+    embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
+    vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
+    existing_doc_ids: &std::collections::HashSet<String>,
+) -> anyhow::Result<kebab_core::IngestItem> {
+    let path = match &asset.source_uri {
+        SourceUri::File(p) => p.clone(),
+        SourceUri::Kb(_) => {
+            return Ok(kebab_core::IngestItem {
+                kind: kebab_core::IngestItemKind::Skipped,
+                doc_id: None,
+                doc_path: asset.workspace_path.clone(),
+                asset_id: Some(asset.asset_id.clone()),
+                byte_len: Some(asset.byte_len),
+                block_count: None,
+                chunk_count: None,
+                parser_version: None,
+                chunker_version: None,
+                warnings: vec![
+                    "kb:// source URIs are not supported by the fs ingester".into(),
+                ],
+                error: None,
+            });
+        }
+    };
+    let bytes = std::fs::read(&path)
+        .with_context(|| format!("read PDF asset bytes from {}", path.display()))?;
+
+    let extract_config = kebab_core::ExtractConfig::default();
+    let workspace_root = std::path::PathBuf::from(&app.config.workspace.root);
+    let ctx = ExtractContext {
+        asset,
+        workspace_root: &workspace_root,
+        config: &extract_config,
+    };
+    let canonical = PdfTextExtractor::new()
+        .extract(&ctx, &bytes)
+        .context("kb-parse-pdf::PdfTextExtractor::extract")?;
+
+    // Per-medium chunker selection: PDF docs always use pdf-page-v1
+    // regardless of `config.chunking.chunker_version`. The chunker
+    // validates every block carries `SourceSpan::Page`; failure here
+    // means the parser drifted from its contract.
+    let chunker = PdfPageV1Chunker;
+    let chunks = chunker
+        .chunk(&canonical, chunk_policy)
+        .context("kb-chunk::PdfPageV1Chunker::chunk")?;
+
+    app.sqlite
+        .put_asset_with_bytes(asset, &bytes)
+        .context("DocumentStore::put_asset_with_bytes (pdf)")?;
+    app.sqlite
+        .put_document(&canonical)
+        .context("DocumentStore::put_document (pdf)")?;
+    app.sqlite
+        .put_blocks(&canonical.doc_id, &canonical.blocks)
+        .context("DocumentStore::put_blocks (pdf)")?;
+    app.sqlite
+        .put_chunks(&canonical.doc_id, &chunks)
+        .context("DocumentStore::put_chunks (pdf)")?;
+
+    if let (Some(emb), Some(vec_store)) = (embedder, vector_store)
+        && !chunks.is_empty()
+    {
+        let inputs: Vec<EmbeddingInput<'_>> = chunks
+            .iter()
+            .map(|c| EmbeddingInput {
+                text: c.text.as_str(),
+                kind: EmbeddingKind::Document,
+            })
+            .collect();
+        let vectors = emb
+            .embed(&inputs)
+            .context("Embedder::embed (pdf chunks)")?;
+        let model_id = emb.model_id();
+        let model_version = emb.model_version();
+        let dimensions = emb.dimensions();
+        let records: Vec<VectorRecord> = chunks
+            .iter()
+            .zip(vectors)
+            .map(|(c, v)| VectorRecord {
+                embedding_id: kebab_core::id_for_embedding(
+                    &c.chunk_id,
+                    &model_id,
+                    &model_version,
+                    dimensions,
+                ),
+                chunk_id: c.chunk_id.clone(),
+                vector: v,
+                doc_id: canonical.doc_id.clone(),
+                text: c.text.clone(),
+                heading_path: c.heading_path.clone(),
+                model_id: model_id.clone(),
+                model_version: model_version.clone(),
+                dimensions,
+            })
+            .collect();
+        vec_store
+            .upsert(&records)
+            .context("VectorStore::upsert (pdf)")?;
+    }
+
+    let kind = if existing_doc_ids.contains(&canonical.doc_id.0) {
+        kebab_core::IngestItemKind::Updated
+    } else {
+        kebab_core::IngestItemKind::New
+    };
+
+    Ok(kebab_core::IngestItem {
+        kind,
+        doc_id: Some(canonical.doc_id.clone()),
+        doc_path: asset.workspace_path.clone(),
+        asset_id: Some(asset.asset_id.clone()),
+        byte_len: Some(asset.byte_len),
+        block_count: u32::try_from(canonical.blocks.len()).ok(),
+        chunk_count: u32::try_from(chunks.len()).ok(),
+        parser_version: Some(canonical.parser_version.clone()),
+        chunker_version: Some(chunker.chunker_version()),
+        warnings: Vec::new(),
+        error: None,
+    })
+}
+
 /// Pull the BCP-47 language hint from the canonical document. P6-1
 /// stamps `Lang("und")` by default; image-pipeline OCR / caption
 /// adapters special-case "und" so the hint is intentionally dropped
--- a/crates/kebab-app/tests/pdf_pipeline.rs
+++ b/crates/kebab-app/tests/pdf_pipeline.rs
@@ -0,0 +1,495 @@
+//! P7-3 PDF ingest wiring — end-to-end integration.
+//!
+//! Each test spins up a `TempDir` workspace + writes one or more PDF
+//! fixtures via the same `lopdf` builder pattern
+//! `kebab-parse-pdf::tests::common` uses, then runs `kebab_app::
+//! ingest_with_config` against it. PDF ingest needs no external HTTP
+//! adapter (no OCR / caption / LM), so unlike the image pipeline these
+//! tests do NOT need wiremock — they run sync, no async runtime.
+
+mod common;
+
+use std::path::Path;
+
+use common::TestEnv;
+use kebab_config::Config;
+use kebab_core::{Block, IngestItemKind, SourceSpan};
+use lopdf::content::{Content, Operation};
+use lopdf::{Document, Object, Stream, dictionary};
+
+// ── Fixture helpers ──────────────────────────────────────────────────────
+
+/// Build a Helvetica-text PDF mirroring `kebab-parse-pdf::tests::common::
+/// build_text_pdf`. `pages` is one entry per page; `None` means the page
+/// has no `/Contents` stream (the "scanned candidate" shape — extract
+/// returns empty + emits a Provenance Warning).
+fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
+    let mut doc = Document::with_version("1.5");
+    let pages_id = doc.new_object_id();
+    let font_id = doc.add_object(dictionary! {
+        "Type" => "Font",
+        "Subtype" => "Type1",
+        "BaseFont" => "Helvetica",
+    });
+    let resources_id = doc.add_object(dictionary! {
+        "Font" => dictionary! { "F1" => font_id },
+    });
+
+    let mut page_refs: Vec<Object> = Vec::new();
+    for page in pages {
+        let mut page_dict = dictionary! {
+            "Type" => "Page",
+            "Parent" => pages_id,
+        };
+        if let Some(text) = page {
+            let content = Content {
+                operations: vec![
+                    Operation::new("BT", vec![]),
+                    Operation::new("Tf", vec!["F1".into(), 24.into()]),
+                    Operation::new(
+                        "Td",
+                        vec![Object::Integer(100), Object::Integer(700)],
+                    ),
+                    Operation::new("Tj", vec![Object::string_literal(*text)]),
+                    Operation::new("ET", vec![]),
+                ],
+            };
+            let stream_data = content.encode().expect("content encode");
+            let content_id =
+                doc.add_object(Stream::new(dictionary! {}, stream_data));
+            page_dict.set("Contents", content_id);
+        }
+        let page_id = doc.add_object(page_dict);
+        page_refs.push(page_id.into());
+    }
+
+    let count = page_refs.len() as i64;
+    let pages_dict = dictionary! {
+        "Type" => "Pages",
+        "Kids" => page_refs,
+        "Count" => count,
+        "Resources" => resources_id,
+        "MediaBox" => vec![
+            Object::Integer(0),
+            Object::Integer(0),
+            Object::Integer(595),
+            Object::Integer(842),
+        ],
+    };
+    doc.objects
+        .insert(pages_id, Object::Dictionary(pages_dict));
+
+    let catalog_id = doc.add_object(dictionary! {
+        "Type" => "Catalog",
+        "Pages" => pages_id,
+    });
+    doc.trailer.set("Root", catalog_id);
+
+    let mut out: Vec<u8> = Vec::new();
+    doc.save_to(&mut out).expect("save PDF to memory");
+    out
+}
+
+/// Wrap any valid PDF byte buffer with a fake `/Encrypt` trailer entry
+/// so `Document::is_encrypted()` flips to true. Mirrors
+/// `kebab-parse-pdf::tests::common::make_encrypted_pdf`.
+fn make_encrypted_pdf() -> Vec<u8> {
+    let bytes = build_text_pdf(&[Some("placeholder")]);
+    let mut doc = Document::load_mem(&bytes).expect("load round-tripped PDF");
+    let enc_id = doc.add_object(dictionary! {
+        "Filter" => "Standard",
+        "V" => 1,
+        "R" => 2,
+        "Length" => 40,
+        "P" => -4,
+    });
+    doc.trailer.set("Encrypt", enc_id);
+    let mut out = Vec::new();
+    doc.save_to(&mut out).expect("save encrypted PDF");
+    out
+}
+
+fn corrupt_pdf() -> Vec<u8> {
+    b"NOT A PDF; just plain bytes".to_vec()
+}
+
+fn write_pdf(root: &Path, name: &str, bytes: &[u8]) -> std::path::PathBuf {
+    let path = root.join(name);
+    std::fs::write(&path, bytes).expect("write PDF fixture");
+    path
+}
+
+fn cfg_with_pdf(env: &TestEnv) -> Config {
+    let mut cfg = env.config.clone();
+    cfg.workspace.include.push("**/*.pdf".to_string());
+    // PDF ingest does not need OCR / caption / LM — leave defaults
+    // (ocr.enabled=false, caption.enabled=false). The image pipeline
+    // construction step skips both adapters.
+    cfg.image.ocr.enabled = false;
+    cfg.image.caption.enabled = false;
+    cfg
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────
+
+/// 3-page text PDF → 1 doc + 3 chunks, each chunk's `source_spans[0]`
+/// is `Page { page: i, .. }`.
+#[test]
+fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
+    let env = TestEnv::lexical_only();
+    let bytes = build_text_pdf(&[
+        Some("Hello page 1 body."),
+        Some("Hello page 2 body."),
+        Some("Hello page 3 body."),
+    ]);
+    write_pdf(&env.workspace_root, "three.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
+            .expect("PDF ingest must succeed");
+
+    assert_eq!(report.errors, 0);
+    let items = report.items.as_ref().expect("items present");
+    let pdf_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("three.pdf"))
+        .expect("PDF item present");
+    assert_eq!(pdf_item.kind, IngestItemKind::New);
+    assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page");
+    assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page");
+    assert_eq!(
+        pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
+        Some("pdf-text-v1")
+    );
+    assert_eq!(
+        pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()),
+        Some("pdf-page-v1")
+    );
+
+    // Inspect the stored doc to confirm SourceSpan::Page round-trip.
+    let doc = kebab_app::inspect_doc_with_config(
+        cfg,
+        pdf_item.doc_id.as_ref().unwrap(),
+    )
+    .expect("inspect_doc returns the PDF document");
+    assert_eq!(doc.blocks.len(), 3);
+    for (i, block) in doc.blocks.iter().enumerate() {
+        let want_page = (i as u32) + 1;
+        let common = match block {
+            Block::Paragraph(p) => &p.common,
+            other => panic!("expected Paragraph, got {other:?}"),
+        };
+        match common.source_span {
+            SourceSpan::Page { page, .. } => assert_eq!(page, want_page),
+            ref other => panic!("expected Page span, got {other:?}"),
+        }
+    }
+}
+
+/// Re-ingest the SAME PDF bytes → identical doc_id, identical chunk_id
+/// set, item kind = Updated. P1 idempotency contract.
+#[test]
+fn re_ingest_identical_pdf_produces_updated_with_same_doc_id() {
+    let env = TestEnv::lexical_only();
+    let bytes = build_text_pdf(&[Some("page 1"), Some("page 2")]);
+    write_pdf(&env.workspace_root, "stable.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report1 =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let item1 = report1
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("stable.pdf"))
+        .cloned()
+        .unwrap();
+    assert_eq!(item1.kind, IngestItemKind::New);
+
+    let report2 =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let item2 = report2
+        .items
+        .unwrap()
+        .into_iter()
+        .find(|i| i.doc_path.0.ends_with("stable.pdf"))
+        .unwrap();
+    assert_eq!(item2.kind, IngestItemKind::Updated);
+    assert_eq!(item2.doc_id, item1.doc_id);
+}
+
+/// Edit a PDF (replace bytes) → different blake3 → different asset_id
+/// → different doc_id → `new+=1` for the new doc_id; first-pass row
+/// remains untouched.
+///
+/// **Currently `#[ignore]`** — exposes a storage-layer bug discovered
+/// by this PR: `assets.workspace_path` carries a UNIQUE constraint and
+/// `upsert_asset_row` only handles `ON CONFLICT(asset_id)`, so the
+/// second insert (new `asset_id` for the edited bytes, same
+/// `workspace_path`) trips constraint 2067. Affects markdown / image /
+/// PDF paths equally; no test exercised it before P7-3. Logged in
+/// `tasks/HOTFIXES.md` for a P+ storage-layer fix.
+#[test]
+#[ignore = "exposes storage-layer assets.workspace_path UNIQUE bug — see HOTFIXES 2026-05-02 P7-3"]
+fn re_ingest_edited_pdf_produces_new_doc_id() {
+    let env = TestEnv::lexical_only();
+    let path = env.workspace_root.join("evolving.pdf");
+    let bytes_v1 = build_text_pdf(&[Some("version one body")]);
+    std::fs::write(&path, &bytes_v1).unwrap();
+    let cfg = cfg_with_pdf(&env);
+
+    let report_v1 =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let id_v1 = report_v1
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("evolving.pdf"))
+        .unwrap()
+        .doc_id
+        .clone()
+        .unwrap();
+
+    let bytes_v2 =
+        build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
+    std::fs::write(&path, &bytes_v2).unwrap();
+
+    let report_v2 =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let item_v2 = report_v2
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("evolving.pdf"))
+        .unwrap();
+    assert_eq!(
+        item_v2.kind,
+        IngestItemKind::New,
+        "edited PDF gets a new asset_id → new doc_id → counted as New"
+    );
+    assert_ne!(item_v2.doc_id.as_ref().unwrap().0, id_v1.0);
+}
+
+/// Encrypted PDF → asset NOT stored; errors+=1; IngestItem.error
+/// preserves the qpdf decrypt hint from kebab-parse-pdf verbatim.
+#[test]
+fn encrypted_pdf_fails_with_qpdf_hint() {
+    let env = TestEnv::lexical_only();
+    let bytes = make_encrypted_pdf();
+    write_pdf(&env.workspace_root, "secret.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
+    assert!(report.errors >= 1, "encrypted PDF must increment errors");
+    let items = report.items.as_ref().unwrap();
+    let pdf_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("secret.pdf"))
+        .expect("encrypted PDF item present");
+    assert_eq!(pdf_item.kind, IngestItemKind::Error);
+    let err = pdf_item.error.as_ref().expect("error field set");
+    assert!(
+        err.contains("encrypted"),
+        "error mentions encryption: {err}"
+    );
+    assert!(
+        err.contains("qpdf") || err.contains("decrypt"),
+        "error preserves remediation hint: {err}"
+    );
+}
+
+/// Corrupt header PDF → asset NOT stored; errors+=1.
+#[test]
+fn corrupt_pdf_fails_without_storing() {
+    let env = TestEnv::lexical_only();
+    let bytes = corrupt_pdf();
+    write_pdf(&env.workspace_root, "corrupt.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    assert!(report.errors >= 1);
+    let items = report.items.as_ref().unwrap();
+    let pdf_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("corrupt.pdf"))
+        .unwrap();
+    assert_eq!(pdf_item.kind, IngestItemKind::Error);
+
+    // Confirm the doc was NOT stored — list_docs returns nothing for
+    // this path.
+    let summaries = kebab_app::list_docs_with_config(
+        cfg,
+        kebab_core::DocFilter::default(),
+    )
+    .unwrap();
+    assert!(
+        !summaries
+            .iter()
+            .any(|s| s.doc_path.0.ends_with("corrupt.pdf")),
+        "corrupt PDF must not have a stored doc row"
+    );
+}
+
+/// Mixed page PDF (text page 1, empty page 2, text page 3) → asset
+/// stored; 2 chunks (pages 1 + 3); doc.provenance.events contains the
+/// page-2 Warning emitted by kebab-parse-pdf.
+#[test]
+fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
+    let env = TestEnv::lexical_only();
+    let bytes =
+        build_text_pdf(&[Some("first page"), None, Some("third page")]);
+    write_pdf(&env.workspace_root, "mixed.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error");
+    let pdf_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("mixed.pdf"))
+        .unwrap();
+    assert_eq!(pdf_item.kind, IngestItemKind::New);
+    assert_eq!(
+        pdf_item.block_count,
+        Some(3),
+        "still 3 blocks (P7-1 emits empty Block::Paragraph for the empty page)"
+    );
+    assert_eq!(
+        pdf_item.chunk_count,
+        Some(2),
+        "pdf-page-v1 emits 0 chunks for the empty page; total = 2"
+    );
+
+    let doc = kebab_app::inspect_doc_with_config(
+        cfg,
+        pdf_item.doc_id.as_ref().unwrap(),
+    )
+    .unwrap();
+    let warnings: Vec<_> = doc
+        .provenance
+        .events
+        .iter()
+        .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
+        .collect();
+    assert_eq!(
+        warnings.len(),
+        1,
+        "exactly one Warning event for the empty page"
+    );
+    let note = warnings[0].note.as_deref().unwrap_or("");
+    assert!(
+        note.contains("page2") && note.contains("scanned candidate"),
+        "Warning note marks page 2 as scanned candidate: {note}"
+    );
+}
+
+/// IngestReport invariant `scanned == new + updated + skipped + errors`
+/// when ingesting a mixed corpus including a corrupt PDF.
+#[test]
+fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() {
+    let env = TestEnv::lexical_only();
+    write_pdf(
+        &env.workspace_root,
+        "good.pdf",
+        &build_text_pdf(&[Some("ok body")]),
+    );
+    write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf());
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
+    let total = report.new + report.updated + report.skipped + report.errors;
+    assert_eq!(
+        report.scanned, total,
+        "invariant: scanned ({}) == new ({}) + updated ({}) + skipped ({}) + errors ({})",
+        report.scanned, report.new, report.updated, report.skipped, report.errors
+    );
+    // Sanity: 1 good (new) + 1 broken (error) = 2 scanned for our PDFs;
+    // markdown fixtures already in the workspace add to scanned/new
+    // alike, so we only assert the invariant rather than absolute counts.
+}
+
+/// 50-page PDF → ≥50 chunks (≥1 per page); ingest completes; storage
+/// round-trips. Vector embedding is disabled in the lexical-only env
+/// so this exercises the SQLite write path only.
+#[test]
+fn long_pdf_round_trips_through_lexical_pipeline() {
+    let env = TestEnv::lexical_only();
+    let pages: Vec<String> = (1..=50)
+        .map(|i| format!("Page {i} body — lorem ipsum dolor sit amet."))
+        .collect();
+    let page_refs: Vec<Option<&str>> =
+        pages.iter().map(|s| Some(s.as_str())).collect();
+    let bytes = build_text_pdf(&page_refs);
+    write_pdf(&env.workspace_root, "long.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    assert_eq!(report.errors, 0);
+    let pdf_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("long.pdf"))
+        .unwrap();
+    assert_eq!(pdf_item.block_count, Some(50));
+    assert!(
+        pdf_item.chunk_count.unwrap() >= 50,
+        "chunk_count={:?} should be ≥50",
+        pdf_item.chunk_count
+    );
+
+    // Round-trip: list_docs sees the long PDF.
+    let summaries =
+        kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
+            .unwrap();
+    assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf")));
+}
+
+/// `kebab inspect doc <pdf_doc_id>` returns the PDF CanonicalDocument
+/// with per-page Block::Paragraph + SourceSpan::Page intact.
+#[test]
+fn inspect_doc_surfaces_page_spans() {
+    let env = TestEnv::lexical_only();
+    let bytes =
+        build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
+    write_pdf(&env.workspace_root, "inspect.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let pdf_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("inspect.pdf"))
+        .unwrap();
+    let doc = kebab_app::inspect_doc_with_config(
+        cfg,
+        pdf_item.doc_id.as_ref().unwrap(),
+    )
+    .unwrap();
+    assert_eq!(doc.parser_version.0, "pdf-text-v1");
+    assert_eq!(doc.blocks.len(), 3);
+    for block in &doc.blocks {
+        match block {
+            Block::Paragraph(p) => assert!(matches!(
+                p.common.source_span,
+                SourceSpan::Page { .. }
+            )),
+            other => panic!("expected Paragraph, got {other:?}"),
+        }
+    }
+}