From 5f3a37cafaff24a36d31c04d7b958265224f6c11 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Sat, 2 May 2026 09:28:06 +0000
Subject: [PATCH 1/3] =?UTF-8?q?feat(kebab-app):=20P7-3=20PDF=20ingest=20wi?=
 =?UTF-8?q?ring=20=E2=80=94=20kebab=20ingest=20=EA=B0=80=20PDF=20=EC=9E=90?=
 =?UTF-8?q?=EC=82=B0=EB=8F=84=20=EC=B2=98=EB=A6=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

P7-1 (`PdfTextExtractor`) + P7-2 (`PdfPageV1Chunker`) 의 라이브러리를
`kebab-app::ingest_with_config` 에 와이어링. `kebab-source-fs` 가 이미
`*.pdf` 를 `MediaType::Pdf` 로 분류하던 자산이 이제 검색 가능한 doc 으로
색인됨. P6-4 image wiring 패턴과 평행 — `ingest_one_asset` 에 `MediaType::Pdf`
arm 추가, 새 private fn `ingest_one_pdf_asset` 로 분기.

핵심 동작:
- per-medium chunker 선택: PDF 자산은 `PdfPageV1Chunker` 하드코딩 (compile-time
  match 기반). `config.chunking.chunker_version` 은 markdown 만 represent —
  PDF 는 항상 `pdf-page-v1`. HOTFIXES entry `2026-05-02 P7-3` 에 deviation 기록.
- encrypted PDF / corrupt PDF → `errors+=1` + P7-1 의 `qpdf --decrypt` hint
  를 `IngestItem.error` 에 verbatim 보존.
- 빈/scanned candidate 페이지 → 0 chunk, P7-1 의 `Provenance::Warning` 그대로
  통과. v1 에서는 검색 불가, P+ scanned-PDF OCR fallback 대기.
- determinism stress: extract → chunk 사이 `now()` 추가 호출 없음 (P6-4 invariant
  계승). PDF doc/chunk_id 모두 결정적.

통합 테스트 (`tests/pdf_pipeline.rs`, 8 passed + 1 ignored):
- 3-page text PDF → 1 doc + 3 chunk + Page span 검증
- identical re-ingest → Updated, doc_id 동일
- encrypted PDF → Error + `qpdf` hint 보존
- corrupt header PDF → Error + 미저장
- mixed page (page 2 빈) → 2 chunk + Warning 1개
- IngestReport 산술 invariant
- 50-page 긴 PDF → ≥50 chunk
- inspect doc → SourceSpan::Page round-trip
- (ignored) edited bytes re-ingest → storage UNIQUE bug 노출, P+ fix 대기

추가 발견 (HOTFIXES `2026-05-02 P7-3`): `assets.workspace_path` 의 UNIQUE
제약과 `upsert_asset_row` 의 `ON CONFLICT(asset_id)` 만 처리하는 부분 사이에
gap 존재. byte 변경 시 새 asset_id → 같은 workspace_path 충돌. md / image / pdf
모두 영향. P7-3 통합 테스트가 처음 노출. 본 PR 은 fix 안 함 — P+ storage task.

`docs/SMOKE.md` 에 PDF 섹션 + 검증 체크리스트 + 알려진 동작 4건 추가.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                             |   2 +
 crates/kebab-app/Cargo.toml            |   9 +
 crates/kebab-app/src/lib.rs            | 163 +++++++-
 crates/kebab-app/tests/pdf_pipeline.rs | 495 +++++++++++++++++++++++++
 docs/SMOKE.md                          |  27 +-
 tasks/HOTFIXES.md                      |  20 +
 tasks/p7/p7-3-pdf-ingest-wiring.md     |   2 +-
 7 files changed, 714 insertions(+), 4 deletions(-)
 create mode 100644 crates/kebab-app/tests/pdf_pipeline.rs

diff --git a/Cargo.lock b/Cargo.lock
index cfc034c..92ef438 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3408,12 +3408,14 @@ dependencies = [
  "kebab-normalize",
  "kebab-parse-image",
  "kebab-parse-md",
+ "kebab-parse-pdf",
  "kebab-parse-types",
  "kebab-rag",
  "kebab-search",
  "kebab-source-fs",
  "kebab-store-sqlite",
  "kebab-store-vector",
+ "lopdf",
  "rusqlite",
  "serde",
  "serde_json",
diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml
index c50ae8e..613d9ea 100644
--- a/crates/kebab-app/Cargo.toml
+++ b/crates/kebab-app/Cargo.toml
@@ -28,6 +28,10 @@ kebab-rag = { path = "../kebab-rag" }
 # image branch). Trait-only consumption — no `kebab-parse-image`
 # internals leak into kb-app code.
 kebab-parse-image = { path = "../kebab-parse-image" }
+# P7-3: PDF text extractor lives here. App threads it into the
+# per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
+# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
+kebab-parse-pdf = { path = "../kebab-parse-pdf" }
 anyhow               = { workspace = true }
 blake3               = { workspace = true }
 serde                = { workspace = true }
@@ -48,3 +52,8 @@ tempfile             = { workspace = true }
 wiremock             = { workspace = true }
 tokio                = { workspace = true, features = ["rt-multi-thread"] }
 image                = { version = "0.25", default-features = false, features = ["png"] }
+# P7-3 PDF integration tests build in-memory PDF fixtures via the same
+# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
+# to the same major (0.32) so byte output is identical between the two
+# fixture surfaces.
+lopdf                = "0.32"
diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs
index d916f7e..026a5d5 100644
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -39,7 +39,7 @@ use std::sync::Arc;
 use anyhow::{Context, anyhow};
 use serde::{Deserialize, Serialize};
 
-use kebab_chunk::MdHeadingV1Chunker;
+use kebab_chunk::{MdHeadingV1Chunker, PdfPageV1Chunker};
 use kebab_core::{
     Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
     DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
@@ -50,6 +50,7 @@ use kebab_core::{
 use kebab_llm_local::OllamaLanguageModel;
 use kebab_normalize::build_canonical_document;
 use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr};
+use kebab_parse_pdf::PdfTextExtractor;
 use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
 use kebab_source_fs::FsSourceConnector;
 
@@ -520,6 +521,16 @@ fn ingest_one_asset(
                 image_pipeline,
             );
         }
+        MediaType::Pdf => {
+            return ingest_one_pdf_asset(
+                app,
+                asset,
+                chunk_policy,
+                embedder,
+                vector_store,
+                existing_doc_ids,
+            );
+        }
         _ => {
             return Ok(kebab_core::IngestItem {
                 kind: kebab_core::IngestItemKind::Skipped,
@@ -938,6 +949,156 @@ fn record_image_analysis_failure(
     warning_notes.push(note);
 }
 
+/// P7-3: process one `MediaType::Pdf` asset end-to-end.
+///
+/// - Reads bytes from disk.
+/// - Calls [`PdfTextExtractor::extract`]. Failure (corrupt header,
+///   encrypted PDF, etc.) → `IngestItemKind::Error` with the formatted
+///   message (so the `qpdf --decrypt` hint surfaces verbatim for the
+///   encrypted-PDF case). Continue to next asset; do not abort.
+/// - Hands the `CanonicalDocument` to [`PdfPageV1Chunker`] (per-medium
+///   chunker selection — keyed on `MediaType::Pdf` at compile time).
+///   Chunker validation failure (would only fire on P7-1 contract
+///   drift OR a future routing bug) is treated as `Error` too.
+/// - Persists doc + blocks + chunks via the same `DocumentStore`
+///   calls the markdown / image branches use.
+/// - Embeds chunks if both an embedder and a vector store are
+///   configured. Embed failure marks the item as `Error` AFTER
+///   doc/block/chunk rows are already written — re-running ingest
+///   re-attempts the embed (consistent with the markdown path; whole-
+///   asset rollback on embed-fail is a P+ task).
+///
+/// `chunker_version` is hard-coded to `pdf-page-v1` (HOTFIXES entry —
+/// `config.chunking.chunker_version` is single-valued today and serves
+/// the markdown path; per-medium config split is a P+ chunker registry
+/// task).
+#[allow(clippy::too_many_arguments)]
+fn ingest_one_pdf_asset(
+    app: &App,
+    asset: &RawAsset,
+    chunk_policy: &ChunkPolicy,
+    embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
+    vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
+    existing_doc_ids: &std::collections::HashSet<String>,
+) -> anyhow::Result<kebab_core::IngestItem> {
+    let path = match &asset.source_uri {
+        SourceUri::File(p) => p.clone(),
+        SourceUri::Kb(_) => {
+            return Ok(kebab_core::IngestItem {
+                kind: kebab_core::IngestItemKind::Skipped,
+                doc_id: None,
+                doc_path: asset.workspace_path.clone(),
+                asset_id: Some(asset.asset_id.clone()),
+                byte_len: Some(asset.byte_len),
+                block_count: None,
+                chunk_count: None,
+                parser_version: None,
+                chunker_version: None,
+                warnings: vec![
+                    "kb:// source URIs are not supported by the fs ingester".into(),
+                ],
+                error: None,
+            });
+        }
+    };
+    let bytes = std::fs::read(&path)
+        .with_context(|| format!("read PDF asset bytes from {}", path.display()))?;
+
+    let extract_config = kebab_core::ExtractConfig::default();
+    let workspace_root = std::path::PathBuf::from(&app.config.workspace.root);
+    let ctx = ExtractContext {
+        asset,
+        workspace_root: &workspace_root,
+        config: &extract_config,
+    };
+    let canonical = PdfTextExtractor::new()
+        .extract(&ctx, &bytes)
+        .context("kb-parse-pdf::PdfTextExtractor::extract")?;
+
+    // Per-medium chunker selection: PDF docs always use pdf-page-v1
+    // regardless of `config.chunking.chunker_version`. The chunker
+    // validates every block carries `SourceSpan::Page`; failure here
+    // means the parser drifted from its contract.
+    let chunker = PdfPageV1Chunker;
+    let chunks = chunker
+        .chunk(&canonical, chunk_policy)
+        .context("kb-chunk::PdfPageV1Chunker::chunk")?;
+
+    app.sqlite
+        .put_asset_with_bytes(asset, &bytes)
+        .context("DocumentStore::put_asset_with_bytes (pdf)")?;
+    app.sqlite
+        .put_document(&canonical)
+        .context("DocumentStore::put_document (pdf)")?;
+    app.sqlite
+        .put_blocks(&canonical.doc_id, &canonical.blocks)
+        .context("DocumentStore::put_blocks (pdf)")?;
+    app.sqlite
+        .put_chunks(&canonical.doc_id, &chunks)
+        .context("DocumentStore::put_chunks (pdf)")?;
+
+    if let (Some(emb), Some(vec_store)) = (embedder, vector_store)
+        && !chunks.is_empty()
+    {
+        let inputs: Vec<EmbeddingInput<'_>> = chunks
+            .iter()
+            .map(|c| EmbeddingInput {
+                text: c.text.as_str(),
+                kind: EmbeddingKind::Document,
+            })
+            .collect();
+        let vectors = emb
+            .embed(&inputs)
+            .context("Embedder::embed (pdf chunks)")?;
+        let model_id = emb.model_id();
+        let model_version = emb.model_version();
+        let dimensions = emb.dimensions();
+        let records: Vec<VectorRecord> = chunks
+            .iter()
+            .zip(vectors)
+            .map(|(c, v)| VectorRecord {
+                embedding_id: kebab_core::id_for_embedding(
+                    &c.chunk_id,
+                    &model_id,
+                    &model_version,
+                    dimensions,
+                ),
+                chunk_id: c.chunk_id.clone(),
+                vector: v,
+                doc_id: canonical.doc_id.clone(),
+                text: c.text.clone(),
+                heading_path: c.heading_path.clone(),
+                model_id: model_id.clone(),
+                model_version: model_version.clone(),
+                dimensions,
+            })
+            .collect();
+        vec_store
+            .upsert(&records)
+            .context("VectorStore::upsert (pdf)")?;
+    }
+
+    let kind = if existing_doc_ids.contains(&canonical.doc_id.0) {
+        kebab_core::IngestItemKind::Updated
+    } else {
+        kebab_core::IngestItemKind::New
+    };
+
+    Ok(kebab_core::IngestItem {
+        kind,
+        doc_id: Some(canonical.doc_id.clone()),
+        doc_path: asset.workspace_path.clone(),
+        asset_id: Some(asset.asset_id.clone()),
+        byte_len: Some(asset.byte_len),
+        block_count: u32::try_from(canonical.blocks.len()).ok(),
+        chunk_count: u32::try_from(chunks.len()).ok(),
+        parser_version: Some(canonical.parser_version.clone()),
+        chunker_version: Some(chunker.chunker_version()),
+        warnings: Vec::new(),
+        error: None,
+    })
+}
+
 /// Pull the BCP-47 language hint from the canonical document. P6-1
 /// stamps `Lang("und")` by default; image-pipeline OCR / caption
 /// adapters special-case "und" so the hint is intentionally dropped
diff --git a/crates/kebab-app/tests/pdf_pipeline.rs b/crates/kebab-app/tests/pdf_pipeline.rs
new file mode 100644
index 0000000..12accf1
--- /dev/null
+++ b/crates/kebab-app/tests/pdf_pipeline.rs
@@ -0,0 +1,495 @@
+//! P7-3 PDF ingest wiring — end-to-end integration.
+//!
+//! Each test spins up a `TempDir` workspace + writes one or more PDF
+//! fixtures via the same `lopdf` builder pattern
+//! `kebab-parse-pdf::tests::common` uses, then runs `kebab_app::
+//! ingest_with_config` against it. PDF ingest needs no external HTTP
+//! adapter (no OCR / caption / LM), so unlike the image pipeline these
+//! tests do NOT need wiremock — they run sync, no async runtime.
+
+mod common;
+
+use std::path::Path;
+
+use common::TestEnv;
+use kebab_config::Config;
+use kebab_core::{Block, IngestItemKind, SourceSpan};
+use lopdf::content::{Content, Operation};
+use lopdf::{Document, Object, Stream, dictionary};
+
+// ── Fixture helpers ──────────────────────────────────────────────────────
+
+/// Build a Helvetica-text PDF mirroring `kebab-parse-pdf::tests::common::
+/// build_text_pdf`. `pages` is one entry per page; `None` means the page
+/// has no `/Contents` stream (the "scanned candidate" shape — extract
+/// returns empty + emits a Provenance Warning).
+fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
+    let mut doc = Document::with_version("1.5");
+    let pages_id = doc.new_object_id();
+    let font_id = doc.add_object(dictionary! {
+        "Type" => "Font",
+        "Subtype" => "Type1",
+        "BaseFont" => "Helvetica",
+    });
+    let resources_id = doc.add_object(dictionary! {
+        "Font" => dictionary! { "F1" => font_id },
+    });
+
+    let mut page_refs: Vec<Object> = Vec::new();
+    for page in pages {
+        let mut page_dict = dictionary! {
+            "Type" => "Page",
+            "Parent" => pages_id,
+        };
+        if let Some(text) = page {
+            let content = Content {
+                operations: vec![
+                    Operation::new("BT", vec![]),
+                    Operation::new("Tf", vec!["F1".into(), 24.into()]),
+                    Operation::new(
+                        "Td",
+                        vec![Object::Integer(100), Object::Integer(700)],
+                    ),
+                    Operation::new("Tj", vec![Object::string_literal(*text)]),
+                    Operation::new("ET", vec![]),
+                ],
+            };
+            let stream_data = content.encode().expect("content encode");
+            let content_id =
+                doc.add_object(Stream::new(dictionary! {}, stream_data));
+            page_dict.set("Contents", content_id);
+        }
+        let page_id = doc.add_object(page_dict);
+        page_refs.push(page_id.into());
+    }
+
+    let count = page_refs.len() as i64;
+    let pages_dict = dictionary! {
+        "Type" => "Pages",
+        "Kids" => page_refs,
+        "Count" => count,
+        "Resources" => resources_id,
+        "MediaBox" => vec![
+            Object::Integer(0),
+            Object::Integer(0),
+            Object::Integer(595),
+            Object::Integer(842),
+        ],
+    };
+    doc.objects
+        .insert(pages_id, Object::Dictionary(pages_dict));
+
+    let catalog_id = doc.add_object(dictionary! {
+        "Type" => "Catalog",
+        "Pages" => pages_id,
+    });
+    doc.trailer.set("Root", catalog_id);
+
+    let mut out: Vec<u8> = Vec::new();
+    doc.save_to(&mut out).expect("save PDF to memory");
+    out
+}
+
+/// Wrap any valid PDF byte buffer with a fake `/Encrypt` trailer entry
+/// so `Document::is_encrypted()` flips to true. Mirrors
+/// `kebab-parse-pdf::tests::common::make_encrypted_pdf`.
+fn make_encrypted_pdf() -> Vec<u8> {
+    let bytes = build_text_pdf(&[Some("placeholder")]);
+    let mut doc = Document::load_mem(&bytes).expect("load round-tripped PDF");
+    let enc_id = doc.add_object(dictionary! {
+        "Filter" => "Standard",
+        "V" => 1,
+        "R" => 2,
+        "Length" => 40,
+        "P" => -4,
+    });
+    doc.trailer.set("Encrypt", enc_id);
+    let mut out = Vec::new();
+    doc.save_to(&mut out).expect("save encrypted PDF");
+    out
+}
+
+fn corrupt_pdf() -> Vec<u8> {
+    b"NOT A PDF; just plain bytes".to_vec()
+}
+
+fn write_pdf(root: &Path, name: &str, bytes: &[u8]) -> std::path::PathBuf {
+    let path = root.join(name);
+    std::fs::write(&path, bytes).expect("write PDF fixture");
+    path
+}
+
+fn cfg_with_pdf(env: &TestEnv) -> Config {
+    let mut cfg = env.config.clone();
+    cfg.workspace.include.push("**/*.pdf".to_string());
+    // PDF ingest does not need OCR / caption / LM — leave defaults
+    // (ocr.enabled=false, caption.enabled=false). The image pipeline
+    // construction step skips both adapters.
+    cfg.image.ocr.enabled = false;
+    cfg.image.caption.enabled = false;
+    cfg
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────
+
+/// 3-page text PDF → 1 doc + 3 chunks, each chunk's `source_spans[0]`
+/// is `Page { page: i, .. }`.
+#[test]
+fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
+    let env = TestEnv::lexical_only();
+    let bytes = build_text_pdf(&[
+        Some("Hello page 1 body."),
+        Some("Hello page 2 body."),
+        Some("Hello page 3 body."),
+    ]);
+    write_pdf(&env.workspace_root, "three.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
+            .expect("PDF ingest must succeed");
+
+    assert_eq!(report.errors, 0);
+    let items = report.items.as_ref().expect("items present");
+    let pdf_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("three.pdf"))
+        .expect("PDF item present");
+    assert_eq!(pdf_item.kind, IngestItemKind::New);
+    assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page");
+    assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page");
+    assert_eq!(
+        pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
+        Some("pdf-text-v1")
+    );
+    assert_eq!(
+        pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()),
+        Some("pdf-page-v1")
+    );
+
+    // Inspect the stored doc to confirm SourceSpan::Page round-trip.
+    let doc = kebab_app::inspect_doc_with_config(
+        cfg,
+        pdf_item.doc_id.as_ref().unwrap(),
+    )
+    .expect("inspect_doc returns the PDF document");
+    assert_eq!(doc.blocks.len(), 3);
+    for (i, block) in doc.blocks.iter().enumerate() {
+        let want_page = (i as u32) + 1;
+        let common = match block {
+            Block::Paragraph(p) => &p.common,
+            other => panic!("expected Paragraph, got {other:?}"),
+        };
+        match common.source_span {
+            SourceSpan::Page { page, .. } => assert_eq!(page, want_page),
+            ref other => panic!("expected Page span, got {other:?}"),
+        }
+    }
+}
+
+/// Re-ingest the SAME PDF bytes → identical doc_id, identical chunk_id
+/// set, item kind = Updated. P1 idempotency contract.
+#[test]
+fn re_ingest_identical_pdf_produces_updated_with_same_doc_id() {
+    let env = TestEnv::lexical_only();
+    let bytes = build_text_pdf(&[Some("page 1"), Some("page 2")]);
+    write_pdf(&env.workspace_root, "stable.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report1 =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let item1 = report1
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("stable.pdf"))
+        .cloned()
+        .unwrap();
+    assert_eq!(item1.kind, IngestItemKind::New);
+
+    let report2 =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let item2 = report2
+        .items
+        .unwrap()
+        .into_iter()
+        .find(|i| i.doc_path.0.ends_with("stable.pdf"))
+        .unwrap();
+    assert_eq!(item2.kind, IngestItemKind::Updated);
+    assert_eq!(item2.doc_id, item1.doc_id);
+}
+
+/// Edit a PDF (replace bytes) → different blake3 → different asset_id
+/// → different doc_id → `new+=1` for the new doc_id; first-pass row
+/// remains untouched.
+///
+/// **Currently `#[ignore]`** — exposes a storage-layer bug discovered
+/// by this PR: `assets.workspace_path` carries a UNIQUE constraint and
+/// `upsert_asset_row` only handles `ON CONFLICT(asset_id)`, so the
+/// second insert (new `asset_id` for the edited bytes, same
+/// `workspace_path`) trips constraint 2067. Affects markdown / image /
+/// PDF paths equally; no test exercised it before P7-3. Logged in
+/// `tasks/HOTFIXES.md` for a P+ storage-layer fix.
+#[test]
+#[ignore = "exposes storage-layer assets.workspace_path UNIQUE bug — see HOTFIXES 2026-05-02 P7-3"]
+fn re_ingest_edited_pdf_produces_new_doc_id() {
+    let env = TestEnv::lexical_only();
+    let path = env.workspace_root.join("evolving.pdf");
+    let bytes_v1 = build_text_pdf(&[Some("version one body")]);
+    std::fs::write(&path, &bytes_v1).unwrap();
+    let cfg = cfg_with_pdf(&env);
+
+    let report_v1 =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let id_v1 = report_v1
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("evolving.pdf"))
+        .unwrap()
+        .doc_id
+        .clone()
+        .unwrap();
+
+    let bytes_v2 =
+        build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
+    std::fs::write(&path, &bytes_v2).unwrap();
+
+    let report_v2 =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let item_v2 = report_v2
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("evolving.pdf"))
+        .unwrap();
+    assert_eq!(
+        item_v2.kind,
+        IngestItemKind::New,
+        "edited PDF gets a new asset_id → new doc_id → counted as New"
+    );
+    assert_ne!(item_v2.doc_id.as_ref().unwrap().0, id_v1.0);
+}
+
+/// Encrypted PDF → asset NOT stored; errors+=1; IngestItem.error
+/// preserves the qpdf decrypt hint from kebab-parse-pdf verbatim.
+#[test]
+fn encrypted_pdf_fails_with_qpdf_hint() {
+    let env = TestEnv::lexical_only();
+    let bytes = make_encrypted_pdf();
+    write_pdf(&env.workspace_root, "secret.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
+    assert!(report.errors >= 1, "encrypted PDF must increment errors");
+    let items = report.items.as_ref().unwrap();
+    let pdf_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("secret.pdf"))
+        .expect("encrypted PDF item present");
+    assert_eq!(pdf_item.kind, IngestItemKind::Error);
+    let err = pdf_item.error.as_ref().expect("error field set");
+    assert!(
+        err.contains("encrypted"),
+        "error mentions encryption: {err}"
+    );
+    assert!(
+        err.contains("qpdf") || err.contains("decrypt"),
+        "error preserves remediation hint: {err}"
+    );
+}
+
+/// Corrupt header PDF → asset NOT stored; errors+=1.
+#[test]
+fn corrupt_pdf_fails_without_storing() {
+    let env = TestEnv::lexical_only();
+    let bytes = corrupt_pdf();
+    write_pdf(&env.workspace_root, "corrupt.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    assert!(report.errors >= 1);
+    let items = report.items.as_ref().unwrap();
+    let pdf_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("corrupt.pdf"))
+        .unwrap();
+    assert_eq!(pdf_item.kind, IngestItemKind::Error);
+
+    // Confirm the doc was NOT stored — list_docs returns nothing for
+    // this path.
+    let summaries = kebab_app::list_docs_with_config(
+        cfg,
+        kebab_core::DocFilter::default(),
+    )
+    .unwrap();
+    assert!(
+        !summaries
+            .iter()
+            .any(|s| s.doc_path.0.ends_with("corrupt.pdf")),
+        "corrupt PDF must not have a stored doc row"
+    );
+}
+
+/// Mixed page PDF (text page 1, empty page 2, text page 3) → asset
+/// stored; 2 chunks (pages 1 + 3); doc.provenance.events contains the
+/// page-2 Warning emitted by kebab-parse-pdf.
+#[test]
+fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
+    let env = TestEnv::lexical_only();
+    let bytes =
+        build_text_pdf(&[Some("first page"), None, Some("third page")]);
+    write_pdf(&env.workspace_root, "mixed.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error");
+    let pdf_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("mixed.pdf"))
+        .unwrap();
+    assert_eq!(pdf_item.kind, IngestItemKind::New);
+    assert_eq!(
+        pdf_item.block_count,
+        Some(3),
+        "still 3 blocks (P7-1 emits empty Block::Paragraph for the empty page)"
+    );
+    assert_eq!(
+        pdf_item.chunk_count,
+        Some(2),
+        "pdf-page-v1 emits 0 chunks for the empty page; total = 2"
+    );
+
+    let doc = kebab_app::inspect_doc_with_config(
+        cfg,
+        pdf_item.doc_id.as_ref().unwrap(),
+    )
+    .unwrap();
+    let warnings: Vec<_> = doc
+        .provenance
+        .events
+        .iter()
+        .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
+        .collect();
+    assert_eq!(
+        warnings.len(),
+        1,
+        "exactly one Warning event for the empty page"
+    );
+    let note = warnings[0].note.as_deref().unwrap_or("");
+    assert!(
+        note.contains("page2") && note.contains("scanned candidate"),
+        "Warning note marks page 2 as scanned candidate: {note}"
+    );
+}
+
+/// IngestReport invariant `scanned == new + updated + skipped + errors`
+/// when ingesting a mixed corpus including a corrupt PDF.
+#[test]
+fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() {
+    let env = TestEnv::lexical_only();
+    write_pdf(
+        &env.workspace_root,
+        "good.pdf",
+        &build_text_pdf(&[Some("ok body")]),
+    );
+    write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf());
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
+    let total = report.new + report.updated + report.skipped + report.errors;
+    assert_eq!(
+        report.scanned, total,
+        "invariant: scanned ({}) == new ({}) + updated ({}) + skipped ({}) + errors ({})",
+        report.scanned, report.new, report.updated, report.skipped, report.errors
+    );
+    // Sanity: 1 good (new) + 1 broken (error) = 2 scanned for our PDFs;
+    // markdown fixtures already in the workspace add to scanned/new
+    // alike, so we only assert the invariant rather than absolute counts.
+}
+
+/// 50-page PDF → ≥50 chunks (≥1 per page); ingest completes; storage
+/// round-trips. Vector embedding is disabled in the lexical-only env
+/// so this exercises the SQLite write path only.
+#[test]
+fn long_pdf_round_trips_through_lexical_pipeline() {
+    let env = TestEnv::lexical_only();
+    let pages: Vec<String> = (1..=50)
+        .map(|i| format!("Page {i} body — lorem ipsum dolor sit amet."))
+        .collect();
+    let page_refs: Vec<Option<&str>> =
+        pages.iter().map(|s| Some(s.as_str())).collect();
+    let bytes = build_text_pdf(&page_refs);
+    write_pdf(&env.workspace_root, "long.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    assert_eq!(report.errors, 0);
+    let pdf_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("long.pdf"))
+        .unwrap();
+    assert_eq!(pdf_item.block_count, Some(50));
+    assert!(
+        pdf_item.chunk_count.unwrap() >= 50,
+        "chunk_count={:?} should be ≥50",
+        pdf_item.chunk_count
+    );
+
+    // Round-trip: list_docs sees the long PDF.
+    let summaries =
+        kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
+            .unwrap();
+    assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf")));
+}
+
+/// `kebab inspect doc <pdf_doc_id>` returns the PDF CanonicalDocument
+/// with per-page Block::Paragraph + SourceSpan::Page intact.
+#[test]
+fn inspect_doc_surfaces_page_spans() {
+    let env = TestEnv::lexical_only();
+    let bytes =
+        build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
+    write_pdf(&env.workspace_root, "inspect.pdf", &bytes);
+    let cfg = cfg_with_pdf(&env);
+
+    let report =
+        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let pdf_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("inspect.pdf"))
+        .unwrap();
+    let doc = kebab_app::inspect_doc_with_config(
+        cfg,
+        pdf_item.doc_id.as_ref().unwrap(),
+    )
+    .unwrap();
+    assert_eq!(doc.parser_version.0, "pdf-text-v1");
+    assert_eq!(doc.blocks.len(), 3);
+    for block in &doc.blocks {
+        match block {
+            Block::Paragraph(p) => assert!(matches!(
+                p.common.source_span,
+                SourceSpan::Page { .. }
+            )),
+            other => panic!("expected Paragraph, got {other:?}"),
+        }
+    }
+}
diff --git a/docs/SMOKE.md b/docs/SMOKE.md
index 3a17012..bc5a1f2 100644
--- a/docs/SMOKE.md
+++ b/docs/SMOKE.md
@@ -151,7 +151,26 @@ max_pixels = 768
 prompt_template_version = "caption-v1"
 ```
 
-이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로 (P7 머지 후).
+이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로.
+
+## P7-3 PDF ingestion
+
+`config.toml` 의 `[workspace] include` 에 `**/*.pdf` 를 추가하면 `kebab ingest` 가 텍스트 PDF 자산도 색인합니다. 외부 service 의존 없음 — `kebab-parse-pdf` 가 lopdf 로 페이지 단위 텍스트 추출, `kebab-chunk::PdfPageV1Chunker` 가 페이지 경계를 절대 넘지 않는 chunk 생성.
+
+```toml
+[workspace]
+include = ["**/*.md", "**/*.pdf"]
+```
+
+PDF 한 권당 페이지 수만큼 (또는 페이지 텍스트가 길면 그 이상의) chunk 가 한 transaction 안에서 commit. 검색 결과의 `chunk.source_spans[0]` 가 `Page { page, char_start, char_end }` 형태라 인용 시 페이지 번호가 그대로 사용 가능.
+
+```bash
+kebab --config /tmp/kebab-smoke/config.toml ingest
+kebab --config /tmp/kebab-smoke/config.toml search --mode hybrid "<본문 단어>"
+kebab --config /tmp/kebab-smoke/config.toml inspect doc "<pdf_doc_id>"
+```
+
+암호화 PDF (예: DRM 책) → `errors+=1`, `error` 필드에 `qpdf --decrypt` 안내. 빈/스캔 페이지 (텍스트 추출 실패) → 0 chunk + `Provenance::Warning` ("scanned candidate"). v1 에서는 검색 불가, P+ scanned-PDF OCR fallback 까지 대기.
 
 각 명령은 0 종료 코드면 정상. `kebab ask` 는 거절 시 종료 코드 1 (`RefusalSignal`) — 의도된 동작.
 
@@ -165,6 +184,7 @@ prompt_template_version = "caption-v1"
 - 코퍼스에 없는 주제로 `kebab ask` → `refusal_reason: "llm_self_judge"` (또는 `no_chunks` / `score_gate`) + `grounded: false`.
 - (P6-4) `image.ocr.enabled = true` 로 PNG 자산을 ingest 하면 `kebab list docs` 가 markdown 옆에 image doc 도 출력 (`workspace_path` 가 `*.png`). `kebab inspect doc <image_doc_id>` 의 `block.ocr.joined` 가 vision LM 의 OCR 결과 (예: 스크린샷 안의 텍스트). `kebab search --mode lexical "<OCR text>"` 가 그 image chunk 를 반환하면 wiring 정상.
 - OCR / caption 부분 실패는 `errors` 카운터 미증가 — `kebab inspect doc <id>` 의 Provenance Warning 이벤트 또는 `--debug` 로그에서만 확인.
+- (P7-3) `*.pdf` 자산을 워크스페이스에 두면 `kebab ingest` 출력에 PDF 도 `new` 카운터에 포함. `kebab inspect doc <pdf_doc_id>` 가 `parser_version = "pdf-text-v1"` + 페이지마다 `Block::Paragraph` + `SourceSpan::Page { page, char_start, char_end }`. 본문에 등장하는 단어로 `kebab search --mode hybrid` 시 PDF chunk 가 결과에 포함되고 `source_span.kind = "page"` 면 wiring 정상. 암호화 PDF 는 `errors+=1` 로 분류되며 `error` 필드에 `qpdf --decrypt` 안내 보존. 빈/스캔 페이지 (PDF 가 텍스트를 추출하지 못한 페이지) 는 0 chunk + `Provenance::Warning` ("scanned candidate") 로 표시 — P+ scanned-PDF OCR fallback 까지는 검색 불가.
 
 ## 정리
 
@@ -181,6 +201,9 @@ rm -rf /tmp/kebab-smoke              # 통째로 정리
 - `kebab ask` 응답 시간 = LLM 토큰 throughput 에 종속. M4 Pro 48GB + gemma4:26b 기준 답변 50–100 토큰에 20–55초.
 - `--config` path 가 존재하지 않거나 malformed 면 `kebab doctor` 가 hard fail (defaults 가 silently mask 하지 않게 하는 hotfix 동작).
 - 매 CLI invocation 마다 fastembed 모델 init 비용 (~4초) — process-level 캐시 부재 때문. P9 TUI 진입 시 `App` 의 `OnceLock` 으로 세션 동안 한 번만 init.
-- (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장 (P7 머지 후).
+- (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장.
+- (P7-3) `config.chunking.chunker_version` 는 markdown 만 represent — PDF 자산은 `pdf-page-v1` 하드코딩. `config.toml` 의 `chunker_version = "md-heading-v1"` 을 봐도 PDF 는 영향 안 받음. HOTFIXES `2026-05-02 P7-3` entry 참조 (P+ chunker registry task 까지 유지).
+- (P7-3) 한 PDF 가 N 페이지면 `kebab ingest` 가 N 개 (또는 그 이상의, 페이지 길면 multi-chunk) 의 chunk 를 한 transaction 안에서 commit. 500 페이지 책 → 500+ chunk 한 번에 → embedding throughput 가 bottleneck. 임베딩 활성 워크스페이스에서 큰 PDF 를 처음 ingest 하면 분-단위 시간 + WAL 크기 증가 가능 — P+ 스케일 hardening task 까지 정상 동작이지만 비용은 측정 가능.
+- (P7-3) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 storage UNIQUE 제약 (`assets.workspace_path`) 에 트립 → `errors+=1`. md / image / pdf 모두 동일하지만 P7-3 의 통합 테스트가 처음 노출. 우회: 파일 path 변경 후 ingest. 영구 fix 는 P+ storage 작업.
 
 자세한 history 와 발견된 버그는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 참조.
diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md
index ed69f17..4c4368f 100644
--- a/tasks/HOTFIXES.md
+++ b/tasks/HOTFIXES.md
@@ -14,6 +14,26 @@ historical contract that was implemented; this file accumulates the
 deltas so phase 5+ readers can find the live behavior without diffing
 git history.
 
+## 2026-05-02 — P7-3 PDF ingest wiring: chunker_version deviation + storage UNIQUE bug
+
+**Discovered**: P7-3 implementation start.
+
+**Symptom 1 (deviation, intentional)**: `tasks/p7/p7-3-pdf-ingest-wiring.md` § Chunker selection notes that `config.chunking.chunker_version` is single-valued and serves the markdown path only. PDF ingest hard-codes `pdf-page-v1` regardless of the config value. A user who reads `config.toml` and sees `chunker_version = "md-heading-v1"` reasonably assumes PDFs use the same — they don't.
+
+**Fix 1**: `ingest_one_pdf_asset` (in `kebab-app::lib.rs`) instantiates `PdfPageV1Chunker` directly. The `Chunk.chunker_version` field on emitted PDF chunks records `pdf-page-v1` truthfully. A future P+ task (chunker registry) either splits `Config::chunking.chunker_version` per medium or replaces the dispatch with a runtime registry. No HOTFIX entry needed once that happens — this entry is the cross-reference.
+
+**Symptom 2 (storage-layer bug, exposed but not fixed by P7-3)**: P7-3's edited-bytes re-ingest test (`re_ingest_edited_pdf_produces_new_doc_id`) tripped on `sqlite error: UNIQUE constraint failed: assets.workspace_path: Error code 2067`. The assets table has a UNIQUE constraint on `workspace_path`, but `upsert_asset_row` (in `kebab-store-sqlite::store.rs:305`) only handles `ON CONFLICT(asset_id)`. When a file's bytes change, the new BLAKE3 produces a new `asset_id` while the `workspace_path` stays the same — INSERT picks the new asset_id branch, then trips the secondary UNIQUE on workspace_path.
+
+**Why it didn't surface earlier**: No existing test (markdown / image) exercises edited-bytes re-ingest. The image path's `re_ingest_image_produces_updated_with_same_doc_id` uses identical bytes (same asset_id → ON CONFLICT(asset_id) catches it). Real-world editing of a tracked file would hit the same bug across all media types.
+
+**Fix 2 (deferred)**: Storage-layer fix is out of scope for P7-3. The P7-3 implementation PR `#[ignore]`s the `re_ingest_edited_pdf_produces_new_doc_id` test with a doc-comment pointing here. A P+ storage task either:
+- Adds `ON CONFLICT(workspace_path) DO UPDATE` alongside the existing `ON CONFLICT(asset_id)` clause (DELETE-the-old + INSERT-the-new in a single statement, since UPSERT can only target one conflict path).
+- Or drops the UNIQUE constraint on `assets.workspace_path` and relies on application-level uniqueness (workspace_path → asset_id mapping in a separate index table).
+
+**Amends**:
+- tasks/p7/p7-3-pdf-ingest-wiring.md (chunker_version deviation, edited-bytes test ignored).
+- (Implicitly) every previous task spec that assumed `assets.workspace_path` UNIQUE was safe — the constraint is in fact too strict for the byte-edit re-ingest case.
+
 ## 2026-05-02 — P7-2 pdf-page-v1: chunk_id collision + BYTES_PER_TOKEN
 
 **Discovered**: P7-2 implementation start.
diff --git a/tasks/p7/p7-3-pdf-ingest-wiring.md b/tasks/p7/p7-3-pdf-ingest-wiring.md
index a96371a..f13eddb 100644
--- a/tasks/p7/p7-3-pdf-ingest-wiring.md
+++ b/tasks/p7/p7-3-pdf-ingest-wiring.md
@@ -3,7 +3,7 @@ phase: P7
 component: kebab-app (PDF ingest dispatch + chunker selection)
 task_id: p7-3
 title: "Wire PdfTextExtractor + PdfPageV1Chunker into kebab-app::ingest end-to-end"
-status: planned
+status: completed
 depends_on: [p7-1, p7-2, p1-6, p3-5, p6-4]
 unblocks: []
 contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
-- 
2.49.1


From 4ad4ef271eae0e6ebf9e58be6bc307d3e3c29fdb Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Sat, 2 May 2026 09:31:55 +0000
Subject: [PATCH 2/3] =?UTF-8?q?review(p7-3):=20=ED=9A=8C=EC=B0=A8=201=20?=
 =?UTF-8?q?=EC=A7=80=EC=A0=81=20=EB=B0=98=EC=98=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- `IngestItem.warnings` 가 PDF path 에서 빈 vec 였던 갭 해소. P7-1 의
  Provenance Warning (scanned candidate / extract panic 흡수) 노트들을
  `IngestItem.warnings` 로 surface — md path 의 `fm_warns + blk_warns`
  patten 과 평행. 사용자가 ingest summary 에서 "이 PDF page 2 가 스캔
  이라 검색 불가" 를 즉시 확인 가능.
- `mixed_page_pdf_stores_asset_with_scanned_candidate_warning` 에
  `IngestItem.warnings` 단정 추가 (정확히 1건 + 노트 내용 검증).
- `encrypted_pdf` / `corrupt_pdf` 테스트의 `errors >= 1` → `errors == 1`
  strict 단정. 미래에 다른 source 가 errors 늘리면 즉시 빨개짐.
- `re_ingest_identical_pdf` 에 `chunk_count` 동일성 단정 추가. P1
  idempotency contract 의 chunk-단위 axis 검증 (chunk_id 전체 set 비교는
  pdf-page-v1 의 `deterministic_chunk_ids_1000` 가 잠그고 있어 chunk_count
  가 가벼운 proxy 로 충분).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kebab-app/src/lib.rs            | 15 +++++++++++++-
 crates/kebab-app/tests/pdf_pipeline.rs | 28 ++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs
index 026a5d5..7ee324e 100644
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -1084,6 +1084,19 @@ fn ingest_one_pdf_asset(
         kebab_core::IngestItemKind::New
     };
 
+    // Surface every `Provenance::Warning` note onto `IngestItem.warnings`
+    // so the ingest summary shows partial-success signals (e.g. "page 2
+    // empty (scanned candidate)") without forcing the operator into
+    // `kebab inspect doc <id>`. Mirrors how the markdown path threads
+    // frontmatter / block warnings up to the same field.
+    let warnings: Vec<String> = canonical
+        .provenance
+        .events
+        .iter()
+        .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
+        .filter_map(|e| e.note.clone())
+        .collect();
+
     Ok(kebab_core::IngestItem {
         kind,
         doc_id: Some(canonical.doc_id.clone()),
@@ -1094,7 +1107,7 @@ fn ingest_one_pdf_asset(
         chunk_count: u32::try_from(chunks.len()).ok(),
         parser_version: Some(canonical.parser_version.clone()),
         chunker_version: Some(chunker.chunker_version()),
-        warnings: Vec::new(),
+        warnings,
         error: None,
     })
 }
diff --git a/crates/kebab-app/tests/pdf_pipeline.rs b/crates/kebab-app/tests/pdf_pipeline.rs
index 12accf1..f5cbea2 100644
--- a/crates/kebab-app/tests/pdf_pipeline.rs
+++ b/crates/kebab-app/tests/pdf_pipeline.rs
@@ -218,6 +218,15 @@ fn re_ingest_identical_pdf_produces_updated_with_same_doc_id() {
         .unwrap();
     assert_eq!(item2.kind, IngestItemKind::Updated);
     assert_eq!(item2.doc_id, item1.doc_id);
+    // P1 idempotency contract: identical bytes → identical chunk set.
+    // Comparing `chunk_count` as a proxy (full chunk_id set comparison
+    // would need direct sqlite access; the per-chunk #c{char_start}
+    // hash variant in pdf-page-v1 is already tested for stability in
+    // `kebab-chunk::pdf_page_v1::deterministic_chunk_ids_1000`).
+    assert_eq!(
+        item1.chunk_count, item2.chunk_count,
+        "identical bytes must produce identical chunk count"
+    );
 }
 
 /// Edit a PDF (replace bytes) → different blake3 → different asset_id
@@ -285,7 +294,7 @@ fn encrypted_pdf_fails_with_qpdf_hint() {
 
     let report =
         kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
-    assert!(report.errors >= 1, "encrypted PDF must increment errors");
+    assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once");
     let items = report.items.as_ref().unwrap();
     let pdf_item = items
         .iter()
@@ -313,7 +322,7 @@ fn corrupt_pdf_fails_without_storing() {
 
     let report =
         kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
-    assert!(report.errors >= 1);
+    assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once");
     let items = report.items.as_ref().unwrap();
     let pdf_item = items
         .iter()
@@ -390,6 +399,21 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
         note.contains("page2") && note.contains("scanned candidate"),
         "Warning note marks page 2 as scanned candidate: {note}"
     );
+
+    // R1: Warning notes also surface on `IngestItem.warnings` so
+    // operators can see the partial-success signal in the ingest
+    // summary without `kebab inspect doc`.
+    assert_eq!(
+        pdf_item.warnings.len(),
+        1,
+        "exactly one warning surfaced on IngestItem"
+    );
+    assert!(
+        pdf_item.warnings[0].contains("page2")
+            && pdf_item.warnings[0].contains("scanned candidate"),
+        "IngestItem.warnings preserves the Provenance Warning note: {:?}",
+        pdf_item.warnings
+    );
 }
 
 /// IngestReport invariant `scanned == new + updated + skipped + errors`
-- 
2.49.1


From 3a57cab1eb01f596d0def058e3dc6a1224193f90 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Sat, 2 May 2026 11:41:23 +0000
Subject: [PATCH 3/3] fix(kebab-store-sqlite): purge stale assets row on
 workspace_path orphan + smoke
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

P7-3 통합 테스트가 노출한 storage 레이어 버그 fix.
`assets.workspace_path` 의 UNIQUE 제약과 `upsert_asset_row` 의
`ON CONFLICT(asset_id)` 만 처리하던 gap 사이 — byte 가 변경된 자산
re-ingest 시 새 asset_id 가 같은 workspace_path 에서 secondary UNIQUE
충돌. md / image / pdf 모두 영향.

Fix:
- 새 helper `purge_orphan_at_workspace_path` 가 같은 `workspace_path`
  의 *다른* `asset_id` 를 발견하면 documents → assets 순서로 sweep.
  documents 의 ON DELETE RESTRICT 회피 + CASCADE 로 blocks / chunks /
  embedding_records 정리. copied 모드면 storage_path 의 byte 파일도
  best-effort 삭제.
- `put_asset_with_bytes` 의 두 분기 (copy / reference) + `DocumentStore
  ::put_asset` 모두 호출.
- 회귀 테스트 `put_asset_with_bytes_sweeps_workspace_path_orphan` (이전
  의 "UPSERT 실패시 orphan 청소" 테스트가 더 이상 doable 하지 않으므로
  대체).
- `re_ingest_edited_pdf_produces_new_doc_id` integration `#[ignore]` 해제 →
  9 통합 테스트 모두 default 로 통과.

Vector store orphan 은 별도 P+ task — LanceDB 가 SQLite cascade 와 무관하게
운영되므로 stale chunk_id vector 가 디스크에 남음. 검색에는 영향 없음 (search 가
SQLite join 통해 surface).

Smoke 검증 (release binary, markdown 2 + image 1 + PDF 2):
- doctor pass
- 첫 ingest: 5 new
- list docs: 5 docs all media types
- search lexical "pdf-page-v1 chunker" → whitepaper.pdf hit
- search hybrid → cross-media 결과
- inspect doc PDF: parser_version=pdf-text-v1, blocks 가 SourceSpan::Page
- 동일 byte re-ingest: 5 updated, 0 errors (P1 idempotency)
- byte 수정 후 re-ingest: 1 new (해당 PDF) + 4 updated, 0 errors (storage fix)
- corrupt PDF 추가: errors+=1 + IngestItem.error 메시지 정확, 다른 자산 영향 0
- 정리 후 다시 ingest: errors=0
- RAG ask: PDF 인용 + `citations[].citation` 에 `kind: "page"` + `page: <N>` +
  `path: <pdf_path>` 정확히 노출

운영 fixture 보조:
- `crates/kebab-parse-pdf/examples/gen_smoke_pdf.rs` — `cargo run --release
  --example gen_smoke_pdf -p kebab-parse-pdf -- <out.pdf> <text-pages>` 로
  reportlab/qpdf 없이 in-tree PDF 생성.
- `crates/kebab-parse-image/examples/gen_smoke_png.rs` — 동일 방식의 PNG
  fixture 생성.
- SMOKE.md 가 두 example 사용법 + 갱신된 HOTFIXES 동작 (byte 수정 시
  errors+=1 → new+=1) 반영.

HOTFIXES `2026-05-02 P7-3` entry 가 \"deferred\" → \"fixed in same PR\" 로
업데이트, vector store orphan caveat 만 남음.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kebab-app/tests/pdf_pipeline.rs        | 15 +---
 .../examples/gen_smoke_png.rs                 | 21 +++++
 .../kebab-parse-pdf/examples/gen_smoke_pdf.rs | 83 ++++++++++++++++++
 crates/kebab-store-sqlite/src/documents.rs    |  9 +-
 crates/kebab-store-sqlite/src/store.rs        | 87 ++++++++++++++++++-
 .../kebab-store-sqlite/tests/asset_writer.rs  | 69 +++++++--------
 docs/SMOKE.md                                 | 17 +++-
 tasks/HOTFIXES.md                             | 22 +++--
 8 files changed, 267 insertions(+), 56 deletions(-)
 create mode 100644 crates/kebab-parse-image/examples/gen_smoke_png.rs
 create mode 100644 crates/kebab-parse-pdf/examples/gen_smoke_pdf.rs

diff --git a/crates/kebab-app/tests/pdf_pipeline.rs b/crates/kebab-app/tests/pdf_pipeline.rs
index f5cbea2..365f42e 100644
--- a/crates/kebab-app/tests/pdf_pipeline.rs
+++ b/crates/kebab-app/tests/pdf_pipeline.rs
@@ -230,18 +230,11 @@ fn re_ingest_identical_pdf_produces_updated_with_same_doc_id() {
 }
 
 /// Edit a PDF (replace bytes) → different blake3 → different asset_id
-/// → different doc_id → `new+=1` for the new doc_id; first-pass row
-/// remains untouched.
-///
-/// **Currently `#[ignore]`** — exposes a storage-layer bug discovered
-/// by this PR: `assets.workspace_path` carries a UNIQUE constraint and
-/// `upsert_asset_row` only handles `ON CONFLICT(asset_id)`, so the
-/// second insert (new `asset_id` for the edited bytes, same
-/// `workspace_path`) trips constraint 2067. Affects markdown / image /
-/// PDF paths equally; no test exercised it before P7-3. Logged in
-/// `tasks/HOTFIXES.md` for a P+ storage-layer fix.
+/// → different doc_id → `new+=1` for the new doc_id; stale doc /
+/// asset / chunk / embedding rows for the prior bytes are swept by
+/// `purge_orphan_at_workspace_path` (HOTFIXES 2026-05-02 P7-3 storage
+/// fix shipped alongside this test).
 #[test]
-#[ignore = "exposes storage-layer assets.workspace_path UNIQUE bug — see HOTFIXES 2026-05-02 P7-3"]
 fn re_ingest_edited_pdf_produces_new_doc_id() {
     let env = TestEnv::lexical_only();
     let path = env.workspace_root.join("evolving.pdf");
diff --git a/crates/kebab-parse-image/examples/gen_smoke_png.rs b/crates/kebab-parse-image/examples/gen_smoke_png.rs
new file mode 100644
index 0000000..1e14d42
--- /dev/null
+++ b/crates/kebab-parse-image/examples/gen_smoke_png.rs
@@ -0,0 +1,21 @@
+//! `cargo run --example gen_smoke_png -p kebab-parse-image -- <out.png>`
+//!
+//! 100×50 solid-red PNG used by the SMOKE runbook for the image
+//! filename-indexing path (OCR / caption disabled).
+
+use image::{ImageBuffer, Rgb};
+use std::io::Cursor;
+
+fn main() {
+    let out = std::env::args()
+        .nth(1)
+        .expect("usage: gen_smoke_png <out.png>");
+    let img: ImageBuffer<Rgb<u8>, _> =
+        ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
+    let mut buf = Cursor::new(Vec::new());
+    img.write_to(&mut buf, image::ImageFormat::Png)
+        .expect("encode PNG");
+    let bytes = buf.into_inner();
+    std::fs::write(&out, &bytes).expect("write");
+    println!("wrote {} ({} bytes)", out, bytes.len());
+}
diff --git a/crates/kebab-parse-pdf/examples/gen_smoke_pdf.rs b/crates/kebab-parse-pdf/examples/gen_smoke_pdf.rs
new file mode 100644
index 0000000..450273c
--- /dev/null
+++ b/crates/kebab-parse-pdf/examples/gen_smoke_pdf.rs
@@ -0,0 +1,83 @@
+//! `cargo run --example gen_smoke_pdf -p kebab-parse-pdf -- <out.pdf> <text-page-1> [<text-page-N> ...]`
+//!
+//! Tiny generator used by the SMOKE runbook to produce text PDFs without
+//! adding `reportlab` / `qpdf` to the dev-machine prerequisites. Mirrors
+//! the `tests/common::build_text_pdf` builder.
+
+use lopdf::content::{Content, Operation};
+use lopdf::{Document, Object, Stream, dictionary};
+use std::fs::File;
+use std::io::Write;
+
+fn main() {
+    let mut args = std::env::args().skip(1);
+    let out = args.next().expect("usage: gen_smoke_pdf <out.pdf> <text...>");
+    let pages: Vec<String> = args.collect();
+    if pages.is_empty() {
+        eprintln!("at least one page text required");
+        std::process::exit(2);
+    }
+
+    let mut doc = Document::with_version("1.5");
+    let pages_id = doc.new_object_id();
+    let font_id = doc.add_object(dictionary! {
+        "Type" => "Font",
+        "Subtype" => "Type1",
+        "BaseFont" => "Helvetica",
+    });
+    let resources_id = doc.add_object(dictionary! {
+        "Font" => dictionary! { "F1" => font_id },
+    });
+    let mut page_refs: Vec<Object> = Vec::new();
+    for text in &pages {
+        let content = Content {
+            operations: vec![
+                Operation::new("BT", vec![]),
+                Operation::new("Tf", vec!["F1".into(), 14.into()]),
+                Operation::new(
+                    "Td",
+                    vec![Object::Integer(72), Object::Integer(720)],
+                ),
+                Operation::new("Tj", vec![Object::string_literal(text.as_str())]),
+                Operation::new("ET", vec![]),
+            ],
+        };
+        let stream_data = content.encode().expect("content encode");
+        let content_id = doc.add_object(Stream::new(dictionary! {}, stream_data));
+        let page_id = doc.add_object(dictionary! {
+            "Type" => "Page",
+            "Parent" => pages_id,
+            "Contents" => content_id,
+        });
+        page_refs.push(page_id.into());
+    }
+
+    let count = page_refs.len() as i64;
+    let pages_dict = dictionary! {
+        "Type" => "Pages",
+        "Kids" => page_refs,
+        "Count" => count,
+        "Resources" => resources_id,
+        "MediaBox" => vec![
+            Object::Integer(0),
+            Object::Integer(0),
+            Object::Integer(595),
+            Object::Integer(842),
+        ],
+    };
+    doc.objects
+        .insert(pages_id, Object::Dictionary(pages_dict));
+    let catalog_id = doc.add_object(dictionary! {
+        "Type" => "Catalog",
+        "Pages" => pages_id,
+    });
+    doc.trailer.set("Root", catalog_id);
+
+    let mut buf: Vec<u8> = Vec::new();
+    doc.save_to(&mut buf).expect("save PDF");
+    File::create(&out)
+        .expect("create out")
+        .write_all(&buf)
+        .expect("write");
+    println!("wrote {} ({} bytes, {} pages)", out, buf.len(), pages.len());
+}
diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs
index b321e12..14b62ad 100644
--- a/crates/kebab-store-sqlite/src/documents.rs
+++ b/crates/kebab-store-sqlite/src/documents.rs
@@ -16,7 +16,9 @@ use rusqlite::params;
 use time::OffsetDateTime;
 
 use crate::error::StoreError;
-use crate::store::{SqliteStore, upsert_asset_row, validate_asset_id};
+use crate::store::{
+    SqliteStore, purge_orphan_at_workspace_path, upsert_asset_row, validate_asset_id,
+};
 
 impl kebab_core::DocumentStore for SqliteStore {
     fn put_asset(&self, asset: &kebab_core::RawAsset) -> Result<()> {
@@ -38,6 +40,11 @@ impl kebab_core::DocumentStore for SqliteStore {
             }
         };
         let conn = self.lock_conn();
+        purge_orphan_at_workspace_path(
+            &conn,
+            &asset.workspace_path.0,
+            &asset.asset_id.0,
+        )?;
         upsert_asset_row(&conn, asset, storage_kind, &storage_path)
     }
 
diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs
index 0ac1408..b2455fa 100644
--- a/crates/kebab-store-sqlite/src/store.rs
+++ b/crates/kebab-store-sqlite/src/store.rs
@@ -11,7 +11,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Mutex, MutexGuard};
 
 use anyhow::{Context, Result};
-use rusqlite::Connection;
+use rusqlite::{Connection, OptionalExtension, params};
 
 use crate::error::StoreError;
 use crate::schema;
@@ -208,6 +208,11 @@ impl SqliteStore {
                 // in place.
                 {
                     let conn = self.lock_conn();
+                    purge_orphan_at_workspace_path(
+                        &conn,
+                        &asset.workspace_path.0,
+                        &asset.asset_id.0,
+                    )?;
                     upsert_asset_row(
                         &conn,
                         asset,
@@ -243,6 +248,11 @@ impl SqliteStore {
                 kebab_core::SourceUri::Kb(u) => u.clone(),
             };
             let conn = self.lock_conn();
+            purge_orphan_at_workspace_path(
+                &conn,
+                &asset.workspace_path.0,
+                &asset.asset_id.0,
+            )?;
             upsert_asset_row(&conn, asset, "reference", &storage_path)?;
             Ok(())
         }
@@ -298,6 +308,81 @@ fn temp_path_for(dest: &Path) -> PathBuf {
     parent.join(format!("{file_name}.tmp.{pid}.{n}"))
 }
 
+/// Sweep stale `assets` + `documents` + downstream rows when the file
+/// at `workspace_path` is being re-ingested with bytes that produce a
+/// **different** `asset_id` (i.e. the file was edited).
+///
+/// Why this exists (HOTFIXES 2026-05-02 P7-3): `idx_assets_workspace_path`
+/// is a UNIQUE index. The original `upsert_asset_row` only handles
+/// `ON CONFLICT(asset_id)`, so a brand-new `asset_id` colliding on
+/// `workspace_path` raises `Error code 2067` and the ingest fails. This
+/// helper does the cleanup `ON CONFLICT(workspace_path)` would have done
+/// if SQLite let UPSERT target two indexes at once.
+///
+/// Order matters:
+/// 1. `documents.asset_id` is `ON DELETE RESTRICT`, so we must drop the
+///    old `documents` rows first. CASCADE on documents → blocks /
+///    chunks / embedding_records sweeps the dependent rows in the same
+///    statement.
+/// 2. Then DELETE the stale `assets` row, freeing the
+///    `workspace_path` slot for the new one.
+/// 3. If the stale asset was stored in `copied` mode, best-effort
+///    delete the on-disk byte file at `storage_path` so the data dir
+///    doesn't accumulate orphans across edits.
+///
+/// **Caveat — vector store orphans.** `embedding_records.chunk_id`
+/// CASCADE clears the SQLite side, but the LanceDB rows keyed on
+/// `chunk_id` live in a separate store and are not touched here.
+/// Stale vectors do not affect retrieval (search joins through
+/// SQLite, so an orphan vector is never surfaced) but they consume
+/// disk in `data_dir/lancedb/`. A future task should reconcile by
+/// `chunk_id` set diff. Tracked alongside this entry in HOTFIXES.
+pub(crate) fn purge_orphan_at_workspace_path(
+    conn: &Connection,
+    workspace_path: &str,
+    new_asset_id: &str,
+) -> Result<()> {
+    let stale: Option<(String, String, String)> = conn
+        .query_row(
+            "SELECT asset_id, storage_kind, storage_path
+             FROM assets
+             WHERE workspace_path = ? AND asset_id != ?",
+            params![workspace_path, new_asset_id],
+            |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
+        )
+        .optional()
+        .map_err(StoreError::from)?;
+
+    let Some((stale_asset_id, storage_kind, storage_path)) = stale else {
+        return Ok(());
+    };
+
+    // documents → blocks / chunks / embedding_records via CASCADE.
+    conn.execute(
+        "DELETE FROM documents WHERE asset_id = ?",
+        params![stale_asset_id],
+    )
+    .map_err(StoreError::from)?;
+    conn.execute(
+        "DELETE FROM assets WHERE asset_id = ?",
+        params![stale_asset_id],
+    )
+    .map_err(StoreError::from)?;
+
+    if storage_kind == "copied" {
+        let _ = std::fs::remove_file(&storage_path);
+    }
+
+    tracing::debug!(
+        target: "kebab-store-sqlite",
+        workspace_path = %workspace_path,
+        stale_asset_id = %stale_asset_id,
+        new_asset_id = %new_asset_id,
+        "purged stale asset (file edited; bytes changed)"
+    );
+    Ok(())
+}
+
 /// UPSERT a row into `assets`. Used by both the `put_asset_with_bytes`
 /// path (which has bytes + computed `storage_kind/path`) and the
 /// `DocumentStore::put_asset` path (which only has the `RawAsset` and
diff --git a/crates/kebab-store-sqlite/tests/asset_writer.rs b/crates/kebab-store-sqlite/tests/asset_writer.rs
index 54f6268..8eac8e6 100644
--- a/crates/kebab-store-sqlite/tests/asset_writer.rs
+++ b/crates/kebab-store-sqlite/tests/asset_writer.rs
@@ -100,22 +100,23 @@ fn reference_mode_does_not_write_file_but_records_path() {
 }
 
 #[test]
-fn put_asset_with_bytes_orphan_cleanup_on_upsert_failure() {
-    // Goal: prove that if the row UPSERT fails AFTER the bytes have been
-    // staged on disk, no `<aa>/<asset_id>` file is left behind.
+fn put_asset_with_bytes_sweeps_workspace_path_orphan() {
+    // HOTFIXES 2026-05-02 P7-3: the original behaviour erred on
+    // workspace_path UNIQUE conflict (`ON CONFLICT(asset_id)` only) so
+    // a re-ingest of an edited file was unrecoverable. The fix is
+    // `purge_orphan_at_workspace_path`, which sweeps the stale
+    // documents → assets chain before the new INSERT lands.
     //
-    // Lever: the `assets` table has a UNIQUE INDEX on `workspace_path`
-    // (V001), but the UPSERT is `ON CONFLICT(asset_id)`. So if some other
-    // row already owns this `workspace_path`, the INSERT half of the
-    // UPSERT trips a UNIQUE constraint that the ON CONFLICT clause does
-    // NOT handle — UPSERT errors. The new asset's bytes were already
-    // staged; we assert they are NOT visible at the final destination.
+    // This test exercises the no-documents flavour (raw asset row only)
+    // — the put_asset_with_bytes path. The documents-cascade flavour
+    // is exercised end-to-end in `kebab-app::tests::pdf_pipeline::
+    // re_ingest_edited_pdf_produces_new_doc_id`.
     let env = common::TestEnv::with_threshold(100);
     let store = SqliteStore::open(&env.config()).unwrap();
     store.run_migrations().unwrap();
 
-    // Pre-populate a row that owns `notes/foo.md` (the workspace_path our
-    // fixture asset will also claim) under a *different* asset_id.
+    // Pre-populate a row that owns `notes/foo.md` under a *different*
+    // asset_id, simulating a prior ingest of an earlier byte version.
     env.with_conn(|c| {
         c.execute(
             "INSERT INTO assets (
@@ -140,36 +141,36 @@ fn put_asset_with_bytes_orphan_cleanup_on_upsert_failure() {
     let cs = b3_full_hex(bytes);
     let asset = fixed_asset(bytes, bytes.len() as u64, &cs);
 
-    let err = store
+    store
         .put_asset_with_bytes(&asset, bytes)
-        .expect_err("UPSERT must fail on workspace_path UNIQUE violation");
-    let msg = format!("{err:#}");
-    assert!(
-        msg.to_lowercase().contains("unique") || msg.to_lowercase().contains("constraint"),
-        "expected UNIQUE constraint failure, got: {msg}"
-    );
+        .expect("orphan sweep + INSERT must succeed");
 
-    // Final destination must NOT exist (no orphan).
+    // Stale row gone, new row owns the workspace_path.
+    let stale_count: i64 = env.with_conn(|c| {
+        c.query_row(
+            "SELECT COUNT(*) FROM assets WHERE asset_id = ?",
+            rusqlite::params!["b".repeat(32)],
+            |row| row.get(0),
+        )
+    });
+    assert_eq!(stale_count, 0, "stale asset_id must be purged");
+    let new_count: i64 = env.with_conn(|c| {
+        c.query_row(
+            "SELECT COUNT(*) FROM assets WHERE asset_id = ?",
+            rusqlite::params![asset.asset_id.0],
+            |row| row.get(0),
+        )
+    });
+    assert_eq!(new_count, 1, "new asset_id must own the workspace_path slot");
+
+    // New asset's bytes published at the final destination.
     let aa = &asset.asset_id.0[..2];
     let dest = env.data_dir().join("assets").join(aa).join(&asset.asset_id.0);
     assert!(
-        !dest.exists(),
-        "asset bytes were left orphan at {} after UPSERT failure",
+        dest.exists(),
+        "new asset bytes must be visible at {}",
         dest.display()
     );
-    // No `*.tmp.*` either — temp file must be cleaned up too.
-    let shard_dir = env.data_dir().join("assets").join(aa);
-    if let Ok(entries) = std::fs::read_dir(&shard_dir) {
-        for entry in entries.flatten() {
-            let name = entry.file_name();
-            let s = name.to_string_lossy();
-            assert!(
-                !s.contains(".tmp."),
-                "temp file leaked at {}",
-                entry.path().display()
-            );
-        }
-    }
 }
 
 #[test]
diff --git a/docs/SMOKE.md b/docs/SMOKE.md
index bc5a1f2..82ae10a 100644
--- a/docs/SMOKE.md
+++ b/docs/SMOKE.md
@@ -162,16 +162,29 @@ prompt_template_version = "caption-v1"
 include = ["**/*.md", "**/*.pdf"]
 ```
 
-PDF 한 권당 페이지 수만큼 (또는 페이지 텍스트가 길면 그 이상의) chunk 가 한 transaction 안에서 commit. 검색 결과의 `chunk.source_spans[0]` 가 `Page { page, char_start, char_end }` 형태라 인용 시 페이지 번호가 그대로 사용 가능.
+PDF 한 권당 페이지 수만큼 (또는 페이지 텍스트가 길면 그 이상의) chunk 가 한 transaction 안에서 commit. 검색 결과의 `chunk.source_spans[0]` 가 `Page { page, char_start, char_end }` 형태라 인용 시 페이지 번호가 그대로 사용 가능. `kebab ask --json` 의 `citations[].citation` 도 `kind: "page"` + `page: <N>` + `path: <pdf_path>` 로 노출.
+
+테스트 fixture 가 필요할 때는 두 example 바이너리를 사용 — `reportlab` / `qpdf` 같은 시스템 dep 없이 in-tree 로 PDF / PNG 생성:
+
+```bash
+cargo run --release --example gen_smoke_pdf -p kebab-parse-pdf -- \
+  /tmp/kebab-smoke/workspace/whitepaper.pdf "page one body" "page two body"
+
+cargo run --release --example gen_smoke_png -p kebab-parse-image -- \
+  /tmp/kebab-smoke/workspace/diagram.png
+```
 
 ```bash
 kebab --config /tmp/kebab-smoke/config.toml ingest
 kebab --config /tmp/kebab-smoke/config.toml search --mode hybrid "<본문 단어>"
 kebab --config /tmp/kebab-smoke/config.toml inspect doc "<pdf_doc_id>"
+kebab --config /tmp/kebab-smoke/config.toml ask "<PDF 본문에 관한 질문>" --json
 ```
 
 암호화 PDF (예: DRM 책) → `errors+=1`, `error` 필드에 `qpdf --decrypt` 안내. 빈/스캔 페이지 (텍스트 추출 실패) → 0 chunk + `Provenance::Warning` ("scanned candidate"). v1 에서는 검색 불가, P+ scanned-PDF OCR fallback 까지 대기.
 
+수정된 PDF 를 같은 path 에 다시 배치하면 `purge_orphan_at_workspace_path` 가 옛 doc / chunks / embeddings 를 sweep 하고 새 byte 가 새 `doc_id` 로 색인됨 — `IngestReport` 에 그 자산만 `new+=1` 로 분류 (다른 자산은 `updated`). HOTFIXES `2026-05-02 P7-3` 참조.
+
 각 명령은 0 종료 코드면 정상. `kebab ask` 는 거절 시 종료 코드 1 (`RefusalSignal`) — 의도된 동작.
 
 ## 검증 체크리스트
@@ -204,6 +217,6 @@ rm -rf /tmp/kebab-smoke              # 통째로 정리
 - (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장.
 - (P7-3) `config.chunking.chunker_version` 는 markdown 만 represent — PDF 자산은 `pdf-page-v1` 하드코딩. `config.toml` 의 `chunker_version = "md-heading-v1"` 을 봐도 PDF 는 영향 안 받음. HOTFIXES `2026-05-02 P7-3` entry 참조 (P+ chunker registry task 까지 유지).
 - (P7-3) 한 PDF 가 N 페이지면 `kebab ingest` 가 N 개 (또는 그 이상의, 페이지 길면 multi-chunk) 의 chunk 를 한 transaction 안에서 commit. 500 페이지 책 → 500+ chunk 한 번에 → embedding throughput 가 bottleneck. 임베딩 활성 워크스페이스에서 큰 PDF 를 처음 ingest 하면 분-단위 시간 + WAL 크기 증가 가능 — P+ 스케일 hardening task 까지 정상 동작이지만 비용은 측정 가능.
-- (P7-3) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 storage UNIQUE 제약 (`assets.workspace_path`) 에 트립 → `errors+=1`. md / image / pdf 모두 동일하지만 P7-3 의 통합 테스트가 처음 노출. 우회: 파일 path 변경 후 ingest. 영구 fix 는 P+ storage 작업.
+- (P7-3) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 `purge_orphan_at_workspace_path` 가 옛 doc / chunks / embeddings 를 sweep 하고 새 byte 가 새 `doc_id` 로 색인됨. `IngestReport` 에 그 자산만 `new+=1` (다른 자산은 `updated`). LanceDB 는 별도 store 라 옛 vector 가 잔존하지만 검색에는 영향 없음 (SQLite join 으로 surface 안 됨) — 디스크 cleanup 은 P+.
 
 자세한 history 와 발견된 버그는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 참조.
diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md
index 4c4368f..d6a0708 100644
--- a/tasks/HOTFIXES.md
+++ b/tasks/HOTFIXES.md
@@ -22,17 +22,25 @@ git history.
 
 **Fix 1**: `ingest_one_pdf_asset` (in `kebab-app::lib.rs`) instantiates `PdfPageV1Chunker` directly. The `Chunk.chunker_version` field on emitted PDF chunks records `pdf-page-v1` truthfully. A future P+ task (chunker registry) either splits `Config::chunking.chunker_version` per medium or replaces the dispatch with a runtime registry. No HOTFIX entry needed once that happens — this entry is the cross-reference.
 
-**Symptom 2 (storage-layer bug, exposed but not fixed by P7-3)**: P7-3's edited-bytes re-ingest test (`re_ingest_edited_pdf_produces_new_doc_id`) tripped on `sqlite error: UNIQUE constraint failed: assets.workspace_path: Error code 2067`. The assets table has a UNIQUE constraint on `workspace_path`, but `upsert_asset_row` (in `kebab-store-sqlite::store.rs:305`) only handles `ON CONFLICT(asset_id)`. When a file's bytes change, the new BLAKE3 produces a new `asset_id` while the `workspace_path` stays the same — INSERT picks the new asset_id branch, then trips the secondary UNIQUE on workspace_path.
+**Symptom 2 (storage-layer bug, fixed in same PR)**: P7-3's edited-bytes re-ingest test (`re_ingest_edited_pdf_produces_new_doc_id`) tripped on `sqlite error: UNIQUE constraint failed: assets.workspace_path: Error code 2067`. The assets table has a UNIQUE constraint on `workspace_path`, but `upsert_asset_row` (in `kebab-store-sqlite::store.rs`) only handles `ON CONFLICT(asset_id)`. When a file's bytes change, the new BLAKE3 produces a new `asset_id` while the `workspace_path` stays the same — INSERT picks the new asset_id branch, then trips the secondary UNIQUE on `workspace_path`.
 
-**Why it didn't surface earlier**: No existing test (markdown / image) exercises edited-bytes re-ingest. The image path's `re_ingest_image_produces_updated_with_same_doc_id` uses identical bytes (same asset_id → ON CONFLICT(asset_id) catches it). Real-world editing of a tracked file would hit the same bug across all media types.
+**Why it didn't surface earlier**: No existing test (markdown / image) exercised edited-bytes re-ingest. The image path's `re_ingest_image_produces_updated_with_same_doc_id` uses identical bytes (same asset_id → `ON CONFLICT(asset_id)` catches it). Real-world editing of a tracked file would hit the same bug across all media types.
 
-**Fix 2 (deferred)**: Storage-layer fix is out of scope for P7-3. The P7-3 implementation PR `#[ignore]`s the `re_ingest_edited_pdf_produces_new_doc_id` test with a doc-comment pointing here. A P+ storage task either:
-- Adds `ON CONFLICT(workspace_path) DO UPDATE` alongside the existing `ON CONFLICT(asset_id)` clause (DELETE-the-old + INSERT-the-new in a single statement, since UPSERT can only target one conflict path).
-- Or drops the UNIQUE constraint on `assets.workspace_path` and relies on application-level uniqueness (workspace_path → asset_id mapping in a separate index table).
+**Fix 2** (P7-3 implementation PR): new `purge_orphan_at_workspace_path` helper in `kebab-store-sqlite::store.rs`. Runs immediately before each `upsert_asset_row` call (both `put_asset_with_bytes` paths AND `DocumentStore::put_asset`). It:
+1. SELECTs the stale row at `workspace_path` whose `asset_id` differs from the incoming one (none → no-op return).
+2. DELETEs from `documents WHERE asset_id = stale` — `documents.asset_id ON DELETE RESTRICT` requires the documents go first; CASCADE on documents → `blocks` / `chunks` / `embedding_records` sweeps the dependent rows in the same statement.
+3. DELETEs the stale `assets` row, freeing the `workspace_path` slot.
+4. If the stale storage was `copied`, best-effort removes the byte file at `storage_path` so `data_dir/assets/` does not accumulate orphans across edits.
+
+**Caveat (still deferred)**: `embedding_records.chunk_id` CASCADE clears the SQLite side, but the LanceDB rows keyed on `chunk_id` live in a separate store and are not touched. Stale vectors do not affect retrieval (search joins through SQLite, so an orphan vector is never surfaced) but they consume disk in `data_dir/lancedb/`. A future task should reconcile by `chunk_id` set diff.
+
+The previously-`#[ignore]`d `re_ingest_edited_pdf_produces_new_doc_id` integration test runs by default after this fix, plus a dedicated unit test `put_asset_with_bytes_sweeps_workspace_path_orphan` in `kebab-store-sqlite::tests::asset_writer` that exercises the no-documents flavour. Verified end-to-end via the SMOKE runbook: `kebab ingest` → edit a tracked PDF → `kebab ingest` reports `new=1` for that asset (rest `updated`) and the prior doc/chunks are gone from `inspect` / `list docs`.
 
 **Amends**:
-- tasks/p7/p7-3-pdf-ingest-wiring.md (chunker_version deviation, edited-bytes test ignored).
-- (Implicitly) every previous task spec that assumed `assets.workspace_path` UNIQUE was safe — the constraint is in fact too strict for the byte-edit re-ingest case.
+- tasks/p7/p7-3-pdf-ingest-wiring.md (chunker_version deviation; edited-bytes test runs).
+- crates/kebab-store-sqlite (new `purge_orphan_at_workspace_path` helper called from both `put_asset_with_bytes` branches and `DocumentStore::put_asset`).
+- crates/kebab-store-sqlite/tests/asset_writer.rs (`put_asset_with_bytes_sweeps_workspace_path_orphan` replaces the prior orphan-cleanup-on-failure test, since the failure path no longer exists).
+- docs/SMOKE.md (note that edited-PDF re-ingest produces `new=1` rather than an error).
 
 ## 2026-05-02 — P7-2 pdf-page-v1: chunk_id collision + BYTES_PER_TOKEN
 
-- 
2.49.1