Merge pull request 'feat(pdf): scanned PDF OCR via qwen2.5vl:3b vision LLM (v0.20.0 sub-item 1)' (#189) from feat/pdf-scanned-ocr into main

Reviewed-on: #189
2026-05-28 04:37:41 +00:00
parent bcd1e37dab 685007789a
commit 09333d0b05
280 changed files with 18330 additions and 3897 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4127,7 +4127,7 @@ dependencies = [

 [[package]]
 name = "kebab-app"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "base64 0.22.1",
@@ -4166,12 +4166,13 @@ dependencies = [
 "tracing-appender",
 "tracing-subscriber",
 "unicode-normalization",
+ "uuid",
 "wiremock",
 ]

 [[package]]
 name = "kebab-chunk"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -4187,7 +4188,7 @@ dependencies = [

 [[package]]
 name = "kebab-cli"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "clap",
@@ -4208,7 +4209,7 @@ dependencies = [

 [[package]]
 name = "kebab-config"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "dirs 5.0.1",
@@ -4223,7 +4224,7 @@ dependencies = [

 [[package]]
 name = "kebab-core"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -4237,7 +4238,7 @@ dependencies = [

 [[package]]
 name = "kebab-embed"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -4251,7 +4252,7 @@ dependencies = [

 [[package]]
 name = "kebab-embed-local"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "fastembed",
@@ -4264,7 +4265,7 @@ dependencies = [

 [[package]]
 name = "kebab-eval"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "kebab-app",
@@ -4283,7 +4284,7 @@ dependencies = [

 [[package]]
 name = "kebab-llm"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "kebab-core",
@@ -4292,7 +4293,7 @@ dependencies = [

 [[package]]
 name = "kebab-llm-local"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "kebab-config",
@@ -4309,7 +4310,7 @@ dependencies = [

 [[package]]
 name = "kebab-mcp"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "kebab-app",
@@ -4327,7 +4328,7 @@ dependencies = [

 [[package]]
 name = "kebab-nli"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "hf-hub",
@@ -4342,7 +4343,7 @@ dependencies = [

 [[package]]
 name = "kebab-parse-code"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "gix",
@@ -4365,7 +4366,7 @@ dependencies = [

 [[package]]
 name = "kebab-parse-image"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "ab_glyph",
 "anyhow",
@@ -4389,7 +4390,7 @@ dependencies = [

 [[package]]
 name = "kebab-parse-md"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "kebab-core",
@@ -4406,20 +4407,22 @@ dependencies = [

 [[package]]
 name = "kebab-parse-pdf"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "blake3",
 "kebab-core",
+ "kebab-parse-image",
 "lopdf",
 "serde_json",
+ "strsim",
 "time",
 "tracing",
 ]

 [[package]]
 name = "kebab-rag"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -4441,7 +4444,7 @@ dependencies = [

 [[package]]
 name = "kebab-search"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "globset",
@@ -4460,7 +4463,7 @@ dependencies = [

 [[package]]
 name = "kebab-source-fs"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -4478,7 +4481,7 @@ dependencies = [

 [[package]]
 name = "kebab-store-sqlite"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -4498,7 +4501,7 @@ dependencies = [

 [[package]]
 name = "kebab-store-vector"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "arrow",
@@ -4522,7 +4525,7 @@ dependencies = [

 [[package]]
 name = "kebab-tui"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
 "anyhow",
 "crossterm",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,7 +30,7 @@ edition       = "2024"
 rust-version  = "1.85"
 license       = "MIT OR Apache-2.0"
 repository    = "https://github.com/altair823/kebab"
-version       = "0.19.0"
+version       = "0.20.0"   # v0.20.0 sub-item 1 (scanned PDF OCR via qwen2.5vl:3b) — CLAUDE.md §Release 사용자 도그푸딩 트리거

 # pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
 # intentional allow-list. The allowed lints are either cosmetic (doc style),
@@ -141,6 +141,7 @@ proptest     = "1"
 # p9-fb-19: LRU cache for `App::search` results. Bounded capacity
 # from `config.search.cache_capacity` (default 256, ~1.3 MB cap).
 lru          = "0.12"
+lopdf        = "0.32"
 # fastembed-rs ships ONNX runtime via the `ort-download-binaries` feature
 # in its default set (which also pulls `hf-hub` for first-run model
 # downloads). Pinned to the 4.x line per task p3-2 (current 5.x release
--- a/HANDOFF.md
+++ b/HANDOFF.md
@@ -17,7 +17,7 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) + P10 전체
 | **P4** | Local LLM + RAG + grounded answer | `kebab-llm`, `kebab-llm-local`, `kebab-rag` | P3 | ✅ 완료 |
 | **P5** | Golden query / regression eval | `kebab-eval` | P4 | ✅ 완료 |
 | **P6** | 이미지 ingestion (OCR + caption) | `kebab-parse-image` | P5 | ✅ 완료 (4/4 component, OCR/caption Ollama-vision) |
-| **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) |
+| **P7** | PDF text + page citation + scanned OCR (v0.20.0 sub-item 1) | `kebab-parse-pdf` + `kebab-app::pdf_ocr_apply` | P5 + P6 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring + post-extract OCR enrichment via qwen2.5vl:3b vision LLM) |
 | **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) |
 | **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) |
 | **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)**, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)**, **1D ✅ (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 — v0.16.0)** |
@@ -32,6 +32,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.

 머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:

+- **v0.20 sub-item 1 (scanned PDF OCR via qwen2.5vl:3b)**: post-extract enrichment pattern (`kebab-app::pdf_ocr_apply`, H-1 resolution), DCTDecode-only v1 scope (FlateDecode/CCITTFax page 는 warning + skip), parser_version `"pdf-text-v1"` 보존 + force-reingest UX 명문 (H-4).
 - **2026-05-26 kebab-normalize + kebab-parse-types 흡수 (24 → 22 crates, design §3.7b 재작성)** — v0.19.0 cut. 4 parser 중 markdown 한 갈래만 lift 를 경유하는 reality 가 design §3.7b 의 fan-in ≥ 2 가정과 diverge → thin layer (`kebab-parse-types`) + `kebab-normalize` 두 crate 가 `kebab-parse-md` 로 흡수. 5 사용 type + 3 forward-declared struct 모두 `kebab-parse-md::{types,normalize}` module 의 `pub` re-export 로 보존. wire / surface impact = 0 (CLI / TUI / MCP / `--json` / config / XDG / parser_version 모두 unchanged). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-26 design deviation entry).
 - **2026-05-26 v0.18.0 fb-41 multi-hop RAG + NLI verification ship (PR #176-180) + post-PR9 cleanup (PR #181)** — pre-v0.18.0 dogfood (`/build/cache/dogfood-v018/`, 33 assets / 205 chunks, gemma3:4b CPU only / 16 GB RAM) 에서 발견된 S7 caffeine hallucination 의 root cause = LLM-self-judge ceiling (synthesize 가 chunks 와 무관한 Adam optimizer gradient 식을 silent emit, self-judge 가 reject 못함). 학계 표준 (Self-RAG, CRAG, Auto-GDA, MedTrust-RAG) 결론 = deterministic post-synthesis verification. mDeBERTa-v3 XNLI ONNX (280 MB, Xenova HF) 가 `(packed_chunks, answer)` entailment 검사 — `[rag] nli_threshold > 0` (default 0.0 = disabled, production 권장 0.5) 일 때 활성. dogfood retest 측정 — S7 PR-8 baseline `grounded=true + Adam hallucination` → PR-9 `nli_verification_failed, nli_score 0.0035`. wire additive minor — `answer.v1.verification` field + `refusal_reason` 의 `nli_verification_failed` / `nli_model_unavailable` 추가, pre-v0.18 reader 무영향. 5 sub-PR 시퀀스 + cleanup PR (clippy::pedantic baseline + 의도적 30+ allow + H1 `[models.nli].model` config wiring + 9 new tests). post-refactor retest = PR-9d byte-identical (deterministic 확인). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 fb-41 PR-9 closure entry + S3 follow-up).
 - **2026-05-25 v0.17.2 post-v0.17.1 polish (PR #164 + #165)** — v0.17.1 의 두 follow-up closure. (1) `[image.ocr] request_timeout_secs` 별 노브 — `crates/kebab-parse-image/src/ocr.rs::REQUEST_TIMEOUT` hard 300s 제거, LLM 쪽 패턴 (PR #162) 을 OCR 어댑터에 동일 적용. 사용자 결정으로 별 노브 분리 (OCR vs LLM 의 cold start 패턴이 달라 독립 조절). v0.17.1 미진행 항목 closure. (2) `chunks_fts` 의 `heading_path` 컬럼이 JSON 표기 + path 세그먼트 까지 trigram 색인 → query false positive 가능 문제 closure. `lexical.rs::build_match_string` 가 non-raw 분기 결과를 `text : (<expr>)` 로 wrap — heading 색인 V007 verbatim 유지, 매칭만 text 한정. 사용자가 명시 heading 검색 하려면 raw mode `'heading_path : <token>'` escape hatch (SKILL.md 갱신). 둘 다 additive (옛 config 호환) / re-ingest 불필요. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 v0.17.2 두 entry).
--- a/README.md
+++ b/README.md
@@ -192,7 +192,7 @@ flowchart TB

 ## Configuration

- `~/.config/kebab/config.toml` — `kebab init` 가 XDG 경로에 생성. `[workspace]` (root, exclude — include 필드는 제거됨, 지원 형식은 자동 결정), `[storage]`, `[chunking]`, `[models.embedding]`, `[models.llm]`, `[image.ocr]`, `[image.caption]`, `[search]`, `[rag]`, `[ui]` 절. 
+- `~/.config/kebab/config.toml` — `kebab init` 가 XDG 경로에 생성. `[workspace]` (root, exclude — include 필드는 제거됨, 지원 형식은 자동 결정), `[storage]`, `[chunking]`, `[models.embedding]`, `[models.llm]`, `[image.ocr]`, `[image.caption]`, `[pdf.ocr]`, `[search]`, `[rag]`, `[ui]` 절. 
  - `[models.embedding]` — 
    - `model` (default `"multilingual-e5-large"`, fb-39b) — 다국어 sentence embedding 모델. 1024-dim. ONNX (~1.3 GB) 첫 실행 시 fastembed cache (`config.storage.model_dir/fastembed/`) 에 자동 다운로드. `"multilingual-e5-small"` (384 dim) 는 backwards-compat 으로 사용 가능 — TOML 에 명시.
    - `dimensions` (default `1024`) — 모델의 embedding 차원. config 와 LanceDB stored dim 불일치 시 검색 결과 0 건 (orphan table). 모델 변경 시 `kebab reset --vector-only && kebab ingest` 로 vector index 재구축 권장.
@@ -211,6 +211,29 @@ flowchart TB

 config 예시는 [docs/SMOKE.md](docs/SMOKE.md) 의 `/tmp/kebab-smoke/config.toml` 블록 참조.

+### `[pdf.ocr]` — scanned PDF OCR (v0.20.0+)
+
+embedded text 가 없는 scanned PDF (책 스캔, 영수증, 카메라 page 등) 의 OCR 활성화. **default off (opt-in)** — OCR 한 page 당 ~45-100s (qwen2.5vl:3b on CPU) 의 cost 때문에 책 / 논문 archive 등 명시적 KB 에만 활성화.
+
+```toml
+[pdf.ocr]
+enabled = false              # opt-in: 책 / 논문 archive KB 에서 true
+always_on = false            # true 시 vector PDF page 도 dual-block OCR (confidence boost)
+engine = "ollama-vision"
+model = "qwen2.5vl:3b"       # PoC alnum 94.79% page1 / 81.56% 받침 (vs gemma4:e4b 의 27%)
+# endpoint = "http://localhost:11434"   # 미명시 시 models.llm.endpoint fallback
+languages = ["eng", "kor"]
+max_pixels = 2048
+request_timeout_secs = 600
+valid_ratio_threshold = 0.5  # text-detect threshold — mojibake / scanned 판정 boundary
+min_char_count = 20
+lang_hint = "kor"
+```
+
+env override: `KEBAB_PDF_OCR_*` 11 변수 (예: `KEBAB_PDF_OCR_ENABLED=true kebab ingest`).
+
+**v0.20 upgrade after**: scanned PDF 가 v0.19 에 빈 block + warning 으로 indexed 된 경우 자동으로 OCR 재실행 안 됨 (parser_version `"pdf-text-v1"` 보존). 명시적 재처리: `kebab ingest --force-reingest`.
+
 ## 외부 AI 통합

 `--json` 출력 + frozen wire schema v1 가 stable contract. 통합 옵션:
--- a/crates/kebab-app/Cargo.toml
+++ b/crates/kebab-app/Cargo.toml
@@ -35,6 +35,7 @@ kebab-parse-image = { path = "../kebab-parse-image" }
 # per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
 # resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
 kebab-parse-pdf = { path = "../kebab-parse-pdf" }
+lopdf            = { workspace = true }
 # p10-1A-2: Rust AST extractor lives here. App threads it into the
 # per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
 # resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
@@ -44,6 +45,7 @@ blake3               = { workspace = true }
 serde                = { workspace = true }
 serde_json           = { workspace = true }
 time                 = { workspace = true }
+uuid                 = { workspace = true }
 tracing              = { workspace = true }
 tracing-subscriber   = { version = "0.3", features = ["env-filter", "fmt", "json"] }
 tracing-appender     = "0.2"
@@ -75,7 +77,7 @@ image                = { version = "0.25", default-features = false, features =
 # lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
 # to the same major (0.32) so byte output is identical between the two
 # fixture surfaces.
-lopdf                = "0.32"
+lopdf                = { workspace = true }
 # error_wire::tests::llm_unreachable_classifies_to_model_unreachable needs a real
 # reqwest::Error (private constructor) — built from a connect-refused call.
 reqwest      = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
--- a/crates/kebab-app/src/app.rs
+++ b/crates/kebab-app/src/app.rs
@@ -46,9 +46,8 @@ use kebab_core::{
 use kebab_embed_local::FastembedEmbedder;
 use kebab_llm_local::OllamaLanguageModel;
 use kebab_parse_code::{
-    CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor,
-    JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor,
-    TypescriptAstExtractor,
+    CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor,
+    KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor,
 };
 use kebab_parse_image::ImageExtractor;
 use kebab_parse_pdf::PdfTextExtractor;
@@ -242,15 +241,15 @@ impl App {
        // kebab-nli construction. Failure (`?`) surfaces as a user-
        // facing error at App boot — never a panic in the pipeline's
        // `expect("verifier must be Some when nli_threshold > 0.0")`.
-        let pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>> =
-            if config.rag.nli_threshold > 0.0 {
-                let v = kebab_nli::OnnxNliVerifier::new(&config).context(
-                    "kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)",
-                )?;
-                Some(Arc::new(v))
-            } else {
-                None
-            };
+        let pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>> = if config.rag.nli_threshold
+            > 0.0
+        {
+            let v = kebab_nli::OnnxNliVerifier::new(&config)
+                .context("kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)")?;
+            Some(Arc::new(v))
+        } else {
+            None
+        };
        Ok(Self {
            config,
            sqlite: Arc::new(sqlite),
@@ -350,7 +349,9 @@ impl App {
        // so other in-flight searches can use the cache concurrently.
        drop(guard);
        let hits = self.search_uncached(query)?;
-        let mut guard = cache.lock().unwrap_or_else(std::sync::PoisonError::into_inner);
+        let mut guard = cache
+            .lock()
+            .unwrap_or_else(std::sync::PoisonError::into_inner);
        guard.put(key, hits.clone());
        Ok(hits)
    }
@@ -430,11 +431,7 @@ impl App {
    ///
    /// `SearchResponse.next_cursor` and `truncated` are independent
    /// signals — see `SearchResponse` doc for details.
-    pub fn search_with_opts(
-        &self,
-        query: SearchQuery,
-        opts: SearchOpts,
-    ) -> Result<SearchResponse> {
+    pub fn search_with_opts(&self, query: SearchQuery, opts: SearchOpts) -> Result<SearchResponse> {
        use crate::cursor;

        let corpus_revision = self.sqlite.corpus_revision().to_string();
@@ -519,8 +516,7 @@ impl App {
            // Apply offset + k_effective truncation (mirrors non-trace path).
            let drop_n = offset.min(traced_hits.len());
            traced_hits.drain(..drop_n);
-            let mut hits: Vec<SearchHit> =
-                traced_hits.into_iter().take(k_effective).collect();
+            let mut hits: Vec<SearchHit> = traced_hits.into_iter().take(k_effective).collect();

            // Snippet truncation if opts.snippet_chars set (mirror non-trace path).
            if opts.snippet_chars.is_some() {
@@ -551,8 +547,7 @@ impl App {
        // Skip offset.
        let drop_n = offset.min(all_hits.len());
        all_hits.drain(..drop_n);
-        let mut hits: Vec<SearchHit> =
-            all_hits.into_iter().take(k_effective).collect();
+        let mut hits: Vec<SearchHit> = all_hits.into_iter().take(k_effective).collect();

        // Apply snippet_chars override if shorter than what the
        // retriever returned (retriever already honored
@@ -573,15 +568,11 @@ impl App {
            // Step 1: shorten snippets progressively to a 60-char floor.
            const SNIPPET_FLOOR: usize = 60;
            let mut current_snippet_cap = snippet_chars;
-            while estimate_chars(&hits) > max_chars
-                && current_snippet_cap > SNIPPET_FLOOR
-            {
-                current_snippet_cap =
-                    (current_snippet_cap / 2).max(SNIPPET_FLOOR);
+            while estimate_chars(&hits) > max_chars && current_snippet_cap > SNIPPET_FLOOR {
+                current_snippet_cap = (current_snippet_cap / 2).max(SNIPPET_FLOOR);
                for h in &mut hits {
                    if h.snippet.chars().count() > current_snippet_cap {
-                        h.snippet =
-                            trim_to_chars(&h.snippet, current_snippet_cap);
+                        h.snippet = trim_to_chars(&h.snippet, current_snippet_cap);
                        truncated = true;
                    }
                }
@@ -651,8 +642,7 @@ impl App {
        retriever: Arc<dyn Retriever>,
        llm: Arc<dyn LanguageModel>,
    ) -> RagPipeline {
-        let pipeline =
-            RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
+        let pipeline = RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
        match &self.pipeline_verifier {
            Some(v) => pipeline.with_verifier(v.clone()),
            None => pipeline,
@@ -723,12 +713,7 @@ impl App {
    /// returns; on persistence error, the answer is still returned
    /// (don't lose the user's compute) but the error is logged so
    /// the operator notices.
-    pub fn ask_with_session(
-        &self,
-        session_id: &str,
-        query: &str,
-        opts: AskOpts,
-    ) -> Result<Answer> {
+    pub fn ask_with_session(&self, session_id: &str, query: &str, opts: AskOpts) -> Result<Answer> {
        use kebab_core::traits::{ChatSessionRepo, ChatSessionRow, ChatTurnRow};
        use std::time::{SystemTime, UNIX_EPOCH};

@@ -766,13 +751,8 @@ impl App {
        let retriever = self.build_retriever(opts.mode)?;
        let llm = self.llm()?;
        let pipeline = self.build_pipeline(retriever, llm);
-        let answer = pipeline.ask_with_history(
-            query,
-            history,
-            session_id.to_string(),
-            next_index,
-            opts,
-        )?;
+        let answer =
+            pipeline.ask_with_history(query, history, session_id.to_string(), next_index, opts)?;

        // Auto-create the session header on first use. Title from
        // the first question (≤40 chars after trim).
@@ -813,7 +793,8 @@ impl App {
            turn_index: next_index,
            question: query.to_string(),
            answer: answer.answer.clone(),
-            citations_json: serde_json::to_string(&answer.citations).unwrap_or_else(|_| "[]".to_string()),
+            citations_json: serde_json::to_string(&answer.citations)
+                .unwrap_or_else(|_| "[]".to_string()),
            created_at: now_unix,
        };
        if let Err(e) = self.sqlite.append_turn(&turn_row) {
@@ -848,8 +829,7 @@ impl App {
            return Ok(Some(e.clone()));
        }
        let emb: Arc<dyn Embedder + Send + Sync> = Arc::new(
-            FastembedEmbedder::new(&self.config)
-                .context("kb-app: load FastembedEmbedder")?,
+            FastembedEmbedder::new(&self.config).context("kb-app: load FastembedEmbedder")?,
        );
        // `set` returns Err if another thread won the race; in that case
        // the loser still returns the (now-cached) winner via `get()`.
@@ -925,7 +905,9 @@ impl App {
    /// clear` admin command). No-op when the cache is disabled.
    pub fn clear_search_cache(&self) {
        if let Some(cache) = self.search_cache.as_ref() {
-            let mut guard = cache.lock().unwrap_or_else(std::sync::PoisonError::into_inner);
+            let mut guard = cache
+                .lock()
+                .unwrap_or_else(std::sync::PoisonError::into_inner);
            guard.clear();
        }
    }
@@ -946,8 +928,8 @@ impl App {
    /// git tree) correctly keep `repo: None` — `Metadata.repo` is already
    /// `None` for those, so the assignment is a no-op.
    fn backfill_repo(&self, hits: &mut [SearchHit]) {
-        use std::collections::HashMap;
        use kebab_core::DocumentId;
+        use std::collections::HashMap;

        // doc_id → Option<String> where None means "not found / no repo"
        let mut cache: HashMap<DocumentId, Option<String>> = HashMap::new();
@@ -956,26 +938,24 @@ impl App {
            if hit.repo.is_some() {
                continue;
            }
-            let repo_val = cache
-                .entry(hit.doc_id.clone())
-                .or_insert_with(|| {
-                    // Deliberately non-aborting: a failed store lookup for
-                    // one hit must not abort the whole search response. Log
-                    // the error so it's observable rather than silently
-                    // dropped (review #140 round 1).
-                    match self.sqlite.get_document(&hit.doc_id) {
-                        Ok(opt) => opt.and_then(|doc| doc.metadata.repo),
-                        Err(e) => {
-                            tracing::warn!(
-                                target: "kebab-app",
-                                doc_id = %hit.doc_id,
-                                error = %e,
-                                "backfill_repo: get_document failed; leaving hit.repo = None"
-                            );
-                            None
-                        }
+            let repo_val = cache.entry(hit.doc_id.clone()).or_insert_with(|| {
+                // Deliberately non-aborting: a failed store lookup for
+                // one hit must not abort the whole search response. Log
+                // the error so it's observable rather than silently
+                // dropped (review #140 round 1).
+                match self.sqlite.get_document(&hit.doc_id) {
+                    Ok(opt) => opt.and_then(|doc| doc.metadata.repo),
+                    Err(e) => {
+                        tracing::warn!(
+                            target: "kebab-app",
+                            doc_id = %hit.doc_id,
+                            error = %e,
+                            "backfill_repo: get_document failed; leaving hit.repo = None"
+                        );
+                        None
                    }
-                });
+                }
+            });
            if let Some(r) = repo_val {
                hit.repo = Some(r.clone());
            }
@@ -986,10 +966,7 @@ impl App {
    /// "switch to --mode lexical" error when embeddings are disabled.
    fn require_embeddings(
        &self,
-    ) -> Result<(
-        Arc<dyn Embedder + Send + Sync>,
-        Arc<LanceVectorStore>,
-    )> {
+    ) -> Result<(Arc<dyn Embedder + Send + Sync>, Arc<LanceVectorStore>)> {
        let emb = self.embedder()?.ok_or_else(|| {
            anyhow!(
                "embeddings disabled (config.models.embedding.provider == \"none\" \
@@ -1278,8 +1255,8 @@ mod tests_extractor_dispatch {
            MediaType::Code("kotlin".into()),
            MediaType::Code("c".into()),
            MediaType::Code("cpp".into()),
-            MediaType::Code("yaml".into()),  // registry NOT cover
-            MediaType::Code("shell".into()), // registry NOT cover
+            MediaType::Code("yaml".into()),   // registry NOT cover
+            MediaType::Code("shell".into()),  // registry NOT cover
            MediaType::Audio(AudioType::Wav), // registry NOT cover
        ];
        for sample in &samples {
--- a/crates/kebab-app/src/bulk.rs
+++ b/crates/kebab-app/src/bulk.rs
@@ -215,7 +215,10 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
            .and_then(serde_json::Value::as_u64)
            .map(|n| n as usize),
        cursor: obj.get("cursor").and_then(|v| v.as_str()).map(String::from),
-        trace: obj.get("trace").and_then(serde_json::Value::as_bool).unwrap_or(false),
+        trace: obj
+            .get("trace")
+            .and_then(serde_json::Value::as_bool)
+            .unwrap_or(false),
    };

    Ok((
--- a/crates/kebab-app/src/error_signal.rs
+++ b/crates/kebab-app/src/error_signal.rs
@@ -10,6 +10,6 @@

 pub use crate::doctor_signal::{DoctorUnhealthy, NoHitSignal, RefusalSignal};

+pub use kebab_config::{ConfigInvalid, ConfigNotFound};
 pub use kebab_llm_local::LlmError;
-pub use kebab_config::ConfigInvalid;
 pub use kebab_store_sqlite::NotIndexed;
--- a/crates/kebab-app/src/error_wire.rs
+++ b/crates/kebab-app/src/error_wire.rs
@@ -9,7 +9,7 @@
 use serde::{Deserialize, Serialize};
 use serde_json::{Value, json};

-use crate::error_signal::{ConfigInvalid, LlmError, NotIndexed};
+use crate::error_signal::{ConfigInvalid, ConfigNotFound, LlmError, NotIndexed};

 // p9-fb-34: `stale_cursor` is constructed directly by `cursor::decode`
 // and surfaced through `StructuredError` (an anyhow-friendly wrapper
@@ -65,6 +65,20 @@ pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
            hint: Some("check `--config <path>` and TOML syntax".to_string()),
        };
    }
+    if let Some(s) = err.downcast_ref::<ConfigNotFound>() {
+        return ErrorV1 {
+            schema_version: ERROR_V1_ID.to_string(),
+            code: "config_not_found".to_string(),
+            message: s.to_string(),
+            details: json!({
+                "path": s.path.to_string_lossy(),
+            }),
+            hint: Some(
+                "verify --config <path>; pass an existing toml file or omit --config to use XDG default"
+                    .to_string(),
+            ),
+        };
+    }
    if let Some(s) = err.downcast_ref::<NotIndexed>() {
        return ErrorV1 {
            schema_version: ERROR_V1_ID.to_string(),
@@ -158,7 +172,10 @@ mod tests {
        });
        let v1 = classify(&err, false);
        assert_eq!(v1.code, "config_invalid");
-        assert_eq!(v1.details.get("path").and_then(|p| p.as_str()), Some("/tmp/x.toml"));
+        assert_eq!(
+            v1.details.get("path").and_then(|p| p.as_str()),
+            Some("/tmp/x.toml")
+        );
        assert!(v1.hint.is_some());
    }

@@ -182,7 +199,8 @@ mod tests {
        // the resulting LlmError::Unreachable maps to "model_unreachable".
        let client = reqwest::blocking::Client::builder()
            .timeout(std::time::Duration::from_millis(500))
-            .build().unwrap();
+            .build()
+            .unwrap();
        let err = client.get("http://127.0.0.1:1").send().unwrap_err();
        let llm = LlmError::Unreachable {
            endpoint: "http://127.0.0.1:1".to_string(),
@@ -198,7 +216,10 @@ mod tests {
        let llm = LlmError::ModelNotPulled("gemma4:e4b".to_string());
        let v1 = classify(&anyhow::Error::new(llm), false);
        assert_eq!(v1.code, "model_not_pulled");
-        assert_eq!(v1.details.get("model").and_then(|p| p.as_str()), Some("gemma4:e4b"));
+        assert_eq!(
+            v1.details.get("model").and_then(|p| p.as_str()),
+            Some("gemma4:e4b")
+        );
    }

    #[test]
@@ -235,7 +256,10 @@ mod tests {
        // (single source of truth). classify must not pattern-match on
        // anyhow string contents — that would create two sources of
        // truth. The bare anyhow string falls through to "generic".
-        assert_ne!(v1.code, "stale_cursor", "classify must not produce stale_cursor from bare anyhow string");
+        assert_ne!(
+            v1.code, "stale_cursor",
+            "classify must not produce stale_cursor from bare anyhow string"
+        );
    }

    #[test]
--- a/crates/kebab-app/src/external.rs
+++ b/crates/kebab-app/src/external.rs
@@ -36,9 +36,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
    } else {
        String::new()
    };
-    let already = existing
-        .lines()
-        .any(|line| line.trim() == KEBABIGNORE_LINE);
+    let already = existing.lines().any(|line| line.trim() == KEBABIGNORE_LINE);
    if already {
        return Ok(());
    }
@@ -57,11 +55,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
 /// Copy bytes to `<external_dir>/<blake3-12>.<ext>`. Idempotent — if the
 /// destination file already exists with the expected hash, the existing
 /// file is reused (no second write). Returns the destination path.
-pub fn copy_to_external(
-    external_dir: &Path,
-    bytes: &[u8],
-    ext: &str,
-) -> Result<PathBuf> {
+pub fn copy_to_external(external_dir: &Path, bytes: &[u8], ext: &str) -> Result<PathBuf> {
    let hash = blake3::hash(bytes);
    let hex = hash.to_hex();
    let prefix = &hex.as_str()[..12];
@@ -82,11 +76,7 @@ pub fn copy_to_external(
 /// Internal `yaml_quote` always uses double-quoted YAML form with backslash
 /// escapes for `"` / `\` / control chars — agent-supplied titles with
 /// special characters are safe.
-pub fn inject_frontmatter(
-    body: &str,
-    title: &str,
-    source_uri: Option<&str>,
-) -> Result<String> {
+pub fn inject_frontmatter(body: &str, title: &str, source_uri: Option<&str>) -> Result<String> {
    let head = body.trim_start();
    if head.starts_with("---\n") || head.starts_with("---\r\n") || head.starts_with("---\r") {
        anyhow::bail!(
--- a/crates/kebab-app/src/fetch.rs
+++ b/crates/kebab-app/src/fetch.rs
@@ -50,14 +50,14 @@ impl App {
 fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
    let target = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_chunk(&app.sqlite, &id)?
        .ok_or_else(|| {
-            anyhow::Error::new(StructuredError(ErrorV1 {
-                schema_version: ERROR_V1_ID.to_string(),
-                code: "chunk_not_found".to_string(),
-                message: format!("chunk_id '{}' not found", id.0),
-                details: serde_json::Value::Null,
-                hint: None,
-            }))
-        })?;
+        anyhow::Error::new(StructuredError(ErrorV1 {
+            schema_version: ERROR_V1_ID.to_string(),
+            code: "chunk_not_found".to_string(),
+            message: format!("chunk_id '{}' not found", id.0),
+            details: serde_json::Value::Null,
+            hint: None,
+        }))
+    })?;

    let doc_id = target.doc_id.clone();
    let doc =
@@ -107,14 +107,14 @@ fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
 fn fetch_doc(app: &App, id: DocumentId, opts: FetchOpts) -> Result<FetchResult> {
    let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
        .ok_or_else(|| {
-            anyhow::Error::new(StructuredError(ErrorV1 {
-                schema_version: ERROR_V1_ID.to_string(),
-                code: "doc_not_found".to_string(),
-                message: format!("doc_id '{}' not found", id.0),
-                details: serde_json::Value::Null,
-                hint: None,
-            }))
-        })?;
+        anyhow::Error::new(StructuredError(ErrorV1 {
+            schema_version: ERROR_V1_ID.to_string(),
+            code: "doc_not_found".to_string(),
+            message: format!("doc_id '{}' not found", id.0),
+            details: serde_json::Value::Null,
+            hint: None,
+        }))
+    })?;

    let mut text = fmt_canonical_to_markdown(&doc);
    let mut truncated = false;
@@ -176,14 +176,14 @@ fn fetch_span(
 ) -> Result<FetchResult> {
    let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
        .ok_or_else(|| {
-            anyhow::Error::new(StructuredError(ErrorV1 {
-                schema_version: ERROR_V1_ID.to_string(),
-                code: "doc_not_found".to_string(),
-                message: format!("doc_id '{}' not found", id.0),
-                details: serde_json::Value::Null,
-                hint: None,
-            }))
-        })?;
+        anyhow::Error::new(StructuredError(ErrorV1 {
+            schema_version: ERROR_V1_ID.to_string(),
+            code: "doc_not_found".to_string(),
+            message: format!("doc_id '{}' not found", id.0),
+            details: serde_json::Value::Null,
+            hint: None,
+        }))
+    })?;

    // Reject line-incompatible media types (PDF / audio). `SourceType`
    // (markdown / note / paper / reference / inbox) is the *user-facing*
--- a/crates/kebab-app/src/ingest_log.rs
+++ b/crates/kebab-app/src/ingest_log.rs
@@ -0,0 +1,328 @@
+//! Per-ingest-run structured ndjson log writer (v0.20.x ingest log feature).
+//!
+//! Each `kebab ingest` run produces one `ingest-{run_id}.ndjson` file in
+//! `config.logging.ingest_log_dir`. Records are appended line by line; the
+//! last record is always `kind="summary"`. `IngestLogWriter::open` returns
+//! `Ok(None)` when `ingest_log_enabled = false` so callers need not branch.
+
+use std::fs::File;
+use std::io::{BufWriter, Write};
+use std::path::{Path, PathBuf};
+use std::time::SystemTime;
+
+use serde::{Deserialize, Serialize};
+use time::format_description::well_known::Rfc3339;
+
+pub struct IngestLogWriter {
+    file: BufWriter<File>,
+    path: PathBuf,
+    run_id: String,
+    started_at: SystemTime,
+}
+
+impl IngestLogWriter {
+    /// Open a new log file. Returns `Ok(None)` when `cfg.ingest_log_enabled == false` (AC-6).
+    pub fn open(cfg: &kebab_config::LoggingCfg) -> anyhow::Result<Option<Self>> {
+        if !cfg.ingest_log_enabled {
+            return Ok(None);
+        }
+        let run_id = generate_run_id();
+        let log_dir = expand_log_dir(&cfg.ingest_log_dir);
+        std::fs::create_dir_all(&log_dir)?;
+        let path = log_dir.join(format!("ingest-{run_id}.ndjson"));
+        let file = BufWriter::new(File::create(&path)?);
+        Ok(Some(Self {
+            file,
+            path,
+            run_id,
+            started_at: SystemTime::now(),
+        }))
+    }
+
+    pub fn write_event(&mut self, event: &LogEvent<'_>) -> anyhow::Result<()> {
+        serde_json::to_writer(&mut self.file, event)?;
+        writeln!(self.file)?;
+        Ok(())
+    }
+
+    pub fn write_summary(&mut self, summary: &IngestSummary) -> anyhow::Result<()> {
+        serde_json::to_writer(&mut self.file, summary)?;
+        writeln!(self.file)?;
+        Ok(())
+    }
+
+    pub fn flush(&mut self) -> anyhow::Result<()> {
+        self.file.flush()?;
+        Ok(())
+    }
+
+    pub fn run_id(&self) -> &str {
+        &self.run_id
+    }
+
+    pub fn path(&self) -> &Path {
+        &self.path
+    }
+
+    pub fn started_at(&self) -> SystemTime {
+        self.started_at
+    }
+}
+
+impl Drop for IngestLogWriter {
+    fn drop(&mut self) {
+        let _ = self.file.flush();
+    }
+}
+
+/// ISO 8601 compact timestamp + uuid v7 suffix: `20260528T013000Z-abc123de`.
+/// uuid v7 is the workspace dep (Cargo.toml); `rand` is not added (spec §6 R-5).
+fn generate_run_id() -> String {
+    use time::macros::format_description;
+    let now = time::OffsetDateTime::now_utc();
+    let ts = now
+        .format(format_description!(
+            "[year][month][day]T[hour][minute][second]Z"
+        ))
+        .unwrap_or_else(|_| "19700101T000000Z".to_string());
+    let uid = uuid::Uuid::now_v7().simple().to_string();
+    let suffix = &uid[uid.len() - 8..];
+    format!("{ts}-{suffix}")
+}
+
+/// Expand `{state_dir}` placeholder → XDG state dir (spec §6 R-3).
+/// Other tilde/env expansion is delegated to `kebab_config::expand_path`.
+fn expand_log_dir(path: &Path) -> PathBuf {
+    let path_str = path.to_string_lossy();
+    if path_str.contains("{state_dir}") {
+        let state_dir = kebab_config::Config::xdg_state_dir();
+        PathBuf::from(path_str.replace("{state_dir}", &state_dir.to_string_lossy()))
+    } else {
+        path.to_path_buf()
+    }
+}
+
+/// RFC 3339 UTC timestamp for log records.
+#[allow(dead_code)]
+pub(crate) fn now_ts() -> String {
+    time::OffsetDateTime::now_utc()
+        .format(&Rfc3339)
+        .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
+}
+
+/// Ingest event record (ndjson line). `kind` is the discriminator.
+#[derive(Serialize, Deserialize)]
+#[serde(tag = "kind", rename_all = "snake_case")]
+pub enum LogEvent<'a> {
+    Ocr {
+        ts: String,
+        doc_path: &'a str,
+        page: u32,
+        image_byte_size: Option<u64>,
+        image_width: Option<u32>,
+        image_height: Option<u32>,
+        ms: u64,
+        chars: u32,
+        success: bool,
+        reason: Option<&'a str>,
+        ocr_engine: &'a str,
+    },
+    ParseError {
+        ts: String,
+        doc_path: &'a str,
+        reason: &'a str,
+        message: &'a str,
+    },
+    Skip {
+        ts: String,
+        doc_path: &'a str,
+        reason: &'a str,
+        detail: Option<&'a str>,
+    },
+    Error {
+        ts: String,
+        code: &'a str,
+        message: &'a str,
+    },
+}
+
+/// Final summary record — always the last line of the log file.
+/// Explicit `kind` field serializes to `"kind": "summary"`.
+#[derive(Serialize, Deserialize)]
+pub struct IngestSummary {
+    pub kind: String,
+    pub ts: String,
+    pub run_id: String,
+    pub scanned: u32,
+    pub new: u32,
+    pub errors: u32,
+    pub ocr_pages: u32,
+    pub ocr_failures: u32,
+    pub ocr_p50_ms: Option<u64>,
+    pub ocr_p90_ms: Option<u64>,
+    pub ocr_max_ms: Option<u64>,
+    pub duration_ms: u64,
+}
+
+impl IngestSummary {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        ts: String,
+        run_id: String,
+        scanned: u32,
+        new: u32,
+        errors: u32,
+        ocr_pages: u32,
+        ocr_failures: u32,
+        ocr_ms_samples: &[u64],
+        duration_ms: u64,
+    ) -> Self {
+        let (p50, p90, max) = percentiles(ocr_ms_samples);
+        Self {
+            kind: "summary".to_string(),
+            ts,
+            run_id,
+            scanned,
+            new,
+            errors,
+            ocr_pages,
+            ocr_failures,
+            ocr_p50_ms: p50,
+            ocr_p90_ms: p90,
+            ocr_max_ms: max,
+            duration_ms,
+        }
+    }
+}
+
+/// Simple percentile extraction on a sorted copy of `samples`.
+/// Returns `(p50, p90, max)`. All `None` when samples is empty.
+pub(crate) fn percentiles(samples: &[u64]) -> (Option<u64>, Option<u64>, Option<u64>) {
+    if samples.is_empty() {
+        return (None, None, None);
+    }
+    let mut sorted = samples.to_vec();
+    sorted.sort_unstable();
+    let n = sorted.len();
+    let p50 = sorted[n * 50 / 100];
+    let p90 = sorted[n * 90 / 100];
+    let max = *sorted.last().unwrap();
+    (Some(p50), Some(p90), Some(max))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use kebab_config::LoggingCfg;
+    use tempfile::TempDir;
+
+    #[test]
+    fn generate_run_id_has_iso_prefix_and_8_hex_suffix() {
+        let id = generate_run_id();
+        // Format: YYYYMMDDTHHmmssZ-xxxxxxxx (total len = 16+1+8 = 25)
+        assert_eq!(id.len(), 25, "run_id len should be 25: {id}");
+        let (prefix, suffix) = id.split_once('-').expect("run_id should contain '-'");
+        assert_eq!(prefix.len(), 16, "prefix should be 16 chars: {prefix}");
+        assert!(prefix.contains('T'), "prefix should contain T: {prefix}");
+        assert!(prefix.ends_with('Z'), "prefix should end with Z: {prefix}");
+        assert_eq!(suffix.len(), 8, "suffix should be 8 chars: {suffix}");
+        assert!(
+            suffix.chars().all(|c| c.is_ascii_hexdigit()),
+            "suffix should be hex: {suffix}"
+        );
+    }
+
+    #[test]
+    fn expand_log_dir_substitutes_state_dir_placeholder() {
+        let input = PathBuf::from("{state_dir}/logs");
+        let expanded = expand_log_dir(&input);
+        let expected = kebab_config::Config::xdg_state_dir().join("logs");
+        assert_eq!(expanded, expected);
+        assert!(!expanded.to_string_lossy().contains("{state_dir}"));
+    }
+
+    #[test]
+    fn writer_disabled_returns_none() {
+        let cfg = LoggingCfg {
+            ingest_log_enabled: false,
+            ingest_log_dir: PathBuf::from("/tmp/should-not-exist"),
+        };
+        let result = IngestLogWriter::open(&cfg).expect("open should not error");
+        assert!(result.is_none(), "disabled writer should return None");
+    }
+
+    #[test]
+    fn writer_writes_one_event_per_line_with_kind_discriminator() {
+        let tmp = TempDir::new().unwrap();
+        let cfg = LoggingCfg {
+            ingest_log_enabled: true,
+            ingest_log_dir: tmp.path().to_path_buf(),
+        };
+        let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
+        let path = writer.path().to_path_buf();
+
+        writer
+            .write_event(&LogEvent::Skip {
+                ts: now_ts(),
+                doc_path: "a.zip",
+                reason: "builtin_blacklist",
+                detail: Some(".zip extension"),
+            })
+            .unwrap();
+        writer
+            .write_event(&LogEvent::Error {
+                ts: now_ts(),
+                code: "ingest_fatal",
+                message: "something bad",
+            })
+            .unwrap();
+        writer
+            .write_event(&LogEvent::ParseError {
+                ts: now_ts(),
+                doc_path: "weird.pdf",
+                reason: "lopdf_error",
+                message: "unexpected EOF",
+            })
+            .unwrap();
+        writer.flush().unwrap();
+
+        let contents = std::fs::read_to_string(&path).unwrap();
+        let lines: Vec<&str> = contents.lines().collect();
+        assert_eq!(lines.len(), 3, "expected 3 lines, got: {}", lines.len());
+        for line in &lines {
+            assert!(
+                line.starts_with('{'),
+                "each line should be JSON object: {line}"
+            );
+            assert!(
+                line.contains("\"kind\""),
+                "each line should have 'kind': {line}"
+            );
+        }
+    }
+
+    #[test]
+    fn drop_flushes_pending_buffer() {
+        let tmp = TempDir::new().unwrap();
+        let cfg = LoggingCfg {
+            ingest_log_enabled: true,
+            ingest_log_dir: tmp.path().to_path_buf(),
+        };
+        let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
+        let path = writer.path().to_path_buf();
+        writer
+            .write_event(&LogEvent::Error {
+                ts: now_ts(),
+                code: "test",
+                message: "drop flush test",
+            })
+            .unwrap();
+        // Drop without explicit flush — Drop impl should flush BufWriter.
+        drop(writer);
+        let contents = std::fs::read_to_string(&path).unwrap();
+        assert!(
+            contents.lines().count() >= 1,
+            "file should have at least 1 line after drop"
+        );
+    }
+}
--- a/crates/kebab-app/src/ingest_progress.rs
+++ b/crates/kebab-app/src/ingest_progress.rs
@@ -46,10 +46,13 @@ pub struct AggregateCounts {
 /// Ordering invariant per design §2.4a:
 ///
 /// ```text
-/// ScanStarted < ScanCompleted < (AssetStarted < AssetFinished)*
-///                             < (Completed | Aborted)
+/// ScanStarted < ScanCompleted
+///   < (AssetStarted [< (PdfOcrStarted < PdfOcrFinished)*] < AssetFinished)*
+///   < (Completed | Aborted)
 /// ```
 ///
+/// `[]` = optional, per-PDF asset only (v0.20.0 sub-item 1).
+///
 /// Embed-batch events (`embed_batch_started` / `embed_batch_finished`
 /// in §2.4a) are reserved for a future iteration and are not emitted
 /// by this task; the spec calls them out as "임의 위치" (optional).
@@ -85,6 +88,30 @@ pub enum IngestEvent {
    /// aggregate at the cancel boundary. Emitted by `p9-fb-04`; this
    /// task never produces `Aborted`.
    Aborted { counts: AggregateCounts },
+    /// PDF page 별 OCR 시작 시 emit. v0.20.0 sub-item 1.
+    PdfOcrStarted { page: u32 },
+    /// PDF page 별 OCR 종료 시 emit. v0.20.0 sub-item 1.
+    /// `skipped` = `true` 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패).
+    /// `chars = 0` 만으로는 "skip" 과 "0-char OCR result" 구분 불가, `skipped` field 가 명시적.
+    PdfOcrFinished {
+        page: u32,
+        ms: u64,
+        chars: u32,
+        ocr_engine: String,
+        skipped: bool,
+        /// v0.20.x ingest log: raster image byte size (additive minor, optional).
+        #[serde(skip_serializing_if = "Option::is_none")]
+        image_byte_size: Option<u64>,
+        /// v0.20.x ingest log: raster image width in pixels (additive minor, optional).
+        #[serde(skip_serializing_if = "Option::is_none")]
+        image_width: Option<u32>,
+        /// v0.20.x ingest log: raster image height in pixels (additive minor, optional).
+        #[serde(skip_serializing_if = "Option::is_none")]
+        image_height: Option<u32>,
+        /// v0.20.x ingest log: OCR failure reason (additive minor, optional).
+        #[serde(skip_serializing_if = "Option::is_none")]
+        failure_reason: Option<String>,
+    },
 }

 /// Map a `MediaType` to the short label used by `IngestEvent::AssetStarted`.
@@ -118,10 +145,7 @@ pub fn render_skipped_breakdown(map: &std::collections::BTreeMap<String, u32>) -
 /// Best-effort send into an optional `mpsc::Sender`. A dropped receiver
 /// is silently absorbed — the ingest hot path must not stall on a slow
 /// consumer. Logged at `trace` for diagnostics.
-pub(crate) fn emit(
-    progress: Option<&std::sync::mpsc::Sender<IngestEvent>>,
-    event: IngestEvent,
-) {
+pub(crate) fn emit(progress: Option<&std::sync::mpsc::Sender<IngestEvent>>, event: IngestEvent) {
    if let Some(tx) = progress {
        if tx.send(event).is_err() {
            tracing::trace!(
@@ -165,7 +189,10 @@ mod tests {
            media: "markdown".into(),
        };
        let v = serde_json::to_value(&ev).unwrap();
-        assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("asset_started"));
+        assert_eq!(
+            v.get("kind").and_then(|s| s.as_str()),
+            Some("asset_started")
+        );
        assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(1));
        assert_eq!(v.get("total").and_then(serde_json::Value::as_u64), Some(10));
        assert_eq!(v.get("path").and_then(|s| s.as_str()), Some("notes/foo.md"));
@@ -184,8 +211,14 @@ mod tests {
        let v = serde_json::to_value(&ev).unwrap();
        assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("completed"));
        let counts = v.get("counts").unwrap();
-        assert_eq!(counts.get("scanned").and_then(serde_json::Value::as_u64), Some(5));
-        assert_eq!(counts.get("new").and_then(serde_json::Value::as_u64), Some(2));
+        assert_eq!(
+            counts.get("scanned").and_then(serde_json::Value::as_u64),
+            Some(5)
+        );
+        assert_eq!(
+            counts.get("new").and_then(serde_json::Value::as_u64),
+            Some(2)
+        );
    }

    #[test]
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -34,21 +34,25 @@
 //! still allowing the cross-crate calls.

 use std::path::PathBuf;
-use std::sync::Arc;
+use std::sync::{Arc, Mutex};

 use anyhow::{Context, anyhow};
 use serde::{Deserialize, Serialize};

-use kebab_chunk::{CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker};
+use kebab_chunk::{
+    CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker,
+    CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker,
+    CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker,
+    K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker,
+};
 use kebab_core::{
-    Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
-    DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
-    EmbeddingKind, ExtractContext, IngestReport, Lang, LanguageModel, MediaType,
-    ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope,
-    SourceUri, VectorRecord, VectorStore,
+    Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, Chunker, ChunkerVersion,
+    DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, EmbeddingKind,
+    ExtractContext, IngestReport, Lang, LanguageModel, MediaType, ParserVersion, RawAsset,
+    SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore,
 };
 use kebab_llm_local::OllamaLanguageModel;
-use kebab_parse_image::{OllamaVisionOcr, apply_caption, apply_ocr};
+use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_caption, apply_ocr};
 use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
 use kebab_source_fs::FsSourceConnector;

@@ -60,20 +64,26 @@ pub mod error_signal;
 pub mod error_wire;
 pub mod external;
 pub mod fetch;
+pub mod ingest_log;
 pub mod ingest_progress;
 pub mod logging;
+pub mod pdf_ocr_apply;
 pub mod reset;
 pub mod schema;
 mod staleness;

 pub use app::{App, SearchResponse, short_query_hint};
-pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
-pub use reset::{ResetReport, ResetScope, enumerate_orphans};
-pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify};
-pub use fetch::fetch_with_config;
 #[doc(hidden)]
 pub use bulk::{BULK_QUERIES_MAX, bulk_search_with_config};
-pub use schema::{Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config};
+pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify};
+pub use fetch::fetch_with_config;
+pub use ingest_log::{IngestLogWriter, IngestSummary, LogEvent};
+pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
+pub use kebab_config::{ConfigInvalid, ConfigNotFound};
+pub use reset::{ResetReport, ResetScope, enumerate_orphans};
+pub use schema::{
+    Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config,
+};
 pub use staleness::{compute_stale, mark_stale_in_place};

 /// p9-fb-25: sentinel for files without an extension in
@@ -293,6 +303,24 @@ pub fn ingest_with_config_opts(

    let app = App::open_with_config(config)?;

+    // v0.20.x Hook 1: init per-run log writer (None when disabled or on open failure).
+    let log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>> =
+        match crate::ingest_log::IngestLogWriter::open(&app.config.logging) {
+            Ok(Some(w)) => Some(Arc::new(Mutex::new(w))),
+            Ok(None) => None,
+            Err(e) => {
+                tracing::warn!(
+                    target: "kebab-app",
+                    error = %e,
+                    "ingest_log: failed to open log file; logging disabled for this run"
+                );
+                None
+            }
+        };
+    let ocr_ms_samples: Arc<Mutex<Vec<u64>>> = Arc::new(Mutex::new(Vec::new()));
+    let ocr_pages_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
+    let ocr_failures_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
+
    // Walk the workspace.
    crate::ingest_progress::emit(
        progress,
@@ -300,8 +328,8 @@ pub fn ingest_with_config_opts(
            root: scope.root.to_string_lossy().into_owned(),
        },
    );
-    let connector = FsSourceConnector::new(&app.config)
-        .context("kb-app::ingest: build FsSourceConnector")?;
+    let connector =
+        FsSourceConnector::new(&app.config).context("kb-app::ingest: build FsSourceConnector")?;
    let (assets, fs_skips) = connector
        .scan_with_skips(&scope)
        .context("kb-app::ingest: scan workspace")?;
@@ -312,6 +340,20 @@ pub fn ingest_with_config_opts(
        },
    );

+    // v0.20.x Hook 4: emit skip events from scan into log writer.
+    if let Some(ref lw) = log_writer {
+        for ev in &fs_skips.events {
+            if let Ok(mut w) = lw.lock() {
+                let _ = w.write_event(&crate::ingest_log::LogEvent::Skip {
+                    ts: crate::ingest_log::now_ts(),
+                    doc_path: &ev.doc_path,
+                    reason: ev.reason,
+                    detail: ev.detail.as_deref(),
+                });
+            }
+        }
+    }
+
    // Embedder + vector store: build once at the top so the cold-start
    // cost is paid once even when the workspace has 1000 markdown files.
    let embedder = app.embedder()?;
@@ -336,18 +378,14 @@ pub fn ingest_with_config_opts(
    // endpoint) aborts ingest fail-fast — better than silently disabling
    // OCR/caption mid-run.
    let ocr_engine: Option<OllamaVisionOcr> = if app.config.image.ocr.enabled {
-        Some(
-            OllamaVisionOcr::new(&app.config)
-                .context("kb-app::ingest: build OllamaVisionOcr")?,
-        )
+        Some(OllamaVisionOcr::new(&app.config).context("kb-app::ingest: build OllamaVisionOcr")?)
    } else {
        None
    };
    let caption_llm: Option<Box<dyn LanguageModel>> = if app.config.image.caption.enabled {
-        Some(Box::new(
-            OllamaLanguageModel::new(&app.config)
-                .context("kb-app::ingest: build OllamaLanguageModel for caption")?,
-        ))
+        Some(Box::new(OllamaLanguageModel::new(&app.config).context(
+            "kb-app::ingest: build OllamaLanguageModel for caption",
+        )?))
    } else {
        None
    };
@@ -356,6 +394,29 @@ pub fn ingest_with_config_opts(
        caption_llm: caption_llm.as_deref(),
    };

+    // p10 / v0.20 sub-item 1: PDF OCR engine eager init (H-5 resolution).
+    // image OCR pattern mirror — per-ingest 1회 build, fallible → fail-fast.
+    let pdf_ocr_engine: Option<OllamaVisionOcr> =
+        if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
+            let cfg = &app.config.pdf.ocr;
+            let endpoint = match cfg.endpoint.as_deref() {
+                Some(s) if !s.is_empty() => s.to_string(),
+                _ => app.config.models.llm.endpoint.clone(),
+            };
+            Some(
+                OllamaVisionOcr::from_parts(
+                    endpoint,
+                    cfg.model.clone(),
+                    cfg.languages.clone(),
+                    cfg.max_pixels,
+                    cfg.request_timeout_secs,
+                )
+                .context("kb-app::ingest: build OllamaVisionOcr (pdf)")?,
+            )
+        } else {
+            None
+        };
+
    // Pre-load every existing doc_id so we can label `IngestItem.kind`
    // as `New` vs `Updated` correctly. `list_documents` returns one
    // row per `(workspace_path, asset_id)` — index by the deterministic
@@ -381,10 +442,8 @@ pub fn ingest_with_config_opts(
    // current walker scope (config narrowing / include-glob change) is
    // NOT purged — we leave it in place to protect against accidental
    // data loss via config edits.
-    let scanned_paths: std::collections::HashSet<kebab_core::WorkspacePath> = assets
-        .iter()
-        .map(|a| a.workspace_path.clone())
-        .collect();
+    let scanned_paths: std::collections::HashSet<kebab_core::WorkspacePath> =
+        assets.iter().map(|a| a.workspace_path.clone()).collect();
    let purged_deleted_files = sweep_deleted_files(
        &app,
        &scanned_paths,
@@ -447,6 +506,13 @@ pub fn ingest_with_config_opts(
            &existing_doc_ids,
            &image_pipeline,
            force_reingest,
+            pdf_ocr_engine.as_ref(),
+            progress,
+            opts.cancel.as_ref(),
+            log_writer.clone(),
+            ocr_ms_samples.clone(),
+            ocr_pages_cnt.clone(),
+            ocr_failures_cnt.clone(),
        );

        let item = match item {
@@ -458,6 +524,16 @@ pub fn ingest_with_config_opts(
                    error = %e,
                    "kb-app::ingest: per-file fatal"
                );
+                // v0.20.x Hook 3: write per-asset error to log writer.
+                if let Some(ref lw) = log_writer {
+                    if let Ok(mut w) = lw.lock() {
+                        let _ = w.write_event(&crate::ingest_log::LogEvent::Error {
+                            ts: crate::ingest_log::now_ts(),
+                            code: "ingest_asset_error",
+                            message: &format!("{e:#}"),
+                        });
+                    }
+                }
                // Note: `error_count += 1` happens below in the
                // `match item.kind { Error => ... }` arm — incrementing
                // here too would double-count (a regression first
@@ -475,6 +551,8 @@ pub fn ingest_with_config_opts(
                    parser_version: None,
                    chunker_version: None,
                    warnings: Vec::new(),
+                    pdf_ocr_pages: None,
+                    pdf_ocr_ms_total: None,
                    error: Some(format!("{e:#}")),
                }
            }
@@ -581,8 +659,7 @@ pub fn ingest_with_config_opts(
        }
    }

-    let duration_ms = u32::try_from(started_instant.elapsed().as_millis())
-        .unwrap_or(u32::MAX);
+    let duration_ms = u32::try_from(started_instant.elapsed().as_millis()).unwrap_or(u32::MAX);
    let finished_at = time::OffsetDateTime::now_utc();

    // Record the ingest_runs row with aggregate counts.
@@ -682,6 +759,29 @@ pub fn ingest_with_config_opts(
        }
    }

+    // v0.20.x Hook 1 exit: write summary record + flush log writer.
+    if let Some(ref lw) = log_writer {
+        if let Ok(mut w) = lw.lock() {
+            let run_id = w.run_id().to_string();
+            let ms_samples = ocr_ms_samples.lock().map(|v| v.clone()).unwrap_or_default();
+            let pages = ocr_pages_cnt.lock().map(|v| *v).unwrap_or(0);
+            let failures = ocr_failures_cnt.lock().map(|v| *v).unwrap_or(0);
+            let summary = crate::ingest_log::IngestSummary::new(
+                crate::ingest_log::now_ts(),
+                run_id,
+                scanned_count,
+                new_count,
+                error_count,
+                pages,
+                failures,
+                &ms_samples,
+                started_instant.elapsed().as_millis() as u64,
+            );
+            let _ = w.write_summary(&summary);
+            let _ = w.flush();
+        }
+    }
+
    Ok(IngestReport {
        scope,
        scanned: scanned_count,
@@ -840,8 +940,8 @@ fn try_skip_unchanged(

    if stored_is_tier3_fallback {
        // Embedder version still must match.
-        let embedder_match = existing_doc.last_embedding_version.as_ref()
-            == current_embedding_version;
+        let embedder_match =
+            existing_doc.last_embedding_version.as_ref() == current_embedding_version;
        if !embedder_match {
            return Ok(None);
        }
@@ -863,6 +963,8 @@ fn try_skip_unchanged(
            parser_version: Some(existing_doc.parser_version.clone()),
            chunker_version: existing_doc.last_chunker_version.clone(),
            warnings: Vec::new(),
+            pdf_ocr_pages: None,
+            pdf_ocr_ms_total: None,
            error: None,
        }));
    }
@@ -883,23 +985,17 @@ fn try_skip_unchanged(
        // sentinel removes every doc at this path (the new doc_id is
        // not yet known here — it's computed downstream from the new
        // PARSER_VERSION).
-        purge_workspace_path_for_parser_bump(app, asset).with_context(|| {
-            format!(
-                "parser-bump orphan purge at {}",
-                asset.workspace_path.0
-            )
-        })?;
+        purge_workspace_path_for_parser_bump(app, asset)
+            .with_context(|| format!("parser-bump orphan purge at {}", asset.workspace_path.0))?;
        return Ok(None);
    }
    // 3. Chunker unchanged.
-    let chunker_match = existing_doc.last_chunker_version.as_ref()
-        == Some(current_chunker_version);
+    let chunker_match = existing_doc.last_chunker_version.as_ref() == Some(current_chunker_version);
    if !chunker_match {
        return Ok(None);
    }
    // 4. Embedder unchanged.
-    let embedder_match = existing_doc.last_embedding_version.as_ref()
-        == current_embedding_version;
+    let embedder_match = existing_doc.last_embedding_version.as_ref() == current_embedding_version;
    if !embedder_match {
        return Ok(None);
    }
@@ -921,6 +1017,8 @@ fn try_skip_unchanged(
        parser_version: Some(existing_doc.parser_version.clone()),
        chunker_version: existing_doc.last_chunker_version.clone(),
        warnings: Vec::new(),
+        pdf_ocr_pages: None,
+        pdf_ocr_ms_total: None,
        error: None,
    }))
 }
@@ -933,7 +1031,8 @@ fn try_skip_unchanged(
 fn ext_for_skip_warning(path: &str) -> String {
    std::path::Path::new(path)
        .extension()
-        .and_then(|s| s.to_str()).map_or_else(|| NO_EXT_SENTINEL.to_string(), str::to_ascii_lowercase)
+        .and_then(|s| s.to_str())
+        .map_or_else(|| NO_EXT_SENTINEL.to_string(), str::to_ascii_lowercase)
 }

 /// p9-fb-25: render the `IngestItem.warnings` line for a Skipped
@@ -963,6 +1062,13 @@ fn ingest_one_asset(
    existing_doc_ids: &std::collections::HashSet<String>,
    image_pipeline: &ImagePipeline<'_>,
    force_reingest: bool,
+    pdf_ocr_engine: Option<&OllamaVisionOcr>,
+    progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
+    cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
+    log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
+    ocr_ms_samples: Arc<Mutex<Vec<u64>>>,
+    ocr_pages_cnt: Arc<Mutex<u32>>,
+    ocr_failures_cnt: Arc<Mutex<u32>>,
 ) -> anyhow::Result<kebab_core::IngestItem> {
    tracing::debug!(
        target: "kebab-app::ingest",
@@ -998,14 +1104,37 @@ fn ingest_one_asset(
                vector_store,
                existing_doc_ids,
                force_reingest,
+                pdf_ocr_engine,
+                progress,
+                cancel,
+                log_writer,
+                ocr_ms_samples,
+                ocr_pages_cnt,
+                ocr_failures_cnt,
            );
        }
        // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. p10-1D: c/cpp added.
        MediaType::Code(lang)
-            if matches!(lang.as_str(),
-                "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin"
-                | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
-                | "shell" | "c" | "cpp") =>
+            if matches!(
+                lang.as_str(),
+                "rust"
+                    | "python"
+                    | "typescript"
+                    | "javascript"
+                    | "go"
+                    | "java"
+                    | "kotlin"
+                    | "yaml"
+                    | "dockerfile"
+                    | "toml"
+                    | "json"
+                    | "xml"
+                    | "groovy"
+                    | "go-mod"
+                    | "shell"
+                    | "c"
+                    | "cpp"
+            ) =>
        {
            return ingest_one_code_asset(
                app,
@@ -1032,6 +1161,8 @@ fn ingest_one_asset(
                parser_version: None,
                chunker_version: None,
                warnings: vec![unsupported_media_warning(&asset.workspace_path.0)],
+                pdf_ocr_pages: None,
+                pdf_ocr_ms_total: None,
                error: None,
            });
        }
@@ -1051,6 +1182,8 @@ fn ingest_one_asset(
                parser_version: None,
                chunker_version: None,
                warnings: vec!["kb:// URI not yet supported".to_string()],
+                pdf_ocr_pages: None,
+                pdf_ocr_ms_total: None,
                error: None,
            });
        }
@@ -1081,16 +1214,17 @@ fn ingest_one_asset(

    // Frontmatter — `parse_frontmatter` returns Ok even on malformed
    // frontmatter (warnings are surfaced through the `Vec<Warning>`).
-    let (metadata, fm_span, fm_warns) = parse_frontmatter(&bytes, &body_hints)
-        .context("kb-parse-md::parse_frontmatter")?;
+    let (metadata, fm_span, fm_warns) =
+        parse_frontmatter(&bytes, &body_hints).context("kb-parse-md::parse_frontmatter")?;

    let body_offset_lines = match fm_span {
        Some(span) => count_lines_in(&bytes[..span.end]),
        None => 0,
    };

-    let (parsed_blocks, blk_warns) = parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines)
-        .context("kb-parse-md::parse_blocks")?;
+    let (parsed_blocks, blk_warns) =
+        parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines)
+            .context("kb-parse-md::parse_blocks")?;

    let mut all_warnings = Vec::with_capacity(fm_warns.len() + blk_warns.len());
    all_warnings.extend(fm_warns);
@@ -1103,14 +1237,9 @@ fn ingest_one_asset(
        .map(|w| format!("{:?}: {}", w.kind, w.note))
        .collect();

-    let mut canonical = build_canonical_document(
-        asset,
-        metadata,
-        parsed_blocks,
-        parser_version,
-        all_warnings,
-    )
-    .context("kb-parse-md::build_canonical_document")?;
+    let mut canonical =
+        build_canonical_document(asset, metadata, parsed_blocks, parser_version, all_warnings)
+            .context("kb-parse-md::build_canonical_document")?;

    let chunks = MdHeadingV1Chunker
        .chunk(&canonical, chunk_policy)
@@ -1177,9 +1306,7 @@ fn ingest_one_asset(
                    dimensions,
                })
                .collect();
-            vec_store
-                .upsert(&records)
-                .context("VectorStore::upsert")?;
+            vec_store.upsert(&records).context("VectorStore::upsert")?;
        }
    }

@@ -1200,6 +1327,8 @@ fn ingest_one_asset(
        parser_version: Some(parser_version.clone()),
        chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
        warnings: warning_notes,
+        pdf_ocr_pages: None,
+        pdf_ocr_ms_total: None,
        error: None,
    })
 }
@@ -1242,9 +1371,9 @@ fn ingest_one_image_asset(
                chunk_count: None,
                parser_version: None,
                chunker_version: None,
-                warnings: vec![
-                    "kb:// URI not yet supported".to_string(),
-                ],
+                warnings: vec!["kb:// URI not yet supported".to_string()],
+                pdf_ocr_pages: None,
+                pdf_ocr_ms_total: None,
                error: None,
            });
        }
@@ -1354,17 +1483,19 @@ fn ingest_one_image_asset(
                "image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})",
                other.map(|b| std::mem::discriminant(b))
            );
-            canonical.provenance.events.push(kebab_core::ProvenanceEvent {
-                at: now,
-                agent: "kb-app".to_string(),
-                kind: kebab_core::ProvenanceKind::Warning,
-                note: Some(
-                    "image document missing leading ImageRef block — OCR/caption skipped"
-                        .to_string(),
-                ),
-            });
-            warning_notes
-                .push("ImageDispatchAnomaly: missing ImageRef block".to_string());
+            canonical
+                .provenance
+                .events
+                .push(kebab_core::ProvenanceEvent {
+                    at: now,
+                    agent: "kb-app".to_string(),
+                    kind: kebab_core::ProvenanceKind::Warning,
+                    note: Some(
+                        "image document missing leading ImageRef block — OCR/caption skipped"
+                            .to_string(),
+                    ),
+                });
+            warning_notes.push("ImageDispatchAnomaly: missing ImageRef block".to_string());
        }
    }

@@ -1455,6 +1586,8 @@ fn ingest_one_image_asset(
        parser_version: Some(canonical.parser_version.clone()),
        chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
        warnings: warning_notes,
+        pdf_ocr_pages: None,
+        pdf_ocr_ms_total: None,
        error: None,
    })
 }
@@ -1510,10 +1643,7 @@ fn record_image_analysis_failure(
 /// 3. Sweeps the SQLite `documents` row (CASCADE drops `blocks` /
 ///    `chunks` / `embedding_records`). The `assets` row stays — same
 ///    bytes, same asset_id, only the derived `doc_id` changed.
-fn purge_workspace_path_for_parser_bump(
-    app: &App,
-    asset: &RawAsset,
-) -> anyhow::Result<()> {
+fn purge_workspace_path_for_parser_bump(app: &App, asset: &RawAsset) -> anyhow::Result<()> {
    let path = &asset.workspace_path.0;
    let stale = app
        .sqlite
@@ -1648,21 +1778,19 @@ fn sweep_deleted_files(
        }

        // File is truly absent → purge.
-        let chunk_ids = match kebab_store_sqlite::purge_deleted_workspace_path(
-            &app.sqlite,
-            &stored_path,
-        ) {
-            Ok(ids) => ids,
-            Err(e) => {
-                tracing::warn!(
-                    target: "kebab-app",
-                    path = %stored_path.0,
-                    error = %e,
-                    "sweep_deleted_files: purge failed; skipping this path"
-                );
-                continue;
-            }
-        };
+        let chunk_ids =
+            match kebab_store_sqlite::purge_deleted_workspace_path(&app.sqlite, &stored_path) {
+                Ok(ids) => ids,
+                Err(e) => {
+                    tracing::warn!(
+                        target: "kebab-app",
+                        path = %stored_path.0,
+                        error = %e,
+                        "sweep_deleted_files: purge failed; skipping this path"
+                    );
+                    continue;
+                }
+            };

        // Purge associated vectors (best-effort; partial failure
        // acceptable — orphan vectors get cleaned by `kebab reset
@@ -1725,6 +1853,13 @@ fn ingest_one_pdf_asset(
    vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
    existing_doc_ids: &std::collections::HashSet<String>,
    force_reingest: bool,
+    pdf_ocr_engine: Option<&OllamaVisionOcr>,
+    progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
+    cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
+    log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
+    ocr_ms_samples: Arc<Mutex<Vec<u64>>>,
+    ocr_pages_cnt: Arc<Mutex<u32>>,
+    ocr_failures_cnt: Arc<Mutex<u32>>,
 ) -> anyhow::Result<kebab_core::IngestItem> {
    let path = match &asset.source_uri {
        SourceUri::File(p) => p.clone(),
@@ -1739,9 +1874,9 @@ fn ingest_one_pdf_asset(
                chunk_count: None,
                parser_version: None,
                chunker_version: None,
-                warnings: vec![
-                    "kb:// URI not yet supported".to_string(),
-                ],
+                warnings: vec!["kb:// URI not yet supported".to_string()],
+                pdf_ocr_pages: None,
+                pdf_ocr_ms_total: None,
                error: None,
            });
        }
@@ -1778,6 +1913,105 @@ fn ingest_one_pdf_asset(
        .extract_for(&asset.media_type, &ctx, &bytes)
        .context("kb-app::extract_for (pdf)")?;

+    // v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry
+    // dispatch invariant 보존 — extract_for 가 normal entry).
+    let (pdf_ocr_pages, pdf_ocr_ms_total): (Option<u32>, Option<u64>) =
+        if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
+            match pdf_ocr_engine {
+                Some(engine) => {
+                    let ocr_opts = crate::pdf_ocr_apply::PdfOcrOpts {
+                        enabled: app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on,
+                        always_on: app.config.pdf.ocr.always_on,
+                        valid_ratio_threshold: app.config.pdf.ocr.valid_ratio_threshold,
+                        min_char_count: app.config.pdf.ocr.min_char_count,
+                        lang_hint: app.config.pdf.ocr.lang_hint.clone().map(kebab_core::Lang),
+                        cancel: cancel.cloned(),
+                    };
+                    // v0.20.x Hook 2: pre-clone Arcs for capture by OCR closure.
+                    let lw_for_ocr = log_writer.clone();
+                    let samples_for_ocr = ocr_ms_samples.clone();
+                    let pages_for_ocr = ocr_pages_cnt.clone();
+                    let failures_for_ocr = ocr_failures_cnt.clone();
+                    let doc_path_for_log = asset.workspace_path.0.clone();
+
+                    let summary = crate::pdf_ocr_apply::apply_ocr_to_pdf_pages(
+                        &mut canonical,
+                        engine,
+                        &bytes,
+                        &ocr_opts,
+                        |p| match p {
+                            crate::pdf_ocr_apply::PdfOcrProgress::Started { page } => {
+                                if let Some(sender) = progress {
+                                    let _ = sender.send(
+                                        crate::ingest_progress::IngestEvent::PdfOcrStarted { page },
+                                    );
+                                }
+                            }
+                            crate::pdf_ocr_apply::PdfOcrProgress::Finished {
+                                page,
+                                ms,
+                                chars,
+                                skipped,
+                                image_byte_size,
+                                image_width,
+                                image_height,
+                                ref failure_reason,
+                            } => {
+                                if let Some(sender) = progress {
+                                    let _ = sender.send(
+                                        crate::ingest_progress::IngestEvent::PdfOcrFinished {
+                                            page,
+                                            ms,
+                                            chars,
+                                            ocr_engine: engine.engine_name().to_string(),
+                                            skipped,
+                                            image_byte_size,
+                                            image_width,
+                                            image_height,
+                                            failure_reason: failure_reason.clone(),
+                                        },
+                                    );
+                                }
+                                // v0.20.x Hook 2: write OCR event to log writer.
+                                let success = !skipped && failure_reason.is_none();
+                                if let Some(ref lw) = lw_for_ocr {
+                                    if let Ok(mut w) = lw.lock() {
+                                        let _ = w.write_event(&crate::ingest_log::LogEvent::Ocr {
+                                            ts: crate::ingest_log::now_ts(),
+                                            doc_path: &doc_path_for_log,
+                                            page,
+                                            image_byte_size,
+                                            image_width,
+                                            image_height,
+                                            ms,
+                                            chars,
+                                            success,
+                                            reason: failure_reason.as_deref(),
+                                            ocr_engine: engine.engine_name(),
+                                        });
+                                    }
+                                }
+                                if let Ok(mut p) = pages_for_ocr.lock() {
+                                    *p += 1;
+                                }
+                                if success {
+                                    if let Ok(mut s) = samples_for_ocr.lock() {
+                                        s.push(ms);
+                                    }
+                                } else if let Ok(mut f) = failures_for_ocr.lock() {
+                                    *f += 1;
+                                }
+                            }
+                        },
+                    )?;
+                    (Some(summary.pages_ocrd), Some(summary.ms_total))
+                }
+                None => (Some(0), Some(0)),
+            }
+        } else {
+            (None, None)
+        };
+
    // Per-medium chunker selection: PDF docs always use pdf-page-v1
    // regardless of `config.chunking.chunker_version`. The chunker
    // validates every block carries `SourceSpan::Page`; failure here
@@ -1818,9 +2052,7 @@ fn ingest_one_pdf_asset(
                kind: EmbeddingKind::Document,
            })
            .collect();
-        let vectors = emb
-            .embed(&inputs)
-            .context("Embedder::embed (pdf chunks)")?;
+        let vectors = emb.embed(&inputs).context("Embedder::embed (pdf chunks)")?;
        let model_id = emb.model_id();
        let model_version = emb.model_version();
        let dimensions = emb.dimensions();
@@ -1879,6 +2111,8 @@ fn ingest_one_pdf_asset(
        parser_version: Some(canonical.parser_version.clone()),
        chunker_version: Some(chunker.chunker_version()),
        warnings,
+        pdf_ocr_pages,
+        pdf_ocr_ms_total,
        error: None,
    })
 }
@@ -1902,7 +2136,7 @@ fn ingest_one_code_asset(
    vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
    existing_doc_ids: &std::collections::HashSet<String>,
    force_reingest: bool,
-    code_lang: &str,                        // <-- NEW (p10-1b Task D)
+    code_lang: &str, // <-- NEW (p10-1b Task D)
 ) -> anyhow::Result<kebab_core::IngestItem> {
    let path = match &asset.source_uri {
        SourceUri::File(p) => p.clone(),
@@ -1917,9 +2151,9 @@ fn ingest_one_code_asset(
                chunk_count: None,
                parser_version: None,
                chunker_version: None,
-                warnings: vec![
-                    "kb:// URI not yet supported".to_string(),
-                ],
+                warnings: vec!["kb:// URI not yet supported".to_string()],
+                pdf_ocr_pages: None,
+                pdf_ocr_ms_total: None,
                error: None,
            });
        }
@@ -1927,43 +2161,43 @@ fn ingest_one_code_asset(

    // p10-1b Task D/G/J: parser_version per-lang.
    let parser_version = match code_lang {
-        "rust"       => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()),
-        "python"     => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()),
+        "rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()),
+        "python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()),
        "typescript" => ParserVersion(kebab_parse_code::TS_PARSER_VERSION.to_string()),
        "javascript" => ParserVersion(kebab_parse_code::JS_PARSER_VERSION.to_string()),
        "go" => ParserVersion(kebab_parse_code::GO_PARSER_VERSION.to_string()),
        "java" => ParserVersion(kebab_parse_code::JAVA_PARSER_VERSION.to_string()),
        "kotlin" => ParserVersion(kebab_parse_code::KOTLIN_PARSER_VERSION.to_string()),
        // p10-2: Tier 2 has no parse step — sentinel "none-v1".
-        "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
-            => ParserVersion("none-v1".to_string()),
+        "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => {
+            ParserVersion("none-v1".to_string())
+        }
        // p10-3: shell direct routes to Tier 3 (no parse step).
        "shell" => ParserVersion("none-v1".to_string()),
        // p10-1D: C + C++ AST extractors.
-        "c"   => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()),
+        "c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()),
        "cpp" => ParserVersion(kebab_parse_code::CPP_PARSER_VERSION.to_string()),
        other => anyhow::bail!("unsupported code_lang: {other}"),
    };

    // p10-1b Task D/G/J/L: chunker_version per-lang.
    let mut chunker_version = match code_lang {
-        "rust"       => CodeRustAstV1Chunker.chunker_version(),
-        "python"     => CodePythonAstV1Chunker.chunker_version(),
+        "rust" => CodeRustAstV1Chunker.chunker_version(),
+        "python" => CodePythonAstV1Chunker.chunker_version(),
        "typescript" => CodeTsAstV1Chunker.chunker_version(),
        "javascript" => CodeJsAstV1Chunker.chunker_version(),
        "go" => CodeGoAstV1Chunker.chunker_version(),
        "java" => CodeJavaAstV1Chunker.chunker_version(),
-        "kotlin"     => CodeKotlinAstV1Chunker.chunker_version(),
+        "kotlin" => CodeKotlinAstV1Chunker.chunker_version(),
        // p10-2 Tier 2:
-        "yaml"       => K8sManifestResourceV1Chunker.chunker_version(),
+        "yaml" => K8sManifestResourceV1Chunker.chunker_version(),
        "dockerfile" => DockerfileFileV1Chunker.chunker_version(),
-        "toml" | "json" | "xml" | "groovy" | "go-mod"
-                     => ManifestFileV1Chunker.chunker_version(),
+        "toml" | "json" | "xml" | "groovy" | "go-mod" => ManifestFileV1Chunker.chunker_version(),
        // p10-3:
-        "shell"      => CodeTextParagraphV1Chunker.chunker_version(),
+        "shell" => CodeTextParagraphV1Chunker.chunker_version(),
        // p10-1D: C + C++ AST chunkers.
-        "c"          => CodeCAstV1Chunker.chunker_version(),
-        "cpp"        => CodeCppAstV1Chunker.chunker_version(),
+        "c" => CodeCAstV1Chunker.chunker_version(),
+        "cpp" => CodeCppAstV1Chunker.chunker_version(),
        other => anyhow::bail!("unreachable chunker_version: {other}"),
    };

@@ -2026,8 +2260,12 @@ fn ingest_one_code_asset(
    // Tier 2 (yaml/dockerfile/…) and shell errors are real (e.g. non-UTF-8) — propagate.
    let mut canonical = match canonical_result {
        Ok(d) => d,
-        Err(e) if code_lang == "shell"
-            || matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") =>
+        Err(e)
+            if code_lang == "shell"
+                || matches!(
+                    code_lang,
+                    "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
+                ) =>
        {
            return Err(e).context("synthesize_tier2_document failed for tier 2/3 lang");
        }
@@ -2051,7 +2289,10 @@ fn ingest_one_code_asset(
    // Tier 2 langs already have "none-v1" parser_version normally, so exclude them
    // from the extract_fell_back guard with the !matches! exclusion.
    let extract_fell_back = canonical.parser_version.0 == "none-v1"
-        && !matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell");
+        && !matches!(
+            code_lang,
+            "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell"
+        );

    let chunks_result: anyhow::Result<Vec<Chunk>> = if extract_fell_back {
        // Tier 1 lang whose extractor errored — go straight to Tier 3 chunker.
@@ -2110,7 +2351,7 @@ fn ingest_one_code_asset(
    // "shell" direct path is already Tier 3 — don't retry-double-up.
    let chunks: Vec<Chunk> = match chunks_result {
        Ok(v) if !v.is_empty() => v,
-        other if code_lang == "shell" => other?,  // shell propagates directly
+        other if code_lang == "shell" => other?, // shell propagates directly
        Ok(_empty) => {
            tracing::warn!(
                workspace_path = %asset.workspace_path.0,
@@ -2134,7 +2375,9 @@ fn ingest_one_code_asset(
            canonical.parser_version = ParserVersion("none-v1".to_string());
            CodeTextParagraphV1Chunker
                .chunk(&canonical, chunk_policy)
-                .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)")?
+                .context(
+                    "kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)",
+                )?
        }
    };

@@ -2226,6 +2469,8 @@ fn ingest_one_code_asset(
        parser_version: Some(canonical.parser_version.clone()),
        chunker_version: Some(chunker_version),
        warnings,
+        pdf_ocr_pages: None,
+        pdf_ocr_ms_total: None,
        error: None,
    })
 }
@@ -2260,13 +2505,7 @@ fn synthesize_tier2_document(
        symbol: Some("<file>".to_string()),
        lang: Some(code_lang.to_string()),
    };
-    let block_id: BlockId = id_for_block(
-        &doc_id,
-        "code",
-        &[],
-        0,
-        &span,
-    );
+    let block_id: BlockId = id_for_block(&doc_id, "code", &[], 0, &span);
    let block = kebab_core::Block::Code(CodeBlock {
        common: CommonBlock {
            block_id,
@@ -2312,7 +2551,9 @@ fn synthesize_tier2_document(
    };

    let title = {
-        let fname = asset.workspace_path.0
+        let fname = asset
+            .workspace_path
+            .0
            .rsplit('/')
            .next()
            .unwrap_or(&asset.workspace_path.0);
@@ -2558,7 +2799,9 @@ pub fn ask_with_session_with_config(
 /// `data_dir_writable` check probes the resolved `storage.data_dir`
 /// from that config (so `--config` users see their custom paths
 /// reflected in the report rather than the XDG defaults).
-pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow::Result<DoctorReport> {
+pub fn doctor_with_config_path(
+    config_path: Option<&std::path::Path>,
+) -> anyhow::Result<DoctorReport> {
    tracing::debug!("doctor() invoked");
    let mut checks = Vec::new();

@@ -2576,11 +2819,7 @@ pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow:
    } else if config_path.is_some() {
        // Explicit `--config <path>` that doesn't exist is a hard error
        // — defaults would silently mask the user's intent.
-        (
-            false,
-            format!("{} (not found)", cfg_path.display()),
-            None,
-        )
+        (false, format!("{} (not found)", cfg_path.display()), None)
    } else {
        // No `--config` and no XDG file: defaults are always loadable.
        (true, format!("{} (defaults)", cfg_path.display()), None)
@@ -2666,16 +2905,18 @@ pub fn ingest_file_with_config(
    path: &std::path::Path,
 ) -> anyhow::Result<IngestReport> {
    if !path.exists() {
-        anyhow::bail!("ingest-file: source path does not exist: {}", path.display());
+        anyhow::bail!(
+            "ingest-file: source path does not exist: {}",
+            path.display()
+        );
    }
    if !path.is_file() {
        anyhow::bail!("ingest-file: not a regular file: {}", path.display());
    }

-    let ext_raw = path
-        .extension()
-        .and_then(|e| e.to_str())
-        .ok_or_else(|| anyhow::anyhow!("ingest-file: source has no extension: {}", path.display()))?;
+    let ext_raw = path.extension().and_then(|e| e.to_str()).ok_or_else(|| {
+        anyhow::anyhow!("ingest-file: source has no extension: {}", path.display())
+    })?;
    let ext = ext_raw.to_lowercase();

    const SUPPORTED_EXTS: &[&str] = &["md", "pdf", "png", "jpg", "jpeg"];
@@ -2752,11 +2993,7 @@ pub fn ingest_stdin_with_config(
    let external_dir = crate::external::ensure_external_dir(&workspace_root)?;
    crate::external::ensure_kebabignore_entry(&workspace_root)?;

-    let dest = crate::external::copy_to_external(
-        &external_dir,
-        wrapped.as_bytes(),
-        "md",
-    )?;
+    let dest = crate::external::copy_to_external(&external_dir, wrapped.as_bytes(), "md")?;

    ingest_file_with_config(config, &dest)
 }
@@ -2764,7 +3001,10 @@ pub fn ingest_stdin_with_config(
 /// Returns true if `source_path` matches any `.kebabignore` pattern
 /// rooted at `workspace_root`. Used by `ingest_file_with_config` to
 /// emit a stderr warn before bypassing the ignore.
-fn check_kebabignore_match(workspace_root: &std::path::Path, source_path: &std::path::Path) -> bool {
+fn check_kebabignore_match(
+    workspace_root: &std::path::Path,
+    source_path: &std::path::Path,
+) -> bool {
    let kebabignore = workspace_root.join(".kebabignore");
    if !kebabignore.exists() {
        return false;
@@ -2785,5 +3025,7 @@ fn check_kebabignore_match(workspace_root: &std::path::Path, source_path: &std::
        Ok(m) => m,
        Err(_) => return false,
    };
-    matcher.matched(source_path, source_path.is_dir()).is_ignore()
+    matcher
+        .matched(source_path, source_path.is_dir())
+        .is_ignore()
 }
--- a/crates/kebab-app/src/logging.rs
+++ b/crates/kebab-app/src/logging.rs
@@ -26,7 +26,9 @@ pub fn init(level: LogLevel) -> Result<WorkerGuard> {
    let (nb, guard) = tracing_appender::non_blocking(file_appender);

    let env_filter = match level {
-        LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")),
+        LogLevel::Default => {
+            EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn"))
+        }
        LogLevel::Verbose => EnvFilter::new("info"),
        LogLevel::Debug => EnvFilter::new("debug"),
    };
--- a/crates/kebab-app/src/pdf_ocr_apply.rs
+++ b/crates/kebab-app/src/pdf_ocr_apply.rs
@@ -0,0 +1,323 @@
+// crates/kebab-app/src/pdf_ocr_apply.rs
+//
+// PDF post-extract OCR enrichment. parser isolation 보존 — kebab-parse-pdf 가
+// kebab-parse-image::OcrEngine 을 import 하지 않도록, helper 는 kebab-app 에 둠.
+// image path 의 apply_ocr (kebab-parse-image::ocr::apply_ocr) 의
+// PDF page 변형 — image 는 ImageRefBlock.ocr 를 mutate, PDF 는
+// Block::Paragraph.text / inlines 를 in-place mutate (단일 OCR fallback) 또는
+// 새 Block::Paragraph 를 push (always_on dual-block).
+
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+use std::time::Instant;
+
+use anyhow::{Context, Result};
+use kebab_core::{
+    Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent, ProvenanceKind,
+    SourceSpan, TextBlock, id_for_block,
+};
+use kebab_parse_image::OcrEngine;
+use kebab_parse_pdf::{compute_valid_char_ratio, extract_dctdecode_page_image};
+use lopdf::Document as LopdfDocument;
+use time::OffsetDateTime;
+use tracing::warn;
+
+/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
+/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
+/// (`kebab_app::ingest_one_pdf_asset`) fills these from
+/// `kebab_config::Config::pdf::ocr` plus runtime flags (CLI / SIGINT).
+pub struct PdfOcrOpts {
+    /// Master switch. `false` short-circuits to
+    /// `PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }` without lopdf reparse.
+    pub enabled: bool,
+    /// `true` → 모든 page OCR (dual-block path, new `Block::Paragraph` push).
+    /// `false` → text-detect block 의 `min_char_count` 또는
+    /// `valid_ratio_threshold` 미달인 page 만 OCR (in-place mutate).
+    pub always_on: bool,
+    /// 0.0..=1.0. text-detect block 의 `compute_valid_char_ratio` 가
+    /// 본 임계 미만이면 OCR fallback. Default `0.5`.
+    pub valid_ratio_threshold: f32,
+    /// text-detect block 의 char count 가 본 임계 미만이면 OCR fallback.
+    /// empty page (cover, blank separator) 자동 skip. Default `20`.
+    pub min_char_count: u32,
+    /// OCR engine 에 전달할 언어 힌트 (예: `Lang("kor".into())`).
+    /// `None` → no hint passed to engine.
+    pub lang_hint: Option<Lang>,
+    /// Optional per-page cancellation handle. checked at start of each page
+    /// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4
+    /// + verifier LOW L-1 resolution + spec §4.8 line 1159 명시.
+    pub cancel: Option<Arc<AtomicBool>>,
+}
+
+/// OCR run summary returned by [`apply_ocr_to_pdf_pages`] for the caller's
+/// `IngestItem.pdf_ocr_pages` + `pdf_ocr_ms_total` wire fields (§4.6.2).
+#[derive(Debug)]
+pub struct PdfOcrSummary {
+    /// Number of pages 가 OCR pipeline 을 실제 통과 (skipped page 제외).
+    pub pages_ocrd: u32,
+    /// Cumulative wall-clock duration of successful OCR engine calls (ms).
+    /// `saturating_add` 사용 — 24-day cumulative 까지 overflow-safe.
+    pub ms_total: u64,
+}
+
+/// Post-extract OCR enrichment for PDF. Walks `canonical.blocks` page-by-page,
+/// classifies each page via `text_quality::compute_valid_char_ratio` +
+/// `min_char_count`, and either:
+/// - skips (vector PDF + sufficient text + `always_on=false`),
+/// - mutates the text-detect `Block::Paragraph` in-place with OCR output
+///   (scanned/mojibake page), or
+/// - pushes a new `Block::Paragraph` with dual ordinal (`always_on=true` +
+///   vector page).
+///
+/// Errors:
+/// - cancel handle (`opts.cancel = Some(true)`) → `Err("PDF OCR cancelled mid-PDF at page N")`.
+/// - lopdf re-parse failure → `Err(...)`.
+/// - per-page OCR engine failure 또는 DCTDecode 부재 → `ProvenanceKind::Warning`
+///   event push + `emit_progress(Finished { skipped: true })` + continue
+///   (no `Err` propagation).
+///
+/// See spec §4.1 + §4.4 for the full pipeline.
+pub fn apply_ocr_to_pdf_pages<F>(
+    canonical: &mut CanonicalDocument,
+    engine: &dyn OcrEngine,
+    pdf_bytes: &[u8],
+    opts: &PdfOcrOpts,
+    mut emit_progress: F,
+) -> Result<PdfOcrSummary>
+where
+    F: FnMut(PdfOcrProgress),
+{
+    if !opts.enabled {
+        return Ok(PdfOcrSummary {
+            pages_ocrd: 0,
+            ms_total: 0,
+        });
+    }
+    let pdf_doc = LopdfDocument::load_mem(pdf_bytes)
+        .context("kb-app::pdf_ocr_apply: re-parse PDF for image extract")?;
+    let page_count = pdf_doc.get_pages().len() as u32;
+
+    let mut new_events: Vec<ProvenanceEvent> = Vec::new();
+    let mut ocr_blocks: Vec<Block> = Vec::new();
+    let mut pages_ocrd: u32 = 0;
+    let mut ms_total: u64 = 0;
+
+    // canonical.blocks 의 page → block index map (text-detect block 의 in-place
+    // mutate 또는 dual-block push 결정용).
+    // PdfTextExtractor 가 page 마다 1 Block::Paragraph + SourceSpan::Page 를
+    // 생성 (§1.4) — 그 invariant 사용.
+    for page_num in 1..=page_count {
+        if let Some(cancel) = &opts.cancel {
+            if cancel.load(std::sync::atomic::Ordering::Relaxed) {
+                anyhow::bail!("PDF OCR cancelled mid-PDF at page {page_num}");
+            }
+        }
+
+        let text_block_idx = find_paragraph_block_idx(&canonical.blocks, page_num);
+        let text = match &canonical.blocks[text_block_idx] {
+            Block::Paragraph(tb) => tb.text.clone(),
+            _ => String::new(),
+        };
+        let chars = text.chars().count() as u32;
+        let valid_ratio = compute_valid_char_ratio(&text);
+        let needs_ocr = chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold;
+
+        // 결정 matrix:
+        //   always_on=true → 모든 page OCR (dual-block).
+        //   always_on=false + needs_ocr → in-place OCR (text-detect block mutate).
+        //   needs_ocr=false → skip.
+        let do_ocr = opts.always_on || needs_ocr;
+        if !do_ocr {
+            continue;
+        }
+
+        emit_progress(PdfOcrProgress::Started { page: page_num });
+
+        let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? {
+            b
+        } else {
+            let note = format!(
+                "page={page_num} skipped: no DCTDecode image XObject (vector PDF page or unsupported /Filter — v1 supports DCTDecode passthrough only; see release notes for normalization guidance)"
+            );
+            warn!(target: "kebab-app", "{}", note);
+            new_events.push(ProvenanceEvent {
+                at: OffsetDateTime::now_utc(),
+                agent: "kb-parse-pdf".to_string(),
+                kind: ProvenanceKind::Warning,
+                note: Some(note),
+            });
+            emit_progress(PdfOcrProgress::Finished {
+                page: page_num,
+                ms: 0,
+                chars: 0,
+                skipped: true,
+                image_byte_size: None,
+                image_width: None,
+                image_height: None,
+                failure_reason: None,
+            });
+            continue;
+        };
+
+        let start = Instant::now();
+        let ocr = match engine.recognize(&page_image_bytes, opts.lang_hint.as_ref()) {
+            Ok(t) => t,
+            Err(e) => {
+                // OCR failure: warning event + skip (text-detect block 그대로).
+                let note = format!(
+                    "page={} OCR failed engine={} version={} err={}",
+                    page_num,
+                    engine.engine_name(),
+                    engine.engine_version(),
+                    e
+                );
+                warn!(target: "kebab-app", "{}", note);
+                new_events.push(ProvenanceEvent {
+                    at: OffsetDateTime::now_utc(),
+                    agent: "kb-parse-pdf".to_string(),
+                    kind: ProvenanceKind::Warning,
+                    note: Some(note),
+                });
+                emit_progress(PdfOcrProgress::Finished {
+                    page: page_num,
+                    ms: start.elapsed().as_millis() as u64,
+                    chars: 0,
+                    skipped: true,
+                    image_byte_size: Some(page_image_bytes.len() as u64),
+                    image_width: None,
+                    image_height: None,
+                    failure_reason: Some("ocr_error".to_string()),
+                });
+                continue;
+            }
+        };
+        let elapsed_ms = start.elapsed().as_millis() as u64;
+        let chars_ocr = ocr.joined.chars().count() as u32;
+
+        pages_ocrd = pages_ocrd.saturating_add(1);
+        ms_total = ms_total.saturating_add(elapsed_ms);
+
+        if opts.always_on && !needs_ocr {
+            // dual-block path: 새 Block::Paragraph push, ordinal = page-1 + page_count.
+            let ocr_ordinal = (page_num - 1) + page_count;
+            let span_ocr = SourceSpan::Page {
+                page: page_num,
+                char_start: Some(0),
+                char_end: Some(chars_ocr),
+            };
+            let block_id =
+                id_for_block(&canonical.doc_id, "paragraph", &[], ocr_ordinal, &span_ocr);
+            let common = CommonBlock {
+                block_id,
+                heading_path: Vec::new(),
+                source_span: span_ocr,
+            };
+            ocr_blocks.push(Block::Paragraph(TextBlock {
+                common,
+                text: ocr.joined.clone(),
+                inlines: if ocr.joined.is_empty() {
+                    Vec::new()
+                } else {
+                    vec![Inline::Text {
+                        text: ocr.joined.clone(),
+                    }]
+                },
+            }));
+        } else {
+            // in-place mutate: text-detect block (빈 또는 low-valid) 의 text/inlines 교체.
+            // block_id / ordinal 보존 — span 의 char_end 만 갱신.
+            if let Block::Paragraph(tb) = &mut canonical.blocks[text_block_idx] {
+                tb.text = ocr.joined.clone();
+                tb.inlines = if ocr.joined.is_empty() {
+                    Vec::new()
+                } else {
+                    vec![Inline::Text {
+                        text: ocr.joined.clone(),
+                    }]
+                };
+                if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
+                    *char_end = Some(chars_ocr);
+                }
+            }
+        }
+
+        new_events.push(ProvenanceEvent {
+            at: OffsetDateTime::now_utc(),
+            agent: "kb-parse-pdf".to_string(),
+            kind: ProvenanceKind::OcrApplied,
+            note: Some(format!(
+                "page={} engine={} version={} regions={} ms={} chars={}",
+                page_num,
+                engine.engine_name(),
+                engine.engine_version(),
+                ocr.regions.len(),
+                elapsed_ms,
+                chars_ocr
+            )),
+        });
+
+        emit_progress(PdfOcrProgress::Finished {
+            page: page_num,
+            ms: elapsed_ms,
+            chars: chars_ocr,
+            skipped: false,
+            image_byte_size: Some(page_image_bytes.len() as u64),
+            image_width: None,
+            image_height: None,
+            failure_reason: None,
+        });
+    }
+
+    canonical.blocks.extend(ocr_blocks);
+    canonical.provenance.events.extend(new_events);
+    Ok(PdfOcrSummary {
+        pages_ocrd,
+        ms_total,
+    })
+}
+
+fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize {
+    blocks
+        .iter()
+        .position(|b| match b {
+            Block::Paragraph(tb) => matches!(
+                tb.common.source_span,
+                SourceSpan::Page { page, .. } if page == page_num
+            ),
+            _ => false,
+        })
+        .expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)")
+}
+
+/// Per-page OCR progress event 가 caller 의 `emit_progress` closure 호출 시 emit.
+/// Step 6 의 ingest_one_pdf_asset 가 IngestEvent::PdfOcrStarted / PdfOcrFinished
+/// 로 carry (spec §4.6.1 wire schema).
+pub enum PdfOcrProgress {
+    /// page 별 OCR 시작 시 emit. `engine.recognize` 호출 직전.
+    Started {
+        /// 1-based PDF page number.
+        page: u32,
+    },
+    /// page 별 OCR 종료 시 emit (성공 / skip / failure 모두).
+    Finished {
+        /// 1-based PDF page number.
+        page: u32,
+        /// `engine.recognize` wall-clock duration. skip path 의 의미는 mixed
+        /// (DCTDecode 부재 시 `0`, OCR engine 실패 시 actual latency before bail).
+        ms: u64,
+        /// OCR result text 의 char count. skip 시 `0`.
+        chars: u32,
+        /// `true` = DCTDecode 부재 또는 OCR engine 실패 로 skip.
+        /// `false` = 정상 OCR 완료.
+        skipped: bool,
+        /// v0.20.x ingest log: raster image byte size (additive, optional).
+        image_byte_size: Option<u64>,
+        /// v0.20.x ingest log: raster image width in pixels (additive, optional).
+        image_width: Option<u32>,
+        /// v0.20.x ingest log: raster image height in pixels (additive, optional).
+        image_height: Option<u32>,
+        /// v0.20.x ingest log: failure reason string when OCR failed (additive, optional).
+        /// Values: "timeout" | "ocr_error" | "network_error" | None (success).
+        failure_reason: Option<String>,
+    },
+}
--- a/crates/kebab-app/src/reset.rs
+++ b/crates/kebab-app/src/reset.rs
@@ -85,8 +85,7 @@ pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec<PathBuf> {
        ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir],
        ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir],
        ResetScope::VectorOnly => {
-            let vector_dir =
-                expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
+            let vector_dir = expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
            vec![vector_dir]
        }
        ResetScope::ConfigOnly => vec![cfg_dir],
@@ -137,8 +136,8 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
 /// the double scan is acceptable for a rare destructive operation.
 pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
    use kebab_core::DocumentStore as _;
-    use kebab_source_fs::FsSourceConnector;
    use kebab_core::SourceScope;
+    use kebab_source_fs::FsSourceConnector;

    let store = kebab_store_sqlite::SqliteStore::open(cfg)
        .context("enumerate_orphans: open SqliteStore")?;
@@ -160,16 +159,13 @@ pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
        ..Default::default()
    };

-    let connector = FsSourceConnector::new(cfg)
-        .context("enumerate_orphans: build FsSourceConnector")?;
+    let connector =
+        FsSourceConnector::new(cfg).context("enumerate_orphans: build FsSourceConnector")?;
    let (assets, _skips) = connector
        .scan_with_skips(&scope)
        .context("enumerate_orphans: scan workspace")?;

-    let scanned: HashSet<WorkspacePath> = assets
-        .into_iter()
-        .map(|a| a.workspace_path)
-        .collect();
+    let scanned: HashSet<WorkspacePath> = assets.into_iter().map(|a| a.workspace_path).collect();

    let mut orphans: Vec<WorkspacePath> = stored
        .into_iter()
@@ -206,8 +202,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
        if !p.exists() {
            continue;
        }
-        std::fs::remove_dir_all(p)
-            .with_context(|| format!("remove {}", p.display()))?;
+        std::fs::remove_dir_all(p).with_context(|| format!("remove {}", p.display()))?;
        removed.push(p.clone());
    }

@@ -229,8 +224,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
 /// Execute the `OrphansOnly` variant: reconcile stored docs against the
 /// current walker scope without touching any filesystem directory.
 fn execute_orphans_only(cfg: &Config) -> Result<ResetReport> {
-    let orphans = enumerate_orphans(cfg)
-        .context("execute_orphans_only: enumerate orphans")?;
+    let orphans = enumerate_orphans(cfg).context("execute_orphans_only: enumerate orphans")?;

    if orphans.is_empty() {
        return Ok(ResetReport {
--- a/crates/kebab-app/src/schema.rs
+++ b/crates/kebab-app/src/schema.rs
@@ -39,6 +39,14 @@ pub struct Capabilities {
 pub struct Models {
    pub parser_version: String,
    pub chunker_version: String,
+    /// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
+    /// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
+    #[serde(default)]
+    pub active_parsers: Vec<String>,
+    /// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
+    /// 빈 corpus → empty Vec.
+    #[serde(default)]
+    pub active_chunkers: Vec<String>,
    pub embedding_version: String,
    pub prompt_template_version: String,
    pub index_version: String,
@@ -142,10 +150,10 @@ fn capabilities_snapshot() -> Capabilities {
        rag_multi_turn: true,
        search_cache: true,
        incremental_ingest: true,
-        streaming_ask: false,
+        streaming_ask: true,
        http_daemon: false,
        mcp_server: true,
-        single_file_ingest: false,
+        single_file_ingest: true,
        bulk_search: true,
    }
 }
@@ -160,12 +168,8 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::Sqli
    kebab_store_sqlite::SqliteStore::open_existing(&db_path)
 }

-fn collect_stats(
-    cfg: &Config,
-    store: &kebab_store_sqlite::SqliteStore,
-) -> anyhow::Result<Stats> {
-    let counts = store
-        .count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
+fn collect_stats(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
+    let counts = store.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
    let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
    let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
        .map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
@@ -190,12 +194,16 @@ fn collect_stats(
 }

 fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
+    let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
+    let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
    Models {
        // markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
        // maintain their own versions; surface those when SchemaV1.models
        // becomes a multi-medium map (P+).
        parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
        chunker_version: cfg.chunking.chunker_version.clone(),
+        active_parsers,
+        active_chunkers,
        // EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
        embedding_version: cfg.models.embedding.model.clone(),
        prompt_template_version: cfg.rag.prompt_template_version.clone(),
@@ -268,3 +276,27 @@ mod tests_stats_ext {
        assert_eq!(s.stats.stale_doc_count, 0);
    }
 }
+
+#[cfg(test)]
+mod tests_capabilities {
+    use super::*;
+
+    #[test]
+    fn capabilities_streaming_ask_matches_cli_surface() {
+        // Bug #9: kebab ask --stream 가 answer_event.v1 ndjson 191 event 정상 emit →
+        // capabilities.streaming_ask 가 true 여야 함.
+        let caps = capabilities_snapshot();
+        assert!(caps.streaming_ask, "streaming_ask must be true (Bug #9)");
+    }
+
+    #[test]
+    fn capabilities_single_file_ingest_matches_cli_surface() {
+        // Bug #9: kebab ingest-file <path> + kebab ingest-stdin --title <T> 양쪽 모두
+        // ingest_report.v1 정상 emit → capabilities.single_file_ingest 가 true 여야 함.
+        let caps = capabilities_snapshot();
+        assert!(
+            caps.single_file_ingest,
+            "single_file_ingest must be true (Bug #9)"
+        );
+    }
+}
--- a/crates/kebab-app/src/staleness.rs
+++ b/crates/kebab-app/src/staleness.rs
@@ -10,11 +10,7 @@ use kebab_core::SearchHit;
 ///
 /// p9-fb-32: mirrored in `kebab_rag::pipeline::compute_stale` (dep-boundary
 /// rule prevents `kebab-rag → kebab-app`). Update both together.
-pub fn compute_stale(
-    indexed_at: OffsetDateTime,
-    now: OffsetDateTime,
-    threshold_days: u32,
-) -> bool {
+pub fn compute_stale(indexed_at: OffsetDateTime, now: OffsetDateTime, threshold_days: u32) -> bool {
    if threshold_days == 0 {
        return false;
    }
@@ -23,11 +19,7 @@ pub fn compute_stale(
 }

 /// Sets `stale` on each hit in place using `compute_stale`.
-pub fn mark_stale_in_place(
-    hits: &mut [SearchHit],
-    now: OffsetDateTime,
-    threshold_days: u32,
-) {
+pub fn mark_stale_in_place(hits: &mut [SearchHit], now: OffsetDateTime, threshold_days: u32) {
    for h in hits {
        h.stale = compute_stale(h.indexed_at, now, threshold_days);
    }
--- a/crates/kebab-app/tests/code_ingest_smoke.rs
+++ b/crates/kebab-app/tests/code_ingest_smoke.rs
@@ -29,9 +29,8 @@ fn rust_file_ingests_and_searches_as_code_citation() {
    )
    .unwrap();

-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
-            .expect("ingest must succeed");
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("ingest must succeed");

    assert_eq!(report.errors, 0, "no errors expected: {report:?}");
    let items = report.items.as_ref().expect("items present");
@@ -127,9 +126,8 @@ fn rust_code_search_hit_has_repo() {
    )
    .unwrap();

-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
-            .expect("ingest must succeed");
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("ingest must succeed");
    assert_eq!(report.errors, 0, "no ingest errors: {report:?}");

    let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("mul"))
@@ -147,8 +145,7 @@ fn rust_code_search_hit_has_repo() {
        .and_then(|n| n.to_str())
        .map(str::to_owned);
    assert_eq!(
-        h.repo,
-        expected_repo,
+        h.repo, expected_repo,
        "SearchHit.repo must match the workspace dir name (detect_repo result)"
    );
    // Also sanity-check code_lang is still filled.
@@ -177,9 +174,8 @@ fn python_file_ingests_and_searches_as_code_citation() {
    )
    .unwrap();

-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
-            .expect("ingest must succeed");
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("ingest must succeed");

    assert!(report.new >= 1, "python file ingested: {report:?}");

@@ -254,9 +250,8 @@ fn typescript_file_ingests_and_searches_as_code_citation() {
    )
    .unwrap();

-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
-            .expect("ingest must succeed");
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("ingest must succeed");

    assert!(report.new >= 1, "ts file ingested: {report:?}");

@@ -331,9 +326,8 @@ fn javascript_file_ingests_and_searches_as_code_citation() {
    )
    .unwrap();

-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
-            .expect("ingest must succeed");
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("ingest must succeed");

    assert!(report.new >= 1, "js file ingested: {report:?}");

@@ -515,7 +509,11 @@ fn java_file_ingests_and_searches_as_code_citation() {
            line_start,
            ..
        } => {
-            assert_eq!(lang.as_deref(), Some("java"), "citation.lang must be 'java'");
+            assert_eq!(
+                lang.as_deref(),
+                Some("java"),
+                "citation.lang must be 'java'"
+            );
            assert_eq!(
                symbol.as_deref(),
                Some("com.foo.Foo.bar"),
@@ -586,7 +584,11 @@ fn kotlin_file_ingests_and_searches_as_code_citation() {
            line_start,
            ..
        } => {
-            assert_eq!(lang.as_deref(), Some("kotlin"), "citation.lang must be 'kotlin'");
+            assert_eq!(
+                lang.as_deref(),
+                Some("kotlin"),
+                "citation.lang must be 'kotlin'"
+            );
            assert_eq!(
                symbol.as_deref(),
                Some("com.foo.Foo.bar"),
@@ -651,8 +653,8 @@ fn tier2_k8s_yaml_ingest_searchable() {
            ..Default::default()
        },
    };
-    let hits = kebab_app::search_with_config(env.config.clone(), query)
-        .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");

    let h = hits
        .iter()
@@ -666,7 +668,11 @@ fn tier2_k8s_yaml_ingest_searchable() {
            line_start,
            ..
        } => {
-            assert_eq!(lang.as_deref(), Some("yaml"), "citation.lang must be 'yaml'");
+            assert_eq!(
+                lang.as_deref(),
+                Some("yaml"),
+                "citation.lang must be 'yaml'"
+            );
            assert_eq!(
                symbol.as_deref(),
                Some("Deployment/prod/api"),
@@ -730,8 +736,8 @@ fn tier2_dockerfile_ingest_searchable() {
            ..Default::default()
        },
    };
-    let hits = kebab_app::search_with_config(env.config.clone(), query)
-        .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");

    let h = hits
        .iter()
@@ -813,8 +819,8 @@ fn tier2_cargo_toml_ingest_searchable() {
            ..Default::default()
        },
    };
-    let hits = kebab_app::search_with_config(env.config.clone(), query)
-        .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");

    let h = hits
        .iter()
@@ -896,8 +902,8 @@ fn tier3_shell_ingest_searchable() {
            ..Default::default()
        },
    };
-    let hits = kebab_app::search_with_config(env.config.clone(), query)
-        .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");

    let h = hits
        .iter()
@@ -987,8 +993,8 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
            ..Default::default()
        },
    };
-    let hits = kebab_app::search_with_config(env.config.clone(), query)
-        .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");

    let h = hits
        .iter()
@@ -1031,14 +1037,9 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
 fn rust_file_re_ingest_is_unchanged() {
    let env = TestEnv::lexical_only();

-    std::fs::write(
-        env.workspace_root.join("stable.rs"),
-        "pub fn noop() {}\n",
-    )
-    .unwrap();
+    std::fs::write(env.workspace_root.join("stable.rs"), "pub fn noop() {}\n").unwrap();

-    let r1 =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
+    let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
    let item1 = r1
        .items
        .as_ref()
@@ -1049,8 +1050,7 @@ fn rust_file_re_ingest_is_unchanged() {
        .unwrap();
    assert_eq!(item1.kind, IngestItemKind::New);

-    let r2 =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
+    let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
    let item2 = r2
        .items
        .unwrap()
@@ -1081,9 +1081,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
    )
    .unwrap();

-    let report1 =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
-            .expect("first ingest");
+    let report1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("first ingest");
    let item1 = report1
        .items
        .as_ref()
@@ -1093,7 +1092,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
        .expect("docker-compose.yml in first report");
    assert!(
        matches!(item1.kind, IngestItemKind::New),
-        "first ingest must be New, got {:?}", item1.kind
+        "first ingest must be New, got {:?}",
+        item1.kind
    );
    assert_eq!(
        item1.chunker_version.as_ref().map(|c| c.0.as_str()),
@@ -1101,9 +1101,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
        "first ingest must use Tier 3 fallback chunker"
    );

-    let report2 =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
-            .expect("second ingest");
+    let report2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("second ingest");
    let item2 = report2
        .items
        .as_ref()
@@ -1113,7 +1112,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
        .expect("docker-compose.yml in second report");
    assert!(
        matches!(item2.kind, IngestItemKind::Unchanged),
-        "second ingest must be Unchanged, got {:?}", item2.kind
+        "second ingest must be Unchanged, got {:?}",
+        item2.kind
    );
 }

@@ -1163,8 +1163,8 @@ fn tier1_c_ingest_searchable() {
            ..Default::default()
        },
    };
-    let hits = kebab_app::search_with_config(env.config.clone(), query)
-        .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");

    let h = hits
        .iter()
@@ -1247,8 +1247,8 @@ fn tier1_cpp_ingest_searchable() {
            ..Default::default()
        },
    };
-    let hits = kebab_app::search_with_config(env.config.clone(), query)
-        .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");

    let h = hits
        .iter()
@@ -1266,7 +1266,9 @@ fn tier1_cpp_ingest_searchable() {
            // Symbol could be "kebab::chunk::Foo" (class) or "kebab::chunk::Foo::bar"
            // (method) depending on which chunk ranks first.
            assert!(
-                symbol.as_deref().is_some_and(|s| s.starts_with("kebab::chunk::Foo")),
+                symbol
+                    .as_deref()
+                    .is_some_and(|s| s.starts_with("kebab::chunk::Foo")),
                "C++ symbol must start with namespace::Class prefix, got {symbol:?}"
            );
            assert!(*line_start >= 1, "line_start must be >=1");
@@ -1335,8 +1337,8 @@ fn tier2_k8s_multi_resource_yaml_ingests_without_collision() {
            ..Default::default()
        },
    };
-    let hits = kebab_app::search_with_config(env.config.clone(), query)
-        .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
    assert!(
        hits.len() >= 2,
        "expected ≥2 hits (Deployment + Service), got {}",
@@ -1359,9 +1361,8 @@ fn tier3_shell_reingest_is_unchanged() {
    )
    .unwrap();

-    let report1 =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
-            .expect("first ingest");
+    let report1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("first ingest");
    let item1 = report1
        .items
        .as_ref()
@@ -1371,12 +1372,12 @@ fn tier3_shell_reingest_is_unchanged() {
        .expect("deploy.sh in first report");
    assert!(
        matches!(item1.kind, IngestItemKind::New),
-        "first ingest must be New, got {:?}", item1.kind
+        "first ingest must be New, got {:?}",
+        item1.kind
    );

-    let report2 =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
-            .expect("second ingest");
+    let report2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
+        .expect("second ingest");
    let item2 = report2
        .items
        .as_ref()
@@ -1386,6 +1387,7 @@ fn tier3_shell_reingest_is_unchanged() {
        .expect("deploy.sh in second report");
    assert!(
        matches!(item2.kind, IngestItemKind::Unchanged),
-        "shell reingest must be Unchanged, got {:?}", item2.kind
+        "shell reingest must be Unchanged, got {:?}",
+        item2.kind
    );
 }
--- a/crates/kebab-app/tests/common/mock_ocr.rs
+++ b/crates/kebab-app/tests/common/mock_ocr.rs
@@ -0,0 +1,60 @@
+use std::sync::Mutex;
+
+use anyhow::Result;
+use kebab_core::{Lang, OcrText};
+use kebab_parse_image::OcrEngine;
+
+pub struct MockOcrEngine {
+    expected_texts: Vec<String>,
+    call_index: Mutex<usize>,
+    fail: bool,
+}
+
+impl MockOcrEngine {
+    /// Single text (backward-compat ctor for pdf_ocr_apply.rs 10 sites).
+    pub fn single(text: impl Into<String>, fail: bool) -> Self {
+        Self {
+            expected_texts: vec![text.into()],
+            call_index: Mutex::new(0),
+            fail,
+        }
+    }
+
+    /// Per-page texts (cursor advances per recognize call).
+    pub fn per_page(texts: Vec<String>, fail: bool) -> Self {
+        Self {
+            expected_texts: texts,
+            call_index: Mutex::new(0),
+            fail,
+        }
+    }
+}
+
+impl OcrEngine for MockOcrEngine {
+    fn engine_name(&self) -> &'static str {
+        "mock-ocr"
+    }
+
+    fn engine_version(&self) -> String {
+        "mock-v1".to_string()
+    }
+
+    fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
+        if self.fail {
+            anyhow::bail!("mock failure");
+        }
+        let mut idx = self.call_index.lock().unwrap();
+        let text = self
+            .expected_texts
+            .get(*idx)
+            .cloned()
+            .unwrap_or_else(|| self.expected_texts.last().cloned().unwrap_or_default());
+        *idx += 1;
+        Ok(OcrText {
+            joined: text,
+            regions: vec![],
+            engine: "mock-ocr".to_string(),
+            engine_version: "mock-v1".to_string(),
+        })
+    }
+}
--- a/crates/kebab-app/tests/common/mod.rs
+++ b/crates/kebab-app/tests/common/mod.rs
@@ -93,8 +93,7 @@ impl TestEnv {
    /// directly. Caller can invoke this multiple times to simulate
    /// re-opening the binary after a corpus revision bump.
    pub fn app(&self) -> kebab_app::App {
-        kebab_app::App::open_with_config(self.config.clone())
-            .expect("App::open_with_config")
+        kebab_app::App::open_with_config(self.config.clone()).expect("App::open_with_config")
    }
 }

@@ -169,3 +168,5 @@ fn copy_dir_recursive(src: &Path, dest: &Path) {
        }
    }
 }
+
+pub mod mock_ocr;
--- a/crates/kebab-app/tests/fetch_integration.rs
+++ b/crates/kebab-app/tests/fetch_integration.rs
@@ -12,7 +12,11 @@ fn open(env: &common::TestEnv) -> App {
 #[test]
 fn fetch_chunk_returns_target_only_when_no_context() {
    let env = common::TestEnv::new();
-    common::ingest_md(&env, "a.md", "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n");
+    common::ingest_md(
+        &env,
+        "a.md",
+        "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n",
+    );
    let app = open(&env);

    // Find a chunk via search to obtain its id.
@@ -42,7 +46,8 @@ fn fetch_chunk_with_context_returns_neighbors() {
    // match. The earlier fixture used 2-char tokens like `A1`/`A3` for
    // section bodies — those zero-hit under trigram. Use 5-char unique
    // words per section so the query can pin one chunk deterministically.
-    let body = "# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
+    let body =
+        "# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
    common::ingest_md(&env, "multi.md", body);
    let app = env.app();

@@ -110,7 +115,10 @@ fn fetch_doc_returns_serialized_markdown() {
        .unwrap();
    assert_eq!(result.kind, FetchKind::Doc);
    let text = result.text.expect("doc text");
-    assert!(text.contains("Heading One"), "doc text contains heading: {text:?}");
+    assert!(
+        text.contains("Heading One"),
+        "doc text contains heading: {text:?}"
+    );
    assert!(text.contains("First paragraph"), "doc text contains body");
    assert!(!result.truncated);
 }
@@ -155,7 +163,11 @@ fn fetch_doc_with_max_tokens_truncates() {
        .unwrap();
    assert!(result.truncated);
    let text = result.text.expect("doc text");
-    assert!(text.chars().count() <= 100, "trimmed text len {}", text.chars().count());
+    assert!(
+        text.chars().count() <= 100,
+        "trimmed text len {}",
+        text.chars().count()
+    );
 }

 #[test]
@@ -292,8 +304,7 @@ fn fetch_span_line_start_beyond_total_returns_empty_text() {
 fn fetch_chunk_context_at_first_chunk_clamps_lower_bound() {
    let env = common::TestEnv::new();
    // Multi-chunk markdown so context ±N has neighbors.
-    let body =
-        "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
+    let body = "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
    common::ingest_md(&env, "boundary.md", body);
    let app = env.app();
    let q = kebab_core::SearchQuery {
--- a/crates/kebab-app/tests/file_deletion_auto_purge.rs
+++ b/crates/kebab-app/tests/file_deletion_auto_purge.rs
@@ -16,8 +16,8 @@
 mod common;

 use common::TestEnv;
-use kebab_app::ingest_with_config_opts;
 use kebab_app::IngestOpts;
+use kebab_app::ingest_with_config_opts;
 use kebab_core::{DocFilter, DocumentStore, SearchMode, SearchQuery, SourceScope};

 /// Helper: open the store via `TestEnv` and run `list_documents`.
@@ -125,17 +125,10 @@ fn include_scope_narrowing_does_not_purge() {
        include: vec!["**/*.rs".to_string()],
        exclude: env.config.workspace.exclude.clone(),
    };
-    let first = ingest_with_config_opts(
-        env.config.clone(),
-        wide_scope,
-        false,
-        IngestOpts::default(),
-    )
-    .expect("first ingest (wide) must succeed");
-    assert!(
-        first.new >= 2,
-        "expected at least 2 new docs: {first:?}"
-    );
+    let first =
+        ingest_with_config_opts(env.config.clone(), wide_scope, false, IngestOpts::default())
+            .expect("first ingest (wide) must succeed");
+    assert!(first.new >= 2, "expected at least 2 new docs: {first:?}");
    assert_eq!(
        first.purged_deleted_files, 0,
        "no purges on first ingest: {first:?}"
--- a/crates/kebab-app/tests/image_pipeline.rs
+++ b/crates/kebab-app/tests/image_pipeline.rs
@@ -24,8 +24,7 @@ use wiremock::{Mock, MockServer, ResponseTemplate};
 /// inspectable in stored DB rows.
 fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf {
    use image::{ImageBuffer, Rgb};
-    let img: ImageBuffer<Rgb<u8>, _> =
-        ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
+    let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
    let path = root.join(name);
    img.save(&path).expect("write PNG fixture");
    path
@@ -80,7 +79,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {

    // Counters: scanned should include the PNG; new ≥ 1 (markdown
    // fixtures from the workspace tree may also count).
-    assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items);
+    assert!(
+        report.scanned >= 1,
+        "scanned={}, items={:?}",
+        report.scanned,
+        report.items
+    );
    assert_eq!(report.errors, 0, "no errors on lenient OCR path");

    // Locate the image doc in the report items.
@@ -94,7 +98,11 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
        kebab_core::IngestItemKind::New,
        "image asset must be classified New on first ingest"
    );
-    assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk");
+    assert_eq!(
+        img_item.chunk_count,
+        Some(1),
+        "image emits exactly one chunk"
+    );

    // Inspect the stored chunk text via kb-app's inspect_chunk facade.
    let doc_id = img_item.doc_id.clone().expect("image doc id");
@@ -117,10 +125,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {

    // Sanity: the doc was actually persisted into SQLite (kb-app's
    // list_docs facade reads the same store the chunker writes to).
-    let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
-        .expect("list_docs");
+    let summaries =
+        kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).expect("list_docs");
    assert!(
-        summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")),
+        summaries
+            .iter()
+            .any(|s| s.doc_path.0.ends_with("diagram.png")),
        "image doc must appear in list_docs"
    );

@@ -171,8 +181,7 @@ async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
        .iter()
        .find(|i| i.doc_path.0.ends_with("diagram.png"))
        .unwrap();
-    let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
-        .unwrap();
+    let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
    let block = match &doc.blocks[0] {
        kebab_core::Block::ImageRef(b) => b,
        _ => unreachable!(),
@@ -267,8 +276,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
    let cfg_clone = cfg.clone();
    let scope = env.scope();
    let report = spawn_blocking(move || {
-        kebab_app::ingest_with_config(cfg_clone, scope, false)
-            .expect("ingest with no OCR/caption")
+        kebab_app::ingest_with_config(cfg_clone, scope, false).expect("ingest with no OCR/caption")
    })
    .await
    .expect("task");
@@ -282,8 +290,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
        .find(|i| i.doc_path.0.ends_with("raw.png"))
        .unwrap();
    assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk");
-    let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
-        .unwrap();
+    let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
    let block = match &doc.blocks[0] {
        kebab_core::Block::ImageRef(b) => b,
        _ => unreachable!(),
@@ -392,16 +399,12 @@ async fn re_ingest_image_produces_unchanged_with_same_doc_id() {
    let scope1 = scope.clone();
    let scope2 = scope.clone();

-    let r1 = spawn_blocking(move || {
-        kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()
-    })
-    .await
-    .unwrap();
-    let r2 = spawn_blocking(move || {
-        kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()
-    })
-    .await
-    .unwrap();
+    let r1 = spawn_blocking(move || kebab_app::ingest_with_config(cfg1, scope1, false).unwrap())
+        .await
+        .unwrap();
+    let r2 = spawn_blocking(move || kebab_app::ingest_with_config(cfg2, scope2, false).unwrap())
+        .await
+        .unwrap();

    let id1 = r1
        .items
--- a/crates/kebab-app/tests/incremental_ingest.rs
+++ b/crates/kebab-app/tests/incremental_ingest.rs
@@ -21,11 +21,16 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
    // First ingest — populates the DB. Use the legacy entry so the
    // assertions cover the "previously ingested" set without needing
    // IngestOpts::default() to behave identically.
-    let first =
-        ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
+    let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
    assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
-    assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
-    assert_eq!(first.unchanged, 0, "first ingest cannot have unchanged: {first:?}");
+    assert!(
+        first.new >= 1,
+        "first ingest must create new docs: {first:?}"
+    );
+    assert_eq!(
+        first.unchanged, 0,
+        "first ingest cannot have unchanged: {first:?}"
+    );

    let scanned = first.scanned;

@@ -38,9 +43,15 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
        IngestOpts::default(),
    )
    .unwrap();
-    assert_eq!(second.scanned, scanned, "second scanned matches first: {second:?}");
+    assert_eq!(
+        second.scanned, scanned,
+        "second scanned matches first: {second:?}"
+    );
    assert_eq!(second.new, 0, "no new docs on re-ingest: {second:?}");
-    assert_eq!(second.updated, 0, "nothing should be marked updated: {second:?}");
+    assert_eq!(
+        second.updated, 0,
+        "nothing should be marked updated: {second:?}"
+    );
    assert_eq!(
        second.unchanged, scanned,
        "every doc must be Unchanged: {second:?}"
@@ -52,10 +63,12 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
 fn force_reingest_bypasses_skip() {
    let env = TestEnv::lexical_only();

-    let first =
-        ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
+    let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
    assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
-    assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
+    assert!(
+        first.new >= 1,
+        "first ingest must create new docs: {first:?}"
+    );
    let scanned = first.scanned;

    let second = ingest_with_config_opts(
--- a/crates/kebab-app/tests/ingest_cancel.rs
+++ b/crates/kebab-app/tests/ingest_cancel.rs
@@ -107,13 +107,9 @@ fn cancel_none_is_uncancellable_default() {
    // ingest_with_config_progress (no cancel) runs to completion.
    let env = TestEnv::lexical_only();
    let (tx, rx) = mpsc::channel::<IngestEvent>();
-    let report = kebab_app::ingest_with_config_progress(
-        env.config.clone(),
-        env.scope(),
-        true,
-        Some(tx),
-    )
-    .unwrap();
+    let report =
+        kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
+            .unwrap();
    assert_eq!(report.scanned, 3);
    assert_eq!(report.new, 3);

--- a/crates/kebab-app/tests/ingest_file.rs
+++ b/crates/kebab-app/tests/ingest_file.rs
@@ -107,5 +107,8 @@ fn ingest_file_errors_on_unsupported_extension() {

    let err = kebab_app::ingest_file_with_config(cfg, &docx).unwrap_err();
    assert!(err.to_string().contains("unsupported extension"), "{err}");
-    assert!(err.to_string().contains(".docx") || err.to_string().contains("docx"), "{err}");
+    assert!(
+        err.to_string().contains(".docx") || err.to_string().contains("docx"),
+        "{err}"
+    );
 }
--- a/crates/kebab-app/tests/ingest_lexical.rs
+++ b/crates/kebab-app/tests/ingest_lexical.rs
@@ -8,8 +8,7 @@ use common::TestEnv;
 #[test]
 fn ingest_then_list_inspects_round_trip() {
    let env = TestEnv::lexical_only();
-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();

    // The fixture has 3 markdown files; first ingest should label them
    // all as New.
@@ -27,17 +26,14 @@ fn ingest_then_list_inspects_round_trip() {
    }

    // list_docs returns the 3 docs.
-    let docs = kebab_app::list_docs_with_config(
-        env.config.clone(),
-        kebab_core::DocFilter::default(),
-    )
-    .unwrap();
+    let docs =
+        kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
+            .unwrap();
    assert_eq!(docs.len(), 3, "docs: {docs:?}");

    // inspect_doc round-trips one of them.
    let any_doc_id = docs[0].doc_id.clone();
-    let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id)
-        .unwrap();
+    let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id).unwrap();
    assert_eq!(canonical.doc_id, any_doc_id);
    assert!(!canonical.blocks.is_empty(), "blocks empty");
 }
@@ -46,12 +42,10 @@ fn ingest_then_list_inspects_round_trip() {
 fn ingest_idempotent_on_second_run() {
    let env = TestEnv::lexical_only();

-    let r1 =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
+    let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
    assert_eq!(r1.new, 3);

-    let r2 =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
+    let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
    // Same files re-ingested — p9-fb-23 task 7 introduced the early-skip
    // path: when checksum + parser/chunker/embedding versions all match,
    // the second run reports `Unchanged` rather than `Updated`. Pre-p9-fb-23
@@ -63,19 +57,16 @@ fn ingest_idempotent_on_second_run() {
    assert_eq!(r2.unchanged, 3, "second run unchanged: {r2:?}");

    // list_docs still has 3 docs (no duplicates).
-    let docs = kebab_app::list_docs_with_config(
-        env.config.clone(),
-        kebab_core::DocFilter::default(),
-    )
-    .unwrap();
+    let docs =
+        kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
+            .unwrap();
    assert_eq!(docs.len(), 3);
 }

 #[test]
 fn ingest_summary_only_drops_items() {
    let env = TestEnv::lexical_only();
-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
    assert_eq!(report.scanned, 3);
    assert!(report.items.is_none(), "summary-only should null items");
 }
@@ -87,12 +78,10 @@ fn ingest_records_ingest_runs_row_with_aggregate_counts() {
    // of every run. `summary_only=true` writes `items_json=NULL`; the
    // counts MUST still be present.
    let env = TestEnv::lexical_only();
-    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
-        .unwrap();
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
    assert_eq!(report.scanned, 3);

-    let db_path = std::path::PathBuf::from(&env.config.storage.data_dir)
-        .join("kebab.sqlite");
+    let db_path = std::path::PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
    let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite");
    let (scanned, new_c, updated, skipped, errors, items_json): (
        i64,
@@ -141,25 +130,18 @@ fn ingest_provider_none_skips_lance() {
    // tree shape (no `<data_dir>/lancedb` directory, or no `*.lance`
    // tables under it).
    let env = TestEnv::lexical_only();
-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
    assert_eq!(report.errors, 0, "lexical-only run must not error");
    assert_eq!(report.new, 3);

-    let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir)
-        .join("lancedb");
+    let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir).join("lancedb");
    if lance_dir.exists() {
        // If the dir was created (e.g., by an earlier consumer touching
        // the path), it MUST contain no `.lance` tables.
        let mut had_lance_table = false;
        for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") {
            let entry = entry.unwrap();
-            if entry
-                .path()
-                .extension()
-                .and_then(|s| s.to_str())
-                == Some("lance")
-            {
+            if entry.path().extension().and_then(|s| s.to_str()) == Some("lance") {
                had_lance_table = true;
                break;
            }
@@ -189,8 +171,7 @@ fn list_docs_filters_by_tags_any() {
        tags_any: vec!["rust".to_string()],
        ..Default::default()
    };
-    let rust_docs =
-        kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
+    let rust_docs = kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
    // intro.md and notes/cargo.md both tag "rust".
    assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}");
 }
@@ -198,8 +179,9 @@ fn list_docs_filters_by_tags_any() {
 #[test]
 fn inspect_doc_not_found_returns_actionable_error() {
    let env = TestEnv::lexical_only();
-    let bogus =
-        kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string());
+    let bogus = kebab_core::DocumentId(
+        "0000000000000000000000000000000000000000000000000000000000000000".to_string(),
+    );
    let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err();
    let msg = format!("{err:#}");
    assert!(
@@ -218,8 +200,7 @@ fn inspect_chunk_not_found_returns_actionable_error() {
    let bogus = kebab_core::ChunkId(
        "0000000000000000000000000000000000000000000000000000000000000000".to_string(),
    );
-    let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus)
-        .unwrap_err();
+    let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus).unwrap_err();
    let msg = format!("{err:#}");
    assert!(msg.contains("not found"), "got: {msg}");
 }
@@ -251,22 +232,18 @@ fn ingest_with_config_opts_default_matches_legacy_behaviour() {
 #[test]
 fn ingest_stamps_chunker_version_on_document() {
    let env = TestEnv::lexical_only();
-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
    assert!(report.new >= 1, "expected at least one new doc: {report:?}");
    assert_eq!(report.errors, 0, "no errors expected: {report:?}");

-    let docs = kebab_app::list_docs_with_config(
-        env.config.clone(),
-        kebab_core::DocFilter::default(),
-    )
-    .unwrap();
+    let docs =
+        kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
+            .unwrap();
    assert!(!docs.is_empty(), "no docs after ingest");

    for doc_entry in &docs {
        let canonical =
-            kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id)
-                .unwrap();
+            kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id).unwrap();
        assert!(
            canonical.last_chunker_version.is_some(),
            "last_chunker_version must be stamped for doc {}: got {:?}",
--- a/crates/kebab-app/tests/ingest_log_smoke.rs
+++ b/crates/kebab-app/tests/ingest_log_smoke.rs
@@ -0,0 +1,169 @@
+// crates/kebab-app/tests/ingest_log_smoke.rs
+//
+// Integration tests for ingest_log feature (v0.20.x). Spec §5 AC-9 + AC-6.
+
+use std::path::PathBuf;
+
+use kebab_app::{IngestOpts, ingest_with_config_opts};
+use kebab_config::{Config, LoggingCfg};
+use kebab_core::SourceScope;
+use serde_json::Value;
+use tempfile::TempDir;
+
+fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Config {
+    let data_dir = workspace.parent().unwrap().join("data");
+    std::fs::create_dir_all(&data_dir).unwrap();
+    let model_dir = workspace.parent().unwrap().join("models");
+    std::fs::create_dir_all(&model_dir).unwrap();
+
+    let mut cfg = Config::defaults();
+    cfg.workspace.root = workspace.to_string_lossy().into_owned();
+    cfg.workspace.exclude.clear();
+    cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
+    cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
+    cfg.models.embedding.provider = "none".to_string();
+    cfg.models.embedding.dimensions = 0;
+    cfg.chunking.target_tokens = 80;
+    cfg.chunking.overlap_tokens = 20;
+    cfg.logging = LoggingCfg {
+        ingest_log_enabled: true,
+        ingest_log_dir: log_dir.to_path_buf(),
+    };
+    cfg
+}
+
+/// AC-9: ingest → log file exists + each line valid JSON + last line kind=summary + scanned>0.
+#[test]
+fn ingest_log_smoke() {
+    let tmp = TempDir::new().unwrap();
+    let workspace = tmp.path().join("kb");
+    std::fs::create_dir_all(&workspace).unwrap();
+    let log_dir = tmp.path().join("logs");
+
+    // 1. Minimal corpus: 1 markdown + 1 scanned PDF (OCR disabled — no Ollama needed).
+    std::fs::write(
+        workspace.join("hello.md"),
+        "# Hello\n\nThis is a smoke test.\n",
+    )
+    .unwrap();
+    let pdf_src = PathBuf::from("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
+    if pdf_src.exists() {
+        std::fs::copy(&pdf_src, workspace.join("scanned.pdf")).unwrap();
+    }
+
+    // 2. Config with logging enabled.
+    let cfg = minimal_config(&workspace, &log_dir);
+    let scope = SourceScope {
+        root: workspace.clone(),
+        exclude: vec![],
+        ..Default::default()
+    };
+
+    // 3. Run ingest.
+    ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
+        .expect("ingest should succeed");
+
+    // 4. Assert log file exists in log_dir.
+    let log_files: Vec<_> = std::fs::read_dir(&log_dir)
+        .unwrap()
+        .filter_map(Result::ok)
+        .filter(|e| {
+            e.file_name().to_string_lossy().starts_with("ingest-")
+                && e.file_name().to_string_lossy().ends_with(".ndjson")
+        })
+        .collect();
+    assert_eq!(
+        log_files.len(),
+        1,
+        "expected exactly 1 ingest-*.ndjson file, found: {log_files:?}"
+    );
+
+    // 5. Parse each line as JSON — assert kind field present and valid.
+    let body = std::fs::read_to_string(log_files[0].path()).unwrap();
+    let lines: Vec<&str> = body.lines().collect();
+    assert!(!lines.is_empty(), "log file should not be empty");
+
+    let valid_kinds = ["ocr", "parse_error", "skip", "error", "summary"];
+    for line in &lines {
+        let v: Value = serde_json::from_str(line)
+            .unwrap_or_else(|e| panic!("line is not valid JSON: {e}\nline: {line}"));
+        let kind = v
+            .get("kind")
+            .and_then(|k| k.as_str())
+            .unwrap_or_else(|| panic!("line missing 'kind' field: {line}"));
+        assert!(
+            valid_kinds.contains(&kind),
+            "unexpected kind '{kind}' in line: {line}"
+        );
+    }
+
+    // 6. Last line must be kind=summary with scanned > 0.
+    let last = lines.last().unwrap();
+    let last_v: Value = serde_json::from_str(last).unwrap();
+    assert_eq!(
+        last_v.get("kind").and_then(|k| k.as_str()),
+        Some("summary"),
+        "last line must be kind=summary, got: {last}"
+    );
+    let scanned = last_v.get("scanned").and_then(Value::as_u64).unwrap_or(0);
+    assert!(scanned > 0, "summary.scanned should be > 0, got: {last}");
+}
+
+/// AC-6: ingest_log_enabled=false → no log file created.
+#[test]
+fn ingest_log_disabled_emits_no_file() {
+    let tmp = TempDir::new().unwrap();
+    let workspace = tmp.path().join("kb");
+    std::fs::create_dir_all(&workspace).unwrap();
+    let log_dir = tmp.path().join("logs");
+
+    std::fs::write(
+        workspace.join("hello.md"),
+        "# Hello\n\nDisabled log test.\n",
+    )
+    .unwrap();
+
+    let data_dir = tmp.path().join("data");
+    std::fs::create_dir_all(&data_dir).unwrap();
+    let model_dir = tmp.path().join("models");
+    std::fs::create_dir_all(&model_dir).unwrap();
+
+    let mut cfg = Config::defaults();
+    cfg.workspace.root = workspace.to_string_lossy().into_owned();
+    cfg.workspace.exclude.clear();
+    cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
+    cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
+    cfg.models.embedding.provider = "none".to_string();
+    cfg.models.embedding.dimensions = 0;
+    cfg.logging = LoggingCfg {
+        ingest_log_enabled: false,
+        ingest_log_dir: log_dir.clone(),
+    };
+
+    let scope = SourceScope {
+        root: workspace.clone(),
+        exclude: vec![],
+        ..Default::default()
+    };
+
+    ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
+        .expect("ingest should succeed");
+
+    // log_dir should either not exist or contain 0 ingest-*.ndjson files.
+    let log_file_count = if log_dir.exists() {
+        std::fs::read_dir(&log_dir)
+            .unwrap()
+            .filter_map(Result::ok)
+            .filter(|e| {
+                e.file_name().to_string_lossy().starts_with("ingest-")
+                    && e.file_name().to_string_lossy().ends_with(".ndjson")
+            })
+            .count()
+    } else {
+        0
+    };
+    assert_eq!(
+        log_file_count, 0,
+        "no ingest-*.ndjson file should be created when disabled"
+    );
+}
--- a/crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
+++ b/crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
@@ -0,0 +1,117 @@
+//! Integration smoke tests for the PDF OCR pipeline (§ Acceptance §9 #1 + #2).
+//!
+//! Tests 1 and 2 require a live Ollama endpoint — `#[ignore]` by default.
+//! Manual invoke:
+//!   KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
+//!     cargo test -p kebab-app --test ingest_pdf_ocr_smoke --ignored -j 4
+//!
+//! Test 3 (cancel) uses a dummy endpoint + pre-set cancel — runs by default
+//! to verify the cancel wiring doesn't panic/deadlock.
+
+mod common;
+
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+
+use common::TestEnv;
+
+fn ollama_endpoint() -> String {
+    std::env::var("KEBAB_PDF_OCR_ENDPOINT").unwrap_or_else(|_| "http://localhost:11434".to_string())
+}
+
+fn make_ocr_env_real() -> TestEnv {
+    let mut env = TestEnv::lexical_only();
+    env.config.pdf.ocr.enabled = true;
+    env.config.pdf.ocr.endpoint = Some(ollama_endpoint());
+    env.config.models.embedding.provider = "none".to_string();
+
+    let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .unwrap()
+        .join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
+    let dest = env.workspace_root.join("scanned_page1.pdf");
+    std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
+
+    env
+}
+
+/// § Acceptance §9 #1 — real Ollama OCR + IngestItem.pdf_ocr_pages = Some(1).
+#[test]
+#[ignore = "real Ollama qwen2.5vl:3b dependency"]
+fn ingest_with_mock_ocr_yields_pdf_ocr_summary() {
+    let env = make_ocr_env_real();
+
+    let report =
+        kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
+
+    assert!(report.new >= 1, "at least one PDF ingested: {report:?}");
+
+    let items = report.items.unwrap_or_default();
+    let pdf_item = items.iter().find(|i| i.doc_path.0.ends_with(".pdf"));
+    assert!(
+        pdf_item.is_some(),
+        "PDF item must appear in ingest report items: {items:?}"
+    );
+    let pdf_item = pdf_item.unwrap();
+    assert!(
+        pdf_item.pdf_ocr_pages.is_some(),
+        "pdf_ocr_pages must be set for scanned PDF: {pdf_item:?}"
+    );
+    assert_eq!(
+        pdf_item.pdf_ocr_pages.unwrap(),
+        1,
+        "scanned_page1.pdf has exactly 1 page"
+    );
+}
+
+/// § Acceptance §9 #2 — OCR text indexed and retrievable via lexical search.
+#[test]
+#[ignore = "real Ollama qwen2.5vl:3b dependency"]
+fn ocr_text_indexed_and_searchable() {
+    let env = make_ocr_env_real();
+
+    kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
+
+    // Search for a Korean morpheme expected to appear in qwen2.5vl:3b OCR
+    // output of the PoC ground-truth page. "다음" is a high-frequency token
+    // in page1.txt truth file.
+    let query = common::lexical_query("다음");
+    let hits = kebab_app::search_with_config(env.config.clone(), query).expect("search");
+
+    assert!(
+        !hits.is_empty(),
+        "OCR-indexed text must surface in lexical search results"
+    );
+}
+
+/// Production cancel wiring smoke — pre-set cancel exits before any OCR call.
+/// Dummy endpoint (port 1 = connection-refused) means OCR HTTP calls would
+/// fail, but cancel=true prevents the loop from reaching OCR at all.
+/// Verifies no panic/deadlock regardless of Ok/Err outcome.
+#[test]
+fn ingest_with_cancel_aborts_mid_pdf() {
+    let mut env = TestEnv::lexical_only();
+    env.config.pdf.ocr.enabled = true;
+    env.config.pdf.ocr.endpoint = Some("http://127.0.0.1:1".to_string());
+
+    let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .unwrap()
+        .join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
+    let dest = env.workspace_root.join("scanned_page1.pdf");
+    std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
+
+    let cancel = Arc::new(AtomicBool::new(true)); // pre-set — abort immediately
+
+    let result = kebab_app::ingest_with_config_cancellable(
+        env.config.clone(),
+        env.scope(),
+        false,
+        None,
+        Some(cancel),
+    );
+    // Both Ok (pre-cancel exit) and Err (eager OCR engine fail) are acceptable —
+    // key assertion is no panic/deadlock.
+    let _ = result;
+}
--- a/crates/kebab-app/tests/ingest_progress.rs
+++ b/crates/kebab-app/tests/ingest_progress.rs
@@ -13,13 +13,9 @@ use kebab_core::IngestItemKind;
 fn run_with_progress() -> Vec<IngestEvent> {
    let env = TestEnv::lexical_only();
    let (tx, rx) = mpsc::channel::<IngestEvent>();
-    let report = kebab_app::ingest_with_config_progress(
-        env.config.clone(),
-        env.scope(),
-        false,
-        Some(tx),
-    )
-    .unwrap();
+    let report =
+        kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), false, Some(tx))
+            .unwrap();
    assert_eq!(report.scanned, 3);
    assert_eq!(report.new, 3);

@@ -116,13 +112,9 @@ fn ingest_with_config_progress_none_matches_ingest_with_config() {
    // `ingest_with_config_progress(..., None)` must produce identical
    // reports modulo wall-clock duration.
    let env = TestEnv::lexical_only();
-    let r_none = kebab_app::ingest_with_config_progress(
-        env.config.clone(),
-        env.scope(),
-        true,
-        None,
-    )
-    .unwrap();
+    let r_none =
+        kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, None)
+            .unwrap();
    assert_eq!(r_none.scanned, 3);
    assert_eq!(r_none.new, 3);
 }
@@ -134,12 +126,77 @@ fn dropped_receiver_does_not_panic_or_fail_ingest() {
    let env = TestEnv::lexical_only();
    let (tx, rx) = mpsc::channel::<IngestEvent>();
    drop(rx);
-    let report = kebab_app::ingest_with_config_progress(
-        env.config.clone(),
-        env.scope(),
-        true,
-        Some(tx),
-    )
-    .unwrap();
+    let report =
+        kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
+            .unwrap();
    assert_eq!(report.scanned, 3);
 }
+
+/// v0.20.0 sub-item 1: pdf_ocr_started + pdf_ocr_finished events 가 PDF asset 의
+/// OCR-enabled ingest 시 emit 됨을 검증. real Ollama 의존 — `#[ignore]` default.
+///
+/// Manual invoke:
+/// ```
+/// KEBAB_PDF_OCR_ENABLED=true \
+///   KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
+///   cargo test -p kebab-app --test ingest_progress \
+///   --ignored pdf_ocr_progress_emits_started_finished_events
+/// ```
+#[test]
+#[ignore = "real Ollama dependency — manual invoke via KEBAB_PDF_OCR_ENABLED=true"]
+fn pdf_ocr_progress_emits_started_finished_events() {
+    // F1 fixture (DCTDecode JPEG passthrough) 을 tmpdir 의 workspace 로 copy.
+    let tmpdir = tempfile::tempdir().expect("create tmpdir");
+    let workspace = tmpdir.path().join("workspace");
+    std::fs::create_dir_all(&workspace).expect("create workspace dir");
+    let f1_src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
+    let f1 = std::fs::read(&f1_src).expect("F1 fixture present");
+    std::fs::write(workspace.join("page1.pdf"), &f1).expect("copy F1");
+
+    let data_dir = tmpdir.path().join("data");
+    std::fs::create_dir_all(&data_dir).expect("create data dir");
+
+    let mut config = kebab_config::Config::defaults();
+    config.workspace.root = workspace.to_string_lossy().into_owned();
+    config.storage.data_dir = data_dir.to_string_lossy().into_owned();
+    config.models.embedding.provider = "none".to_string();
+    config.models.embedding.dimensions = 0;
+    config.pdf.ocr.enabled = true;
+    if let Ok(endpoint) = std::env::var("KEBAB_PDF_OCR_ENDPOINT") {
+        config.pdf.ocr.endpoint = Some(endpoint);
+    }
+
+    let scope = kebab_core::SourceScope {
+        root: workspace.clone(),
+        ..Default::default()
+    };
+
+    let (tx, rx) = mpsc::channel::<IngestEvent>();
+    let _report = kebab_app::ingest_with_config_progress(config, scope, false, Some(tx))
+        .expect("ingest_with_config_progress");
+
+    let events: Vec<_> = rx.iter().collect();
+
+    let started_count = events
+        .iter()
+        .filter(|e| matches!(e, IngestEvent::PdfOcrStarted { .. }))
+        .count();
+    let finished_count = events
+        .iter()
+        .filter(|e| matches!(e, IngestEvent::PdfOcrFinished { .. }))
+        .count();
+
+    assert!(
+        started_count >= 1,
+        "PdfOcrStarted 가 ≥ 1 emit 됨 (got {started_count})"
+    );
+    assert!(
+        finished_count >= 1,
+        "PdfOcrFinished 가 ≥ 1 emit 됨 (got {finished_count})"
+    );
+    assert_eq!(
+        started_count, finished_count,
+        "Started 와 Finished 의 count 일치"
+    );
+}
--- a/crates/kebab-app/tests/ingest_stdin.rs
+++ b/crates/kebab-app/tests/ingest_stdin.rs
@@ -29,12 +29,14 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() {
        "## Body content\n\nMore.",
        "Article X",
        Some("https://example.com/x"),
-    ).unwrap();
+    )
+    .unwrap();
    assert_eq!(report.new, 1, "{report:?}");

    // _external/ contains exactly one .md file with frontmatter.
    let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
-    let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
+    let entries: Vec<_> = fs::read_dir(&ext_dir)
+        .unwrap()
        .filter_map(std::result::Result::ok)
        .collect();
    assert_eq!(entries.len(), 1);
@@ -50,16 +52,13 @@ fn ingest_stdin_without_source_uri() {
    let dir = tempfile::tempdir().unwrap();
    let cfg = fresh_cfg(dir.path());

-    let report = kebab_app::ingest_stdin_with_config(
-        cfg.clone(),
-        "## Body",
-        "Title",
-        None,
-    ).unwrap();
+    let report =
+        kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap();
    assert_eq!(report.new, 1);

    let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
-    let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
+    let entries: Vec<_> = fs::read_dir(&ext_dir)
+        .unwrap()
        .filter_map(std::result::Result::ok)
        .collect();
    let content = fs::read_to_string(entries[0].path()).unwrap();
--- a/crates/kebab-app/tests/init_template.rs
+++ b/crates/kebab-app/tests/init_template.rs
@@ -17,9 +17,8 @@ fn init_workspace_header_lists_supported_extensions() {
    }
    kebab_app::init_workspace(true).expect("init_workspace");
    let cfg_path = kebab_config::Config::xdg_config_path();
-    let body = std::fs::read_to_string(&cfg_path).unwrap_or_else(|e| {
-        panic!("read config at {}: {e}", cfg_path.display())
-    });
+    let body = std::fs::read_to_string(&cfg_path)
+        .unwrap_or_else(|e| panic!("read config at {}: {e}", cfg_path.display()));
    assert!(
        body.contains("처리 가능한 형식"),
        "header lists supported types section: body=\n{body}"
--- a/crates/kebab-app/tests/multi_scanned_pdf_ingest_no_chunk_id_collision.rs
+++ b/crates/kebab-app/tests/multi_scanned_pdf_ingest_no_chunk_id_collision.rs
@@ -0,0 +1,122 @@
+//! Bug #3 regression: multi-scanned PDF ingest must produce globally unique chunk_ids.
+//! v0.20.0 sub-item 1 bugfix.
+//!
+//! Strategy: helper-level chain test (apply_ocr_to_pdf_pages → PdfPageV1Chunker).
+//! Facade mock injection is unavailable (kebab-app hardcodes OllamaVisionOcr), so
+//! this test covers the full OCR→chunk pipeline with real PDF fixtures + MockOcrEngine,
+//! adding value beyond kebab-chunk unit test B5 (which tests PdfPageV1Chunker alone).
+
+mod common;
+
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+use common::mock_ocr::MockOcrEngine;
+use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
+use kebab_chunk::PdfPageV1Chunker;
+use kebab_core::{
+    AssetStorage, Checksum, ChunkPolicy, Chunker, ExtractConfig, ExtractContext, Extractor,
+    MediaType, RawAsset, SourceUri, WorkspacePath, id_for_asset,
+};
+use kebab_parse_image::OcrEngine;
+use kebab_parse_pdf::PdfTextExtractor;
+use time::OffsetDateTime;
+
+fn make_pdf_asset(path: &str, hash_char: char, byte_len: u64) -> RawAsset {
+    let fake_hash: String = hash_char.to_string().repeat(64);
+    let asset_id = id_for_asset(&fake_hash);
+    RawAsset {
+        asset_id,
+        source_uri: SourceUri::File(PathBuf::from(path)),
+        workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
+        media_type: MediaType::Pdf,
+        byte_len,
+        checksum: Checksum(fake_hash),
+        discovered_at: OffsetDateTime::UNIX_EPOCH,
+        stored: AssetStorage::Copied {
+            path: PathBuf::from(path),
+        },
+    }
+}
+
+fn extract_and_ocr(
+    bytes: &[u8],
+    path: &str,
+    hash_char: char,
+    engine: &dyn OcrEngine,
+) -> kebab_core::CanonicalDocument {
+    let asset = make_pdf_asset(path, hash_char, bytes.len() as u64);
+    let workspace_root = Path::new("/");
+    let config = ExtractConfig::default();
+    let ctx = ExtractContext {
+        asset: &asset,
+        workspace_root,
+        config: &config,
+    };
+    let mut canonical = PdfTextExtractor::new().extract(&ctx, bytes).unwrap();
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: false,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: None,
+    };
+    apply_ocr_to_pdf_pages(&mut canonical, engine, bytes, &opts, |_| {}).unwrap();
+    canonical
+}
+
+#[test]
+fn multi_scanned_pdf_ingest_no_chunk_id_collision() {
+    let f1_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
+        .expect("F1 fixture missing");
+    let f2_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page2.pdf")
+        .expect("F2 fixture missing");
+
+    // Bug #3 trigger shape: 10-char early segment + ". " + 500-char tail.
+    // byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500 → multi-chunk.
+    // overlap_bytes = min(240, 750) = 240 / chars=80 → second chunk's actual_start
+    // collapses to prev_min=0 without the fix → same #c0 suffix → chunk_id collision.
+    let trigger_text = format!("{}. {}", "가".repeat(10), "나".repeat(500));
+
+    let f1_engine = MockOcrEngine::single("F1 mock OCR page text", false);
+    let f2_engine = MockOcrEngine::single(&trigger_text, false);
+
+    let f1_canonical = extract_and_ocr(&f1_bytes, "page1.pdf", '1', &f1_engine);
+    let f2_canonical = extract_and_ocr(&f2_bytes, "page2.pdf", '2', &f2_engine);
+
+    let chunk_policy = ChunkPolicy {
+        target_tokens: 500,
+        overlap_tokens: 80,
+        respect_markdown_headings: false,
+        chunker_version: PdfPageV1Chunker.chunker_version(),
+    };
+
+    let f1_chunks = PdfPageV1Chunker
+        .chunk(&f1_canonical, &chunk_policy)
+        .unwrap();
+    let f2_chunks = PdfPageV1Chunker
+        .chunk(&f2_canonical, &chunk_policy)
+        .unwrap();
+
+    assert!(
+        f2_chunks.len() >= 2,
+        "F2 trigger text must produce ≥2 chunks for the collision to be possible; got {}",
+        f2_chunks.len()
+    );
+
+    let all_ids: Vec<&str> = f1_chunks
+        .iter()
+        .chain(f2_chunks.iter())
+        .map(|c| c.chunk_id.0.as_str())
+        .collect();
+    let total = all_ids.len();
+    let unique: HashSet<&str> = all_ids.iter().copied().collect();
+    assert_eq!(
+        unique.len(),
+        total,
+        "all chunk_ids must be globally unique across F1 + F2 ({} unique vs {} total — collision detected)",
+        unique.len(),
+        total,
+    );
+}
--- a/crates/kebab-app/tests/pdf_ocr_apply.rs
+++ b/crates/kebab-app/tests/pdf_ocr_apply.rs
@@ -0,0 +1,358 @@
+//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
+
+mod common;
+
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+
+use common::mock_ocr::MockOcrEngine;
+use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
+use kebab_core::{
+    AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, Extractor,
+    Inline, Lang, MediaType, RawAsset, SourceSpan, SourceUri, WorkspacePath, id_for_asset,
+};
+use kebab_parse_pdf::PdfTextExtractor;
+use time::OffsetDateTime;
+
+// ── Fixture helpers ───────────────────────────────────────────────────────
+
+fn f1_pdf_bytes() -> Vec<u8> {
+    std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
+        .expect("F1 fixture missing")
+}
+
+fn make_raw_asset(path: &str, media_type: MediaType, byte_len: u64) -> RawAsset {
+    let fake_hash = "0".repeat(64);
+    let asset_id = id_for_asset(&fake_hash);
+    RawAsset {
+        asset_id,
+        source_uri: SourceUri::File(PathBuf::from(path)),
+        workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
+        media_type,
+        byte_len,
+        checksum: Checksum(fake_hash.clone()),
+        discovered_at: OffsetDateTime::UNIX_EPOCH,
+        stored: AssetStorage::Copied {
+            path: PathBuf::from(path),
+        },
+    }
+}
+
+/// Build a CanonicalDocument from raw PDF bytes using PdfTextExtractor.
+/// F1 (scanned) returns an empty-text Block::Paragraph per page.
+fn extract_canonical_from_bytes(bytes: &[u8]) -> CanonicalDocument {
+    let asset = make_raw_asset("test.pdf", MediaType::Pdf, bytes.len() as u64);
+    let workspace_root = Path::new("/");
+    let config = ExtractConfig::default();
+    let ctx = ExtractContext {
+        asset: &asset,
+        workspace_root,
+        config: &config,
+    };
+    PdfTextExtractor::new().extract(&ctx, bytes).unwrap()
+}
+
+/// F1 bytes → canonical with 1 empty Block::Paragraph for page 1.
+fn canonical_with_empty_block() -> CanonicalDocument {
+    extract_canonical_from_bytes(&f1_pdf_bytes())
+}
+
+/// F1-based canonical with block text replaced by `text` (high valid_ratio, chars≥20).
+fn canonical_with_filled_block(text: &str) -> CanonicalDocument {
+    let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
+    if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
+        let char_count = text.chars().count() as u32;
+        tb.text = text.to_string();
+        tb.inlines = vec![Inline::Text {
+            text: text.to_string(),
+        }];
+        if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
+            *char_end = Some(char_count);
+        }
+    }
+    canonical
+}
+
+/// F1-based canonical with block text replaced by PUA codepoints (low valid_ratio).
+fn canonical_with_mojibake_block() -> CanonicalDocument {
+    let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
+    if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
+        let pua = "\u{E000}".repeat(25); // 25 PUA codepoints → valid_ratio ≈ 0
+        let char_count = pua.chars().count() as u32;
+        tb.text = pua.clone();
+        tb.inlines = vec![Inline::Text { text: pua }];
+        if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
+            *char_end = Some(char_count);
+        }
+    }
+    canonical
+}
+
+fn default_opts(enabled: bool) -> PdfOcrOpts {
+    PdfOcrOpts {
+        enabled,
+        always_on: false,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: None,
+    }
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────
+
+// Test 1: F1 + enabled=true → in-place mutate
+#[test]
+fn f1_input_with_ocr_enabled_replaces_empty_block() {
+    let bytes = f1_pdf_bytes();
+    let mut canonical = canonical_with_empty_block();
+    let engine = MockOcrEngine::single("MOCK_OCR_TEXT", false);
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: false,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: Some(Lang("kor".into())),
+        cancel: None,
+    };
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 1);
+    let first_para = canonical.blocks.iter().find_map(|b| match b {
+        Block::Paragraph(tb) => Some(tb),
+        _ => None,
+    });
+    assert!(first_para.is_some());
+    assert_eq!(first_para.unwrap().text, "MOCK_OCR_TEXT");
+}
+
+// Test 2: F3 vector (mock filled canonical) + enabled=true → OCR skip (needs_ocr=false)
+#[test]
+fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
+    let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
+    let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
+    let mut canonical = canonical_with_filled_block(text);
+    let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
+    let opts = default_opts(true);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 0, "vector PDF 의 OCR 호출 0");
+    let first_para = canonical.blocks.iter().find_map(|b| match b {
+        Block::Paragraph(tb) => Some(tb),
+        _ => None,
+    });
+    if let Some(tb) = first_para {
+        assert!(tb.text.starts_with("충분한"), "원본 text 보존");
+    }
+}
+
+// Test 3: F1 + enabled=false → no-op
+#[test]
+fn f1_input_with_ocr_disabled_keeps_empty_block() {
+    let bytes = f1_pdf_bytes();
+    let mut canonical = canonical_with_empty_block();
+    let engine = MockOcrEngine::single("IGNORED", false);
+    let opts = default_opts(false);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 0);
+    assert_eq!(summary.ms_total, 0);
+}
+
+// Test 4: mojibake canonical (PUA chars) + enabled=true → in-place mutate
+#[test]
+fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
+    let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
+    let mut canonical = canonical_with_mojibake_block();
+    let engine = MockOcrEngine::single("OCR_MOJIBAKE_REPLACEMENT", false);
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: false,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: None,
+    };
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 1, "mojibake page 의 OCR 호출");
+    let first_para = canonical.blocks.iter().find_map(|b| match b {
+        Block::Paragraph(tb) => Some(tb),
+        _ => None,
+    });
+    if let Some(tb) = first_para {
+        assert_eq!(tb.text, "OCR_MOJIBAKE_REPLACEMENT");
+    }
+}
+
+// Test 5: filled canonical + always_on=true → dual-block (+1 OCR block)
+#[test]
+fn f3_input_with_always_on_pushes_dual_blocks() {
+    let bytes = f1_pdf_bytes();
+    let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
+    let mut canonical = canonical_with_filled_block(text);
+    let original_block_count = canonical.blocks.len();
+    let engine = MockOcrEngine::single("OCR_DUAL", false);
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: true,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: None,
+    };
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 1);
+    assert_eq!(
+        canonical.blocks.len(),
+        original_block_count + 1,
+        "always_on 시 새 Block::Paragraph push"
+    );
+    let texts: Vec<&str> = canonical
+        .blocks
+        .iter()
+        .filter_map(|b| match b {
+            Block::Paragraph(tb) => Some(tb.text.as_str()),
+            _ => None,
+        })
+        .collect();
+    assert!(texts.contains(&"OCR_DUAL"), "OCR block 포함");
+    assert!(
+        texts.iter().any(|t| t.starts_with("vector")),
+        "원본 text-detect block 보존"
+    );
+}
+
+// Test 6: F6 FlateDecode → extract_dctdecode_page_image=None → skip + warning
+#[test]
+fn f6_flatedecode_skipped_with_warning() {
+    let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
+        .expect("F6 fixture missing");
+    let mut canonical = canonical_with_empty_block(); // page-1 block from F1
+    let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
+    let opts = default_opts(true);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(
+        summary.pages_ocrd, 0,
+        "FlateDecode page 는 skip (DCTDecode-only v1 invariant)"
+    );
+    let warning_count = canonical
+        .provenance
+        .events
+        .iter()
+        .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
+        .count();
+    assert!(warning_count >= 1, "FlateDecode skip 시 Warning event 발행");
+}
+
+// Test 7: F7 CCITTFax → skip + warning (verifier M-4 split)
+#[test]
+fn f7_ccittfax_skipped_with_warning() {
+    let bytes =
+        std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf").expect("F7 fixture missing");
+    let mut canonical = canonical_with_empty_block(); // page-1 block from F1
+    let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
+    let opts = default_opts(true);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 0, "CCITTFax page 는 skip");
+    let warning_count = canonical
+        .provenance
+        .events
+        .iter()
+        .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
+        .count();
+    assert!(warning_count >= 1, "CCITTFax skip 시 Warning event 발행");
+}
+
+// Test 8: OCR engine failure → warning event + skip
+#[test]
+fn ocr_engine_failure_surfaces_as_warning() {
+    let bytes = f1_pdf_bytes();
+    let mut canonical = canonical_with_empty_block();
+    let engine = MockOcrEngine::single("", true);
+    let opts = default_opts(true);
+
+    let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    assert_eq!(summary.pages_ocrd, 0, "OCR failure 시 pages_ocrd=0");
+    let warning_with_failure = canonical.provenance.events.iter().any(|e| {
+        e.kind == kebab_core::ProvenanceKind::Warning
+            && e.note.as_deref().unwrap_or("").contains("mock failure")
+    });
+    assert!(
+        warning_with_failure,
+        "OCR failure 의 error message 가 warning event 의 note 안"
+    );
+}
+
+// Test 9: dual-block ordinals are deterministic and unique
+#[test]
+fn dual_block_ordinals_are_deterministic_and_unique() {
+    let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
+    let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
+    let mut canonical = canonical_with_filled_block(text);
+    let engine = MockOcrEngine::single("DUAL", false);
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: true,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: None,
+    };
+
+    apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
+
+    // page_count=1 → text-detect ordinal=0, ocr ordinal=1 (page_num-1 + page_count = 0+1=1)
+    let para_count = canonical
+        .blocks
+        .iter()
+        .filter(|b| matches!(b, Block::Paragraph(_)))
+        .count();
+    assert_eq!(para_count, 2, "dual-block: text-detect + OCR");
+
+    let all_page_1 = canonical
+        .blocks
+        .iter()
+        .filter_map(|b| match b {
+            Block::Paragraph(tb) => Some(&tb.common.source_span),
+            _ => None,
+        })
+        .all(|s| matches!(s, SourceSpan::Page { page: 1, .. }));
+    assert!(all_page_1, "두 block 모두 page=1");
+}
+
+// Test 10: cancel handle aborts mid-PDF
+#[test]
+fn cancel_handle_aborts_mid_pdf() {
+    let bytes = f1_pdf_bytes();
+    let mut canonical = canonical_with_empty_block();
+    let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
+    let engine = MockOcrEngine::single("IGNORED", false);
+    let opts = PdfOcrOpts {
+        enabled: true,
+        always_on: false,
+        valid_ratio_threshold: 0.5,
+        min_char_count: 20,
+        lang_hint: None,
+        cancel: Some(cancel.clone()),
+    };
+
+    let result = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {});
+    let err = result.expect_err("cancel=true 시 error 반환");
+    assert!(
+        format!("{err}").contains("cancelled mid-PDF"),
+        "error message 가 'cancelled mid-PDF' 포함: {err}"
+    );
+}
--- a/crates/kebab-app/tests/pdf_pipeline.rs
+++ b/crates/kebab-app/tests/pdf_pipeline.rs
@@ -46,17 +46,13 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
                operations: vec![
                    Operation::new("BT", vec![]),
                    Operation::new("Tf", vec!["F1".into(), 24.into()]),
-                    Operation::new(
-                        "Td",
-                        vec![Object::Integer(100), Object::Integer(700)],
-                    ),
+                    Operation::new("Td", vec![Object::Integer(100), Object::Integer(700)]),
                    Operation::new("Tj", vec![Object::string_literal(*text)]),
                    Operation::new("ET", vec![]),
                ],
            };
            let stream_data = content.encode().expect("content encode");
-            let content_id =
-                doc.add_object(Stream::new(dictionary! {}, stream_data));
+            let content_id = doc.add_object(Stream::new(dictionary! {}, stream_data));
            page_dict.set("Contents", content_id);
        }
        let page_id = doc.add_object(page_dict);
@@ -76,8 +72,7 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
            Object::Integer(842),
        ],
    };
-    doc.objects
-        .insert(pages_id, Object::Dictionary(pages_dict));
+    doc.objects.insert(pages_id, Object::Dictionary(pages_dict));

    let catalog_id = doc.add_object(dictionary! {
        "Type" => "Catalog",
@@ -146,9 +141,8 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
    write_pdf(&env.workspace_root, "three.pdf", &bytes);
    let cfg = cfg_with_pdf(&env);

-    let report =
-        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
-            .expect("PDF ingest must succeed");
+    let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
+        .expect("PDF ingest must succeed");

    assert_eq!(report.errors, 0);
    let items = report.items.as_ref().expect("items present");
@@ -157,23 +151,28 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
        .find(|i| i.doc_path.0.ends_with("three.pdf"))
        .expect("PDF item present");
    assert_eq!(pdf_item.kind, IngestItemKind::New);
-    assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page");
-    assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page");
+    assert_eq!(
+        pdf_item.block_count,
+        Some(3),
+        "one Block::Paragraph per page"
+    );
+    assert_eq!(
+        pdf_item.chunk_count,
+        Some(3),
+        "one chunk per non-empty page"
+    );
    assert_eq!(
        pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
        Some("pdf-text-v1")
    );
    assert_eq!(
        pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()),
-        Some("pdf-page-v1")
+        Some("pdf-page-v1.1")
    );

    // Inspect the stored doc to confirm SourceSpan::Page round-trip.
-    let doc = kebab_app::inspect_doc_with_config(
-        cfg,
-        pdf_item.doc_id.as_ref().unwrap(),
-    )
-    .expect("inspect_doc returns the PDF document");
+    let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap())
+        .expect("inspect_doc returns the PDF document");
    assert_eq!(doc.blocks.len(), 3);
    for (i, block) in doc.blocks.iter().enumerate() {
        let want_page = (i as u32) + 1;
@@ -202,8 +201,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
    write_pdf(&env.workspace_root, "stable.pdf", &bytes);
    let cfg = cfg_with_pdf(&env);

-    let report1 =
-        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let report1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
    let item1 = report1
        .items
        .as_ref()
@@ -214,8 +212,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
        .unwrap();
    assert_eq!(item1.kind, IngestItemKind::New);

-    let report2 =
-        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let report2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
    let item2 = report2
        .items
        .unwrap()
@@ -239,8 +236,7 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
    std::fs::write(&path, &bytes_v1).unwrap();
    let cfg = cfg_with_pdf(&env);

-    let report_v1 =
-        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let report_v1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
    let id_v1 = report_v1
        .items
        .as_ref()
@@ -252,12 +248,10 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
        .clone()
        .unwrap();

-    let bytes_v2 =
-        build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
+    let bytes_v2 = build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
    std::fs::write(&path, &bytes_v2).unwrap();

-    let report_v2 =
-        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let report_v2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
    let item_v2 = report_v2
        .items
        .as_ref()
@@ -282,9 +276,11 @@ fn encrypted_pdf_fails_with_qpdf_hint() {
    write_pdf(&env.workspace_root, "secret.pdf", &bytes);
    let cfg = cfg_with_pdf(&env);

-    let report =
-        kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
-    assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once");
+    let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
+    assert_eq!(
+        report.errors, 1,
+        "encrypted PDF must increment errors exactly once"
+    );
    let items = report.items.as_ref().unwrap();
    let pdf_item = items
        .iter()
@@ -310,9 +306,11 @@ fn corrupt_pdf_fails_without_storing() {
    write_pdf(&env.workspace_root, "corrupt.pdf", &bytes);
    let cfg = cfg_with_pdf(&env);

-    let report =
-        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
-    assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once");
+    let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    assert_eq!(
+        report.errors, 1,
+        "corrupt PDF must increment errors exactly once"
+    );
    let items = report.items.as_ref().unwrap();
    let pdf_item = items
        .iter()
@@ -322,11 +320,8 @@ fn corrupt_pdf_fails_without_storing() {

    // Confirm the doc was NOT stored — list_docs returns nothing for
    // this path.
-    let summaries = kebab_app::list_docs_with_config(
-        cfg,
-        kebab_core::DocFilter::default(),
-    )
-    .unwrap();
+    let summaries =
+        kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
    assert!(
        !summaries
            .iter()
@@ -341,14 +336,15 @@ fn corrupt_pdf_fails_without_storing() {
 #[test]
 fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
    let env = TestEnv::lexical_only();
-    let bytes =
-        build_text_pdf(&[Some("first page"), None, Some("third page")]);
+    let bytes = build_text_pdf(&[Some("first page"), None, Some("third page")]);
    write_pdf(&env.workspace_root, "mixed.pdf", &bytes);
    let cfg = cfg_with_pdf(&env);

-    let report =
-        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
-    assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error");
+    let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    assert_eq!(
+        report.errors, 0,
+        "scanned candidate is a Warning, not Error"
+    );
    let pdf_item = report
        .items
        .as_ref()
@@ -365,14 +361,10 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
    assert_eq!(
        pdf_item.chunk_count,
        Some(2),
-        "pdf-page-v1 emits 0 chunks for the empty page; total = 2"
+        "pdf-page-v1.1 emits 0 chunks for the empty page; total = 2"
    );

-    let doc = kebab_app::inspect_doc_with_config(
-        cfg,
-        pdf_item.doc_id.as_ref().unwrap(),
-    )
-    .unwrap();
+    let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
    let warnings: Vec<_> = doc
        .provenance
        .events
@@ -419,8 +411,7 @@ fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() {
    write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf());
    let cfg = cfg_with_pdf(&env);

-    let report =
-        kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
+    let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
    let total = report.new + report.updated + report.skipped + report.errors;
    assert_eq!(
        report.scanned, total,
@@ -441,14 +432,12 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
    let pages: Vec<String> = (1..=50)
        .map(|i| format!("Page {i} body — lorem ipsum dolor sit amet."))
        .collect();
-    let page_refs: Vec<Option<&str>> =
-        pages.iter().map(|s| Some(s.as_str())).collect();
+    let page_refs: Vec<Option<&str>> = pages.iter().map(|s| Some(s.as_str())).collect();
    let bytes = build_text_pdf(&page_refs);
    write_pdf(&env.workspace_root, "long.pdf", &bytes);
    let cfg = cfg_with_pdf(&env);

-    let report =
-        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
    assert_eq!(report.errors, 0);
    let pdf_item = report
        .items
@@ -466,8 +455,7 @@ fn long_pdf_round_trips_through_lexical_pipeline() {

    // Round-trip: list_docs sees the long PDF.
    let summaries =
-        kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
-            .unwrap();
+        kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
    assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf")));
 }

@@ -476,13 +464,11 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
 #[test]
 fn inspect_doc_surfaces_page_spans() {
    let env = TestEnv::lexical_only();
-    let bytes =
-        build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
+    let bytes = build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
    write_pdf(&env.workspace_root, "inspect.pdf", &bytes);
    let cfg = cfg_with_pdf(&env);

-    let report =
-        kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
+    let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
    let pdf_item = report
        .items
        .as_ref()
@@ -490,19 +476,12 @@ fn inspect_doc_surfaces_page_spans() {
        .iter()
        .find(|i| i.doc_path.0.ends_with("inspect.pdf"))
        .unwrap();
-    let doc = kebab_app::inspect_doc_with_config(
-        cfg,
-        pdf_item.doc_id.as_ref().unwrap(),
-    )
-    .unwrap();
+    let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
    assert_eq!(doc.parser_version.0, "pdf-text-v1");
    assert_eq!(doc.blocks.len(), 3);
    for block in &doc.blocks {
        match block {
-            Block::Paragraph(p) => assert!(matches!(
-                p.common.source_span,
-                SourceSpan::Page { .. }
-            )),
+            Block::Paragraph(p) => assert!(matches!(p.common.source_span, SourceSpan::Page { .. })),
            other => panic!("expected Paragraph, got {other:?}"),
        }
    }
--- a/crates/kebab-app/tests/reset_orphans.rs
+++ b/crates/kebab-app/tests/reset_orphans.rs
@@ -78,19 +78,15 @@ fn reset_orphans_only_purges_out_of_scope_docs() {
    narrow_cfg.workspace.exclude = vec!["b.rs".to_string(), "c.rs".to_string()];

    // Run orphans-only reset.
-    let report = execute(ResetScope::OrphansOnly, &narrow_cfg)
-        .expect("orphans-only reset must succeed");
+    let report =
+        execute(ResetScope::OrphansOnly, &narrow_cfg).expect("orphans-only reset must succeed");

    assert_eq!(
        report.orphans_purged, 2,
        "expected 2 orphans purged (b.rs + c.rs): {report:?}"
    );

-    let mut purged: Vec<String> = report
-        .purged_paths
-        .iter()
-        .map(|p| p.0.clone())
-        .collect();
+    let mut purged: Vec<String> = report.purged_paths.iter().map(|p| p.0.clone()).collect();
    purged.sort();
    assert_eq!(
        purged,
--- a/crates/kebab-app/tests/schema_active_versions.rs
+++ b/crates/kebab-app/tests/schema_active_versions.rs
@@ -0,0 +1,79 @@
+//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers.
+
+use kebab_app::schema_with_config;
+use kebab_config::Config;
+use kebab_core::SourceScope;
+
+fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
+    let mut cfg = Config::defaults();
+    cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
+    cfg.workspace.exclude.clear();
+    cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
+    cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
+    cfg.models.embedding.provider = "none".to_string();
+    cfg.models.embedding.dimensions = 0;
+    cfg.chunking.target_tokens = 80;
+    cfg.chunking.overlap_tokens = 20;
+    cfg
+}
+
+fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope {
+    SourceScope {
+        root: workspace_root.to_path_buf(),
+        include: vec![],
+        exclude: vec![],
+    }
+}
+
+#[test]
+fn schema_models_active_arrays_empty_on_empty_corpus() {
+    let dir = tempfile::tempdir().unwrap();
+    let workspace = dir.path().join("kb");
+    std::fs::create_dir_all(&workspace).unwrap();
+    let cfg = minimal_config(dir.path(), &workspace);
+
+    let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
+    store.run_migrations().unwrap();
+    drop(store);
+
+    let s = schema_with_config(&cfg).unwrap();
+    assert!(
+        s.models.active_parsers.is_empty(),
+        "empty corpus → no parsers"
+    );
+    assert!(
+        s.models.active_chunkers.is_empty(),
+        "empty corpus → no chunkers"
+    );
+    // backward compat: 기존 단일 field 는 markdown default 보존.
+    assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION);
+}
+
+#[test]
+fn schema_emits_active_parsers_and_chunkers_array_after_ingest() {
+    let dir = tempfile::tempdir().unwrap();
+    let workspace = dir.path().join("kb");
+    std::fs::create_dir_all(&workspace).unwrap();
+    std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap();
+    let cfg = minimal_config(dir.path(), &workspace);
+    let scope = minimal_scope(&workspace);
+
+    kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap();
+
+    let s = schema_with_config(&cfg).unwrap();
+    assert!(
+        !s.models.active_parsers.is_empty(),
+        "active_parsers populated after ingest"
+    );
+    assert!(
+        !s.models.active_chunkers.is_empty(),
+        "active_chunkers populated after ingest"
+    );
+    // active arrays must be sorted (ORDER BY in SQL).
+    let mut sorted = s.models.active_parsers.clone();
+    sorted.sort();
+    assert_eq!(
+        s.models.active_parsers, sorted,
+        "active_parsers must be sorted"
+    );
+}
--- a/crates/kebab-app/tests/schema_report.rs
+++ b/crates/kebab-app/tests/schema_report.rs
@@ -57,7 +57,7 @@ fn schema_report_reflects_freshly_ingested_kb() {
        schema.wire.schemas
    );
    assert!(schema.capabilities.json_mode);
-    assert!(!schema.capabilities.streaming_ask);
+    assert!(schema.capabilities.streaming_ask); // Bug #9: streaming_ask is now true
    assert!(
        schema.capabilities.mcp_server,
        "mcp_server should be true after fb-30",
--- a/crates/kebab-app/tests/search_budget_integration.rs
+++ b/crates/kebab-app/tests/search_budget_integration.rs
@@ -27,7 +27,10 @@ fn search_with_opts_no_budget_matches_search() {

    assert_eq!(resp.hits.len(), baseline.len());
    assert!(!resp.truncated);
-    assert!(resp.next_cursor.is_none(), "k=5 against 1 doc → no next page");
+    assert!(
+        resp.next_cursor.is_none(),
+        "k=5 against 1 doc → no next page"
+    );
 }

 #[test]
@@ -62,7 +65,11 @@ fn budget_truncates_snippets_when_below_threshold() {
 fn cursor_paginates_to_next_page() {
    let env = common::TestEnv::new();
    for i in 0..6 {
-        common::ingest_md(&env, &format!("d{i}.md"), &format!("# T{i}\n\nrust topic {i}\n"));
+        common::ingest_md(
+            &env,
+            &format!("d{i}.md"),
+            &format!("# T{i}\n\nrust topic {i}\n"),
+        );
    }
    let app = env.app();

@@ -88,7 +95,10 @@ fn cursor_paginates_to_next_page() {
        page1.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
    let p2_ids: std::collections::HashSet<_> =
        page2.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
-    assert!(p1_ids.is_disjoint(&p2_ids), "page 2 must not repeat page 1 hits");
+    assert!(
+        p1_ids.is_disjoint(&p2_ids),
+        "page 2 must not repeat page 1 hits"
+    );
 }

 #[test]
--- a/crates/kebab-app/tests/search_korean.rs
+++ b/crates/kebab-app/tests/search_korean.rs
@@ -75,11 +75,9 @@ fn lexical_multi_token_korean_query_hits() {
    kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
        .expect("ingest must succeed");

-    let hits = kebab_app::search_with_config(
-        env.config.clone(),
-        common::lexical_query("해시 충돌"),
-    )
-    .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), common::lexical_query("해시 충돌"))
+            .expect("search must succeed");

    assert!(
        !hits.is_empty(),
@@ -113,11 +111,9 @@ fn lexical_mixed_korean_english_multi_token_query_hits() {
    kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
        .expect("ingest must succeed");

-    let hits = kebab_app::search_with_config(
-        env.config.clone(),
-        common::lexical_query("Rust 충돌은"),
-    )
-    .expect("search must succeed");
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust 충돌은"))
+            .expect("search must succeed");

    assert!(
        !hits.is_empty(),
--- a/crates/kebab-app/tests/search_lexical.rs
+++ b/crates/kebab-app/tests/search_lexical.rs
@@ -35,8 +35,8 @@ fn lexical_search_returns_hits_after_ingest() {
 fn lexical_search_empty_query_returns_empty() {
    let env = TestEnv::lexical_only();
    kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
-    let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("   "))
-        .unwrap();
+    let hits =
+        kebab_app::search_with_config(env.config.clone(), common::lexical_query("   ")).unwrap();
    assert!(hits.is_empty(), "blank query must short-circuit empty");
 }

@@ -107,17 +107,17 @@ fn search_uncached_returns_same_hits_as_cached() {
 #[test]
 fn first_ingest_bumps_corpus_revision() {
    let env = TestEnv::lexical_only();
-    let store_before =
-        kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
+    let store_before = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
    store_before.run_migrations().unwrap();
    assert_eq!(store_before.corpus_revision(), 0, "fresh store seeds 0");

-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
-    assert!(report.new + report.updated > 0, "first ingest must commit ≥1 doc");
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
+    assert!(
+        report.new + report.updated > 0,
+        "first ingest must commit ≥1 doc"
+    );

-    let store_after =
-        kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
+    let store_after = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
    assert!(
        store_after.corpus_revision() >= 1,
        "ingest commit must bump corpus_revision (got {})",
--- a/crates/kebab-app/tests/search_stale_integration.rs
+++ b/crates/kebab-app/tests/search_stale_integration.rs
@@ -29,7 +29,9 @@ fn fresh_doc_is_not_stale_with_default_threshold() {
    assert!(
        hits.iter().all(|h| !h.stale),
        "freshly-ingested doc must not be stale at default 30d threshold: {:?}",
-        hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
+        hits.iter()
+            .map(|h| (h.doc_path.0.clone(), h.stale))
+            .collect::<Vec<_>>()
    );
 }

@@ -50,7 +52,9 @@ fn threshold_zero_disables_staleness() {
    assert!(
        hits.iter().all(|h| !h.stale),
        "threshold=0 disables staleness even for year-old docs: {:?}",
-        hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
+        hits.iter()
+            .map(|h| (h.doc_path.0.clone(), h.stale))
+            .collect::<Vec<_>>()
    );
 }

--- a/crates/kebab-app/tests/search_vector.rs
+++ b/crates/kebab-app/tests/search_vector.rs
@@ -14,7 +14,8 @@ use common::TestEnv;
 fn require_avx_or_panic() {
    #[cfg(target_arch = "x86_64")]
    {
-        assert!(std::is_x86_feature_detected!("avx"), 
+        assert!(
+            std::is_x86_feature_detected!("avx"),
            "kb-app vector integration test requires AVX-capable hardware; \
             host CPU lacks AVX. Run on an AVX-capable machine."
        );
@@ -28,8 +29,7 @@ fn ingest_then_hybrid_search_returns_hits() {
    require_avx_or_panic();

    let env = TestEnv::with_embeddings();
-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
    assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
    assert_eq!(report.new, 3);

@@ -55,8 +55,7 @@ fn ingest_then_vector_search_carries_embedding_model() {
    require_avx_or_panic();

    let env = TestEnv::with_embeddings();
-    let report =
-        kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
    assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
    assert_eq!(report.new, 3);

--- a/crates/kebab-app/tests/skip_reason.rs
+++ b/crates/kebab-app/tests/skip_reason.rs
@@ -13,11 +13,7 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
    std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
    std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();

-    let report = kebab_app::ingest_with_config(
-        env.config.clone(),
-        env.scope(),
-        false,
-    ).unwrap();
+    let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();

    let items = report.items.as_ref().expect("items array populated");
    let docx_item = items
@@ -39,5 +35,8 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
        vec!["unsupported media type: <no-ext>".to_string()],
    );
    assert_eq!(report.skipped_by_extension.get("docx").copied(), Some(1));
-    assert_eq!(report.skipped_by_extension.get("<no-ext>").copied(), Some(1));
+    assert_eq!(
+        report.skipped_by_extension.get("<no-ext>").copied(),
+        Some(1)
+    );
 }
--- a/crates/kebab-app/tests/twin_files_fetch_span.rs
+++ b/crates/kebab-app/tests/twin_files_fetch_span.rs
@@ -44,8 +44,8 @@ fn twin_files_fetch_span_uses_correct_asset() {
    std::fs::write(dir_b.join("note.md"), content).unwrap();

    // Ingest all files (fixture workspace + our two new twins).
-    let report = ingest_with_config(env.config.clone(), env.scope(), false)
-        .expect("ingest must succeed");
+    let report =
+        ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest must succeed");
    assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");

    // Both twin paths must appear as New in the report.
@@ -53,8 +53,7 @@ fn twin_files_fetch_span_uses_correct_asset() {
    let twin_items: Vec<_> = items
        .iter()
        .filter(|i| {
-            i.doc_path.0.ends_with("src_a/note.md")
-                || i.doc_path.0.ends_with("src_b/note.md")
+            i.doc_path.0.ends_with("src_a/note.md") || i.doc_path.0.ends_with("src_b/note.md")
        })
        .collect();
    assert_eq!(
@@ -149,7 +148,10 @@ fn twin_files_fetch_span_uses_correct_asset() {
    // at either twin, making one twin's span fetch behave incorrectly.
    let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
        .expect("second ingest must succeed");
-    assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}");
+    assert_eq!(
+        report2.errors, 0,
+        "no ingest errors on second run; report={report2:?}"
+    );

    // Re-open app after second ingest and verify span still works on both.
    let app2 = env.app();
--- a/crates/kebab-app/tests/twin_files_idempotent.rs
+++ b/crates/kebab-app/tests/twin_files_idempotent.rs
@@ -43,9 +43,7 @@ fn twin_files_second_ingest_is_unchanged() {
    let items = first.items.as_ref().expect("items must be present");
    let twin_items: Vec<_> = items
        .iter()
-        .filter(|i| {
-            i.doc_path.0.ends_with("__init__.py")
-        })
+        .filter(|i| i.doc_path.0.ends_with("__init__.py"))
        .collect();
    assert_eq!(
        twin_items.len(),
@@ -63,8 +61,14 @@ fn twin_files_second_ingest_is_unchanged() {
    // Second ingest — same files, same content → both must be Unchanged.
    let second = ingest_with_config(env.config.clone(), env.scope(), false)
        .expect("second ingest must succeed");
-    assert_eq!(second.errors, 0, "second ingest: no errors; report={second:?}");
-    assert_eq!(second.new, 0, "second ingest: no new docs; report={second:?}");
+    assert_eq!(
+        second.errors, 0,
+        "second ingest: no errors; report={second:?}"
+    );
+    assert_eq!(
+        second.new, 0,
+        "second ingest: no new docs; report={second:?}"
+    );
    assert_eq!(
        second.updated, 0,
        "second ingest: no updated docs (twin-file bug would set this to 2); report={second:?}"
--- a/crates/kebab-chunk/src/code_c_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_c_ast_v1.rs
@@ -39,17 +39,11 @@ impl Chunker for CodeCAstV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        for b in &doc.blocks {
            let c = match b {
                Block::Code(c) => c,
-                _ => anyhow::bail!(
-                    "CodeCAstV1Chunker only handles code docs (got non-Code block)"
-                ),
+                _ => anyhow::bail!("CodeCAstV1Chunker only handles code docs (got non-Code block)"),
            };
            if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
                anyhow::bail!(
@@ -68,9 +62,12 @@ impl Chunker for CodeCAstV1Chunker {
                _ => unreachable!("validated above"),
            };
            let (ls, le, symbol, lang) = match &cb.common.source_span {
-                SourceSpan::Code { line_start, line_end, symbol, lang } => {
-                    (*line_start, *line_end, symbol.clone(), lang.clone())
-                }
+                SourceSpan::Code {
+                    line_start,
+                    line_end,
+                    symbol,
+                    lang,
+                } => (*line_start, *line_end, symbol.clone(), lang.clone()),
                _ => unreachable!("validated above"),
            };
            let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +81,13 @@ impl Chunker for CodeCAstV1Chunker {
                    lang: lang.clone(),
                };
                out.push(make_chunk(
-                    doc, &chunker_version, &block_ids, &base_policy_hash,
-                    None, span, cb.code.clone(),
+                    doc,
+                    &chunker_version,
+                    &block_ids,
+                    &base_policy_hash,
+                    None,
+                    span,
+                    cb.code.clone(),
                ));
            } else {
                let parts = split_oversize(&cb.code);
@@ -93,9 +95,7 @@ impl Chunker for CodeCAstV1Chunker {
                for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
                    let part_ls = ls + off_start;
                    let part_le = ls + off_end;
-                    let part_sym = symbol
-                        .as_ref()
-                        .map(|s| format!("{s} [part {}/{n}]", i + 1));
+                    let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
                    let span = SourceSpan::Code {
                        line_start: part_ls,
                        line_end: part_le,
@@ -103,8 +103,13 @@ impl Chunker for CodeCAstV1Chunker {
                        lang: lang.clone(),
                    };
                    out.push(make_chunk(
-                        doc, &chunker_version, &block_ids, &base_policy_hash,
-                        Some(part_ls), span, text,
+                        doc,
+                        &chunker_version,
+                        &block_ids,
+                        &base_policy_hash,
+                        Some(part_ls),
+                        span,
+                        text,
                    ));
                }
            }
@@ -183,9 +188,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
 mod tests {
    use super::*;
    use kebab_core::{
-        Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-        SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
-        SourceType, TrustLevel, WorkspacePath,
+        AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+        CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+        WorkspacePath, id_for_block, id_for_doc,
    };
    use time::OffsetDateTime;

@@ -206,39 +211,60 @@ mod tests {
                };
                let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
                Block::Code(CodeBlock {
-                    common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
+                    common: CommonBlock {
+                        block_id: bid,
+                        heading_path: vec![],
+                        source_span: span,
+                    },
                    lang: Some("c".into()),
                    code: (*code).to_string(),
                })
            })
            .collect();
        CanonicalDocument {
-            doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
-            lang: Lang("und".into()), blocks,
+            doc_id,
+            source_asset_id: aid,
+            workspace_path: wp,
+            title: "a".into(),
+            lang: Lang("und".into()),
+            blocks,
            metadata: Metadata {
-                aliases: vec![], tags: vec![],
+                aliases: vec![],
+                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
-                source_type: SourceType::Note, trust_level: TrustLevel::Primary,
-                user_id_alias: None, user: Default::default(),
-                repo: Some("kebab".into()), git_branch: Some("main".into()),
-                git_commit: Some("0".repeat(40)), code_lang: Some("c".into()),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+                repo: Some("kebab".into()),
+                git_branch: Some("main".into()),
+                git_commit: Some("0".repeat(40)),
+                code_lang: Some("c".into()),
            },
            provenance: Provenance { events: vec![] },
-            parser_version: pv, schema_version: 1, doc_version: 1,
-            last_chunker_version: None, last_embedding_version: None,
+            parser_version: pv,
+            schema_version: 1,
+            doc_version: 1,
+            last_chunker_version: None,
+            last_embedding_version: None,
        }
    }
    fn policy() -> ChunkPolicy {
-        ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
+        ChunkPolicy {
+            target_tokens: 500,
+            overlap_tokens: 80,
            respect_markdown_headings: false,
-            chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
    }

    #[test]
    fn chunker_version_is_code_c_ast_v1() {
-        assert_eq!(CodeCAstV1Chunker.chunker_version(),
-            ChunkerVersion("code-c-ast-v1".into()));
+        assert_eq!(
+            CodeCAstV1Chunker.chunker_version(),
+            ChunkerVersion("code-c-ast-v1".into())
+        );
    }

    #[test]
@@ -256,7 +282,12 @@ mod tests {
            assert_eq!(c.chunker_version.0, "code-c-ast-v1");
        }
        match &chunks[0].source_spans[0] {
-            SourceSpan::Code { symbol, line_start, line_end, .. } => {
+            SourceSpan::Code {
+                symbol,
+                line_start,
+                line_end,
+                ..
+            } => {
                assert_eq!(symbol.as_deref(), Some("parse"));
                assert_eq!((*line_start, *line_end), (1, 3));
            }
@@ -266,22 +297,32 @@ mod tests {

    #[test]
    fn oversize_unit_splits_into_parts_with_unique_ids() {
-        let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::<String>();
+        let body = (0..500)
+            .map(|i| format!("\tx{i} = {i};\n"))
+            .collect::<String>();
        let code = format!("int big() {{\n{body}\n}}");
        let doc = code_doc(&[("big", 1, 502, &code)]);
        let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap();
-        assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
+        assert!(
+            chunks.len() >= 2,
+            "oversize unit must split, got {}",
+            chunks.len()
+        );
        for c in &chunks {
            match &c.source_spans[0] {
                SourceSpan::Code { symbol, .. } => {
-                    assert!(symbol.as_deref().unwrap().starts_with("big [part "),
-                        "part-numbered symbol, got {symbol:?}");
+                    assert!(
+                        symbol.as_deref().unwrap().starts_with("big [part "),
+                        "part-numbered symbol, got {symbol:?}"
+                    );
                }
                _ => unreachable!(),
            }
        }
        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
-        let n = ids.len(); ids.sort_unstable(); ids.dedup();
+        let n = ids.len();
+        ids.sort_unstable();
+        ids.dedup();
        assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
    }

@@ -295,7 +336,8 @@ mod tests {
                heading_path: vec![],
                source_span: SourceSpan::Line { start: 1, end: 1 },
            },
-            text: "x".into(), inlines: vec![],
+            text: "x".into(),
+            inlines: vec![],
        })];
        let err = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
        assert!(err.to_string().contains("CodeCAstV1Chunker"));
@@ -304,11 +346,19 @@ mod tests {
    #[test]
    fn deterministic_chunk_ids_1000() {
        let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
-        let base: Vec<String> = CodeCAstV1Chunker.chunk(&doc, &policy())
-            .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+        let base: Vec<String> = CodeCAstV1Chunker
+            .chunk(&doc, &policy())
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
        for _ in 0..1000 {
-            let again: Vec<String> = CodeCAstV1Chunker.chunk(&doc, &policy())
-                .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+            let again: Vec<String> = CodeCAstV1Chunker
+                .chunk(&doc, &policy())
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
            assert_eq!(again, base);
        }
    }
@@ -316,7 +366,9 @@ mod tests {
    #[test]
    fn policy_hash_matches_md_heading_v1() {
        let p = policy();
-        assert_eq!(CodeCAstV1Chunker.policy_hash(&p),
-            crate::MdHeadingV1Chunker.policy_hash(&p));
+        assert_eq!(
+            CodeCAstV1Chunker.policy_hash(&p),
+            crate::MdHeadingV1Chunker.policy_hash(&p)
+        );
    }
 }
--- a/crates/kebab-chunk/src/code_cpp_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_cpp_ast_v1.rs
@@ -39,17 +39,13 @@ impl Chunker for CodeCppAstV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        for b in &doc.blocks {
            let c = match b {
                Block::Code(c) => c,
-                _ => anyhow::bail!(
-                    "CodeCppAstV1Chunker only handles code docs (got non-Code block)"
-                ),
+                _ => {
+                    anyhow::bail!("CodeCppAstV1Chunker only handles code docs (got non-Code block)")
+                }
            };
            if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
                anyhow::bail!(
@@ -68,9 +64,12 @@ impl Chunker for CodeCppAstV1Chunker {
                _ => unreachable!("validated above"),
            };
            let (ls, le, symbol, lang) = match &cb.common.source_span {
-                SourceSpan::Code { line_start, line_end, symbol, lang } => {
-                    (*line_start, *line_end, symbol.clone(), lang.clone())
-                }
+                SourceSpan::Code {
+                    line_start,
+                    line_end,
+                    symbol,
+                    lang,
+                } => (*line_start, *line_end, symbol.clone(), lang.clone()),
                _ => unreachable!("validated above"),
            };
            let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeCppAstV1Chunker {
                    lang: lang.clone(),
                };
                out.push(make_chunk(
-                    doc, &chunker_version, &block_ids, &base_policy_hash,
-                    None, span, cb.code.clone(),
+                    doc,
+                    &chunker_version,
+                    &block_ids,
+                    &base_policy_hash,
+                    None,
+                    span,
+                    cb.code.clone(),
                ));
            } else {
                let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeCppAstV1Chunker {
                for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
                    let part_ls = ls + off_start;
                    let part_le = ls + off_end;
-                    let part_sym = symbol
-                        .as_ref()
-                        .map(|s| format!("{s} [part {}/{n}]", i + 1));
+                    let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
                    let span = SourceSpan::Code {
                        line_start: part_ls,
                        line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeCppAstV1Chunker {
                        lang: lang.clone(),
                    };
                    out.push(make_chunk(
-                        doc, &chunker_version, &block_ids, &base_policy_hash,
-                        Some(part_ls), span, text,
+                        doc,
+                        &chunker_version,
+                        &block_ids,
+                        &base_policy_hash,
+                        Some(part_ls),
+                        span,
+                        text,
                    ));
                }
            }
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
 mod tests {
    use super::*;
    use kebab_core::{
-        Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-        SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
-        SourceType, TrustLevel, WorkspacePath,
+        AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+        CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+        WorkspacePath, id_for_block, id_for_doc,
    };
    use time::OffsetDateTime;

@@ -206,39 +213,60 @@ mod tests {
                };
                let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
                Block::Code(CodeBlock {
-                    common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
+                    common: CommonBlock {
+                        block_id: bid,
+                        heading_path: vec![],
+                        source_span: span,
+                    },
                    lang: Some("cpp".into()),
                    code: (*code).to_string(),
                })
            })
            .collect();
        CanonicalDocument {
-            doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
-            lang: Lang("und".into()), blocks,
+            doc_id,
+            source_asset_id: aid,
+            workspace_path: wp,
+            title: "a".into(),
+            lang: Lang("und".into()),
+            blocks,
            metadata: Metadata {
-                aliases: vec![], tags: vec![],
+                aliases: vec![],
+                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
-                source_type: SourceType::Note, trust_level: TrustLevel::Primary,
-                user_id_alias: None, user: Default::default(),
-                repo: Some("kebab".into()), git_branch: Some("main".into()),
-                git_commit: Some("0".repeat(40)), code_lang: Some("cpp".into()),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+                repo: Some("kebab".into()),
+                git_branch: Some("main".into()),
+                git_commit: Some("0".repeat(40)),
+                code_lang: Some("cpp".into()),
            },
            provenance: Provenance { events: vec![] },
-            parser_version: pv, schema_version: 1, doc_version: 1,
-            last_chunker_version: None, last_embedding_version: None,
+            parser_version: pv,
+            schema_version: 1,
+            doc_version: 1,
+            last_chunker_version: None,
+            last_embedding_version: None,
        }
    }
    fn policy() -> ChunkPolicy {
-        ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
+        ChunkPolicy {
+            target_tokens: 500,
+            overlap_tokens: 80,
            respect_markdown_headings: false,
-            chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
    }

    #[test]
    fn chunker_version_is_code_cpp_ast_v1() {
-        assert_eq!(CodeCppAstV1Chunker.chunker_version(),
-            ChunkerVersion("code-cpp-ast-v1".into()));
+        assert_eq!(
+            CodeCppAstV1Chunker.chunker_version(),
+            ChunkerVersion("code-cpp-ast-v1".into())
+        );
    }

    #[test]
@@ -256,7 +284,12 @@ mod tests {
            assert_eq!(c.chunker_version.0, "code-cpp-ast-v1");
        }
        match &chunks[0].source_spans[0] {
-            SourceSpan::Code { symbol, line_start, line_end, .. } => {
+            SourceSpan::Code {
+                symbol,
+                line_start,
+                line_end,
+                ..
+            } => {
                assert_eq!(symbol.as_deref(), Some("parse"));
                assert_eq!((*line_start, *line_end), (1, 3));
            }
@@ -266,22 +299,32 @@ mod tests {

    #[test]
    fn oversize_unit_splits_into_parts_with_unique_ids() {
-        let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::<String>();
+        let body = (0..500)
+            .map(|i| format!("\tx{i} = {i};\n"))
+            .collect::<String>();
        let code = format!("int big() {{\n{body}\n}}");
        let doc = code_doc(&[("big", 1, 502, &code)]);
        let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
-        assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
+        assert!(
+            chunks.len() >= 2,
+            "oversize unit must split, got {}",
+            chunks.len()
+        );
        for c in &chunks {
            match &c.source_spans[0] {
                SourceSpan::Code { symbol, .. } => {
-                    assert!(symbol.as_deref().unwrap().starts_with("big [part "),
-                        "part-numbered symbol, got {symbol:?}");
+                    assert!(
+                        symbol.as_deref().unwrap().starts_with("big [part "),
+                        "part-numbered symbol, got {symbol:?}"
+                    );
                }
                _ => unreachable!(),
            }
        }
        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
-        let n = ids.len(); ids.sort_unstable(); ids.dedup();
+        let n = ids.len();
+        ids.sort_unstable();
+        ids.dedup();
        assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
    }

@@ -295,7 +338,8 @@ mod tests {
                heading_path: vec![],
                source_span: SourceSpan::Line { start: 1, end: 1 },
            },
-            text: "x".into(), inlines: vec![],
+            text: "x".into(),
+            inlines: vec![],
        })];
        let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
        assert!(err.to_string().contains("CodeCppAstV1Chunker"));
@@ -304,11 +348,19 @@ mod tests {
    #[test]
    fn deterministic_chunk_ids_1000() {
        let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
-        let base: Vec<String> = CodeCppAstV1Chunker.chunk(&doc, &policy())
-            .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+        let base: Vec<String> = CodeCppAstV1Chunker
+            .chunk(&doc, &policy())
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
        for _ in 0..1000 {
-            let again: Vec<String> = CodeCppAstV1Chunker.chunk(&doc, &policy())
-                .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+            let again: Vec<String> = CodeCppAstV1Chunker
+                .chunk(&doc, &policy())
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
            assert_eq!(again, base);
        }
    }
@@ -316,7 +368,9 @@ mod tests {
    #[test]
    fn policy_hash_matches_md_heading_v1() {
        let p = policy();
-        assert_eq!(CodeCppAstV1Chunker.policy_hash(&p),
-            crate::MdHeadingV1Chunker.policy_hash(&p));
+        assert_eq!(
+            CodeCppAstV1Chunker.policy_hash(&p),
+            crate::MdHeadingV1Chunker.policy_hash(&p)
+        );
    }
 }
--- a/crates/kebab-chunk/src/code_go_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_go_ast_v1.rs
@@ -39,17 +39,13 @@ impl Chunker for CodeGoAstV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        for b in &doc.blocks {
            let c = match b {
                Block::Code(c) => c,
-                _ => anyhow::bail!(
-                    "CodeGoAstV1Chunker only handles code docs (got non-Code block)"
-                ),
+                _ => {
+                    anyhow::bail!("CodeGoAstV1Chunker only handles code docs (got non-Code block)")
+                }
            };
            if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
                anyhow::bail!(
@@ -68,9 +64,12 @@ impl Chunker for CodeGoAstV1Chunker {
                _ => unreachable!("validated above"),
            };
            let (ls, le, symbol, lang) = match &cb.common.source_span {
-                SourceSpan::Code { line_start, line_end, symbol, lang } => {
-                    (*line_start, *line_end, symbol.clone(), lang.clone())
-                }
+                SourceSpan::Code {
+                    line_start,
+                    line_end,
+                    symbol,
+                    lang,
+                } => (*line_start, *line_end, symbol.clone(), lang.clone()),
                _ => unreachable!("validated above"),
            };
            let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeGoAstV1Chunker {
                    lang: lang.clone(),
                };
                out.push(make_chunk(
-                    doc, &chunker_version, &block_ids, &base_policy_hash,
-                    None, span, cb.code.clone(),
+                    doc,
+                    &chunker_version,
+                    &block_ids,
+                    &base_policy_hash,
+                    None,
+                    span,
+                    cb.code.clone(),
                ));
            } else {
                let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeGoAstV1Chunker {
                for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
                    let part_ls = ls + off_start;
                    let part_le = ls + off_end;
-                    let part_sym = symbol
-                        .as_ref()
-                        .map(|s| format!("{s} [part {}/{n}]", i + 1));
+                    let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
                    let span = SourceSpan::Code {
                        line_start: part_ls,
                        line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeGoAstV1Chunker {
                        lang: lang.clone(),
                    };
                    out.push(make_chunk(
-                        doc, &chunker_version, &block_ids, &base_policy_hash,
-                        Some(part_ls), span, text,
+                        doc,
+                        &chunker_version,
+                        &block_ids,
+                        &base_policy_hash,
+                        Some(part_ls),
+                        span,
+                        text,
                    ));
                }
            }
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
 mod tests {
    use super::*;
    use kebab_core::{
-        Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-        SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
-        SourceType, TrustLevel, WorkspacePath,
+        AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+        CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+        WorkspacePath, id_for_block, id_for_doc,
    };
    use time::OffsetDateTime;

@@ -206,46 +213,72 @@ mod tests {
                };
                let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
                Block::Code(CodeBlock {
-                    common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
+                    common: CommonBlock {
+                        block_id: bid,
+                        heading_path: vec![],
+                        source_span: span,
+                    },
                    lang: Some("go".into()),
                    code: (*code).to_string(),
                })
            })
            .collect();
        CanonicalDocument {
-            doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
-            lang: Lang("und".into()), blocks,
+            doc_id,
+            source_asset_id: aid,
+            workspace_path: wp,
+            title: "a".into(),
+            lang: Lang("und".into()),
+            blocks,
            metadata: Metadata {
-                aliases: vec![], tags: vec![],
+                aliases: vec![],
+                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
-                source_type: SourceType::Note, trust_level: TrustLevel::Primary,
-                user_id_alias: None, user: Default::default(),
-                repo: Some("kebab".into()), git_branch: Some("main".into()),
-                git_commit: Some("0".repeat(40)), code_lang: Some("go".into()),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+                repo: Some("kebab".into()),
+                git_branch: Some("main".into()),
+                git_commit: Some("0".repeat(40)),
+                code_lang: Some("go".into()),
            },
            provenance: Provenance { events: vec![] },
-            parser_version: pv, schema_version: 1, doc_version: 1,
-            last_chunker_version: None, last_embedding_version: None,
+            parser_version: pv,
+            schema_version: 1,
+            doc_version: 1,
+            last_chunker_version: None,
+            last_embedding_version: None,
        }
    }
    fn policy() -> ChunkPolicy {
-        ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
+        ChunkPolicy {
+            target_tokens: 500,
+            overlap_tokens: 80,
            respect_markdown_headings: false,
-            chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
    }

    #[test]
    fn chunker_version_is_code_go_ast_v1() {
-        assert_eq!(CodeGoAstV1Chunker.chunker_version(),
-            ChunkerVersion("code-go-ast-v1".into()));
+        assert_eq!(
+            CodeGoAstV1Chunker.chunker_version(),
+            ChunkerVersion("code-go-ast-v1".into())
+        );
    }

    #[test]
    fn one_chunk_per_unit_preserves_code_span() {
        let doc = code_doc(&[
            ("parse", 1, 3, "func parse() {\n\t// x\n}"),
-            ("Foo.double", 5, 7, "func double() int {\n\t//\n\treturn 0\n}"),
+            (
+                "Foo.double",
+                5,
+                7,
+                "func double() int {\n\t//\n\treturn 0\n}",
+            ),
        ]);
        let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
        assert_eq!(chunks.len(), 2);
@@ -256,7 +289,12 @@ mod tests {
            assert_eq!(c.chunker_version.0, "code-go-ast-v1");
        }
        match &chunks[0].source_spans[0] {
-            SourceSpan::Code { symbol, line_start, line_end, .. } => {
+            SourceSpan::Code {
+                symbol,
+                line_start,
+                line_end,
+                ..
+            } => {
                assert_eq!(symbol.as_deref(), Some("parse"));
                assert_eq!((*line_start, *line_end), (1, 3));
            }
@@ -266,22 +304,33 @@ mod tests {

    #[test]
    fn oversize_unit_splits_into_parts_with_unique_ids() {
-        let body = (0..500).map(|i| format!("\tx{i} := {i}")).collect::<Vec<_>>().join("\n");
+        let body = (0..500)
+            .map(|i| format!("\tx{i} := {i}"))
+            .collect::<Vec<_>>()
+            .join("\n");
        let code = format!("func big() {{\n{body}\n}}");
        let doc = code_doc(&[("big", 1, 502, &code)]);
        let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
-        assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
+        assert!(
+            chunks.len() >= 2,
+            "oversize unit must split, got {}",
+            chunks.len()
+        );
        for c in &chunks {
            match &c.source_spans[0] {
                SourceSpan::Code { symbol, .. } => {
-                    assert!(symbol.as_deref().unwrap().starts_with("big [part "),
-                        "part-numbered symbol, got {symbol:?}");
+                    assert!(
+                        symbol.as_deref().unwrap().starts_with("big [part "),
+                        "part-numbered symbol, got {symbol:?}"
+                    );
                }
                _ => unreachable!(),
            }
        }
        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
-        let n = ids.len(); ids.sort_unstable(); ids.dedup();
+        let n = ids.len();
+        ids.sort_unstable();
+        ids.dedup();
        assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
    }

@@ -295,7 +344,8 @@ mod tests {
                heading_path: vec![],
                source_span: SourceSpan::Line { start: 1, end: 1 },
            },
-            text: "x".into(), inlines: vec![],
+            text: "x".into(),
+            inlines: vec![],
        })];
        let err = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
        assert!(err.to_string().contains("CodeGoAstV1Chunker"));
@@ -304,11 +354,19 @@ mod tests {
    #[test]
    fn deterministic_chunk_ids_1000() {
        let doc = code_doc(&[("parse", 1, 2, "func parse() {}\n")]);
-        let base: Vec<String> = CodeGoAstV1Chunker.chunk(&doc, &policy())
-            .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+        let base: Vec<String> = CodeGoAstV1Chunker
+            .chunk(&doc, &policy())
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
        for _ in 0..1000 {
-            let again: Vec<String> = CodeGoAstV1Chunker.chunk(&doc, &policy())
-                .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+            let again: Vec<String> = CodeGoAstV1Chunker
+                .chunk(&doc, &policy())
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
            assert_eq!(again, base);
        }
    }
@@ -316,7 +374,9 @@ mod tests {
    #[test]
    fn policy_hash_matches_md_heading_v1() {
        let p = policy();
-        assert_eq!(CodeGoAstV1Chunker.policy_hash(&p),
-            crate::MdHeadingV1Chunker.policy_hash(&p));
+        assert_eq!(
+            CodeGoAstV1Chunker.policy_hash(&p),
+            crate::MdHeadingV1Chunker.policy_hash(&p)
+        );
    }
 }
--- a/crates/kebab-chunk/src/code_java_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_java_ast_v1.rs
@@ -39,11 +39,7 @@ impl Chunker for CodeJavaAstV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        for b in &doc.blocks {
            let c = match b {
                Block::Code(c) => c,
@@ -68,9 +64,12 @@ impl Chunker for CodeJavaAstV1Chunker {
                _ => unreachable!("validated above"),
            };
            let (ls, le, symbol, lang) = match &cb.common.source_span {
-                SourceSpan::Code { line_start, line_end, symbol, lang } => {
-                    (*line_start, *line_end, symbol.clone(), lang.clone())
-                }
+                SourceSpan::Code {
+                    line_start,
+                    line_end,
+                    symbol,
+                    lang,
+                } => (*line_start, *line_end, symbol.clone(), lang.clone()),
                _ => unreachable!("validated above"),
            };
            let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeJavaAstV1Chunker {
                    lang: lang.clone(),
                };
                out.push(make_chunk(
-                    doc, &chunker_version, &block_ids, &base_policy_hash,
-                    None, span, cb.code.clone(),
+                    doc,
+                    &chunker_version,
+                    &block_ids,
+                    &base_policy_hash,
+                    None,
+                    span,
+                    cb.code.clone(),
                ));
            } else {
                let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeJavaAstV1Chunker {
                for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
                    let part_ls = ls + off_start;
                    let part_le = ls + off_end;
-                    let part_sym = symbol
-                        .as_ref()
-                        .map(|s| format!("{s} [part {}/{n}]", i + 1));
+                    let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
                    let span = SourceSpan::Code {
                        line_start: part_ls,
                        line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeJavaAstV1Chunker {
                        lang: lang.clone(),
                    };
                    out.push(make_chunk(
-                        doc, &chunker_version, &block_ids, &base_policy_hash,
-                        Some(part_ls), span, text,
+                        doc,
+                        &chunker_version,
+                        &block_ids,
+                        &base_policy_hash,
+                        Some(part_ls),
+                        span,
+                        text,
                    ));
                }
            }
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
 mod tests {
    use super::*;
    use kebab_core::{
-        Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-        SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
-        SourceType, TrustLevel, WorkspacePath,
+        AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+        CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+        WorkspacePath, id_for_block, id_for_doc,
    };
    use time::OffsetDateTime;

@@ -206,39 +213,60 @@ mod tests {
                };
                let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
                Block::Code(CodeBlock {
-                    common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
+                    common: CommonBlock {
+                        block_id: bid,
+                        heading_path: vec![],
+                        source_span: span,
+                    },
                    lang: Some("java".into()),
                    code: (*code).to_string(),
                })
            })
            .collect();
        CanonicalDocument {
-            doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
-            lang: Lang("und".into()), blocks,
+            doc_id,
+            source_asset_id: aid,
+            workspace_path: wp,
+            title: "a".into(),
+            lang: Lang("und".into()),
+            blocks,
            metadata: Metadata {
-                aliases: vec![], tags: vec![],
+                aliases: vec![],
+                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
-                source_type: SourceType::Note, trust_level: TrustLevel::Primary,
-                user_id_alias: None, user: Default::default(),
-                repo: Some("kebab".into()), git_branch: Some("main".into()),
-                git_commit: Some("0".repeat(40)), code_lang: Some("java".into()),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+                repo: Some("kebab".into()),
+                git_branch: Some("main".into()),
+                git_commit: Some("0".repeat(40)),
+                code_lang: Some("java".into()),
            },
            provenance: Provenance { events: vec![] },
-            parser_version: pv, schema_version: 1, doc_version: 1,
-            last_chunker_version: None, last_embedding_version: None,
+            parser_version: pv,
+            schema_version: 1,
+            doc_version: 1,
+            last_chunker_version: None,
+            last_embedding_version: None,
        }
    }
    fn policy() -> ChunkPolicy {
-        ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
+        ChunkPolicy {
+            target_tokens: 500,
+            overlap_tokens: 80,
            respect_markdown_headings: false,
-            chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
    }

    #[test]
    fn chunker_version_is_code_java_ast_v1() {
-        assert_eq!(CodeJavaAstV1Chunker.chunker_version(),
-            ChunkerVersion("code-java-ast-v1".into()));
+        assert_eq!(
+            CodeJavaAstV1Chunker.chunker_version(),
+            ChunkerVersion("code-java-ast-v1".into())
+        );
    }

    #[test]
@@ -256,7 +284,12 @@ mod tests {
            assert_eq!(c.chunker_version.0, "code-java-ast-v1");
        }
        match &chunks[0].source_spans[0] {
-            SourceSpan::Code { symbol, line_start, line_end, .. } => {
+            SourceSpan::Code {
+                symbol,
+                line_start,
+                line_end,
+                ..
+            } => {
                assert_eq!(symbol.as_deref(), Some("parse"));
                assert_eq!((*line_start, *line_end), (1, 3));
            }
@@ -266,22 +299,33 @@ mod tests {

    #[test]
    fn oversize_unit_splits_into_parts_with_unique_ids() {
-        let body = (0..500).map(|i| format!("\tint x{i} = {i};")).collect::<Vec<_>>().join("\n");
+        let body = (0..500)
+            .map(|i| format!("\tint x{i} = {i};"))
+            .collect::<Vec<_>>()
+            .join("\n");
        let code = format!("void big() {{\n{body}\n}}");
        let doc = code_doc(&[("big", 1, 502, &code)]);
        let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap();
-        assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
+        assert!(
+            chunks.len() >= 2,
+            "oversize unit must split, got {}",
+            chunks.len()
+        );
        for c in &chunks {
            match &c.source_spans[0] {
                SourceSpan::Code { symbol, .. } => {
-                    assert!(symbol.as_deref().unwrap().starts_with("big [part "),
-                        "part-numbered symbol, got {symbol:?}");
+                    assert!(
+                        symbol.as_deref().unwrap().starts_with("big [part "),
+                        "part-numbered symbol, got {symbol:?}"
+                    );
                }
                _ => unreachable!(),
            }
        }
        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
-        let n = ids.len(); ids.sort_unstable(); ids.dedup();
+        let n = ids.len();
+        ids.sort_unstable();
+        ids.dedup();
        assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
    }

@@ -295,7 +339,8 @@ mod tests {
                heading_path: vec![],
                source_span: SourceSpan::Line { start: 1, end: 1 },
            },
-            text: "x".into(), inlines: vec![],
+            text: "x".into(),
+            inlines: vec![],
        })];
        let err = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
        assert!(err.to_string().contains("CodeJavaAstV1Chunker"));
@@ -304,11 +349,19 @@ mod tests {
    #[test]
    fn deterministic_chunk_ids_1000() {
        let doc = code_doc(&[("parse", 1, 2, "void parse() {}\n")]);
-        let base: Vec<String> = CodeJavaAstV1Chunker.chunk(&doc, &policy())
-            .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+        let base: Vec<String> = CodeJavaAstV1Chunker
+            .chunk(&doc, &policy())
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
        for _ in 0..1000 {
-            let again: Vec<String> = CodeJavaAstV1Chunker.chunk(&doc, &policy())
-                .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+            let again: Vec<String> = CodeJavaAstV1Chunker
+                .chunk(&doc, &policy())
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
            assert_eq!(again, base);
        }
    }
@@ -316,7 +369,9 @@ mod tests {
    #[test]
    fn policy_hash_matches_md_heading_v1() {
        let p = policy();
-        assert_eq!(CodeJavaAstV1Chunker.policy_hash(&p),
-            crate::MdHeadingV1Chunker.policy_hash(&p));
+        assert_eq!(
+            CodeJavaAstV1Chunker.policy_hash(&p),
+            crate::MdHeadingV1Chunker.policy_hash(&p)
+        );
    }
 }
--- a/crates/kebab-chunk/src/code_js_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_js_ast_v1.rs
@@ -39,17 +39,13 @@ impl Chunker for CodeJsAstV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        for b in &doc.blocks {
            let c = match b {
                Block::Code(c) => c,
-                _ => anyhow::bail!(
-                    "CodeJsAstV1Chunker only handles code docs (got non-Code block)"
-                ),
+                _ => {
+                    anyhow::bail!("CodeJsAstV1Chunker only handles code docs (got non-Code block)")
+                }
            };
            if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
                anyhow::bail!(
@@ -68,9 +64,12 @@ impl Chunker for CodeJsAstV1Chunker {
                _ => unreachable!("validated above"),
            };
            let (ls, le, symbol, lang) = match &cb.common.source_span {
-                SourceSpan::Code { line_start, line_end, symbol, lang } => {
-                    (*line_start, *line_end, symbol.clone(), lang.clone())
-                }
+                SourceSpan::Code {
+                    line_start,
+                    line_end,
+                    symbol,
+                    lang,
+                } => (*line_start, *line_end, symbol.clone(), lang.clone()),
                _ => unreachable!("validated above"),
            };
            let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeJsAstV1Chunker {
                    lang: lang.clone(),
                };
                out.push(make_chunk(
-                    doc, &chunker_version, &block_ids, &base_policy_hash,
-                    None, span, cb.code.clone(),
+                    doc,
+                    &chunker_version,
+                    &block_ids,
+                    &base_policy_hash,
+                    None,
+                    span,
+                    cb.code.clone(),
                ));
            } else {
                let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeJsAstV1Chunker {
                for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
                    let part_ls = ls + off_start;
                    let part_le = ls + off_end;
-                    let part_sym = symbol
-                        .as_ref()
-                        .map(|s| format!("{s} [part {}/{n}]", i + 1));
+                    let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
                    let span = SourceSpan::Code {
                        line_start: part_ls,
                        line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeJsAstV1Chunker {
                        lang: lang.clone(),
                    };
                    out.push(make_chunk(
-                        doc, &chunker_version, &block_ids, &base_policy_hash,
-                        Some(part_ls), span, text,
+                        doc,
+                        &chunker_version,
+                        &block_ids,
+                        &base_policy_hash,
+                        Some(part_ls),
+                        span,
+                        text,
                    ));
                }
            }
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
 mod tests {
    use super::*;
    use kebab_core::{
-        Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-        SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
-        SourceType, TrustLevel, WorkspacePath,
+        AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+        CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+        WorkspacePath, id_for_block, id_for_doc,
    };
    use time::OffsetDateTime;

@@ -206,46 +213,72 @@ mod tests {
                };
                let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
                Block::Code(CodeBlock {
-                    common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
+                    common: CommonBlock {
+                        block_id: bid,
+                        heading_path: vec![],
+                        source_span: span,
+                    },
                    lang: Some("javascript".into()),
                    code: (*code).to_string(),
                })
            })
            .collect();
        CanonicalDocument {
-            doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
-            lang: Lang("und".into()), blocks,
+            doc_id,
+            source_asset_id: aid,
+            workspace_path: wp,
+            title: "a".into(),
+            lang: Lang("und".into()),
+            blocks,
            metadata: Metadata {
-                aliases: vec![], tags: vec![],
+                aliases: vec![],
+                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
-                source_type: SourceType::Note, trust_level: TrustLevel::Primary,
-                user_id_alias: None, user: Default::default(),
-                repo: Some("kebab".into()), git_branch: Some("main".into()),
-                git_commit: Some("0".repeat(40)), code_lang: Some("javascript".into()),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+                repo: Some("kebab".into()),
+                git_branch: Some("main".into()),
+                git_commit: Some("0".repeat(40)),
+                code_lang: Some("javascript".into()),
            },
            provenance: Provenance { events: vec![] },
-            parser_version: pv, schema_version: 1, doc_version: 1,
-            last_chunker_version: None, last_embedding_version: None,
+            parser_version: pv,
+            schema_version: 1,
+            doc_version: 1,
+            last_chunker_version: None,
+            last_embedding_version: None,
        }
    }
    fn policy() -> ChunkPolicy {
-        ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
+        ChunkPolicy {
+            target_tokens: 500,
+            overlap_tokens: 80,
            respect_markdown_headings: false,
-            chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
    }

    #[test]
    fn chunker_version_is_code_js_ast_v1() {
-        assert_eq!(CodeJsAstV1Chunker.chunker_version(),
-            ChunkerVersion("code-js-ast-v1".into()));
+        assert_eq!(
+            CodeJsAstV1Chunker.chunker_version(),
+            ChunkerVersion("code-js-ast-v1".into())
+        );
    }

    #[test]
    fn one_chunk_per_unit_preserves_code_span() {
        let doc = code_doc(&[
            ("parse", 1, 3, "function parse() {\n    // x\n}"),
-            ("Foo.double", 5, 7, "function double() {\n    //\n    return 0;\n}"),
+            (
+                "Foo.double",
+                5,
+                7,
+                "function double() {\n    //\n    return 0;\n}",
+            ),
        ]);
        let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
        assert_eq!(chunks.len(), 2);
@@ -256,7 +289,12 @@ mod tests {
            assert_eq!(c.chunker_version.0, "code-js-ast-v1");
        }
        match &chunks[0].source_spans[0] {
-            SourceSpan::Code { symbol, line_start, line_end, .. } => {
+            SourceSpan::Code {
+                symbol,
+                line_start,
+                line_end,
+                ..
+            } => {
                assert_eq!(symbol.as_deref(), Some("parse"));
                assert_eq!((*line_start, *line_end), (1, 3));
            }
@@ -266,22 +304,33 @@ mod tests {

    #[test]
    fn oversize_unit_splits_into_parts_with_unique_ids() {
-        let body = (0..500).map(|i| format!("    const x{i} = {i};")).collect::<Vec<_>>().join("\n");
+        let body = (0..500)
+            .map(|i| format!("    const x{i} = {i};"))
+            .collect::<Vec<_>>()
+            .join("\n");
        let code = format!("function big() {{\n{body}\n}}");
        let doc = code_doc(&[("big", 1, 502, &code)]);
        let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
-        assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
+        assert!(
+            chunks.len() >= 2,
+            "oversize unit must split, got {}",
+            chunks.len()
+        );
        for c in &chunks {
            match &c.source_spans[0] {
                SourceSpan::Code { symbol, .. } => {
-                    assert!(symbol.as_deref().unwrap().starts_with("big [part "),
-                        "part-numbered symbol, got {symbol:?}");
+                    assert!(
+                        symbol.as_deref().unwrap().starts_with("big [part "),
+                        "part-numbered symbol, got {symbol:?}"
+                    );
                }
                _ => unreachable!(),
            }
        }
        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
-        let n = ids.len(); ids.sort_unstable(); ids.dedup();
+        let n = ids.len();
+        ids.sort_unstable();
+        ids.dedup();
        assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
    }

@@ -295,7 +344,8 @@ mod tests {
                heading_path: vec![],
                source_span: SourceSpan::Line { start: 1, end: 1 },
            },
-            text: "x".into(), inlines: vec![],
+            text: "x".into(),
+            inlines: vec![],
        })];
        let err = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
        assert!(err.to_string().contains("CodeJsAstV1Chunker"));
@@ -304,11 +354,19 @@ mod tests {
    #[test]
    fn deterministic_chunk_ids_1000() {
        let doc = code_doc(&[("parse", 1, 2, "function parse() {}\n")]);
-        let base: Vec<String> = CodeJsAstV1Chunker.chunk(&doc, &policy())
-            .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+        let base: Vec<String> = CodeJsAstV1Chunker
+            .chunk(&doc, &policy())
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
        for _ in 0..1000 {
-            let again: Vec<String> = CodeJsAstV1Chunker.chunk(&doc, &policy())
-                .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+            let again: Vec<String> = CodeJsAstV1Chunker
+                .chunk(&doc, &policy())
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
            assert_eq!(again, base);
        }
    }
@@ -316,7 +374,9 @@ mod tests {
    #[test]
    fn policy_hash_matches_md_heading_v1() {
        let p = policy();
-        assert_eq!(CodeJsAstV1Chunker.policy_hash(&p),
-            crate::MdHeadingV1Chunker.policy_hash(&p));
+        assert_eq!(
+            CodeJsAstV1Chunker.policy_hash(&p),
+            crate::MdHeadingV1Chunker.policy_hash(&p)
+        );
    }
 }
--- a/crates/kebab-chunk/src/code_kotlin_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_kotlin_ast_v1.rs
@@ -39,11 +39,7 @@ impl Chunker for CodeKotlinAstV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        for b in &doc.blocks {
            let c = match b {
                Block::Code(c) => c,
@@ -68,9 +64,12 @@ impl Chunker for CodeKotlinAstV1Chunker {
                _ => unreachable!("validated above"),
            };
            let (ls, le, symbol, lang) = match &cb.common.source_span {
-                SourceSpan::Code { line_start, line_end, symbol, lang } => {
-                    (*line_start, *line_end, symbol.clone(), lang.clone())
-                }
+                SourceSpan::Code {
+                    line_start,
+                    line_end,
+                    symbol,
+                    lang,
+                } => (*line_start, *line_end, symbol.clone(), lang.clone()),
                _ => unreachable!("validated above"),
            };
            let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeKotlinAstV1Chunker {
                    lang: lang.clone(),
                };
                out.push(make_chunk(
-                    doc, &chunker_version, &block_ids, &base_policy_hash,
-                    None, span, cb.code.clone(),
+                    doc,
+                    &chunker_version,
+                    &block_ids,
+                    &base_policy_hash,
+                    None,
+                    span,
+                    cb.code.clone(),
                ));
            } else {
                let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeKotlinAstV1Chunker {
                for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
                    let part_ls = ls + off_start;
                    let part_le = ls + off_end;
-                    let part_sym = symbol
-                        .as_ref()
-                        .map(|s| format!("{s} [part {}/{n}]", i + 1));
+                    let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
                    let span = SourceSpan::Code {
                        line_start: part_ls,
                        line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeKotlinAstV1Chunker {
                        lang: lang.clone(),
                    };
                    out.push(make_chunk(
-                        doc, &chunker_version, &block_ids, &base_policy_hash,
-                        Some(part_ls), span, text,
+                        doc,
+                        &chunker_version,
+                        &block_ids,
+                        &base_policy_hash,
+                        Some(part_ls),
+                        span,
+                        text,
                    ));
                }
            }
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
 mod tests {
    use super::*;
    use kebab_core::{
-        Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-        SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
-        SourceType, TrustLevel, WorkspacePath,
+        AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+        CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+        WorkspacePath, id_for_block, id_for_doc,
    };
    use time::OffsetDateTime;

@@ -206,46 +213,72 @@ mod tests {
                };
                let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
                Block::Code(CodeBlock {
-                    common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
+                    common: CommonBlock {
+                        block_id: bid,
+                        heading_path: vec![],
+                        source_span: span,
+                    },
                    lang: Some("kotlin".into()),
                    code: (*code).to_string(),
                })
            })
            .collect();
        CanonicalDocument {
-            doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
-            lang: Lang("und".into()), blocks,
+            doc_id,
+            source_asset_id: aid,
+            workspace_path: wp,
+            title: "a".into(),
+            lang: Lang("und".into()),
+            blocks,
            metadata: Metadata {
-                aliases: vec![], tags: vec![],
+                aliases: vec![],
+                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
-                source_type: SourceType::Note, trust_level: TrustLevel::Primary,
-                user_id_alias: None, user: Default::default(),
-                repo: Some("kebab".into()), git_branch: Some("main".into()),
-                git_commit: Some("0".repeat(40)), code_lang: Some("kotlin".into()),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+                repo: Some("kebab".into()),
+                git_branch: Some("main".into()),
+                git_commit: Some("0".repeat(40)),
+                code_lang: Some("kotlin".into()),
            },
            provenance: Provenance { events: vec![] },
-            parser_version: pv, schema_version: 1, doc_version: 1,
-            last_chunker_version: None, last_embedding_version: None,
+            parser_version: pv,
+            schema_version: 1,
+            doc_version: 1,
+            last_chunker_version: None,
+            last_embedding_version: None,
        }
    }
    fn policy() -> ChunkPolicy {
-        ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
+        ChunkPolicy {
+            target_tokens: 500,
+            overlap_tokens: 80,
            respect_markdown_headings: false,
-            chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
    }

    #[test]
    fn chunker_version_is_code_kotlin_ast_v1() {
-        assert_eq!(CodeKotlinAstV1Chunker.chunker_version(),
-            ChunkerVersion("code-kotlin-ast-v1".into()));
+        assert_eq!(
+            CodeKotlinAstV1Chunker.chunker_version(),
+            ChunkerVersion("code-kotlin-ast-v1".into())
+        );
    }

    #[test]
    fn one_chunk_per_unit_preserves_code_span() {
        let doc = code_doc(&[
            ("parse", 1, 3, "fun parse() {\n\t// x\n}"),
-            ("Foo.double", 5, 7, "fun double(): Int {\n\t//\n\treturn 0\n}"),
+            (
+                "Foo.double",
+                5,
+                7,
+                "fun double(): Int {\n\t//\n\treturn 0\n}",
+            ),
        ]);
        let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
        assert_eq!(chunks.len(), 2);
@@ -256,7 +289,12 @@ mod tests {
            assert_eq!(c.chunker_version.0, "code-kotlin-ast-v1");
        }
        match &chunks[0].source_spans[0] {
-            SourceSpan::Code { symbol, line_start, line_end, .. } => {
+            SourceSpan::Code {
+                symbol,
+                line_start,
+                line_end,
+                ..
+            } => {
                assert_eq!(symbol.as_deref(), Some("parse"));
                assert_eq!((*line_start, *line_end), (1, 3));
            }
@@ -266,22 +304,33 @@ mod tests {

    #[test]
    fn oversize_unit_splits_into_parts_with_unique_ids() {
-        let body = (0..500).map(|i| format!("\tval x{i} = {i}")).collect::<Vec<_>>().join("\n");
+        let body = (0..500)
+            .map(|i| format!("\tval x{i} = {i}"))
+            .collect::<Vec<_>>()
+            .join("\n");
        let code = format!("fun big() {{\n{body}\n}}");
        let doc = code_doc(&[("big", 1, 502, &code)]);
        let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
-        assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
+        assert!(
+            chunks.len() >= 2,
+            "oversize unit must split, got {}",
+            chunks.len()
+        );
        for c in &chunks {
            match &c.source_spans[0] {
                SourceSpan::Code { symbol, .. } => {
-                    assert!(symbol.as_deref().unwrap().starts_with("big [part "),
-                        "part-numbered symbol, got {symbol:?}");
+                    assert!(
+                        symbol.as_deref().unwrap().starts_with("big [part "),
+                        "part-numbered symbol, got {symbol:?}"
+                    );
                }
                _ => unreachable!(),
            }
        }
        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
-        let n = ids.len(); ids.sort_unstable(); ids.dedup();
+        let n = ids.len();
+        ids.sort_unstable();
+        ids.dedup();
        assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
    }

@@ -295,7 +344,8 @@ mod tests {
                heading_path: vec![],
                source_span: SourceSpan::Line { start: 1, end: 1 },
            },
-            text: "x".into(), inlines: vec![],
+            text: "x".into(),
+            inlines: vec![],
        })];
        let err = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
        assert!(err.to_string().contains("CodeKotlinAstV1Chunker"));
@@ -304,11 +354,19 @@ mod tests {
    #[test]
    fn deterministic_chunk_ids_1000() {
        let doc = code_doc(&[("parse", 1, 2, "fun parse() {}\n")]);
-        let base: Vec<String> = CodeKotlinAstV1Chunker.chunk(&doc, &policy())
-            .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+        let base: Vec<String> = CodeKotlinAstV1Chunker
+            .chunk(&doc, &policy())
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
        for _ in 0..1000 {
-            let again: Vec<String> = CodeKotlinAstV1Chunker.chunk(&doc, &policy())
-                .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+            let again: Vec<String> = CodeKotlinAstV1Chunker
+                .chunk(&doc, &policy())
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
            assert_eq!(again, base);
        }
    }
@@ -316,7 +374,9 @@ mod tests {
    #[test]
    fn policy_hash_matches_md_heading_v1() {
        let p = policy();
-        assert_eq!(CodeKotlinAstV1Chunker.policy_hash(&p),
-            crate::MdHeadingV1Chunker.policy_hash(&p));
+        assert_eq!(
+            CodeKotlinAstV1Chunker.policy_hash(&p),
+            crate::MdHeadingV1Chunker.policy_hash(&p)
+        );
    }
 }
--- a/crates/kebab-chunk/src/code_python_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_python_ast_v1.rs
@@ -39,11 +39,7 @@ impl Chunker for CodePythonAstV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        for b in &doc.blocks {
            let c = match b {
                Block::Code(c) => c,
@@ -68,9 +64,12 @@ impl Chunker for CodePythonAstV1Chunker {
                _ => unreachable!("validated above"),
            };
            let (ls, le, symbol, lang) = match &cb.common.source_span {
-                SourceSpan::Code { line_start, line_end, symbol, lang } => {
-                    (*line_start, *line_end, symbol.clone(), lang.clone())
-                }
+                SourceSpan::Code {
+                    line_start,
+                    line_end,
+                    symbol,
+                    lang,
+                } => (*line_start, *line_end, symbol.clone(), lang.clone()),
                _ => unreachable!("validated above"),
            };
            let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodePythonAstV1Chunker {
                    lang: lang.clone(),
                };
                out.push(make_chunk(
-                    doc, &chunker_version, &block_ids, &base_policy_hash,
-                    None, span, cb.code.clone(),
+                    doc,
+                    &chunker_version,
+                    &block_ids,
+                    &base_policy_hash,
+                    None,
+                    span,
+                    cb.code.clone(),
                ));
            } else {
                let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodePythonAstV1Chunker {
                for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
                    let part_ls = ls + off_start;
                    let part_le = ls + off_end;
-                    let part_sym = symbol
-                        .as_ref()
-                        .map(|s| format!("{s} [part {}/{n}]", i + 1));
+                    let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
                    let span = SourceSpan::Code {
                        line_start: part_ls,
                        line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodePythonAstV1Chunker {
                        lang: lang.clone(),
                    };
                    out.push(make_chunk(
-                        doc, &chunker_version, &block_ids, &base_policy_hash,
-                        Some(part_ls), span, text,
+                        doc,
+                        &chunker_version,
+                        &block_ids,
+                        &base_policy_hash,
+                        Some(part_ls),
+                        span,
+                        text,
                    ));
                }
            }
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
 mod tests {
    use super::*;
    use kebab_core::{
-        Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-        SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
-        SourceType, TrustLevel, WorkspacePath,
+        AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+        CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+        WorkspacePath, id_for_block, id_for_doc,
    };
    use time::OffsetDateTime;

@@ -206,39 +213,60 @@ mod tests {
                };
                let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
                Block::Code(CodeBlock {
-                    common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
+                    common: CommonBlock {
+                        block_id: bid,
+                        heading_path: vec![],
+                        source_span: span,
+                    },
                    lang: Some("python".into()),
                    code: (*code).to_string(),
                })
            })
            .collect();
        CanonicalDocument {
-            doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
-            lang: Lang("und".into()), blocks,
+            doc_id,
+            source_asset_id: aid,
+            workspace_path: wp,
+            title: "a".into(),
+            lang: Lang("und".into()),
+            blocks,
            metadata: Metadata {
-                aliases: vec![], tags: vec![],
+                aliases: vec![],
+                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
-                source_type: SourceType::Note, trust_level: TrustLevel::Primary,
-                user_id_alias: None, user: Default::default(),
-                repo: Some("kebab".into()), git_branch: Some("main".into()),
-                git_commit: Some("0".repeat(40)), code_lang: Some("python".into()),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+                repo: Some("kebab".into()),
+                git_branch: Some("main".into()),
+                git_commit: Some("0".repeat(40)),
+                code_lang: Some("python".into()),
            },
            provenance: Provenance { events: vec![] },
-            parser_version: pv, schema_version: 1, doc_version: 1,
-            last_chunker_version: None, last_embedding_version: None,
+            parser_version: pv,
+            schema_version: 1,
+            doc_version: 1,
+            last_chunker_version: None,
+            last_embedding_version: None,
        }
    }
    fn policy() -> ChunkPolicy {
-        ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
+        ChunkPolicy {
+            target_tokens: 500,
+            overlap_tokens: 80,
            respect_markdown_headings: false,
-            chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
    }

    #[test]
    fn chunker_version_is_code_python_ast_v1() {
-        assert_eq!(CodePythonAstV1Chunker.chunker_version(),
-            ChunkerVersion("code-python-ast-v1".into()));
+        assert_eq!(
+            CodePythonAstV1Chunker.chunker_version(),
+            ChunkerVersion("code-python-ast-v1".into())
+        );
    }

    #[test]
@@ -256,7 +284,12 @@ mod tests {
            assert_eq!(c.chunker_version.0, "code-python-ast-v1");
        }
        match &chunks[0].source_spans[0] {
-            SourceSpan::Code { symbol, line_start, line_end, .. } => {
+            SourceSpan::Code {
+                symbol,
+                line_start,
+                line_end,
+                ..
+            } => {
                assert_eq!(symbol.as_deref(), Some("parse"));
                assert_eq!((*line_start, *line_end), (1, 3));
            }
@@ -266,22 +299,33 @@ mod tests {

    #[test]
    fn oversize_unit_splits_into_parts_with_unique_ids() {
-        let body = (0..500).map(|i| format!("    x{i} = {i}")).collect::<Vec<_>>().join("\n");
+        let body = (0..500)
+            .map(|i| format!("    x{i} = {i}"))
+            .collect::<Vec<_>>()
+            .join("\n");
        let code = format!("def big():\n{body}\n");
        let doc = code_doc(&[("big", 1, 502, &code)]);
        let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap();
-        assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
+        assert!(
+            chunks.len() >= 2,
+            "oversize unit must split, got {}",
+            chunks.len()
+        );
        for c in &chunks {
            match &c.source_spans[0] {
                SourceSpan::Code { symbol, .. } => {
-                    assert!(symbol.as_deref().unwrap().starts_with("big [part "),
-                        "part-numbered symbol, got {symbol:?}");
+                    assert!(
+                        symbol.as_deref().unwrap().starts_with("big [part "),
+                        "part-numbered symbol, got {symbol:?}"
+                    );
                }
                _ => unreachable!(),
            }
        }
        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
-        let n = ids.len(); ids.sort_unstable(); ids.dedup();
+        let n = ids.len();
+        ids.sort_unstable();
+        ids.dedup();
        assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
    }

@@ -295,7 +339,8 @@ mod tests {
                heading_path: vec![],
                source_span: SourceSpan::Line { start: 1, end: 1 },
            },
-            text: "x".into(), inlines: vec![],
+            text: "x".into(),
+            inlines: vec![],
        })];
        let err = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
        assert!(err.to_string().contains("CodePythonAstV1Chunker"));
@@ -304,11 +349,19 @@ mod tests {
    #[test]
    fn deterministic_chunk_ids_1000() {
        let doc = code_doc(&[("parse", 1, 2, "def parse(): pass\n")]);
-        let base: Vec<String> = CodePythonAstV1Chunker.chunk(&doc, &policy())
-            .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+        let base: Vec<String> = CodePythonAstV1Chunker
+            .chunk(&doc, &policy())
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
        for _ in 0..1000 {
-            let again: Vec<String> = CodePythonAstV1Chunker.chunk(&doc, &policy())
-                .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+            let again: Vec<String> = CodePythonAstV1Chunker
+                .chunk(&doc, &policy())
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
            assert_eq!(again, base);
        }
    }
@@ -316,7 +369,9 @@ mod tests {
    #[test]
    fn policy_hash_matches_md_heading_v1() {
        let p = policy();
-        assert_eq!(CodePythonAstV1Chunker.policy_hash(&p),
-            crate::MdHeadingV1Chunker.policy_hash(&p));
+        assert_eq!(
+            CodePythonAstV1Chunker.policy_hash(&p),
+            crate::MdHeadingV1Chunker.policy_hash(&p)
+        );
    }
 }
--- a/crates/kebab-chunk/src/code_rust_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_rust_ast_v1.rs
@@ -39,11 +39,7 @@ impl Chunker for CodeRustAstV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        for b in &doc.blocks {
            let c = match b {
                Block::Code(c) => c,
@@ -68,9 +64,12 @@ impl Chunker for CodeRustAstV1Chunker {
                _ => unreachable!("validated above"),
            };
            let (ls, le, symbol, lang) = match &cb.common.source_span {
-                SourceSpan::Code { line_start, line_end, symbol, lang } => {
-                    (*line_start, *line_end, symbol.clone(), lang.clone())
-                }
+                SourceSpan::Code {
+                    line_start,
+                    line_end,
+                    symbol,
+                    lang,
+                } => (*line_start, *line_end, symbol.clone(), lang.clone()),
                _ => unreachable!("validated above"),
            };
            let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeRustAstV1Chunker {
                    lang: lang.clone(),
                };
                out.push(make_chunk(
-                    doc, &chunker_version, &block_ids, &base_policy_hash,
-                    None, span, cb.code.clone(),
+                    doc,
+                    &chunker_version,
+                    &block_ids,
+                    &base_policy_hash,
+                    None,
+                    span,
+                    cb.code.clone(),
                ));
            } else {
                let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeRustAstV1Chunker {
                for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
                    let part_ls = ls + off_start;
                    let part_le = ls + off_end;
-                    let part_sym = symbol
-                        .as_ref()
-                        .map(|s| format!("{s} [part {}/{n}]", i + 1));
+                    let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
                    let span = SourceSpan::Code {
                        line_start: part_ls,
                        line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeRustAstV1Chunker {
                        lang: lang.clone(),
                    };
                    out.push(make_chunk(
-                        doc, &chunker_version, &block_ids, &base_policy_hash,
-                        Some(part_ls), span, text,
+                        doc,
+                        &chunker_version,
+                        &block_ids,
+                        &base_policy_hash,
+                        Some(part_ls),
+                        span,
+                        text,
                    ));
                }
            }
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
 mod tests {
    use super::*;
    use kebab_core::{
-        Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-        SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
-        SourceType, TrustLevel, WorkspacePath,
+        AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+        CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+        WorkspacePath, id_for_block, id_for_doc,
    };
    use time::OffsetDateTime;

@@ -206,39 +213,60 @@ mod tests {
                };
                let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
                Block::Code(CodeBlock {
-                    common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
+                    common: CommonBlock {
+                        block_id: bid,
+                        heading_path: vec![],
+                        source_span: span,
+                    },
                    lang: Some("rust".into()),
                    code: (*code).to_string(),
                })
            })
            .collect();
        CanonicalDocument {
-            doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
-            lang: Lang("und".into()), blocks,
+            doc_id,
+            source_asset_id: aid,
+            workspace_path: wp,
+            title: "a".into(),
+            lang: Lang("und".into()),
+            blocks,
            metadata: Metadata {
-                aliases: vec![], tags: vec![],
+                aliases: vec![],
+                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
-                source_type: SourceType::Note, trust_level: TrustLevel::Primary,
-                user_id_alias: None, user: Default::default(),
-                repo: Some("kebab".into()), git_branch: Some("main".into()),
-                git_commit: Some("0".repeat(40)), code_lang: Some("rust".into()),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+                repo: Some("kebab".into()),
+                git_branch: Some("main".into()),
+                git_commit: Some("0".repeat(40)),
+                code_lang: Some("rust".into()),
            },
            provenance: Provenance { events: vec![] },
-            parser_version: pv, schema_version: 1, doc_version: 1,
-            last_chunker_version: None, last_embedding_version: None,
+            parser_version: pv,
+            schema_version: 1,
+            doc_version: 1,
+            last_chunker_version: None,
+            last_embedding_version: None,
        }
    }
    fn policy() -> ChunkPolicy {
-        ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
+        ChunkPolicy {
+            target_tokens: 500,
+            overlap_tokens: 80,
            respect_markdown_headings: false,
-            chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
    }

    #[test]
    fn chunker_version_is_code_rust_ast_v1() {
-        assert_eq!(CodeRustAstV1Chunker.chunker_version(),
-            ChunkerVersion("code-rust-ast-v1".into()));
+        assert_eq!(
+            CodeRustAstV1Chunker.chunker_version(),
+            ChunkerVersion("code-rust-ast-v1".into())
+        );
    }

    #[test]
@@ -256,7 +284,12 @@ mod tests {
            assert_eq!(c.chunker_version.0, "code-rust-ast-v1");
        }
        match &chunks[0].source_spans[0] {
-            SourceSpan::Code { symbol, line_start, line_end, .. } => {
+            SourceSpan::Code {
+                symbol,
+                line_start,
+                line_end,
+                ..
+            } => {
                assert_eq!(symbol.as_deref(), Some("parse"));
                assert_eq!((*line_start, *line_end), (1, 3));
            }
@@ -266,22 +299,33 @@ mod tests {

    #[test]
    fn oversize_unit_splits_into_parts_with_unique_ids() {
-        let body = (0..500).map(|i| format!("    let x{i} = {i};")).collect::<Vec<_>>().join("\n");
+        let body = (0..500)
+            .map(|i| format!("    let x{i} = {i};"))
+            .collect::<Vec<_>>()
+            .join("\n");
        let code = format!("pub fn big() {{\n{body}\n}}");
        let doc = code_doc(&[("big", 1, 502, &code)]);
        let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap();
-        assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
+        assert!(
+            chunks.len() >= 2,
+            "oversize unit must split, got {}",
+            chunks.len()
+        );
        for c in &chunks {
            match &c.source_spans[0] {
                SourceSpan::Code { symbol, .. } => {
-                    assert!(symbol.as_deref().unwrap().starts_with("big [part "),
-                        "part-numbered symbol, got {symbol:?}");
+                    assert!(
+                        symbol.as_deref().unwrap().starts_with("big [part "),
+                        "part-numbered symbol, got {symbol:?}"
+                    );
                }
                _ => unreachable!(),
            }
        }
        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
-        let n = ids.len(); ids.sort_unstable(); ids.dedup();
+        let n = ids.len();
+        ids.sort_unstable();
+        ids.dedup();
        assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
    }

@@ -295,7 +339,8 @@ mod tests {
                heading_path: vec![],
                source_span: SourceSpan::Line { start: 1, end: 1 },
            },
-            text: "x".into(), inlines: vec![],
+            text: "x".into(),
+            inlines: vec![],
        })];
        let err = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
        assert!(err.to_string().contains("CodeRustAstV1Chunker"));
@@ -304,11 +349,19 @@ mod tests {
    #[test]
    fn deterministic_chunk_ids_1000() {
        let doc = code_doc(&[("parse", 1, 2, "fn parse(){}\n}")]);
-        let base: Vec<String> = CodeRustAstV1Chunker.chunk(&doc, &policy())
-            .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+        let base: Vec<String> = CodeRustAstV1Chunker
+            .chunk(&doc, &policy())
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
        for _ in 0..1000 {
-            let again: Vec<String> = CodeRustAstV1Chunker.chunk(&doc, &policy())
-                .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+            let again: Vec<String> = CodeRustAstV1Chunker
+                .chunk(&doc, &policy())
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
            assert_eq!(again, base);
        }
    }
@@ -316,7 +369,9 @@ mod tests {
    #[test]
    fn policy_hash_matches_md_heading_v1() {
        let p = policy();
-        assert_eq!(CodeRustAstV1Chunker.policy_hash(&p),
-            crate::MdHeadingV1Chunker.policy_hash(&p));
+        assert_eq!(
+            CodeRustAstV1Chunker.policy_hash(&p),
+            crate::MdHeadingV1Chunker.policy_hash(&p)
+        );
    }
 }
--- a/crates/kebab-chunk/src/code_text_paragraph_v1.rs
+++ b/crates/kebab-chunk/src/code_text_paragraph_v1.rs
@@ -9,7 +9,7 @@

 use crate::tier2_shared::{build_chunk_no_symbol, policy_hash};
 use anyhow::Result;
-use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
+use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};

 pub const VERSION_LABEL: &str = "code-text-paragraph-v1";

--- a/crates/kebab-chunk/src/code_ts_ast_v1.rs
+++ b/crates/kebab-chunk/src/code_ts_ast_v1.rs
@@ -39,17 +39,13 @@ impl Chunker for CodeTsAstV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        for b in &doc.blocks {
            let c = match b {
                Block::Code(c) => c,
-                _ => anyhow::bail!(
-                    "CodeTsAstV1Chunker only handles code docs (got non-Code block)"
-                ),
+                _ => {
+                    anyhow::bail!("CodeTsAstV1Chunker only handles code docs (got non-Code block)")
+                }
            };
            if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
                anyhow::bail!(
@@ -68,9 +64,12 @@ impl Chunker for CodeTsAstV1Chunker {
                _ => unreachable!("validated above"),
            };
            let (ls, le, symbol, lang) = match &cb.common.source_span {
-                SourceSpan::Code { line_start, line_end, symbol, lang } => {
-                    (*line_start, *line_end, symbol.clone(), lang.clone())
-                }
+                SourceSpan::Code {
+                    line_start,
+                    line_end,
+                    symbol,
+                    lang,
+                } => (*line_start, *line_end, symbol.clone(), lang.clone()),
                _ => unreachable!("validated above"),
            };
            let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeTsAstV1Chunker {
                    lang: lang.clone(),
                };
                out.push(make_chunk(
-                    doc, &chunker_version, &block_ids, &base_policy_hash,
-                    None, span, cb.code.clone(),
+                    doc,
+                    &chunker_version,
+                    &block_ids,
+                    &base_policy_hash,
+                    None,
+                    span,
+                    cb.code.clone(),
                ));
            } else {
                let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeTsAstV1Chunker {
                for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
                    let part_ls = ls + off_start;
                    let part_le = ls + off_end;
-                    let part_sym = symbol
-                        .as_ref()
-                        .map(|s| format!("{s} [part {}/{n}]", i + 1));
+                    let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
                    let span = SourceSpan::Code {
                        line_start: part_ls,
                        line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeTsAstV1Chunker {
                        lang: lang.clone(),
                    };
                    out.push(make_chunk(
-                        doc, &chunker_version, &block_ids, &base_policy_hash,
-                        Some(part_ls), span, text,
+                        doc,
+                        &chunker_version,
+                        &block_ids,
+                        &base_policy_hash,
+                        Some(part_ls),
+                        span,
+                        text,
                    ));
                }
            }
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
 mod tests {
    use super::*;
    use kebab_core::{
-        Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-        SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
-        SourceType, TrustLevel, WorkspacePath,
+        AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+        CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+        WorkspacePath, id_for_block, id_for_doc,
    };
    use time::OffsetDateTime;

@@ -206,46 +213,72 @@ mod tests {
                };
                let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
                Block::Code(CodeBlock {
-                    common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
+                    common: CommonBlock {
+                        block_id: bid,
+                        heading_path: vec![],
+                        source_span: span,
+                    },
                    lang: Some("typescript".into()),
                    code: (*code).to_string(),
                })
            })
            .collect();
        CanonicalDocument {
-            doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
-            lang: Lang("und".into()), blocks,
+            doc_id,
+            source_asset_id: aid,
+            workspace_path: wp,
+            title: "a".into(),
+            lang: Lang("und".into()),
+            blocks,
            metadata: Metadata {
-                aliases: vec![], tags: vec![],
+                aliases: vec![],
+                tags: vec![],
                created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
                updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
-                source_type: SourceType::Note, trust_level: TrustLevel::Primary,
-                user_id_alias: None, user: Default::default(),
-                repo: Some("kebab".into()), git_branch: Some("main".into()),
-                git_commit: Some("0".repeat(40)), code_lang: Some("typescript".into()),
+                source_type: SourceType::Note,
+                trust_level: TrustLevel::Primary,
+                user_id_alias: None,
+                user: Default::default(),
+                repo: Some("kebab".into()),
+                git_branch: Some("main".into()),
+                git_commit: Some("0".repeat(40)),
+                code_lang: Some("typescript".into()),
            },
            provenance: Provenance { events: vec![] },
-            parser_version: pv, schema_version: 1, doc_version: 1,
-            last_chunker_version: None, last_embedding_version: None,
+            parser_version: pv,
+            schema_version: 1,
+            doc_version: 1,
+            last_chunker_version: None,
+            last_embedding_version: None,
        }
    }
    fn policy() -> ChunkPolicy {
-        ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
+        ChunkPolicy {
+            target_tokens: 500,
+            overlap_tokens: 80,
            respect_markdown_headings: false,
-            chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
+            chunker_version: ChunkerVersion(VERSION_LABEL.into()),
+        }
    }

    #[test]
    fn chunker_version_is_code_ts_ast_v1() {
-        assert_eq!(CodeTsAstV1Chunker.chunker_version(),
-            ChunkerVersion("code-ts-ast-v1".into()));
+        assert_eq!(
+            CodeTsAstV1Chunker.chunker_version(),
+            ChunkerVersion("code-ts-ast-v1".into())
+        );
    }

    #[test]
    fn one_chunk_per_unit_preserves_code_span() {
        let doc = code_doc(&[
            ("parse", 1, 3, "function parse(): void {\n    // x\n}"),
-            ("Foo.double", 5, 7, "function double(): number {\n    //\n    return 0;\n}"),
+            (
+                "Foo.double",
+                5,
+                7,
+                "function double(): number {\n    //\n    return 0;\n}",
+            ),
        ]);
        let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
        assert_eq!(chunks.len(), 2);
@@ -256,7 +289,12 @@ mod tests {
            assert_eq!(c.chunker_version.0, "code-ts-ast-v1");
        }
        match &chunks[0].source_spans[0] {
-            SourceSpan::Code { symbol, line_start, line_end, .. } => {
+            SourceSpan::Code {
+                symbol,
+                line_start,
+                line_end,
+                ..
+            } => {
                assert_eq!(symbol.as_deref(), Some("parse"));
                assert_eq!((*line_start, *line_end), (1, 3));
            }
@@ -266,22 +304,33 @@ mod tests {

    #[test]
    fn oversize_unit_splits_into_parts_with_unique_ids() {
-        let body = (0..500).map(|i| format!("    const x{i} = {i};")).collect::<Vec<_>>().join("\n");
+        let body = (0..500)
+            .map(|i| format!("    const x{i} = {i};"))
+            .collect::<Vec<_>>()
+            .join("\n");
        let code = format!("function big(): void {{\n{body}\n}}");
        let doc = code_doc(&[("big", 1, 502, &code)]);
        let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
-        assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
+        assert!(
+            chunks.len() >= 2,
+            "oversize unit must split, got {}",
+            chunks.len()
+        );
        for c in &chunks {
            match &c.source_spans[0] {
                SourceSpan::Code { symbol, .. } => {
-                    assert!(symbol.as_deref().unwrap().starts_with("big [part "),
-                        "part-numbered symbol, got {symbol:?}");
+                    assert!(
+                        symbol.as_deref().unwrap().starts_with("big [part "),
+                        "part-numbered symbol, got {symbol:?}"
+                    );
                }
                _ => unreachable!(),
            }
        }
        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
-        let n = ids.len(); ids.sort_unstable(); ids.dedup();
+        let n = ids.len();
+        ids.sort_unstable();
+        ids.dedup();
        assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
    }

@@ -295,7 +344,8 @@ mod tests {
                heading_path: vec![],
                source_span: SourceSpan::Line { start: 1, end: 1 },
            },
-            text: "x".into(), inlines: vec![],
+            text: "x".into(),
+            inlines: vec![],
        })];
        let err = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
        assert!(err.to_string().contains("CodeTsAstV1Chunker"));
@@ -304,11 +354,19 @@ mod tests {
    #[test]
    fn deterministic_chunk_ids_1000() {
        let doc = code_doc(&[("parse", 1, 2, "function parse(): void {}\n")]);
-        let base: Vec<String> = CodeTsAstV1Chunker.chunk(&doc, &policy())
-            .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+        let base: Vec<String> = CodeTsAstV1Chunker
+            .chunk(&doc, &policy())
+            .unwrap()
+            .into_iter()
+            .map(|c| c.chunk_id.0)
+            .collect();
        for _ in 0..1000 {
-            let again: Vec<String> = CodeTsAstV1Chunker.chunk(&doc, &policy())
-                .unwrap().into_iter().map(|c| c.chunk_id.0).collect();
+            let again: Vec<String> = CodeTsAstV1Chunker
+                .chunk(&doc, &policy())
+                .unwrap()
+                .into_iter()
+                .map(|c| c.chunk_id.0)
+                .collect();
            assert_eq!(again, base);
        }
    }
@@ -316,7 +374,9 @@ mod tests {
    #[test]
    fn policy_hash_matches_md_heading_v1() {
        let p = policy();
-        assert_eq!(CodeTsAstV1Chunker.policy_hash(&p),
-            crate::MdHeadingV1Chunker.policy_hash(&p));
+        assert_eq!(
+            CodeTsAstV1Chunker.policy_hash(&p),
+            crate::MdHeadingV1Chunker.policy_hash(&p)
+        );
    }
 }
--- a/crates/kebab-chunk/src/dockerfile_file_v1.rs
+++ b/crates/kebab-chunk/src/dockerfile_file_v1.rs
@@ -7,7 +7,7 @@

 use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
 use anyhow::Result;
-use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
+use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};

 pub const VERSION_LABEL: &str = "dockerfile-file-v1";

--- a/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs
+++ b/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs
@@ -8,7 +8,7 @@

 use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
 use anyhow::Result;
-use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
+use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};

 pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1";

@@ -49,19 +49,14 @@ impl Chunker for K8sManifestResourceV1Chunker {
                .get("apiVersion")
                .and_then(|v| v.as_str())
                .unwrap_or("");
-            let kind = mapping
-                .get("kind")
-                .and_then(|v| v.as_str())
-                .unwrap_or("");
+            let kind = mapping.get("kind").and_then(|v| v.as_str()).unwrap_or("");

            // Skip non-k8s documents.
            if api.is_empty() || kind.is_empty() {
                continue;
            }

-            let metadata = mapping
-                .get("metadata")
-                .and_then(|v| v.as_mapping());
+            let metadata = mapping.get("metadata").and_then(|v| v.as_mapping());
            let name = metadata
                .and_then(|m| m.get("name"))
                .and_then(|v| v.as_str())
@@ -118,10 +113,7 @@ fn split_yaml_documents(text: &str) -> Vec<YamlSlice<'_>> {
        .enumerate()
        .filter_map(|(i, l)| {
            let trimmed = l.trim_end();
-            if trimmed == "---"
-                || trimmed.starts_with("--- ")
-                || trimmed.starts_with("---\t")
-            {
+            if trimmed == "---" || trimmed.starts_with("--- ") || trimmed.starts_with("---\t") {
                Some(i)
            } else {
                None
--- a/crates/kebab-chunk/src/lib.rs
+++ b/crates/kebab-chunk/src/lib.rs
@@ -23,14 +23,14 @@ mod code_js_ast_v1;
 mod code_kotlin_ast_v1;
 mod code_python_ast_v1;
 mod code_rust_ast_v1;
+pub mod code_text_paragraph_v1;
 mod code_ts_ast_v1;
+pub mod dockerfile_file_v1;
+pub mod k8s_manifest_resource_v1;
+pub mod manifest_file_v1;
 mod md_heading_v1;
 mod pdf_page_v1;
 mod tier2_shared;
-pub mod k8s_manifest_resource_v1;
-pub mod dockerfile_file_v1;
-pub mod manifest_file_v1;
-pub mod code_text_paragraph_v1;

 pub use code_c_ast_v1::CodeCAstV1Chunker;
 pub use code_cpp_ast_v1::CodeCppAstV1Chunker;
@@ -40,10 +40,10 @@ pub use code_js_ast_v1::CodeJsAstV1Chunker;
 pub use code_kotlin_ast_v1::CodeKotlinAstV1Chunker;
 pub use code_python_ast_v1::CodePythonAstV1Chunker;
 pub use code_rust_ast_v1::CodeRustAstV1Chunker;
+pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker;
 pub use code_ts_ast_v1::CodeTsAstV1Chunker;
+pub use dockerfile_file_v1::DockerfileFileV1Chunker;
+pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
+pub use manifest_file_v1::ManifestFileV1Chunker;
 pub use md_heading_v1::MdHeadingV1Chunker;
 pub use pdf_page_v1::PdfPageV1Chunker;
-pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
-pub use dockerfile_file_v1::DockerfileFileV1Chunker;
-pub use manifest_file_v1::ManifestFileV1Chunker;
-pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker;
--- a/crates/kebab-chunk/src/manifest_file_v1.rs
+++ b/crates/kebab-chunk/src/manifest_file_v1.rs
@@ -8,7 +8,7 @@

 use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
 use anyhow::Result;
-use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
+use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};

 pub const VERSION_LABEL: &str = "manifest-file-v1";

--- a/crates/kebab-chunk/src/md_heading_v1.rs
+++ b/crates/kebab-chunk/src/md_heading_v1.rs
@@ -1,8 +1,8 @@
 //! `md-heading-v1` — heading-aware Markdown chunker.

 use kebab_core::{
-    Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker,
-    ChunkerVersion, DocumentId, SourceSpan, id_for_chunk,
+    Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
+    SourceSpan, id_for_chunk,
 };

 /// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label
@@ -99,11 +99,7 @@ impl Chunker for MdHeadingV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        let policy_hash = self.policy_hash(policy);
        let chunker_version = self.chunker_version();
        let mut out: Vec<Chunk> = Vec::new();
@@ -152,22 +148,12 @@ impl Chunker for MdHeadingV1Chunker {
                    // `collect_overlap_seed` keeps seed ≤ target/2, so
                    // a flush here never produces a chunk smaller than
                    // the seed budget.
-                    let would_exceed = acc.text_tokens + next_tokens
-                        > policy.target_tokens
+                    let would_exceed = acc.text_tokens + next_tokens > policy.target_tokens
                        && acc.has_non_heading_content();
                    if would_exceed {
-                        let overlap_seed = collect_overlap_seed(
-                            &acc,
-                            policy.overlap_tokens,
-                            policy.target_tokens,
-                        );
-                        flush(
-                            &mut acc,
-                            doc,
-                            &chunker_version,
-                            &policy_hash,
-                            &mut out,
-                        );
+                        let overlap_seed =
+                            collect_overlap_seed(&acc, policy.overlap_tokens, policy.target_tokens);
+                        flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
                        // Seed next accumulator with the prior chunk's
                        // tail blocks (paragraph-level overlap). The
                        // heading is *not* re-included here — it lives
@@ -292,10 +278,11 @@ fn build_chunk(
 ) -> Chunk {
    debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block");

-    let block_ids: Vec<BlockId> =
-        blocks.iter().map(|b| common(b).block_id.clone()).collect();
-    let source_spans: Vec<SourceSpan> =
-        blocks.iter().map(|b| common(b).source_span.clone()).collect();
+    let block_ids: Vec<BlockId> = blocks.iter().map(|b| common(b).block_id.clone()).collect();
+    let source_spans: Vec<SourceSpan> = blocks
+        .iter()
+        .map(|b| common(b).source_span.clone())
+        .collect();

    // heading_path: pick the first non-Heading block's heading_path
    // (which already includes every parent heading per kb-normalize).
@@ -339,12 +326,7 @@ fn build_chunk(
        text.len().div_ceil(BYTES_PER_TOKEN)
    };

-    let chunk_id = id_for_chunk(
-        &doc.doc_id,
-        chunker_version,
-        &block_ids,
-        policy_hash,
-    );
+    let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, &block_ids, policy_hash);

    Chunk {
        chunk_id,
@@ -400,14 +382,8 @@ fn render_block_text(b: &Block) -> String {
            } else {
                i.alt.clone()
            };
-            let ocr = i
-                .ocr
-                .as_ref()
-                .map_or("", |o| o.joined.as_str());
-            let cap = i
-                .caption
-                .as_ref()
-                .map_or("", |c| c.text.as_str());
+            let ocr = i.ocr.as_ref().map_or("", |o| o.joined.as_str());
+            let cap = i.caption.as_ref().map_or("", |c| c.text.as_str());
            [alt.as_str(), ocr, cap]
                .iter()
                .filter(|s| !s.is_empty())
@@ -447,9 +423,8 @@ fn common(b: &Block) -> &kebab_core::CommonBlock {
 mod tests {
    use super::*;
    use kebab_core::{
-        AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang,
-        Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel,
-        WorkspacePath, id_for_block,
+        AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, Metadata, Provenance,
+        SourceType, TableBlock, TextBlock, TrustLevel, WorkspacePath, id_for_block,
    };
    use time::OffsetDateTime;

@@ -492,12 +467,7 @@ mod tests {
        SourceSpan::Line { start, end }
    }

-    fn common_for(
-        kind: &str,
-        heading_path: &[String],
-        ordinal: u32,
-        s: SourceSpan,
-    ) -> CommonBlock {
+    fn common_for(kind: &str, heading_path: &[String], ordinal: u32, s: SourceSpan) -> CommonBlock {
        CommonBlock {
            block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s),
            heading_path: heading_path.to_vec(),
@@ -532,12 +502,7 @@ mod tests {
        })
    }

-    fn paragraph(
-        text: &str,
-        heading_path: &[&str],
-        ordinal: u32,
-        line: u32,
-    ) -> Block {
+    fn paragraph(text: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
        Block::Paragraph(TextBlock {
            common: common_for("paragraph", &hp, ordinal, span(line, line)),
@@ -546,12 +511,7 @@ mod tests {
        })
    }

-    fn code_block(
-        code: &str,
-        heading_path: &[&str],
-        ordinal: u32,
-        s: SourceSpan,
-    ) -> Block {
+    fn code_block(code: &str, heading_path: &[&str], ordinal: u32, s: SourceSpan) -> Block {
        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
        Block::Code(CodeBlock {
            common: common_for("code", &hp, ordinal, s),
@@ -578,12 +538,7 @@ mod tests {
        })
    }

-    fn image_ref(
-        alt: &str,
-        heading_path: &[&str],
-        ordinal: u32,
-        line: u32,
-    ) -> Block {
+    fn image_ref(alt: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
        let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
        Block::ImageRef(ImageRefBlock {
            common: common_for("imageref", &hp, ordinal, span(line, line)),
--- a/crates/kebab-chunk/src/pdf_page_v1.rs
+++ b/crates/kebab-chunk/src/pdf_page_v1.rs
@@ -53,18 +53,21 @@
 //! one chunk per atomic block. PdfPageV1 cannot.
 //!
 //! Workaround that doesn't change the §4.2 recipe: feed a per-chunk
-//! variant `format!("{base_policy_hash}#c{char_start}")` into the
-//! recipe's `policy_hash` slot (so distinct chunks distinguish via
-//! different policy_hash inputs), while storing the unmodified
-//! `base_policy_hash` in `Chunk.policy_hash` so the field still answers
-//! "what policy was active". Logged in `tasks/HOTFIXES.md`.
+//! variant `format!("{base_policy_hash}#c{segment_start}")` into the
+//! recipe's `policy_hash` slot. `segment_start` is the pre-overlap
+//! segment boundary, strictly increasing across the returned chunks
+//! even when the overlap walk collapses `actual_start` to a previous
+//! chunk's `prev_min`. Unmodified `base_policy_hash` is stored in
+//! `Chunk.policy_hash` so the field still answers "what policy was
+//! active". v1.1 second-iteration patch — logged in
+//! `tasks/HOTFIXES.md` (2026-05-27).

 use kebab_core::{
    Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
    SourceSpan, id_for_chunk,
 };

-const VERSION_LABEL: &str = "pdf-page-v1";
+const VERSION_LABEL: &str = "pdf-page-v1.1";
 const BYTES_PER_TOKEN: usize = 3;
 const POLICY_HASH_HEX_LEN: usize = 16;

@@ -89,11 +92,7 @@ impl Chunker for PdfPageV1Chunker {
        hex[..POLICY_HASH_HEX_LEN].to_string()
    }

-    fn chunk(
-        &self,
-        doc: &CanonicalDocument,
-        policy: &ChunkPolicy,
-    ) -> anyhow::Result<Vec<Chunk>> {
+    fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
        // Validate up front — every block must be a Paragraph carrying
        // SourceSpan::Page. A mixed document signals a routing bug in
        // the caller (e.g. running this chunker on Markdown) and is
@@ -106,18 +105,13 @@ impl Chunker for PdfPageV1Chunker {
                ),
            };
            if !matches!(common.source_span, SourceSpan::Page { .. }) {
-                anyhow::bail!(
-                    "PdfPageV1Chunker only handles PDF docs (got non-Page source_span)"
-                );
+                anyhow::bail!("PdfPageV1Chunker only handles PDF docs (got non-Page source_span)");
            }
        }

        let base_policy_hash = self.policy_hash(policy);
        let chunker_version = self.chunker_version();
-        let target_bytes = policy
-            .target_tokens
-            .saturating_mul(BYTES_PER_TOKEN)
-            .max(1);
+        let target_bytes = policy.target_tokens.saturating_mul(BYTES_PER_TOKEN).max(1);
        // Clamp the overlap to half the target. Without this, a policy
        // with `overlap_tokens >= target_tokens` would make every chunk
        // fully re-emit the previous chunk's text — mirrors
@@ -146,7 +140,7 @@ impl Chunker for PdfPageV1Chunker {
                continue;
            }

-            for (char_start, char_end, slice) in
+            for (segment_start, char_start, char_end, slice) in
                chunk_page(&p.text, target_bytes, overlap_bytes)
            {
                // PDF chars-per-page comfortably fits in u32 (a single
@@ -154,20 +148,20 @@ impl Chunker for PdfPageV1Chunker {
                // typography); silent `as u32` truncation would only
                // surface on corrupted input, where an explicit panic
                // is preferable to an off-by-2^32 span.
-                let char_start_u32 = u32::try_from(char_start)
-                    .expect("page chars fit in u32");
-                let char_end_u32 =
-                    u32::try_from(char_end).expect("page chars fit in u32");
+                let char_start_u32 = u32::try_from(char_start).expect("page chars fit in u32");
+                let char_end_u32 = u32::try_from(char_end).expect("page chars fit in u32");
                let span = SourceSpan::Page {
                    page: page_num,
                    char_start: Some(char_start_u32),
                    char_end: Some(char_end_u32),
                };
                let block_ids: Vec<BlockId> = vec![p.common.block_id.clone()];
-                // Per-chunk policy_hash variant prevents chunk_id
-                // collision when a page produces multiple chunks. See
-                // module docs for rationale.
-                let per_chunk_hash = format!("{base_policy_hash}#c{char_start}");
+                // v0.20.0 sub-item 1 bugfix (#3): per-chunk policy_hash
+                // variant uses `segment_start` (pre-overlap boundary,
+                // strictly increasing) instead of `char_start` (post-
+                // overlap, may collapse to prev_min). See module docs +
+                // spec §4.1 root cause + HOTFIXES.md 2026-05-27.
+                let per_chunk_hash = format!("{base_policy_hash}#c{segment_start}");
                let chunk_id =
                    id_for_chunk(&doc.doc_id, &chunker_version, &block_ids, &per_chunk_hash);
                let token_estimate = slice.len().div_ceil(BYTES_PER_TOKEN);
@@ -198,18 +192,28 @@ impl Chunker for PdfPageV1Chunker {
 }

 /// Split a single page's text into ordered chunks, each represented as
-/// `(char_start, char_end, text_slice)`. Char positions are within the
-/// page text, suitable for `SourceSpan::Page::char_start` / `char_end`.
+/// `(segment_start, actual_start, chunk_end, text_slice)`.
+///
+/// - `segment_start` = pre-overlap segment boundary. Strictly increasing
+///   across the returned vec. Use this for chunk_id uniqueness suffixes.
+/// - `actual_start` = post-overlap start char index. May collapse to a
+///   previous chunk's `actual_start` under aggressive overlap policy.
+///   Use this for `SourceSpan::Page::char_start`.
+/// - `chunk_end` = chunk's end char index (exclusive).
 ///
 /// Returns an empty vector when `text` is empty or whitespace-only.
-fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, String)> {
+fn chunk_page(
+    text: &str,
+    target_bytes: usize,
+    overlap_bytes: usize,
+) -> Vec<(usize, usize, usize, String)> {
    let chars: Vec<char> = text.chars().collect();
    let n = chars.len();
    if n == 0 {
        return Vec::new();
    }
    if text.len() <= target_bytes {
-        return vec![(0, n, text.to_string())];
+        return vec![(0, 0, n, text.to_string())];
    }

    // Build candidate boundary positions (char indices where a chunk
@@ -222,8 +226,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
        let c = chars[k];
        let nx = chars[k + 1];
        let is_paragraph_break = c == '\n' && nx == '\n';
-        let is_sentence_end =
-            matches!(c, '.' | '?' | '!') && nx.is_whitespace();
+        let is_sentence_end = matches!(c, '.' | '?' | '!') && nx.is_whitespace();
        if (is_paragraph_break || is_sentence_end) && k + 2 <= n {
            bounds.push(k + 2);
        }
@@ -235,11 +238,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
    bounds.dedup();

    // UTF-8 byte length of the slice between two char indices.
-    let byte_len = |a: usize, b: usize| -> usize {
-        chars[a..b].iter().map(|c| c.len_utf8()).sum()
-    };
+    let byte_len = |a: usize, b: usize| -> usize { chars[a..b].iter().map(|c| c.len_utf8()).sum() };

-    let mut chunks: Vec<(usize, usize, String)> = Vec::new();
+    let mut chunks: Vec<(usize, usize, usize, String)> = Vec::new();
    let mut seg_idx: usize = 0;
    while seg_idx + 1 < bounds.len() {
        let start = bounds[seg_idx];
@@ -264,7 +265,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
        // have absorbed up to `overlap_bytes` of bytes, but never past
        // the previous chunk's start (no full re-emission).
        let actual_start = if let Some(prev) = chunks.last() {
-            let prev_min = prev.0;
+            // prev tuple shape = (segment_start, actual_start, chunk_end, slice).
+            // overlap walk floor = previous chunk's actual_start (prev.1).
+            let prev_min = prev.1;
            let mut a = start;
            let mut acc_o: usize = 0;
            while a > prev_min {
@@ -281,7 +284,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
        };

        let slice: String = chars[actual_start..chunk_end].iter().collect();
-        chunks.push((actual_start, chunk_end, slice));
+        chunks.push((start, actual_start, chunk_end, slice));
        seg_idx = end_idx;
    }

@@ -390,7 +393,11 @@ mod tests {
            assert_eq!(c.heading_path, Vec::<String>::new());
            assert_eq!(c.source_spans.len(), 1);
            match c.source_spans[0] {
-                SourceSpan::Page { page, char_start, char_end } => {
+                SourceSpan::Page {
+                    page,
+                    char_start,
+                    char_end,
+                } => {
                    assert_eq!(page, (i as u32) + 1);
                    assert_eq!(char_start, Some(0));
                    assert!(char_end.unwrap() > 0);
@@ -435,11 +442,16 @@ mod tests {
        // N-1's char_end).
        for w in chunks.windows(2) {
            let prev_end = match w[0].source_spans[0] {
-                SourceSpan::Page { char_end: Some(e), .. } => e,
+                SourceSpan::Page {
+                    char_end: Some(e), ..
+                } => e,
                _ => panic!("missing char_end"),
            };
            let next_start = match w[1].source_spans[0] {
-                SourceSpan::Page { char_start: Some(s), .. } => s,
+                SourceSpan::Page {
+                    char_start: Some(s),
+                    ..
+                } => s,
                _ => panic!("missing char_start"),
            };
            assert!(
@@ -653,11 +665,17 @@ mod tests {
        // overlap) is the failure mode.
        for w in chunks.windows(2) {
            let prev_start = match w[0].source_spans[0] {
-                SourceSpan::Page { char_start: Some(s), .. } => s,
+                SourceSpan::Page {
+                    char_start: Some(s),
+                    ..
+                } => s,
                _ => panic!("missing char_start"),
            };
            let next_start = match w[1].source_spans[0] {
-                SourceSpan::Page { char_start: Some(s), .. } => s,
+                SourceSpan::Page {
+                    char_start: Some(s),
+                    ..
+                } => s,
                _ => panic!("missing char_start"),
            };
            assert!(
@@ -674,6 +692,43 @@ mod tests {
        assert_eq!(ids.len(), total, "chunk_ids must remain unique");
    }

+    #[test]
+    fn multi_chunk_page_with_aggressive_overlap_produces_unique_chunk_ids() {
+        // 한국어 OCR text 의 trigger shape: 10 char "가" + ". " + 500 char "나".
+        // → first segment [0, 12), second segment [12, n).
+        //   page_text byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500
+        //   → multi-chunk. overlap_bytes = min(240, 750) = 240 chars=80
+        //   → second chunk 의 actual_start 가 prev_min=0 collapse → same `#c0`.
+        //
+        // default_policy(500, 80) — target_tokens=500 → target_bytes=500*3=1500
+        // (한국어 3byte/char 환산), overlap_tokens=80 → overlap_bytes=min(240, 750)=240.
+        // verifier round 1 L-3 보강.
+        let early_seg = "가".repeat(10);
+        let tail = "나".repeat(500);
+        let page_text = format!("{early_seg}. {tail}");
+
+        let doc = make_pdf_doc(&[&page_text]);
+        let policy = default_policy(500, 80); // target=1500 byte, overlap=240 byte
+        let chunks = PdfPageV1Chunker.chunk(&doc, &policy).unwrap();
+
+        assert!(
+            chunks.len() >= 2,
+            "expected ≥2 chunks for {} byte page; got {}",
+            page_text.len(),
+            chunks.len()
+        );
+
+        let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
+        ids.sort_unstable();
+        let total = ids.len();
+        ids.dedup();
+        assert_eq!(
+            ids.len(),
+            total,
+            "all chunk_ids must be unique even when overlap walks actual_start back to prev_min"
+        );
+    }
+
    #[test]
    fn policy_hash_matches_md_heading_v1_for_identical_policy() {
        // Cross-chunker policy fingerprint identity — important so a
--- a/crates/kebab-chunk/src/tier2_shared.rs
+++ b/crates/kebab-chunk/src/tier2_shared.rs
@@ -113,7 +113,14 @@ pub(crate) fn build_chunk(
        symbol: Some(symbol.to_string()),
        lang: Some(lang.to_string()),
    };
-    build_chunk_from_span(doc, chunker_version, base_policy_hash, text, span, split_key)
+    build_chunk_from_span(
+        doc,
+        chunker_version,
+        base_policy_hash,
+        text,
+        span,
+        split_key,
+    )
 }

 /// Like `build_chunk` but emits `symbol: None`. Used by Tier 3 (per spec §9.3).
--- a/crates/kebab-chunk/tests/code_c_ast_snapshot.rs
+++ b/crates/kebab-chunk/tests/code_c_ast_snapshot.rs
@@ -13,9 +13,9 @@ use std::path::PathBuf;

 use kebab_chunk::CodeCAstV1Chunker;
 use kebab_core::{
-    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-    Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
-    id_for_block, id_for_doc,
+    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+    CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+    WorkspacePath, id_for_block, id_for_doc,
 };
 use serde_json::Value;
 use time::OffsetDateTime;
--- a/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
+++ b/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
@@ -15,9 +15,9 @@ use std::path::PathBuf;

 use kebab_chunk::CodeCppAstV1Chunker;
 use kebab_core::{
-    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-    Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
-    id_for_block, id_for_doc,
+    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+    CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+    WorkspacePath, id_for_block, id_for_doc,
 };
 use kebab_parse_code::CppAstExtractor;
 use serde_json::Value;
@@ -171,7 +171,9 @@ fn extract_cpp_fixture() -> CanonicalDocument {
        workspace_root: &root,
        config: &cfg,
    };
-    CppAstExtractor::new().extract(&ctx, src.as_bytes()).unwrap()
+    CppAstExtractor::new()
+        .extract(&ctx, src.as_bytes())
+        .unwrap()
 }

 // ---------------------------------------------------------------------------
@@ -261,43 +263,61 @@ fn code_cpp_ast_extractor_snapshot() {
    let doc = extract_cpp_fixture();

    // Verify the extractor emits all expected named units.
-    let block_syms: Vec<Option<String>> = doc.blocks.iter().filter_map(|b| match b {
-        Block::Code(c) => match &c.common.source_span {
-            SourceSpan::Code { symbol, .. } => Some(symbol.clone()),
+    let block_syms: Vec<Option<String>> = doc
+        .blocks
+        .iter()
+        .filter_map(|b| match b {
+            Block::Code(c) => match &c.common.source_span {
+                SourceSpan::Code { symbol, .. } => Some(symbol.clone()),
+                _ => None,
+            },
            _ => None,
-        },
-        _ => None,
-    }).collect();
+        })
+        .collect();

    // Must include namespace-qualified class and its methods
    assert!(
-        block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")),
+        block_syms
+            .iter()
+            .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")),
        "class unit missing: {block_syms:?}"
    );
    assert!(
-        block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")),
+        block_syms
+            .iter()
+            .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")),
        "ctor unit missing: {block_syms:?}"
    );
    assert!(
-        block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")),
+        block_syms
+            .iter()
+            .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")),
        "dtor unit missing: {block_syms:?}"
    );
    assert!(
-        block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")),
+        block_syms
+            .iter()
+            .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")),
        "chunk_doc unit missing: {block_syms:?}"
    );
    assert!(
-        block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")),
+        block_syms
+            .iter()
+            .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")),
        "operator() unit missing: {block_syms:?}"
    );
    // Template function (inside kebab::chunk namespace in the fixture)
    assert!(
-        block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::identity")),
+        block_syms
+            .iter()
+            .any(|s| s.as_deref() == Some("kebab::chunk::identity")),
        "identity template fn unit missing: {block_syms:?}"
    );
    // Free function in outer namespace
    assert!(
-        block_syms.iter().any(|s| s.as_deref() == Some("kebab::global_helper")),
+        block_syms
+            .iter()
+            .any(|s| s.as_deref() == Some("kebab::global_helper")),
        "global_helper unit missing: {block_syms:?}"
    );
    // Global main
@@ -312,14 +332,23 @@ fn code_cpp_ast_extractor_snapshot() {
 fn code_cpp_ast_extractor_chunks_deterministic() {
    let doc1 = extract_cpp_fixture();
    let doc2 = extract_cpp_fixture();
-    assert_eq!(doc1.blocks, doc2.blocks, "extractor output non-deterministic");
+    assert_eq!(
+        doc1.blocks, doc2.blocks,
+        "extractor output non-deterministic"
+    );

    let policy = fixed_policy();
    let chunks1 = CodeCppAstV1Chunker.chunk(&doc1, &policy).unwrap();
    let chunks2 = CodeCppAstV1Chunker.chunk(&doc2, &policy).unwrap();
    assert_eq!(
-        chunks1.iter().map(|c| c.chunk_id.0.clone()).collect::<Vec<_>>(),
-        chunks2.iter().map(|c| c.chunk_id.0.clone()).collect::<Vec<_>>(),
+        chunks1
+            .iter()
+            .map(|c| c.chunk_id.0.clone())
+            .collect::<Vec<_>>(),
+        chunks2
+            .iter()
+            .map(|c| c.chunk_id.0.clone())
+            .collect::<Vec<_>>(),
        "chunker output non-deterministic"
    );
 }
--- a/crates/kebab-chunk/tests/code_go_ast_snapshot.rs
+++ b/crates/kebab-chunk/tests/code_go_ast_snapshot.rs
@@ -13,9 +13,9 @@ use std::path::PathBuf;

 use kebab_chunk::CodeGoAstV1Chunker;
 use kebab_core::{
-    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-    Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
-    id_for_block, id_for_doc,
+    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+    CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+    WorkspacePath, id_for_block, id_for_doc,
 };
 use serde_json::Value;
 use time::OffsetDateTime;
--- a/crates/kebab-chunk/tests/code_java_ast_snapshot.rs
+++ b/crates/kebab-chunk/tests/code_java_ast_snapshot.rs
@@ -13,9 +13,9 @@ use std::path::PathBuf;

 use kebab_chunk::CodeJavaAstV1Chunker;
 use kebab_core::{
-    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-    Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
-    id_for_block, id_for_doc,
+    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+    CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+    WorkspacePath, id_for_block, id_for_doc,
 };
 use serde_json::Value;
 use time::OffsetDateTime;
--- a/crates/kebab-chunk/tests/code_js_ast_snapshot.rs
+++ b/crates/kebab-chunk/tests/code_js_ast_snapshot.rs
@@ -13,9 +13,9 @@ use std::path::PathBuf;

 use kebab_chunk::CodeJsAstV1Chunker;
 use kebab_core::{
-    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-    Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
-    id_for_block, id_for_doc,
+    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+    CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+    WorkspacePath, id_for_block, id_for_doc,
 };
 use serde_json::Value;
 use time::OffsetDateTime;
--- a/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs
+++ b/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs
@@ -13,9 +13,9 @@ use std::path::PathBuf;

 use kebab_chunk::CodeKotlinAstV1Chunker;
 use kebab_core::{
-    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-    Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
-    id_for_block, id_for_doc,
+    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+    CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+    WorkspacePath, id_for_block, id_for_doc,
 };
 use serde_json::Value;
 use time::OffsetDateTime;
--- a/crates/kebab-chunk/tests/code_python_ast_snapshot.rs
+++ b/crates/kebab-chunk/tests/code_python_ast_snapshot.rs
@@ -13,9 +13,9 @@ use std::path::PathBuf;

 use kebab_chunk::CodePythonAstV1Chunker;
 use kebab_core::{
-    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-    Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
-    id_for_block, id_for_doc,
+    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+    CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+    WorkspacePath, id_for_block, id_for_doc,
 };
 use serde_json::Value;
 use time::OffsetDateTime;
--- a/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs
+++ b/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs
@@ -13,9 +13,9 @@ use std::path::PathBuf;

 use kebab_chunk::CodeRustAstV1Chunker;
 use kebab_core::{
-    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-    Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
-    id_for_block, id_for_doc,
+    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+    CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+    WorkspacePath, id_for_block, id_for_doc,
 };
 use serde_json::Value;
 use time::OffsetDateTime;
--- a/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs
+++ b/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs
@@ -13,9 +13,9 @@ use std::path::PathBuf;

 use kebab_chunk::CodeTsAstV1Chunker;
 use kebab_core::{
-    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
-    Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
-    id_for_block, id_for_doc,
+    AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
+    CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
+    WorkspacePath, id_for_block, id_for_doc,
 };
 use serde_json::Value;
 use time::OffsetDateTime;
--- a/crates/kebab-chunk/tests/dockerfile_file_v1.rs
+++ b/crates/kebab-chunk/tests/dockerfile_file_v1.rs
@@ -124,7 +124,11 @@ fn dockerfile_emits_single_chunk() {
                Some("<dockerfile>"),
                "symbol must be '<dockerfile>'"
            );
-            assert_eq!(lang.as_deref(), Some("dockerfile"), "lang must be 'dockerfile'");
+            assert_eq!(
+                lang.as_deref(),
+                Some("dockerfile"),
+                "lang must be 'dockerfile'"
+            );
        }
        other => panic!("expected SourceSpan::Code, got {other:?}"),
    }
--- a/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs
+++ b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs
@@ -110,13 +110,11 @@ fn k8s_multi_doc_emits_one_chunk_per_resource() {

    let symbols: Vec<&str> = chunks
        .iter()
-        .map(|c| {
-            match &c.source_spans[0] {
-                SourceSpan::Code { symbol, .. } => {
-                    symbol.as_deref().expect("symbol must be Some for k8s chunks")
-                }
-                other => panic!("expected Code span, got {other:?}"),
-            }
+        .map(|c| match &c.source_spans[0] {
+            SourceSpan::Code { symbol, .. } => symbol
+                .as_deref()
+                .expect("symbol must be Some for k8s chunks"),
+            other => panic!("expected Code span, got {other:?}"),
        })
        .collect();

@@ -270,7 +268,11 @@ fn k8s_oversize_splits_into_line_windows_sharing_symbol() {
    let ranges: Vec<(u32, u32)> = chunks
        .iter()
        .map(|c| match &c.source_spans[0] {
-            SourceSpan::Code { line_start, line_end, .. } => (*line_start, *line_end),
+            SourceSpan::Code {
+                line_start,
+                line_end,
+                ..
+            } => (*line_start, *line_end),
            other => panic!("expected Code span, got {other:?}"),
        })
        .collect();
--- a/crates/kebab-chunk/tests/long_section_snapshot.rs
+++ b/crates/kebab-chunk/tests/long_section_snapshot.rs
@@ -15,7 +15,7 @@ use std::path::PathBuf;

 use kebab_chunk::MdHeadingV1Chunker;
 use kebab_core::{
-    AssetId, AssetStorage, Checksum, ChunkPolicy, ChunkerVersion, Chunker, MediaType,
+    AssetId, AssetStorage, Checksum, ChunkPolicy, Chunker, ChunkerVersion, MediaType,
    ParserVersion, RawAsset, SourceUri, WorkspacePath,
 };
 use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
@@ -65,8 +65,7 @@ fn long_section_chunks_snapshot() {
        Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1,
        None => 1,
    };
-    let (blocks, parse_warns) =
-        parse_blocks(&bytes, body_offset_lines).expect("blocks parse");
+    let (blocks, parse_warns) = parse_blocks(&bytes, body_offset_lines).expect("blocks parse");

    // Pin parser_version so doc_id / block_ids are reproducible.
    let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into());
@@ -74,9 +73,8 @@ fn long_section_chunks_snapshot() {
    metadata.aliases.sort();
    metadata.tags.sort();

-    let doc =
-        build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
-            .expect("build_canonical_document");
+    let doc = build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
+        .expect("build_canonical_document");

    // Pin policy so policy_hash and chunk_ids are reproducible.
    let policy = ChunkPolicy {
@@ -102,8 +100,7 @@ fn long_section_chunks_snapshot() {
            baseline_path.display()
        ),
    };
-    let expected: Value =
-        serde_json::from_str(&baseline_text).expect("baseline parses as json");
+    let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");

    if actual != expected {
        if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
@@ -154,14 +151,8 @@ fn long_section_chunks_are_deterministic() {
        let mut metadata = metadata;
        metadata.aliases.sort();
        metadata.tags.sort();
-        let doc = build_canonical_document(
-            &asset,
-            metadata,
-            blocks,
-            &parser_version,
-            parse_warns,
-        )
-        .expect("build_canonical_document");
+        let doc = build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
+            .expect("build_canonical_document");
        let ids: Vec<String> = MdHeadingV1Chunker
            .chunk(&doc, &policy)
            .unwrap()
--- a/crates/kebab-chunk/tests/manifest_file_v1.rs
+++ b/crates/kebab-chunk/tests/manifest_file_v1.rs
@@ -107,9 +107,7 @@ fn cargo_toml_single_chunk_with_toml_lang() {
        .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));

    let doc = manifest_doc("toml", &text);
-    let chunks = ManifestFileV1Chunker
-        .chunk(&doc, &policy())
-        .expect("chunk");
+    let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");

    assert_eq!(
        chunks.len(),
@@ -149,9 +147,7 @@ fn package_json_single_chunk_with_json_lang() {
        .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));

    let doc = manifest_doc("json", &text);
-    let chunks = ManifestFileV1Chunker
-        .chunk(&doc, &policy())
-        .expect("chunk");
+    let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");

    assert_eq!(
        chunks.len(),
@@ -191,9 +187,7 @@ fn pom_xml_single_chunk_with_xml_lang() {
        .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));

    let doc = manifest_doc("xml", &text);
-    let chunks = ManifestFileV1Chunker
-        .chunk(&doc, &policy())
-        .expect("chunk");
+    let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");

    assert_eq!(
        chunks.len(),
@@ -233,9 +227,7 @@ fn go_mod_single_chunk_with_go_mod_lang() {
        .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));

    let doc = manifest_doc("go-mod", &text);
-    let chunks = ManifestFileV1Chunker
-        .chunk(&doc, &policy())
-        .expect("chunk");
+    let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");

    assert_eq!(
        chunks.len(),
--- a/crates/kebab-cli/src/main.rs
+++ b/crates/kebab-cli/src/main.rs
@@ -156,7 +156,7 @@ enum Cmd {

        /// p9-fb-36: filter by `assets.media_type` kind. Comma-separated.
        /// Aliases: `md` → `markdown`. Other accepted: `markdown`, `pdf`,
-        /// `image`, `audio`, `other`. Unknown values match nothing.
+        /// `image`, `audio`, `code`, `other`. Unknown values match nothing.
        #[arg(long, value_delimiter = ',')]
        media: Vec<String>,

@@ -179,7 +179,12 @@ enum Cmd {
        /// canonical).  Repeatable or comma-separated.
        /// Examples: `rust`, `python`, `typescript`.
        /// Unknown values produce empty hits.
-        #[arg(long = "code-lang", value_name = "LANG", num_args = 1, value_delimiter = ',')]
+        #[arg(
+            long = "code-lang",
+            value_name = "LANG",
+            num_args = 1,
+            value_delimiter = ','
+        )]
        code_lang: Vec<String>,

        /// p9-fb-37: emit pre-fusion lexical / vector / RRF candidate
@@ -464,7 +469,9 @@ fn parse_bool_env(s: &str) -> Result<bool, String> {
    match s.to_ascii_lowercase().as_str() {
        "1" | "true" | "yes" | "on" => Ok(true),
        "0" | "false" | "no" | "off" => Ok(false),
-        other => Err(format!("expected 1/0/true/false/yes/no/on/off, got {other:?}")),
+        other => Err(format!(
+            "expected 1/0/true/false/yes/no/on/off, got {other:?}"
+        )),
    }
 }

@@ -551,8 +558,14 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                    "created  {}",
                    kebab_config::Config::xdg_config_path().display()
                );
-                println!("created  {}", kebab_config::Config::xdg_data_dir().display());
-                println!("created  {}", kebab_config::Config::xdg_state_dir().display());
+                println!(
+                    "created  {}",
+                    kebab_config::Config::xdg_data_dir().display()
+                );
+                println!(
+                    "created  {}",
+                    kebab_config::Config::xdg_state_dir().display()
+                );
                println!("hint     edit the config above, then `kebab ingest`");
            }
            Ok(())
@@ -565,7 +578,9 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
        } => {
            let cfg = kebab_config::Config::load(cli.config.as_deref())?;
            let scope = kebab_core::SourceScope {
-                root: root.clone().unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)),
+                root: root
+                    .clone()
+                    .unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)),
                exclude: cfg.workspace.exclude.clone(),
                ..Default::default()
            };
@@ -580,9 +595,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                .unwrap_or(false);
            let mode = progress::ProgressMode::from_flags(cli.json, cli.quiet, plain_env);
            let (tx, rx) = std::sync::mpsc::channel::<kebab_app::IngestEvent>();
-            let display_handle = std::thread::spawn(move || {
-                progress::ProgressDisplay::new(mode).run(rx)
-            });
+            let display_handle =
+                std::thread::spawn(move || progress::ProgressDisplay::new(mode).run(rx));

            // p9-fb-04: register a Ctrl-C handler that flips the same
            // AtomicBool the facade polls at each step boundary. The
@@ -614,7 +628,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
            if cli.json {
                println!("{}", serde_json::to_string(&wire::wire_ingest(&report))?);
            } else {
-                let skipped_breakdown = kebab_app::render_skipped_breakdown(&report.skipped_by_extension);
+                let skipped_breakdown =
+                    kebab_app::render_skipped_breakdown(&report.skipped_by_extension);
                let purged_suffix = if report.purged_deleted_files > 0 {
                    format!("  purged {}", report.purged_deleted_files)
                } else {
@@ -640,7 +655,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                let cfg = kebab_config::Config::load(cli.config.as_deref())?;
                let docs = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())?;
                if cli.json {
-                    println!("{}", serde_json::to_string(&wire::wire_doc_summaries(&docs))?);
+                    println!(
+                        "{}",
+                        serde_json::to_string(&wire::wire_doc_summaries(&docs))?
+                    );
                } else {
                    for d in &docs {
                        println!("{}\t{}", d.doc_id, d.doc_path.0);
@@ -667,7 +685,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                let cfg = kebab_config::Config::load(cli.config.as_deref())?;
                let chunk_id: kebab_core::ChunkId = id.parse()?;
                let chunk = kebab_app::inspect_chunk_with_config(cfg, &chunk_id)?;
-                println!("{}", serde_json::to_string(&wire::wire_chunk_inspection(&chunk))?);
+                println!(
+                    "{}",
+                    serde_json::to_string(&wire::wire_chunk_inspection(&chunk))?
+                );
                Ok(())
            }
        },
@@ -708,7 +729,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
            };
            let result = kebab_app::fetch_with_config(cfg, query, opts)?;
            if cli.json {
-                println!("{}", serde_json::to_string(&wire::wire_fetch_result(&result))?);
+                println!(
+                    "{}",
+                    serde_json::to_string(&wire::wire_fetch_result(&result))?
+                );
            } else {
                render_fetch_plain(&result);
            }
@@ -752,30 +776,21 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                    if line.trim().is_empty() {
                        continue;
                    }
-                    let v: serde_json::Value =
-                        serde_json::from_str(&line).map_err(|e| {
-                            anyhow::Error::new(kebab_app::StructuredError(
-                                kebab_app::ErrorV1 {
-                                    schema_version: kebab_app::ERROR_V1_ID
-                                        .to_string(),
-                                    code: "config_invalid".to_string(),
-                                    message: format!(
-                                        "stdin ndjson line {} parse error: {e}",
-                                        lineno + 1
-                                    ),
-                                    details: serde_json::Value::Null,
-                                    hint: Some(
-                                        "each line must be a JSON object with at least `query`"
-                                            .to_string(),
-                                    ),
-                                },
-                            ))
-                        })?;
+                    let v: serde_json::Value = serde_json::from_str(&line).map_err(|e| {
+                        anyhow::Error::new(kebab_app::StructuredError(kebab_app::ErrorV1 {
+                            schema_version: kebab_app::ERROR_V1_ID.to_string(),
+                            code: "config_invalid".to_string(),
+                            message: format!("stdin ndjson line {} parse error: {e}", lineno + 1),
+                            details: serde_json::Value::Null,
+                            hint: Some(
+                                "each line must be a JSON object with at least `query`".to_string(),
+                            ),
+                        }))
+                    })?;
                    raw_items.push(v);
                }

-                let (items, summary) =
-                    kebab_app::bulk_search_with_config(cfg, raw_items)?;
+                let (items, summary) = kebab_app::bulk_search_with_config(cfg, raw_items)?;

                if cli.json {
                    let mut stdout = std::io::stdout().lock();
@@ -799,11 +814,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                        if let Some(err) = &item.error {
                            writeln!(stdout, "error: {err}")?;
                        } else if let Some(resp) = &item.response {
-                            writeln!(
-                                stdout,
-                                "{}",
-                                serde_json::to_string_pretty(resp)?
-                            )?;
+                            writeln!(stdout, "{}", serde_json::to_string_pretty(resp)?)?;
                        }
                        writeln!(stdout)?;
                    }
@@ -819,6 +830,17 @@ fn run(cli: &Cli) -> anyhow::Result<()> {

            // p9-fb-42: bulk mode requires no query; single-query mode requires query.
            let query_text = match query.as_ref() {
+                Some(q) if q.trim().is_empty() => {
+                    return Err(anyhow::Error::new(kebab_app::StructuredError(
+                        kebab_app::ErrorV1 {
+                            schema_version: kebab_app::ERROR_V1_ID.to_string(),
+                            code: "invalid_input".to_string(),
+                            message: "query is empty; provide a non-empty search term or use --bulk".into(),
+                            details: serde_json::Value::Null,
+                            hint: Some("e.g. `kebab search 'rust async'` or `kebab search --bulk < queries.ndjson`".into()),
+                        },
+                    )));
+                }
                Some(q) => q.clone(),
                None => {
                    return Err(anyhow::anyhow!("query is required unless --bulk is set"));
@@ -832,8 +854,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                    other => other.to_string(),
                }
            }
-            let media_norm: Vec<String> =
-                media.iter().map(|s| normalize_media_alias(s)).collect();
+            let media_norm: Vec<String> = media.iter().map(|s| normalize_media_alias(s)).collect();

            // p9-fb-36: parse --ingested-after as RFC3339; structured error on failure.
            let ingested_after_parsed: Option<time::OffsetDateTime> =
@@ -845,8 +866,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                        ) {
                            Ok(ts) => Some(ts),
                            Err(e) => {
-                                return Err(anyhow::Error::new(
-                                    kebab_app::StructuredError(kebab_app::ErrorV1 {
+                                return Err(anyhow::Error::new(kebab_app::StructuredError(
+                                    kebab_app::ErrorV1 {
                                        schema_version: kebab_app::ERROR_V1_ID.to_string(),
                                        code: "config_invalid".to_string(),
                                        message: format!(
@@ -856,8 +877,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                                        hint: Some(
                                            "expected format like 2026-04-01T00:00:00Z".to_string(),
                                        ),
-                                    }),
-                                ));
+                                    },
+                                )));
                            }
                        }
                    }
@@ -932,11 +953,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                    };
                    println!(
                        "{:>2}. {:.4}  {}{}{}",
-                        h.rank,
-                        h.retrieval.fusion_score,
-                        stale_tag,
-                        h.doc_path.0,
-                        heading,
+                        h.rank, h.retrieval.fusion_score, stale_tag, h.doc_path.0, heading,
                    );
                }
                // p9-fb-34: truncation hint goes to stderr so it
@@ -958,15 +975,33 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                    if let Some(t) = &resp.trace {
                        eprintln!();
                        eprintln!("Trace:");
-                        eprintln!("  lexical ({} hits, {}ms):", t.lexical.len(), t.timing.lexical_ms);
+                        eprintln!(
+                            "  lexical ({} hits, {}ms):",
+                            t.lexical.len(),
+                            t.timing.lexical_ms
+                        );
                        for c in t.lexical.iter().take(3) {
-                            eprintln!("    rank={} score={:.4} chunk={}", c.rank, c.score, c.chunk_id.0);
+                            eprintln!(
+                                "    rank={} score={:.4} chunk={}",
+                                c.rank, c.score, c.chunk_id.0
+                            );
                        }
-                        eprintln!("  vector ({} hits, {}ms):", t.vector.len(), t.timing.vector_ms);
+                        eprintln!(
+                            "  vector ({} hits, {}ms):",
+                            t.vector.len(),
+                            t.timing.vector_ms
+                        );
                        for c in t.vector.iter().take(3) {
-                            eprintln!("    rank={} score={:.4} chunk={}", c.rank, c.score, c.chunk_id.0);
+                            eprintln!(
+                                "    rank={} score={:.4} chunk={}",
+                                c.rank, c.score, c.chunk_id.0
+                            );
                        }
-                        eprintln!("  fusion ({} inputs, {}ms)", t.rrf_inputs.len(), t.timing.fusion_ms);
+                        eprintln!(
+                            "  fusion ({} inputs, {}ms)",
+                            t.rrf_inputs.len(),
+                            t.timing.fusion_ms
+                        );
                        eprintln!("  total: {}ms", t.timing.total_ms);
                    }
                }
@@ -988,6 +1023,17 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
            multi_hop,
        } => {
            let cfg = kebab_config::Config::load(cli.config.as_deref())?;
+            if query.trim().is_empty() {
+                return Err(anyhow::Error::new(kebab_app::StructuredError(
+                    kebab_app::ErrorV1 {
+                        schema_version: kebab_app::ERROR_V1_ID.to_string(),
+                        code: "invalid_input".to_string(),
+                        message: "query is empty; provide a non-empty prompt".into(),
+                        details: serde_json::Value::Null,
+                        hint: Some("e.g. `kebab ask \"explain this code\"`".into()),
+                    },
+                )));
+            }
            if *stream {
                // p9-fb-33: streaming branch. Background thread runs
                // ask_with_config (which calls into the rag pipeline);
@@ -1017,16 +1063,12 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                let cfg2 = cfg.clone();
                let q = query.clone();
                let session2 = session.clone();
-                let handle = std::thread::spawn(
-                    move || -> anyhow::Result<kebab_core::Answer> {
-                        match session2.as_deref() {
-                            Some(sid) => kebab_app::ask_with_session_with_config(
-                                cfg2, sid, &q, opts,
-                            ),
-                            None => kebab_app::ask_with_config(cfg2, &q, opts),
-                        }
-                    },
-                );
+                let handle = std::thread::spawn(move || -> anyhow::Result<kebab_core::Answer> {
+                    match session2.as_deref() {
+                        Some(sid) => kebab_app::ask_with_session_with_config(cfg2, sid, &q, opts),
+                        None => kebab_app::ask_with_config(cfg2, &q, opts),
+                    }
+                });

                // Drain receiver, write ndjson to stderr until
                // completion or BrokenPipe.
@@ -1302,9 +1344,18 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                    println!("{}", serde_json::to_string_pretty(&agg)?);
                } else {
                    println!("run_id: {run_id}");
-                    println!("queries: {} ({} failed)", agg.total_queries, agg.failed_queries);
-                    println!("hit@1:   {:.4}", agg.hit_at_k.get(&1).copied().unwrap_or(0.0));
-                    println!("hit@5:   {:.4}", agg.hit_at_k.get(&5).copied().unwrap_or(0.0));
+                    println!(
+                        "queries: {} ({} failed)",
+                        agg.total_queries, agg.failed_queries
+                    );
+                    println!(
+                        "hit@1:   {:.4}",
+                        agg.hit_at_k.get(&1).copied().unwrap_or(0.0)
+                    );
+                    println!(
+                        "hit@5:   {:.4}",
+                        agg.hit_at_k.get(&5).copied().unwrap_or(0.0)
+                    );
                    println!("MRR:     {:.4}", agg.mrr);
                }
                Ok(())
@@ -1354,8 +1405,12 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
            } else {
                println!(
                    "ingest-file: scanned={} new={} updated={} unchanged={} skipped={} errors={}",
-                    report.scanned, report.new, report.updated,
-                    report.unchanged, report.skipped, report.errors
+                    report.scanned,
+                    report.new,
+                    report.updated,
+                    report.unchanged,
+                    report.skipped,
+                    report.errors
                );
            }
            Ok(())
@@ -1368,20 +1423,20 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
                .read_to_string(&mut body)
                .context("kebab ingest-stdin: read stdin")?;
            let cfg = kebab_config::Config::load(cli.config.as_deref())?;
-            let report = kebab_app::ingest_stdin_with_config(
-                cfg,
-                &body,
-                title,
-                source_uri.as_deref(),
-            )?;
+            let report =
+                kebab_app::ingest_stdin_with_config(cfg, &body, title, source_uri.as_deref())?;
            if cli.json {
                let v = wire::wire_ingest(&report);
                println!("{}", serde_json::to_string(&v)?);
            } else {
                println!(
                    "ingest-stdin: scanned={} new={} updated={} unchanged={} skipped={} errors={}",
-                    report.scanned, report.new, report.updated,
-                    report.unchanged, report.skipped, report.errors
+                    report.scanned,
+                    report.new,
+                    report.updated,
+                    report.unchanged,
+                    report.skipped,
+                    report.errors
                );
            }
            Ok(())
@@ -1410,10 +1465,7 @@ fn render_ask_plain_citations(
    writeln!(w)?;
    writeln!(w, "근거:")?;
    for (idx, c) in ans.citations.iter().enumerate() {
-        let marker = c
-            .marker
-            .clone()
-            .unwrap_or_else(|| format!("{}", idx + 1));
+        let marker = c.marker.clone().unwrap_or_else(|| format!("{}", idx + 1));
        // p9-fb-32: `[stale]` prefix on the URI for citations whose
        // `stale: true`. Yellow on TTY, plain otherwise — mirrors the
        // search-plain renderer in `Cmd::Search`.
@@ -1474,7 +1526,10 @@ fn print_schema_text(s: &kebab_app::SchemaV1) {
    println!("  parser_version          {}", s.models.parser_version);
    println!("  chunker_version         {}", s.models.chunker_version);
    println!("  embedding_version       {}", s.models.embedding_version);
-    println!("  prompt_template_version {}", s.models.prompt_template_version);
+    println!(
+        "  prompt_template_version {}",
+        s.models.prompt_template_version
+    );
    println!("  index_version           {}", s.models.index_version);
    println!("  corpus_revision         {}", s.models.corpus_revision);
    println!();
@@ -1523,9 +1578,7 @@ fn confirm_destructive(
 /// Confirm prompt for `--orphans-only`: shows the orphan count + a
 /// sample of up to 5 paths so the user knows what will be purged before
 /// committing. No filesystem paths are removed — only store records.
-fn confirm_orphans_only(
-    orphan_paths: &[kebab_core::WorkspacePath],
-) -> anyhow::Result<bool> {
+fn confirm_orphans_only(orphan_paths: &[kebab_core::WorkspacePath]) -> anyhow::Result<bool> {
    use std::io::Write;
    let n = orphan_paths.len();
    let mut out = std::io::stderr().lock();
@@ -1538,11 +1591,7 @@ fn confirm_orphans_only(
        return Ok(true);
    }

-    let sample: Vec<&str> = orphan_paths
-        .iter()
-        .take(5)
-        .map(|p| p.0.as_str())
-        .collect();
+    let sample: Vec<&str> = orphan_paths.iter().take(5).map(|p| p.0.as_str()).collect();
    let sample_str = sample.join(", ");
    let ellipsis = if n > 5 { ", …" } else { "" };

@@ -1571,19 +1620,28 @@ fn render_fetch_plain(r: &kebab_core::FetchResult) {
            if !r.context_before.is_empty() {
                println!("\n=== before ===");
                for c in &r.context_before {
-                    let heading = c.heading_path.last().map_or("", std::string::String::as_str);
+                    let heading = c
+                        .heading_path
+                        .last()
+                        .map_or("", std::string::String::as_str);
                    println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text);
                }
            }
            if let Some(c) = &r.chunk {
                println!("\n=== target ===");
-                let heading = c.heading_path.last().map_or("", std::string::String::as_str);
+                let heading = c
+                    .heading_path
+                    .last()
+                    .map_or("", std::string::String::as_str);
                println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text);
            }
            if !r.context_after.is_empty() {
                println!("\n=== after ===");
                for c in &r.context_after {
-                    let heading = c.heading_path.last().map_or("", std::string::String::as_str);
+                    let heading = c
+                        .heading_path
+                        .last()
+                        .map_or("", std::string::String::as_str);
                    println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text);
                }
            }
@@ -1615,8 +1673,8 @@ mod tests {
    //! against a synthetic `Answer` instead.
    use super::*;
    use kebab_core::{
-        Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef,
-        PromptTemplateVersion, SearchMode, TokenUsage, TraceId, WorkspacePath,
+        Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef, PromptTemplateVersion,
+        SearchMode, TokenUsage, TraceId, WorkspacePath,
    };
    use time::OffsetDateTime;

@@ -1712,4 +1770,3 @@ mod tests {
        );
    }
 }
-
--- a/crates/kebab-cli/src/progress.rs
+++ b/crates/kebab-cli/src/progress.rs
@@ -124,11 +124,9 @@ impl ProgressDisplay {
                    bar.set_length(u64::from(*total));
                    bar.set_position(0);
                    bar.set_style(
-                        ProgressStyle::with_template(
-                            "ingest [{bar:30}] {pos}/{len} {wide_msg}",
-                        )
-                        .unwrap()
-                        .progress_chars("=> "),
+                        ProgressStyle::with_template("ingest [{bar:30}] {pos}/{len} {wide_msg}")
+                            .unwrap()
+                            .progress_chars("=> "),
                    );
                    bar.set_message("");
                }
@@ -170,11 +168,7 @@ impl ProgressDisplay {
                    let _ = writeln!(
                        err,
                        "ingest: complete (scanned={} new={} updated={} skipped={} errors={})",
-                        counts.scanned,
-                        counts.new,
-                        counts.updated,
-                        counts.skipped,
-                        counts.errors,
+                        counts.scanned, counts.new, counts.updated, counts.skipped, counts.errors,
                    );
                }
            }
@@ -193,14 +187,42 @@ impl ProgressDisplay {
                    let _ = writeln!(
                        err,
                        "ingest: aborted (scanned={} new={} updated={} skipped={} errors={})",
-                        counts.scanned,
-                        counts.new,
-                        counts.updated,
-                        counts.skipped,
-                        counts.errors,
+                        counts.scanned, counts.new, counts.updated, counts.skipped, counts.errors,
                    );
                }
            }
+            // v0.20.0 sub-item 1: per-page PDF OCR events — sub-progress lines
+            // under AssetStarted for scanned PDF. spec §4.6.1 line 1085-1086.
+            // skipped=true 시 (DCTDecode 부재 또는 engine fail) skip line.
+            IngestEvent::PdfOcrStarted { page } => {
+                if !quiet {
+                    let mut err = std::io::stderr().lock();
+                    let _ = writeln!(err, "  📷 OCR page {page}...");
+                }
+            }
+            IngestEvent::PdfOcrFinished {
+                page,
+                ms,
+                chars,
+                ocr_engine,
+                skipped,
+                ..
+            } => {
+                if !quiet {
+                    let mut err = std::io::stderr().lock();
+                    if *skipped {
+                        let _ = writeln!(
+                            err,
+                            "  ⊘ OCR page {page} skipped (no DCTDecode or engine fail, {ms}ms)"
+                        );
+                    } else {
+                        let _ = writeln!(
+                            err,
+                            "  ✓ OCR page {page} ({chars} chars, {ms}ms via {ocr_engine})"
+                        );
+                    }
+                }
+            }
        }
        Ok(())
    }
@@ -231,7 +253,10 @@ mod tests {

    #[test]
    fn from_flags_json_takes_priority_over_tty() {
-        assert_eq!(ProgressMode::from_flags(true, false, false), ProgressMode::Json);
+        assert_eq!(
+            ProgressMode::from_flags(true, false, false),
+            ProgressMode::Json
+        );
    }

    #[test]
--- a/crates/kebab-cli/src/wire.rs
+++ b/crates/kebab-cli/src/wire.rs
@@ -114,10 +114,7 @@ pub fn wire_answer(a: &Answer) -> Value {
 /// The timestamp is added at emit time (caller fills `ts`), since the
 /// pipeline doesn't carry one in the in-process enum — mirrors the
 /// `wire_ingest_progress` pattern (§2 ingest_progress.v1).
-pub fn wire_answer_event(
-    ev: &kebab_app::StreamEvent,
-    ts: time::OffsetDateTime,
-) -> Value {
+pub fn wire_answer_event(ev: &kebab_app::StreamEvent, ts: time::OffsetDateTime) -> Value {
    let mut v = serde_json::to_value(ev).expect("StreamEvent serializes");
    let ts_str = ts
        .format(&time::format_description::well_known::Rfc3339)
@@ -161,9 +158,7 @@ pub fn wire_reset(r: &kebab_app::ResetReport) -> Value {
 /// wall-clock — the emit site is the only place that knows the moment
 /// of emission, so the timestamp is stamped here rather than carried
 /// on the event itself.
-pub fn wire_ingest_progress(
-    event: &kebab_app::IngestEvent,
-) -> anyhow::Result<Value> {
+pub fn wire_ingest_progress(event: &kebab_app::IngestEvent) -> anyhow::Result<Value> {
    let mut v = serde_json::to_value(event)?;
    if let Value::Object(ref mut map) = v {
        map.insert(
@@ -305,15 +300,15 @@ mod tests {
        let v = wire_search_response(&r);
        assert_eq!(schema_of(&v), Some("search_response.v1"));
        assert!(v.get("hits").and_then(|h| h.as_array()).is_some());
-        assert_eq!(
-            v.get("hits").and_then(|h| h.as_array()).unwrap().len(),
-            0
-        );
+        assert_eq!(v.get("hits").and_then(|h| h.as_array()).unwrap().len(), 0);
        assert_eq!(
            v.get("next_cursor").and_then(|c| c.as_str()),
            Some("opaque-cursor-abc")
        );
-        assert_eq!(v.get("truncated").and_then(serde_json::Value::as_bool), Some(true));
+        assert_eq!(
+            v.get("truncated").and_then(serde_json::Value::as_bool),
+            Some(true)
+        );
    }

    #[test]
@@ -322,23 +317,36 @@ mod tests {
        let schema = SchemaV1 {
            schema_version: "schema.v1".to_string(),
            kebab_version: "0.2.1".to_string(),
-            wire: WireBlock { schemas: vec!["answer.v1".to_string()] },
+            wire: WireBlock {
+                schemas: vec!["answer.v1".to_string()],
+            },
            capabilities: Capabilities {
-                json_mode: true, ingest_progress: true, ingest_cancellation: true,
-                rag_multi_turn: true, search_cache: true, incremental_ingest: true,
-                streaming_ask: false, http_daemon: false, mcp_server: false,
-                single_file_ingest: false, bulk_search: true,
+                json_mode: true,
+                ingest_progress: true,
+                ingest_cancellation: true,
+                rag_multi_turn: true,
+                search_cache: true,
+                incremental_ingest: true,
+                streaming_ask: false,
+                http_daemon: false,
+                mcp_server: false,
+                single_file_ingest: false,
+                bulk_search: true,
            },
            models: Models {
                parser_version: "x".to_string(),
                chunker_version: "y".to_string(),
+                active_parsers: vec![],
+                active_chunkers: vec![],
                embedding_version: "z".to_string(),
                prompt_template_version: "w".to_string(),
                index_version: "v".to_string(),
                corpus_revision: 7,
            },
            stats: Stats {
-                doc_count: 1, chunk_count: 2, asset_count: 1,
+                doc_count: 1,
+                chunk_count: 2,
+                asset_count: 1,
                last_ingest_at: None,
                media_breakdown: Default::default(),
                lang_breakdown: Default::default(),
@@ -350,7 +358,10 @@ mod tests {
        };
        let v = wire_schema(&schema);
        assert_eq!(schema_of(&v), Some("schema.v1"));
-        assert_eq!(v.get("kebab_version").and_then(Value::as_str), Some("0.2.1"));
+        assert_eq!(
+            v.get("kebab_version").and_then(Value::as_str),
+            Some("0.2.1")
+        );
    }

    #[test]
@@ -365,7 +376,10 @@ mod tests {
        };
        let v = wire_error_v1(&err);
        assert_eq!(schema_of(&v), Some("error.v1"));
-        assert_eq!(v.get("code").and_then(Value::as_str), Some("config_invalid"));
+        assert_eq!(
+            v.get("code").and_then(Value::as_str),
+            Some("config_invalid")
+        );
    }

    #[test]
@@ -391,8 +405,10 @@ mod tests {

    #[test]
    fn search_response_with_trace_serializes_trace_field() {
-        use kebab_core::{SearchTrace, TraceCandidate, TraceFusionInput,
-                         TraceTiming, ChunkId, DocumentId, WorkspacePath};
+        use kebab_core::{
+            ChunkId, DocumentId, SearchTrace, TraceCandidate, TraceFusionInput, TraceTiming,
+            WorkspacePath,
+        };
        let r = kebab_app::SearchResponse {
            hits: vec![],
            next_cursor: None,
@@ -412,7 +428,12 @@ mod tests {
                    vector_rank: None,
                    fusion_score: 0.0,
                }],
-                timing: TraceTiming { lexical_ms: 5, vector_ms: 0, fusion_ms: 1, total_ms: 7 },
+                timing: TraceTiming {
+                    lexical_ms: 5,
+                    vector_ms: 0,
+                    fusion_ms: 1,
+                    total_ms: 7,
+                },
            }),
            hint: None,
        };
--- a/crates/kebab-cli/tests/cli_config_not_found.rs
+++ b/crates/kebab-cli/tests/cli_config_not_found.rs
@@ -0,0 +1,64 @@
+//! Integration tests for Bug #10: explicit --config <path> that does not exist
+//! must fail with exit≠0 and error.v1 code=config_not_found (not silently fall
+//! back to XDG defaults).
+
+use serde_json::Value;
+use std::process::Command;
+
+fn kebab_bin() -> String {
+    env!("CARGO_BIN_EXE_kebab").to_string()
+}
+
+fn parse_error_v1(stderr: &str) -> Value {
+    let last = stderr
+        .lines()
+        .last()
+        .expect("expected error.v1 ndjson on stderr");
+    serde_json::from_str(last)
+        .unwrap_or_else(|e| panic!("expected ndjson on stderr: {e}\nstderr={stderr}"))
+}
+
+#[test]
+fn invalid_config_path_emits_error_v1_with_nonzero_exit() {
+    let absent = "/tmp/__kebab_bugfix3_absolute_nonexistent.toml";
+    assert!(!std::path::Path::new(absent).exists());
+
+    let out = Command::new(kebab_bin())
+        .args(["search", "rust", "--config", absent, "--json"])
+        .output()
+        .expect("spawn kebab");
+
+    assert_ne!(
+        out.status.code(),
+        Some(0),
+        "exit must be nonzero on missing --config"
+    );
+    let stderr = String::from_utf8_lossy(&out.stderr);
+    let v = parse_error_v1(&stderr);
+    assert_eq!(v["schema_version"], "error.v1");
+    assert_eq!(v["code"], "config_not_found");
+    assert!(v["hint"].is_string(), "hint must be present");
+}
+
+#[test]
+fn invalid_relative_config_path_emits_config_not_found() {
+    // Bug #10 spec §6 R-1: relative path も cwd-relative で cover.
+    let tmp = tempfile::tempdir().unwrap();
+    let out = Command::new(kebab_bin())
+        .args([
+            "search",
+            "rust",
+            "--config",
+            "nonexistent-rel.toml",
+            "--json",
+        ])
+        .current_dir(tmp.path())
+        .output()
+        .expect("spawn kebab");
+
+    assert_ne!(out.status.code(), Some(0));
+    let stderr = String::from_utf8_lossy(&out.stderr);
+    let v = parse_error_v1(&stderr);
+    assert_eq!(v["schema_version"], "error.v1");
+    assert_eq!(v["code"], "config_not_found");
+}
--- a/crates/kebab-cli/tests/cli_empty_query.rs
+++ b/crates/kebab-cli/tests/cli_empty_query.rs
@@ -0,0 +1,50 @@
+//! Integration tests for Bug #14: empty or whitespace-only query must emit
+//! error.v1 code=invalid_input and exit nonzero (not silent 0-hit return).
+
+use serde_json::Value;
+use std::process::Command;
+
+fn kebab_bin() -> String {
+    env!("CARGO_BIN_EXE_kebab").to_string()
+}
+
+fn parse_error_v1(stderr: &str) -> Value {
+    let last = stderr
+        .lines()
+        .last()
+        .expect("expected error.v1 ndjson on stderr");
+    serde_json::from_str(last)
+        .unwrap_or_else(|e| panic!("expected ndjson on stderr: {e}\nstderr={stderr}"))
+}
+
+#[test]
+fn search_empty_query_emits_invalid_input() {
+    for q in ["", "   "] {
+        let out = Command::new(kebab_bin())
+            .args(["search", q, "--json"])
+            .output()
+            .expect("spawn kebab");
+        assert_ne!(
+            out.status.code(),
+            Some(0),
+            "empty/whitespace query must fail (q={q:?})"
+        );
+        let stderr = String::from_utf8_lossy(&out.stderr);
+        let v = parse_error_v1(&stderr);
+        assert_eq!(v["schema_version"], "error.v1", "stderr={stderr}");
+        assert_eq!(v["code"], "invalid_input", "stderr={stderr}");
+    }
+}
+
+#[test]
+fn ask_empty_query_emits_invalid_input() {
+    let out = Command::new(kebab_bin())
+        .args(["ask", "", "--json"])
+        .output()
+        .expect("spawn kebab");
+    assert_ne!(out.status.code(), Some(0));
+    let stderr = String::from_utf8_lossy(&out.stderr);
+    let v = parse_error_v1(&stderr);
+    assert_eq!(v["schema_version"], "error.v1");
+    assert_eq!(v["code"], "invalid_input");
+}
--- a/crates/kebab-cli/tests/cli_error_wire.rs
+++ b/crates/kebab-cli/tests/cli_error_wire.rs
@@ -2,11 +2,10 @@
 //! on stderr while non-json mode emits the legacy `error:` text prefix.
 //!
 //! The `config_invalid` code is triggered by supplying an *existing* but
-//! malformed TOML file via `--config`. Note: supplying a *non-existent*
-//! path does NOT trigger this error — Config::load silently falls back to
-//! defaults when the specified config file is absent (by design, so that
-//! `kebab doctor` runs before `kebab init` is ever called). A file that
-//! exists but fails TOML parsing is the reliable path to `config_invalid`.
+//! malformed TOML file via `--config`. A file that exists but fails TOML
+//! parsing is the reliable path to `config_invalid`. Supplying a path that
+//! does not exist emits `config_not_found` instead (Bug #10 fix, v0.20.0
+//! bugfix3); see `cli_config_not_found.rs` for those tests.

 use std::process::Command;

@@ -37,12 +36,7 @@ fn json_mode_emits_error_v1_on_config_invalid() {
    std::fs::write(&bad_config, b"this is not { valid toml !!!").unwrap();

    let mut cmd = Command::new(kebab_bin());
-    cmd.args([
-        "--json",
-        "--config",
-        bad_config.to_str().unwrap(),
-        "ingest",
-    ]);
+    cmd.args(["--json", "--config", bad_config.to_str().unwrap(), "ingest"]);
    for (k, v) in xdg_envs(tmp.path()) {
        cmd.env(k, v);
    }
@@ -56,7 +50,10 @@ fn json_mode_emits_error_v1_on_config_invalid() {
    assert_eq!(exit_code, 2, "expected exit code 2, got {exit_code}");

    let stderr = String::from_utf8(out.stderr).unwrap();
-    let first_line = stderr.lines().next().expect("stderr must have at least one line");
+    let first_line = stderr
+        .lines()
+        .next()
+        .expect("stderr must have at least one line");
    let v: serde_json::Value =
        serde_json::from_str(first_line).expect("stderr first line must be valid JSON");

--- a/crates/kebab-cli/tests/cli_help_smoke.rs
+++ b/crates/kebab-cli/tests/cli_help_smoke.rs
@@ -0,0 +1,17 @@
+// crates/kebab-cli/tests/cli_help_smoke.rs
+//
+// Regression pin — `kebab search --help` 의 `--media` value list 가
+// `code` 를 노출. Bug #7 (v0.20.0 bugfix round 2 spec §4.4).
+
+#[test]
+fn search_help_lists_code_in_media_values() {
+    let out = std::process::Command::new(env!("CARGO_BIN_EXE_kebab"))
+        .args(["search", "--help"])
+        .output()
+        .expect("kebab search --help");
+    let stdout = String::from_utf8_lossy(&out.stdout);
+    assert!(
+        stdout.contains("`code`"),
+        "search --help must list 'code' as accepted --media value; stdout = {stdout}"
+    );
+}
--- a/crates/kebab-cli/tests/cli_ingest_file.rs
+++ b/crates/kebab-cli/tests/cli_ingest_file.rs
@@ -72,21 +72,34 @@ max_context_tokens = 8000
            workspace = workspace.display(),
            data = data.display(),
        ),
-    ).unwrap();
+    )
+    .unwrap();

    let src = dir.path().join("doc.md");
    fs::write(&src, "# A\n\nbody.").unwrap();

    let bin = env!("CARGO_BIN_EXE_kebab");
    let out = Command::new(bin)
-        .args(["--json", "--config", cfg_path.to_str().unwrap(), "ingest-file"])
+        .args([
+            "--json",
+            "--config",
+            cfg_path.to_str().unwrap(),
+            "ingest-file",
+        ])
        .arg(&src)
        .output()
        .unwrap();
-    assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
+    assert!(
+        out.status.success(),
+        "stderr: {}",
+        String::from_utf8_lossy(&out.stderr)
+    );

    let stdout = String::from_utf8_lossy(&out.stdout);
    let v: serde_json::Value = serde_json::from_str(stdout.trim()).unwrap();
-    assert_eq!(v.get("schema_version").and_then(|s| s.as_str()), Some("ingest_report.v1"));
+    assert_eq!(
+        v.get("schema_version").and_then(|s| s.as_str()),
+        Some("ingest_report.v1")
+    );
    assert_eq!(v.get("new").and_then(serde_json::Value::as_u64), Some(1));
 }
--- a/crates/kebab-cli/tests/cli_ingest_stdin.rs
+++ b/crates/kebab-cli/tests/cli_ingest_stdin.rs
@@ -73,13 +73,18 @@ max_context_tokens = 8000
            workspace = workspace.display(),
            data = data.display(),
        ),
-    ).unwrap();
+    )
+    .unwrap();

    let bin = env!("CARGO_BIN_EXE_kebab");
    let mut child = Command::new(bin)
        .args([
-            "--json", "--config", cfg_path.to_str().unwrap(),
-            "ingest-stdin", "--title", "X",
+            "--json",
+            "--config",
+            cfg_path.to_str().unwrap(),
+            "ingest-stdin",
+            "--title",
+            "X",
        ])
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
@@ -91,10 +96,17 @@ max_context_tokens = 8000
        stdin.write_all(b"## Body\n\nbody text.\n").unwrap();
    }
    let out = child.wait_with_output().unwrap();
-    assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
+    assert!(
+        out.status.success(),
+        "stderr: {}",
+        String::from_utf8_lossy(&out.stderr)
+    );

    let stdout = String::from_utf8_lossy(&out.stdout);
    let v: serde_json::Value = serde_json::from_str(stdout.trim()).unwrap();
-    assert_eq!(v.get("schema_version").and_then(|s| s.as_str()), Some("ingest_report.v1"));
+    assert_eq!(
+        v.get("schema_version").and_then(|s| s.as_str()),
+        Some("ingest_report.v1")
+    );
    assert_eq!(v.get("new").and_then(serde_json::Value::as_u64), Some(1));
 }
--- a/crates/kebab-cli/tests/cli_readonly_quiet.rs
+++ b/crates/kebab-cli/tests/cli_readonly_quiet.rs
@@ -112,7 +112,13 @@ fn kebab_readonly_env_blocks_ingest() {
 fn readonly_json_mode_emits_error_v1() {
    let (tmp, ws) = fixture_workspace();
    let out = Command::new(kebab_bin())
-        .args(["--readonly", "--json", "ingest", "--root", ws.to_str().unwrap()])
+        .args([
+            "--readonly",
+            "--json",
+            "ingest",
+            "--root",
+            ws.to_str().unwrap(),
+        ])
        .envs(xdg_envs(tmp.path()))
        .output()
        .unwrap();
@@ -164,12 +170,22 @@ fn quiet_flag_suppresses_progress_stderr() {
 fn quiet_with_json_stdout_has_report_stderr_is_empty() {
    let (tmp, ws) = fixture_workspace();
    let out = Command::new(kebab_bin())
-        .args(["--quiet", "--json", "ingest", "--root", ws.to_str().unwrap()])
+        .args([
+            "--quiet",
+            "--json",
+            "ingest",
+            "--root",
+            ws.to_str().unwrap(),
+        ])
        .envs(xdg_envs(tmp.path()))
        .output()
        .unwrap();

-    assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
+    assert!(
+        out.status.success(),
+        "stderr: {}",
+        String::from_utf8_lossy(&out.stderr)
+    );
    let stderr = String::from_utf8_lossy(&out.stderr);
    assert!(stderr.is_empty(), "expected empty stderr, got: {stderr}");
    let stdout = String::from_utf8_lossy(&out.stdout);
--- a/crates/kebab-cli/tests/ingest_progress_cli.rs
+++ b/crates/kebab-cli/tests/ingest_progress_cli.rs
@@ -90,12 +90,7 @@ fn ingest_human_non_tty_emits_progress_lines_to_stderr() {
    // target is `hidden` and progress lines go to stderr instead.
    let (tmp, ws) = fixture_workspace();
    let mut cmd = Command::new(kebab_bin());
-    cmd.args([
-        "ingest",
-        "--root",
-        ws.to_str().unwrap(),
-        "--summary-only",
-    ]);
+    cmd.args(["ingest", "--root", ws.to_str().unwrap(), "--summary-only"]);
    for (k, v) in xdg_envs(tmp.path()) {
        cmd.env(k, v);
    }
@@ -155,8 +150,14 @@ fn ingest_json_progress_lines_carry_kind_and_ts() {
            saw_completed = true;
            // Counts mirror the report.
            let counts = v.get("counts").unwrap();
-            assert_eq!(counts.get("scanned").and_then(serde_json::Value::as_u64), Some(2));
-            assert_eq!(counts.get("new").and_then(serde_json::Value::as_u64), Some(2));
+            assert_eq!(
+                counts.get("scanned").and_then(serde_json::Value::as_u64),
+                Some(2)
+            );
+            assert_eq!(
+                counts.get("new").and_then(serde_json::Value::as_u64),
+                Some(2)
+            );
        }
    }
    assert!(saw_scan_started, "missing scan_started event");
--- a/crates/kebab-cli/tests/reset_cli.rs
+++ b/crates/kebab-cli/tests/reset_cli.rs
@@ -50,9 +50,18 @@ fn reset_data_only_yes_removes_data_dir_and_keeps_config() {
    );

    assert!(!xdg_data.join("kebab").exists(), "data dir should be gone");
-    assert!(!xdg_cache.join("kebab").exists(), "cache dir should be gone");
-    assert!(!xdg_state.join("kebab").exists(), "state dir should be gone");
-    assert!(xdg_cfg.join("kebab/marker").exists(), "config dir preserved");
+    assert!(
+        !xdg_cache.join("kebab").exists(),
+        "cache dir should be gone"
+    );
+    assert!(
+        !xdg_state.join("kebab").exists(),
+        "state dir should be gone"
+    );
+    assert!(
+        xdg_cfg.join("kebab/marker").exists(),
+        "config dir preserved"
+    );
 }

 #[test]
@@ -101,7 +110,11 @@ fn reset_data_only_yes_json_emits_reset_report_v1() {
        .env("XDG_STATE_HOME", tmp.path().join("state"))
        .output()
        .unwrap();
-    assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
+    assert!(
+        out.status.success(),
+        "stderr: {}",
+        String::from_utf8_lossy(&out.stderr)
+    );

    let v: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap();
    assert_eq!(
--- a/crates/kebab-cli/tests/wire_ask_multi_hop.rs
+++ b/crates/kebab-cli/tests/wire_ask_multi_hop.rs
@@ -32,10 +32,9 @@ fn schema_path(name: &str) -> PathBuf {
 }

 fn parse_schema(name: &str) -> serde_json::Value {
-    let text = std::fs::read_to_string(schema_path(name))
-        .unwrap_or_else(|e| panic!("read {name}: {e}"));
-    serde_json::from_str(&text)
-        .unwrap_or_else(|e| panic!("{name} must parse as valid JSON: {e}"))
+    let text =
+        std::fs::read_to_string(schema_path(name)).unwrap_or_else(|e| panic!("read {name}: {e}"));
+    serde_json::from_str(&text).unwrap_or_else(|e| panic!("{name} must parse as valid JSON: {e}"))
 }

 #[test]
--- a/crates/kebab-cli/tests/wire_ask_stream.rs
+++ b/crates/kebab-cli/tests/wire_ask_stream.rs
@@ -41,8 +41,7 @@ fn relax_score_gate(cfg: &Path) {
 #[ignore = "requires real Ollama on 127.0.0.1:11434"]
 fn stream_emits_ndjson_events_on_stderr() {
    let dir = tempfile::tempdir().unwrap();
-    let (cfg, workspace, _data) =
-        common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
+    let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
    relax_score_gate(&cfg);
    fs::write(
        workspace.join("a.md"),
@@ -93,12 +92,8 @@ fn stream_emits_ndjson_events_on_stderr() {
    // stdout: last line is answer.v1 (backwards compat with the
    // non-streaming path — same wire shape, just emitted after the
    // ndjson event stream rather than instead of it).
-    let final_line = stdout
-        .lines()
-        .last()
-        .expect("stdout has at least one line");
-    let answer: Value =
-        serde_json::from_str(final_line).expect("stdout final line = answer.v1");
+    let final_line = stdout.lines().last().expect("stdout has at least one line");
+    let answer: Value = serde_json::from_str(final_line).expect("stdout final line = answer.v1");
    assert_eq!(answer["schema_version"], "answer.v1");
 }

@@ -109,8 +104,7 @@ fn non_stream_path_unchanged() {
    // emits a single `answer.v1` line on stdout — fb-33 must not
    // perturb the existing wire surface.
    let dir = tempfile::tempdir().unwrap();
-    let (cfg, workspace, _data) =
-        common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
+    let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
    relax_score_gate(&cfg);
    fs::write(
        workspace.join("a.md"),
@@ -140,8 +134,7 @@ fn stream_cancels_when_stderr_closes() {
    use std::process::{Command, Stdio};

    let dir = tempfile::tempdir().unwrap();
-    let (cfg, workspace, _data) =
-        common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
+    let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
    relax_score_gate(&cfg);
    fs::write(
        workspace.join("a.md"),
@@ -198,15 +191,10 @@ fn stream_cancels_when_stderr_closes() {
 #[ignore = "requires real Ollama on 127.0.0.1:11434"]
 fn stream_score_gate_refusal_emits_only_retrieval_done() {
    let dir = tempfile::tempdir().unwrap();
-    let (cfg, workspace, _data) =
-        common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
+    let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
    // Intentionally NO relax_score_gate — keep the default 0.30
    // so the thin-doc + unrelated-query combo trips refusal.
-    fs::write(
-        workspace.join("a.md"),
-        "# Title\n\nrust is a language.\n",
-    )
-    .unwrap();
+    fs::write(workspace.join("a.md"), "# Title\n\nrust is a language.\n").unwrap();
    common::ingest(&cfg, &workspace);

    let (stdout, stderr) =
@@ -230,12 +218,8 @@ fn stream_score_gate_refusal_emits_only_retrieval_done() {
    );

    // Stdout still has answer.v1 with grounded=false.
-    let final_line = stdout
-        .lines()
-        .last()
-        .expect("stdout has at least one line");
-    let answer: Value =
-        serde_json::from_str(final_line).expect("answer.v1");
+    let final_line = stdout.lines().last().expect("stdout has at least one line");
+    let answer: Value = serde_json::from_str(final_line).expect("answer.v1");
    assert_eq!(answer["schema_version"], "answer.v1");
    assert_eq!(answer["grounded"], false);
 }
--- a/crates/kebab-cli/tests/wire_bulk_search.rs
+++ b/crates/kebab-cli/tests/wire_bulk_search.rs
@@ -21,7 +21,11 @@ fn cargo_bin() -> &'static str {
    env!("CARGO_BIN_EXE_kebab")
 }

-fn run_bulk_with_stdin(cfg: &std::path::Path, stdin_body: &str, json: bool) -> std::process::Output {
+fn run_bulk_with_stdin(
+    cfg: &std::path::Path,
+    stdin_body: &str,
+    json: bool,
+) -> std::process::Output {
    let mut cmd = Command::new(cargo_bin());
    cmd.arg("--config").arg(cfg).arg("search").arg("--bulk");
    if json {
@@ -94,7 +98,10 @@ fn empty_stdin_returns_empty_results_with_zero_summary() {
    let out = run_bulk_with_stdin(&cfg, "", true);
    assert!(out.status.success());
    let stdout = String::from_utf8_lossy(&out.stdout);
-    assert!(stdout.trim().is_empty(), "expected empty stdout, got: {stdout}");
+    assert!(
+        stdout.trim().is_empty(),
+        "expected empty stdout, got: {stdout}"
+    );
    let stderr = String::from_utf8_lossy(&out.stderr);
    assert!(stderr.contains("bulk_summary: total=0 succeeded=0 failed=0"));
 }
--- a/crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs
+++ b/crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs
@@ -19,7 +19,10 @@ fn line_variant_serialization_unchanged() {
    assert_eq!(v["end"], 2);
    assert_eq!(v["section"], "§14");
    // Must not bleed Code-variant keys.
-    assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
+    assert!(
+        v.get("line_start").is_none(),
+        "line_start must be absent: {v}"
+    );
    assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
    assert!(v.get("code").is_none(), "code must be absent: {v}");
 }
@@ -48,7 +51,10 @@ fn page_variant_serialization_unchanged() {
    let v = serde_json::to_value(&c).unwrap();
    assert_eq!(v["kind"], "page");
    assert_eq!(v["page"], 13);
-    assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
+    assert!(
+        v.get("line_start").is_none(),
+        "line_start must be absent: {v}"
+    );
    assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
 }

@@ -67,7 +73,10 @@ fn region_variant_serialization_unchanged() {
    assert_eq!(v["y"], 20);
    assert_eq!(v["w"], 100);
    assert_eq!(v["h"], 200);
-    assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
+    assert!(
+        v.get("line_start").is_none(),
+        "line_start must be absent: {v}"
+    );
 }

 #[test]
@@ -79,7 +88,10 @@ fn caption_variant_serialization_unchanged() {
    let v = serde_json::to_value(&c).unwrap();
    assert_eq!(v["kind"], "caption");
    assert_eq!(v["model"], "qwen2.5-vl:7b");
-    assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
+    assert!(
+        v.get("line_start").is_none(),
+        "line_start must be absent: {v}"
+    );
 }

 #[test]
@@ -95,6 +107,9 @@ fn time_variant_serialization_unchanged() {
    assert_eq!(v["start_ms"], 1000);
    assert_eq!(v["end_ms"], 5000);
    assert_eq!(v["speaker"], "Alice");
-    assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
+    assert!(
+        v.get("line_start").is_none(),
+        "line_start must be absent: {v}"
+    );
    assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
 }
--- a/crates/kebab-cli/tests/wire_fetch.rs
+++ b/crates/kebab-cli/tests/wire_fetch.rs
@@ -24,10 +24,8 @@ fn fetch_chunk_json_emits_fetch_result_v1() {
    common::ingest(&cfg, &workspace);

    // Find chunk_id via search.
-    let (search_stdout, _) = common::run_search_with_args(
-        &cfg,
-        &["--json", "--mode", "lexical", "--k", "1", "apples"],
-    );
+    let (search_stdout, _) =
+        common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "1", "apples"]);
    let search: Value = serde_json::from_str(search_stdout.trim())
        .unwrap_or_else(|e| panic!("search not JSON: {search_stdout:?}: {e}"));
    let chunk_id = search["hits"][0]["chunk_id"]
@@ -35,10 +33,7 @@ fn fetch_chunk_json_emits_fetch_result_v1() {
        .expect("chunk_id on first hit")
        .to_string();

-    let (stdout, _) = common::run_fetch_with_args(
-        &cfg,
-        &["--json", "chunk", &chunk_id],
-    );
+    let (stdout, _) = common::run_fetch_with_args(&cfg, &["--json", "chunk", &chunk_id]);
    let v: Value = serde_json::from_str(stdout.trim())
        .unwrap_or_else(|e| panic!("fetch not JSON: {stdout:?}: {e}"));
    assert_eq!(v["schema_version"], "fetch_result.v1");
@@ -59,10 +54,8 @@ fn fetch_doc_json_with_max_tokens_truncates() {
    common::ingest(&cfg, &workspace);

    // Find doc_id via search.
-    let (search_stdout, _) = common::run_search_with_args(
-        &cfg,
-        &["--json", "--mode", "lexical", "--k", "1", "Lorem"],
-    );
+    let (search_stdout, _) =
+        common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "1", "Lorem"]);
    let search: Value = serde_json::from_str(search_stdout.trim())
        .unwrap_or_else(|e| panic!("search not JSON: {search_stdout:?}: {e}"));
    let doc_id = search["hits"][0]["doc_id"]
@@ -70,10 +63,8 @@ fn fetch_doc_json_with_max_tokens_truncates() {
        .expect("doc_id on first hit")
        .to_string();

-    let (stdout, _) = common::run_fetch_with_args(
-        &cfg,
-        &["--json", "doc", &doc_id, "--max-tokens", "20"],
-    );
+    let (stdout, _) =
+        common::run_fetch_with_args(&cfg, &["--json", "doc", &doc_id, "--max-tokens", "20"]);
    let v: Value = serde_json::from_str(stdout.trim())
        .unwrap_or_else(|e| panic!("fetch not JSON: {stdout:?}: {e}"));
    assert_eq!(v["kind"], "doc");
--- a/crates/kebab-cli/tests/wire_search_filters.rs
+++ b/crates/kebab-cli/tests/wire_search_filters.rs
@@ -32,12 +32,9 @@ fn search_with_doc_id_filter_returns_only_target_doc() {
    common::ingest(&cfg, &workspace);

    // First, search without a doc-id filter to find what doc_ids exist.
-    let (stdout, _) = common::run_search_with_args(
-        &cfg,
-        &["--json", "--mode", "lexical", "rust"],
-    );
-    let resp: Value = serde_json::from_str(stdout.trim())
-        .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
+    let (stdout, _) = common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]);
+    let resp: Value =
+        serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
    let hits = resp["hits"].as_array().expect("hits array");
    assert!(
        hits.len() >= 2,
@@ -147,15 +144,19 @@ fn search_with_media_filter_md_alias_normalizes_to_markdown() {
    let (cfg, workspace, _data) = common::write_config(dir.path(), 30);

    // Only a markdown file — the `md` alias should match it.
-    fs::write(workspace.join("notes.md"), "# Notes\n\nrust async programming\n").unwrap();
+    fs::write(
+        workspace.join("notes.md"),
+        "# Notes\n\nrust async programming\n",
+    )
+    .unwrap();
    common::ingest(&cfg, &workspace);

    let (stdout, _) = common::run_search_with_args(
        &cfg,
        &["--json", "--mode", "lexical", "--media", "md", "rust"],
    );
-    let resp: Value = serde_json::from_str(stdout.trim())
-        .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
+    let resp: Value =
+        serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
    let hits = resp["hits"].as_array().expect("hits array");

    assert!(
@@ -189,10 +190,8 @@ fn search_with_tag_filter_matches_frontmatter_tags() {
    common::ingest(&cfg, &workspace);

    // Without filter — both docs must produce hits.
-    let (unfiltered, _) = common::run_search_with_args(
-        &cfg,
-        &["--json", "--mode", "lexical", "rust"],
-    );
+    let (unfiltered, _) =
+        common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]);
    let uresp: Value = serde_json::from_str(unfiltered.trim())
        .unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}"));
    let uhits = uresp["hits"].as_array().expect("unfiltered hits array");
@@ -254,10 +253,8 @@ fn search_with_two_tag_filters_returns_or_within_tags() {
    common::ingest(&cfg, &workspace);

    // Without filter: all three docs produce hits.
-    let (unfiltered, _) = common::run_search_with_args(
-        &cfg,
-        &["--json", "--mode", "lexical", "rust"],
-    );
+    let (unfiltered, _) =
+        common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]);
    let uresp: Value = serde_json::from_str(unfiltered.trim())
        .unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}"));
    let uhits = uresp["hits"].as_array().expect("unfiltered hits array");
@@ -270,10 +267,7 @@ fn search_with_two_tag_filters_returns_or_within_tags() {
    let (filtered, _) = common::run_search_with_args(
        &cfg,
        &[
-            "--json", "--mode", "lexical",
-            "--tag", "rust",
-            "--tag", "async",
-            "rust",
+            "--json", "--mode", "lexical", "--tag", "rust", "--tag", "async", "rust",
        ],
    );
    let fresp: Value = serde_json::from_str(filtered.trim())
@@ -301,6 +295,12 @@ fn search_with_two_tag_filters_returns_or_within_tags() {
        .collect();
    let has_a = paths.iter().any(|p| p.ends_with("a.md"));
    let has_b = paths.iter().any(|p| p.ends_with("b.md"));
-    assert!(has_a, "--tag rust must include a.md (rust-tagged): paths={paths:?}");
-    assert!(has_b, "--tag async must include b.md (async-tagged): paths={paths:?}");
+    assert!(
+        has_a,
+        "--tag rust must include a.md (rust-tagged): paths={paths:?}"
+    );
+    assert!(
+        has_b,
+        "--tag async must include b.md (async-tagged): paths={paths:?}"
+    );
 }
--- a/crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs
+++ b/crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs
@@ -5,7 +5,7 @@
 //! inject spurious keys into the existing markdown corpus wire shape.

 use kebab_core::{
-    Citation, ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, ScoreKind,
+    ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, ScoreKind,
    SearchHit, WorkspacePath,
 };

--- a/crates/kebab-cli/tests/wire_search_response.rs
+++ b/crates/kebab-cli/tests/wire_search_response.rs
@@ -23,12 +23,10 @@ fn search_json_emits_search_response_v1_wrapper() {
    fs::write(workspace.join("a.md"), "# T\n\napples are red.\n").unwrap();
    common::ingest(&cfg, &workspace);

-    let (stdout, _stderr) = common::run_search_with_args(
-        &cfg,
-        &["--json", "--mode", "lexical", "apples"],
-    );
-    let v: Value = serde_json::from_str(stdout.trim())
-        .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
+    let (stdout, _stderr) =
+        common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "apples"]);
+    let v: Value =
+        serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
    assert_eq!(v["schema_version"], "search_response.v1");
    assert!(v["hits"].is_array(), "hits must be array, got {v}");
    assert!(
@@ -67,8 +65,8 @@ fn search_json_truncates_with_max_tokens() {
        &cfg,
        &["--json", "--mode", "lexical", "--max-tokens", "30", "rust"],
    );
-    let v: Value = serde_json::from_str(stdout.trim())
-        .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
+    let v: Value =
+        serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
    assert_eq!(
        v["truncated"], true,
        "30-token cap must trip truncation: {v}"
@@ -88,10 +86,8 @@ fn search_json_cursor_paginates() {
    }
    common::ingest(&cfg, &workspace);

-    let (page1, _) = common::run_search_with_args(
-        &cfg,
-        &["--json", "--mode", "lexical", "--k", "2", "rust"],
-    );
+    let (page1, _) =
+        common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "2", "rust"]);
    let v1: Value = serde_json::from_str(page1.trim())
        .unwrap_or_else(|e| panic!("page1 not JSON: {page1:?}: {e}"));
    let cursor = v1["next_cursor"]
@@ -101,14 +97,7 @@ fn search_json_cursor_paginates() {
    let (page2, _) = common::run_search_with_args(
        &cfg,
        &[
-            "--json",
-            "--mode",
-            "lexical",
-            "--k",
-            "2",
-            "--cursor",
-            cursor,
-            "rust",
+            "--json", "--mode", "lexical", "--k", "2", "--cursor", cursor, "rust",
        ],
    );
    let v2: Value = serde_json::from_str(page2.trim())
@@ -118,23 +107,13 @@ fn search_json_cursor_paginates() {
        .as_array()
        .expect("page1 hits array")
        .iter()
-        .map(|h| {
-            h["chunk_id"]
-                .as_str()
-                .expect("chunk_id string")
-                .to_string()
-        })
+        .map(|h| h["chunk_id"].as_str().expect("chunk_id string").to_string())
        .collect();
    let p2_ids: Vec<String> = v2["hits"]
        .as_array()
        .expect("page2 hits array")
        .iter()
-        .map(|h| {
-            h["chunk_id"]
-                .as_str()
-                .expect("chunk_id string")
-                .to_string()
-        })
+        .map(|h| h["chunk_id"].as_str().expect("chunk_id string").to_string())
        .collect();
    assert!(
        !p2_ids.is_empty(),
@@ -161,10 +140,8 @@ fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() {
    common::ingest(&cfg, &workspace);

    // Get a valid cursor first.
-    let (page1_stdout, _) = common::run_search_with_args(
-        &cfg,
-        &["--mode", "lexical", "--json", "--k", "1", "apples"],
-    );
+    let (page1_stdout, _) =
+        common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "--k", "1", "apples"]);
    let v1: Value = serde_json::from_str(page1_stdout.trim()).expect("json");
    let cursor = v1["next_cursor"]
        .as_str()
@@ -181,16 +158,8 @@ fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() {
    let cfg_str = cfg.to_str().expect("utf8");
    let out = std::process::Command::new(exe)
        .args([
-            "--config",
-            cfg_str,
-            "--json",
-            "search",
-            "--mode",
-            "lexical",
-            "--json",
-            "--cursor",
-            &cursor,
-            "apples",
+            "--config", cfg_str, "--json", "search", "--mode", "lexical", "--json", "--cursor",
+            &cursor, "apples",
        ])
        .output()
        .expect("kebab search --cursor");
@@ -234,10 +203,8 @@ fn search_plain_emits_truncated_hint_to_stderr() {
    }
    common::ingest(&cfg, &workspace);

-    let (_stdout, stderr) = common::run_search_with_args(
-        &cfg,
-        &["--mode", "lexical", "--max-tokens", "30", "rust"],
-    );
+    let (_stdout, stderr) =
+        common::run_search_with_args(&cfg, &["--mode", "lexical", "--max-tokens", "30", "rust"]);
    assert!(
        stderr.contains("[truncated;"),
        "stderr must carry truncated hint: {stderr:?}"
@@ -254,10 +221,7 @@ fn search_plain_emits_short_query_hint_to_stderr() {
    let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
    common::ingest(&cfg, &workspace);

-    let (_stdout, stderr) = common::run_search_with_args(
-        &cfg,
-        &["--mode", "lexical", "ab"],
-    );
+    let (_stdout, stderr) = common::run_search_with_args(&cfg, &["--mode", "lexical", "ab"]);
    assert!(
        stderr.contains("[hint]"),
        "stderr must carry short-query hint: {stderr:?}"
@@ -278,18 +242,18 @@ fn search_json_emits_hint_field_for_short_query() {
    let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
    common::ingest(&cfg, &workspace);

-    let (stdout, _stderr) = common::run_search_with_args(
-        &cfg,
-        &["--json", "--mode", "lexical", "ab"],
-    );
-    let v: Value = serde_json::from_str(stdout.trim())
-        .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
+    let (stdout, _stderr) =
+        common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "ab"]);
+    let v: Value =
+        serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
    assert!(
        v["hits"].as_array().unwrap().is_empty(),
        "empty hits expected for short query in empty KB: {v}"
    );
    assert_eq!(
-        v["hint"].as_str().expect("hint field set on short empty result"),
+        v["hint"]
+            .as_str()
+            .expect("hint field set on short empty result"),
        "3자 이상 키워드 권장 (trigram tokenizer 제약)",
        "hint must carry the standard advisory: {v}"
    );
@@ -305,12 +269,10 @@ fn search_json_omits_hint_field_when_query_is_long_enough() {
    let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
    common::ingest(&cfg, &workspace);

-    let (stdout, _stderr) = common::run_search_with_args(
-        &cfg,
-        &["--json", "--mode", "lexical", "abc"],
-    );
-    let v: Value = serde_json::from_str(stdout.trim())
-        .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
+    let (stdout, _stderr) =
+        common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "abc"]);
+    let v: Value =
+        serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
    assert!(
        v.get("hint").is_none(),
        "hint must be absent for ≥3-char queries: {v}"
--- a/crates/kebab-cli/tests/wire_search_score_kind.rs
+++ b/crates/kebab-cli/tests/wire_search_score_kind.rs
@@ -16,10 +16,8 @@ fn lexical_mode_hits_carry_bm25_score_kind() {
    doc_with_term(&workspace);
    common::ingest(&cfg, &workspace);

-    let (stdout, _stderr) = common::run_search_with_args(
-        &cfg,
-        &["--mode", "lexical", "--json", "rust"],
-    );
+    let (stdout, _stderr) =
+        common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "rust"]);
    let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON");
    let hits = v["hits"].as_array().expect("hits array");
    assert!(!hits.is_empty(), "expected at least 1 hit");
@@ -40,10 +38,8 @@ fn old_wire_reader_compat_score_kind_optional_field() {
    doc_with_term(&workspace);
    common::ingest(&cfg, &workspace);

-    let (stdout, _stderr) = common::run_search_with_args(
-        &cfg,
-        &["--mode", "lexical", "--json", "rust"],
-    );
+    let (stdout, _stderr) =
+        common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "rust"]);
    let v: Value = serde_json::from_str(stdout.trim()).unwrap();
    let hit = &v["hits"][0];
    assert!(hit.get("score_kind").is_some(), "score_kind always emitted");
--- a/crates/kebab-cli/tests/wire_search_stale.rs
+++ b/crates/kebab-cli/tests/wire_search_stale.rs
@@ -59,15 +59,14 @@ fn search_json_includes_indexed_at_and_stale() {
        .get("hits")
        .and_then(|h| h.as_array())
        .unwrap_or_else(|| panic!("expected hits array, got {stdout}"));
-    let first = arr.first().unwrap_or_else(|| panic!("expected ≥1 hit, got empty hits: {stdout}"));
+    let first = arr
+        .first()
+        .unwrap_or_else(|| panic!("expected ≥1 hit, got empty hits: {stdout}"));
    assert!(
        first.get("indexed_at").is_some(),
        "missing indexed_at in {first}"
    );
-    assert!(
-        first.get("stale").is_some(),
-        "missing stale in {first}"
-    );
+    assert!(first.get("stale").is_some(), "missing stale in {first}");
    assert_eq!(
        first["stale"], false,
        "freshly ingested doc must not be stale at default 30d threshold"
--- a/Show More
+++ b/Show More