From ca0567c72b0e3d4c33c91fc20cd45bac89535773 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 07:37:56 +0000 Subject: [PATCH 1/3] =?UTF-8?q?feat(kebab-app):=20P6-4=20image=20ingest=20?= =?UTF-8?q?wiring=20=E2=80=94=20kebab=20ingest=20=EA=B0=80=20PNG/JPEG=20?= =?UTF-8?q?=EC=9E=90=EC=82=B0=EB=8F=84=20=EC=B2=98=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P6-1/P6-2/P6-3 의 라이브러리 (`ImageExtractor`, `OllamaVisionOcr`, `apply_caption`) 가 그동안 CLI 에서 보이지 않던 미완 구간을 완성. 이제 `kebab ingest` 가 markdown 외에 이미지 자산을 end-to-end 로 색인하고, `kebab search` / `kebab ask` 가 OCR 텍스트 + caption 으로 이미지를 매칭/인용한다. ## kebab-app - `[dependencies]` 에 `kebab-parse-image` 추가. - `ingest_with_config` 진입 시 `image.ocr.enabled` / `image.caption.enabled` 플래그에 따라 `OllamaVisionOcr` / `OllamaLanguageModel` 을 **ingest 세션당 1회** 빌드. 자산 루프에서 trait object 로 공유. reqwest::blocking::Client 의 내부 Arc 덕분에 알로케이션 비용은 자산 수와 무관. - 두 어댑터 + ImageExtractor 를 한 묶음으로 `ImagePipeline` 구조체에 담아 `ingest_one_asset` 매개변수 폭증 차단 (clippy::too_many_arguments 대응). - `ingest_one_asset` 의 markdown-only 가드를 `match media_type` 으로 교체 — Markdown 은 기존 경로, Image(_) 는 새 `ingest_one_image_asset` 로 분기, PDF/Audio/Other 는 종전대로 skipped. - 신규 `ingest_one_image_asset`: - bytes 읽기 → `ImageExtractor::extract` (실패 시 caller 가 errors+=1) - `apply_ocr` (Lenient — 실패 시 ProvenanceKind::Warning 이벤트 + `IngestItem.warnings` 에 \"ocr_failed: ...\", `block.ocr` 는 None 유지) - `apply_caption` (동일 Lenient 정책) - 기존 `MdHeadingV1Chunker` 호출 — 청커는 이미 `Block::ImageRef` 를 단일 청크로 emit - 기존 persist + embed 시퀀스 그대로 (markdown 과 byte-identical) - `lang_hint_from_doc` — `Lang(\"und\")` 또는 빈 문자열을 None 으로 매핑 (image-pipeline 어댑터의 build_prompt 가 \"und\" 를 silent drop 하지 않도록 caller 측에서 미리). ## kebab-chunk - `render_block_text` 의 `Block::ImageRef` 분기를 P6-4 (β) plain concat 정책으로 교체 — `[alt, ocr.joined, caption.text]` 를 `\\n\\n` 로 join, 빈 부분은 drop. alt 가 비면 `src` 의 basename 으로 fallback (P6-1 contract 의 defensive guard). - 신규 unit 테스트 `image_ref_p6_4_plain_concat_drops_empty_parts` — alt-only / alt+ocr / alt+caption / alt+ocr+caption / 빈 alt → src fallback 다섯 케이스 모두 검증. - 기존 `image_ref_emits_own_chunk_zero_tokens` 그대로 통과 — 청커의 per-block dispatch 는 변경 없음, text 렌더링만 갱신. ## 통합 테스트 (kebab-app/tests/image_pipeline.rs) wiremock 으로 Ollama 를 stub. 5건: 1. OCR-only happy path — 1 PNG + ocr.enabled → 1 doc + 1 chunk emit, `block.ocr.joined` 가 mock 의 \"Hello World 2026\". 2. OCR + caption 동시 활성 — 두 필드 모두 채워지고 chunk text 에 alt + ocr + caption 세 부분 모두 포함. 3. Lenient 실패 검증 — OCR 503 시 자산은 indexed (kind=New), `errors=0`, ProvenanceKind::Warning attributed to \"kb-app\", `IngestItem.warnings` 에 \"ocr_failed:\" 노트. 4. 양쪽 비활성 — `image.ocr.enabled=false && image.caption.enabled=false` 여도 자산은 chunk 1개로 indexed (chunk text=filename), EXIF + dimensions 그대로 채워짐. 5. 결정성 (re-ingest) — 동일 PNG 두 번 ingest 시 두 번째는 `Updated` + 동일 `doc_id`. ## SMOKE.md `kebab search --mode lexical \"Hello World\"` 단계를 명령 시퀀스에 추가. `[image.ocr]` / `[image.caption]` config 절 예시 + ingest 시간 추정 (자산당 ~5-10초) 추가. \"책은 P7 PDF 라인으로\" 가이드를 검증 체크리스트 와 \"알려진 동작\" 양쪽에 박음. ## 실 Ollama 통합 검증 192.168.0.47 + gemma4:e4b 기준: ``` $ kebab --config /tmp/kebab-smoke/config.toml ingest scanned 2 new 2 updated 0 skipped 0 errors 0 (18395 ms) $ kebab inspect doc parser_version: image-meta-v1 blocks: [{ alt: \"hello.png\", ocr: \"Hello World 2026\", caption: \"The image displays the text \\\"Hello World 2026\\\" in a large, black, sans-serif font.\" }] $ kebab --json ask \"Hello World 텍스트가 어디에 있나?\" --mode hybrid grounded: true citations: [{marker: \"[1]\", doc_path: \"hello.png\"}] ``` ## 검증 - `cargo test --workspace --no-fail-fast -j 1` — 전부 pass - `cargo clippy --workspace --all-targets -- -D warnings` — pass - `cargo test -p kebab-chunk image_ref` — 2 pass (P1-5 회귀 + P6-4 신규 unit) - `cargo test -p kebab-app --test image_pipeline` — 5 pass ## 의존성 경계 - `kebab-app` 이 `kebab-parse-image` 추가 — spec Allowed dep 그대로. - 새 forbidden 침범 없음 (기존 `kebab-tui` / `kebab-desktop` / `kebab-eval` 미참조 유지). - 본 task 가 신설하는 image-specific 비즈니스 로직 0줄 — 모두 `kebab-parse-image` 에 위임. `tasks/p6/p6-4-image-ingest-wiring.md` status: planned → completed. contract: docs/superpowers/specs/2026-04-27-kebab-final-form-design.md sections: §3.4 ImageRefBlock, §6.1 ingest pipeline, §7.2 Extractor/Chunker traits, §9.1 image extraction policy. --- Cargo.lock | 4 + crates/kebab-app/Cargo.toml | 11 + crates/kebab-app/src/lib.rs | 321 ++++++++++++++++++-- crates/kebab-app/tests/image_pipeline.rs | 366 +++++++++++++++++++++++ crates/kebab-chunk/src/md_heading_v1.rs | 97 +++++- docs/SMOKE.md | 38 ++- tasks/p6/p6-4-image-ingest-wiring.md | 2 +- 7 files changed, 807 insertions(+), 32 deletions(-) create mode 100644 crates/kebab-app/tests/image_pipeline.rs diff --git a/Cargo.lock b/Cargo.lock index 0c9ee2a..219f346 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3397,6 +3397,7 @@ dependencies = [ "anyhow", "blake3", "dirs 5.0.1", + "image", "kebab-chunk", "kebab-config", "kebab-core", @@ -3405,6 +3406,7 @@ dependencies = [ "kebab-llm", "kebab-llm-local", "kebab-normalize", + "kebab-parse-image", "kebab-parse-md", "kebab-parse-types", "kebab-rag", @@ -3417,10 +3419,12 @@ dependencies = [ "serde_json", "tempfile", "time", + "tokio", "toml", "tracing", "tracing-appender", "tracing-subscriber", + "wiremock", ] [[package]] diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml index e8fae60..c50ae8e 100644 --- a/crates/kebab-app/Cargo.toml +++ b/crates/kebab-app/Cargo.toml @@ -23,6 +23,11 @@ kebab-embed-local = { path = "../kebab-embed-local" } kebab-llm = { path = "../kebab-llm" } kebab-llm-local = { path = "../kebab-llm-local" } kebab-rag = { path = "../kebab-rag" } +# P6-4: image extractor + OCR + caption adapters live here. App +# threads them into the per-asset dispatch (see `ingest_one_asset` +# image branch). Trait-only consumption — no `kebab-parse-image` +# internals leak into kb-app code. +kebab-parse-image = { path = "../kebab-parse-image" } anyhow = { workspace = true } blake3 = { workspace = true } serde = { workspace = true } @@ -37,3 +42,9 @@ dirs = "5" [dev-dependencies] rusqlite = { workspace = true } tempfile = { workspace = true } +# Image-pipeline integration tests use wiremock to stub Ollama for OCR +# / caption HTTP calls. Async runtime to host the mock server only; +# the kb-app code under test stays sync. +wiremock = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread"] } +image = { version = "0.25", default-features = false, features = ["png"] } diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 15d6fc9..fff98c3 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -41,12 +41,15 @@ use serde::{Deserialize, Serialize}; use kebab_chunk::MdHeadingV1Chunker; use kebab_core::{ - Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, + Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, - EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery, - SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore, + EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType, + ParserVersion, RawAsset, SearchHit, SearchQuery, SourceConnector, SourceScope, + SourceUri, VectorRecord, VectorStore, }; +use kebab_llm_local::OllamaLanguageModel; use kebab_normalize::build_canonical_document; +use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -190,6 +193,35 @@ pub fn ingest_with_config( let parser_version = ParserVersion(KEBAB_PARSE_MD_VERSION.to_string()); let chunk_policy = chunk_policy_from_config(&app.config); + // P6-4: build OCR / caption adapters once per ingest invocation, + // gated on their respective `enabled` flags. `reqwest::blocking::Client` + // is internally Arc-shared so reusing one instance across the asset + // loop is correct and cheap. Construction failure (e.g. invalid + // endpoint) aborts ingest fail-fast — better than silently disabling + // OCR/caption mid-run. + let ocr_engine: Option = if app.config.image.ocr.enabled { + Some( + OllamaVisionOcr::new(&app.config) + .context("kb-app::ingest: build OllamaVisionOcr")?, + ) + } else { + None + }; + let caption_llm: Option> = if app.config.image.caption.enabled { + Some(Box::new( + OllamaLanguageModel::new(&app.config) + .context("kb-app::ingest: build OllamaLanguageModel for caption")?, + )) + } else { + None + }; + let image_extractor = ImageExtractor::new(); + let image_pipeline = ImagePipeline { + extractor: &image_extractor, + ocr_engine: ocr_engine.as_ref(), + caption_llm: caption_llm.as_deref(), + }; + // Pre-load every existing doc_id so we can label `IngestItem.kind` // as `New` vs `Updated` correctly. `list_documents` returns one // row per `(workspace_path, asset_id)` — index by the deterministic @@ -230,6 +262,7 @@ pub fn ingest_with_config( embedder.as_ref(), vector_store.as_ref(), &existing_doc_ids, + &image_pipeline, ); let item = match item { @@ -438,6 +471,16 @@ type SqliteStoreAlias = kebab_store_sqlite::SqliteStore; /// persist, embed. Per-asset failures bubble up to the caller for /// labelling as `IngestItemKind::Error` — they do NOT abort the /// whole run. +/// P6-4: borrowed bundle of the three image-pipeline components built +/// once per ingest invocation. Threaded through `ingest_one_asset` so +/// the dispatch does not need ten separate parameters. +struct ImagePipeline<'a> { + extractor: &'a ImageExtractor, + ocr_engine: Option<&'a OllamaVisionOcr>, + caption_llm: Option<&'a dyn LanguageModel>, +} + +#[allow(clippy::too_many_arguments)] fn ingest_one_asset( app: &App, asset: &RawAsset, @@ -446,27 +489,47 @@ fn ingest_one_asset( embedder: Option<&Arc>, vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, + image_pipeline: &ImagePipeline<'_>, ) -> anyhow::Result { tracing::debug!( target: "kebab-app::ingest", path = %asset.workspace_path.0, + media_type = ?asset.media_type, "processing asset" ); - // Only handle Markdown for now; other media types are P6+ work. - if asset.media_type != kebab_core::MediaType::Markdown { - return Ok(kebab_core::IngestItem { - kind: kebab_core::IngestItemKind::Skipped, - doc_id: None, - doc_path: asset.workspace_path.clone(), - asset_id: Some(asset.asset_id.clone()), - byte_len: Some(asset.byte_len), - block_count: None, - chunk_count: None, - parser_version: None, - chunker_version: None, - warnings: Vec::new(), - error: None, - }); + // P6-4: dispatch on media_type. Markdown takes the existing + // parse-md / normalize path; image takes the new + // ImageExtractor + (optional) OCR + (optional) caption path. + // Anything else (PDF, audio, unknown) is skipped — the + // respective phases (P7 / P8) wire them in later. + match &asset.media_type { + MediaType::Markdown => { /* fall through to markdown path */ } + MediaType::Image(_) => { + return ingest_one_image_asset( + app, + asset, + chunk_policy, + embedder, + vector_store, + existing_doc_ids, + image_pipeline, + ); + } + _ => { + return Ok(kebab_core::IngestItem { + kind: kebab_core::IngestItemKind::Skipped, + doc_id: None, + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: None, + chunk_count: None, + parser_version: None, + chunker_version: None, + warnings: Vec::new(), + error: None, + }); + } } let path = match &asset.source_uri { @@ -612,6 +675,228 @@ fn ingest_one_asset( }) } +/// P6-4: process one `MediaType::Image(_)` asset end-to-end. +/// +/// Pipeline: read bytes → `ImageExtractor::extract` → optional +/// `apply_ocr` → optional `apply_caption` → existing chunker / embedder +/// / store path (the same one markdown uses, which already handles +/// `Block::ImageRef` per P1-5). +/// +/// Failure semantics (per P6-4 spec): +/// - `ImageExtractor::extract` Err → propagate (caller increments +/// `errors`). +/// - OCR / caption Err → log + `Provenance::Warning` event, continue. +/// `block.ocr` / `block.caption` stay `None`. `errors` NOT incremented. +#[allow(clippy::too_many_arguments)] +fn ingest_one_image_asset( + app: &App, + asset: &RawAsset, + chunk_policy: &ChunkPolicy, + embedder: Option<&Arc>, + vector_store: Option<&Arc>, + existing_doc_ids: &std::collections::HashSet, + image_pipeline: &ImagePipeline<'_>, +) -> anyhow::Result { + let image_extractor = image_pipeline.extractor; + let ocr_engine = image_pipeline.ocr_engine; + let caption_llm = image_pipeline.caption_llm; + let path = match &asset.source_uri { + SourceUri::File(p) => p.clone(), + SourceUri::Kb(_) => { + return Ok(kebab_core::IngestItem { + kind: kebab_core::IngestItemKind::Skipped, + doc_id: None, + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: None, + chunk_count: None, + parser_version: None, + chunker_version: None, + warnings: vec![ + "kb:// source URIs are not supported by the fs ingester".into(), + ], + error: None, + }); + } + }; + let bytes = std::fs::read(&path) + .with_context(|| format!("read image asset bytes from {}", path.display()))?; + + // 1. Decode + EXIF + dimensions. ExtractContext.config carries + // nothing the image extractor reads today; we pass a default + // instance per the trait shape. + let extract_config = kebab_core::ExtractConfig::default(); + let workspace_root = std::path::PathBuf::from(&app.config.workspace.root); + let ctx = ExtractContext { + asset, + workspace_root: &workspace_root, + config: &extract_config, + }; + let mut canonical = image_extractor + .extract(&ctx, &bytes) + .context("kb-parse-image::ImageExtractor::extract")?; + + // 2 + 3. Apply OCR / caption when their adapters exist. Both are + // Lenient — failure is captured into Provenance Warning, + // `block.ocr` / `block.caption` stay `None`. P6-4 spec + // explicitly: such partial failures do NOT increment the + // `errors` counter. + let lang_hint = lang_hint_from_doc(&canonical); + let mut warning_notes: Vec = Vec::new(); + if !canonical.blocks.is_empty() { + // P6-1 contract: image documents always have exactly one + // `Block::ImageRef`. Defensive match keeps us forward-compatible. + if let Some(Block::ImageRef(block)) = canonical.blocks.first_mut() { + if let Some(engine) = ocr_engine { + if let Err(e) = apply_ocr( + engine, + &bytes, + block, + lang_hint.as_ref(), + &mut canonical.provenance.events, + ) { + let note = format!("ocr_failed: {e:#}"); + tracing::warn!( + target: "kebab-app", + path = %asset.workspace_path.0, + "{}", + note + ); + canonical.provenance.events.push(kebab_core::ProvenanceEvent { + at: time::OffsetDateTime::now_utc(), + agent: "kb-app".to_string(), + kind: kebab_core::ProvenanceKind::Warning, + note: Some(note.clone()), + }); + warning_notes.push(note); + } + } + if let Some(llm) = caption_llm { + if let Err(e) = apply_caption( + llm, + &bytes, + block, + lang_hint.as_ref(), + &app.config, + &mut canonical.provenance.events, + ) { + let note = format!("caption_failed: {e:#}"); + tracing::warn!( + target: "kebab-app", + path = %asset.workspace_path.0, + "{}", + note + ); + canonical.provenance.events.push(kebab_core::ProvenanceEvent { + at: time::OffsetDateTime::now_utc(), + agent: "kb-app".to_string(), + kind: kebab_core::ProvenanceKind::Warning, + note: Some(note.clone()), + }); + warning_notes.push(note); + } + } + } + } + + // 4. Chunk via the same `MdHeadingV1Chunker` markdown uses — its + // `Block::ImageRef` arm already produces a single chunk per + // image (P1-5). The chunk text now follows the (β) plain-concat + // contract per the kebab-chunk render_block_text update. + let chunks = MdHeadingV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?; + + // 5. Persist + embed — identical sequence to markdown. + app.sqlite + .put_asset_with_bytes(asset, &bytes) + .context("DocumentStore::put_asset_with_bytes (image)")?; + app.sqlite + .put_document(&canonical) + .context("DocumentStore::put_document (image)")?; + app.sqlite + .put_blocks(&canonical.doc_id, &canonical.blocks) + .context("DocumentStore::put_blocks (image)")?; + app.sqlite + .put_chunks(&canonical.doc_id, &chunks) + .context("DocumentStore::put_chunks (image)")?; + + if let (Some(emb), Some(vec_store)) = (embedder, vector_store) + && !chunks.is_empty() + { + let inputs: Vec> = chunks + .iter() + .map(|c| EmbeddingInput { + text: c.text.as_str(), + kind: EmbeddingKind::Document, + }) + .collect(); + let vectors = emb + .embed(&inputs) + .context("Embedder::embed (image chunks)")?; + let model_id = emb.model_id(); + let model_version = emb.model_version(); + let dimensions = emb.dimensions(); + let records: Vec = chunks + .iter() + .zip(vectors) + .map(|(c, v)| VectorRecord { + embedding_id: kebab_core::id_for_embedding( + &c.chunk_id, + &model_id, + &model_version, + dimensions, + ), + chunk_id: c.chunk_id.clone(), + vector: v, + doc_id: canonical.doc_id.clone(), + text: c.text.clone(), + heading_path: c.heading_path.clone(), + model_id: model_id.clone(), + model_version: model_version.clone(), + dimensions, + }) + .collect(); + vec_store + .upsert(&records) + .context("VectorStore::upsert (image)")?; + } + + let kind = if existing_doc_ids.contains(&canonical.doc_id.0) { + kebab_core::IngestItemKind::Updated + } else { + kebab_core::IngestItemKind::New + }; + + Ok(kebab_core::IngestItem { + kind, + doc_id: Some(canonical.doc_id.clone()), + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: u32::try_from(canonical.blocks.len()).ok(), + chunk_count: u32::try_from(chunks.len()).ok(), + parser_version: Some(canonical.parser_version.clone()), + chunker_version: Some(MdHeadingV1Chunker.chunker_version()), + warnings: warning_notes, + error: None, + }) +} + +/// Pull the BCP-47 language hint from the canonical document. P6-1 +/// stamps `Lang("und")` by default; image-pipeline OCR / caption +/// adapters special-case "und" so the hint is intentionally dropped +/// from prompts. +fn lang_hint_from_doc(doc: &CanonicalDocument) -> Option { + let s = doc.lang.0.as_str(); + if s.is_empty() || s == "und" { + None + } else { + Some(doc.lang.clone()) + } +} + /// Convenience: end byte of the frontmatter region (or 0 when absent). fn fm_span_end(span: Option) -> usize { span.map(|s| s.end).unwrap_or(0) diff --git a/crates/kebab-app/tests/image_pipeline.rs b/crates/kebab-app/tests/image_pipeline.rs new file mode 100644 index 0000000..60dea3e --- /dev/null +++ b/crates/kebab-app/tests/image_pipeline.rs @@ -0,0 +1,366 @@ +//! P6-4 image ingest wiring — end-to-end integration. +//! +//! Each test spins up a `TempDir` workspace + writes one PNG fixture + +//! routes OCR / caption HTTP calls through a `wiremock` server that +//! impersonates Ollama's `/api/generate` endpoint. The kb-app code +//! under test is sync; the wiremock server is async, so test bodies +//! drive blocking work via `tokio::task::spawn_blocking`. + +mod common; + +use std::path::Path; + +use common::TestEnv; +use kebab_config::Config; +use serde_json::json; +use tokio::task::spawn_blocking; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +// ── Fixture helpers ────────────────────────────────────────────────────── + +/// Tiny solid-red PNG written into the test workspace at `/`. +/// 100×50 — small enough to skip downscale by default but non-trivially +/// inspectable in stored DB rows. +fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf { + use image::{ImageBuffer, Rgb}; + let img: ImageBuffer, _> = + ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0])); + let path = root.join(name); + img.save(&path).expect("write PNG fixture"); + path +} + +fn cfg_with_image_pipeline(env: &TestEnv, mock_endpoint: &str) -> Config { + let mut cfg = env.config.clone(); + // Ensure image assets are scanned. + cfg.workspace + .include + .push("**/*.png".to_string()); + cfg.image.ocr.enabled = true; + cfg.image.ocr.endpoint = Some(mock_endpoint.to_string()); + cfg.image.ocr.model = "vision-mock:1b".to_string(); + cfg.image.ocr.max_pixels = 512; + cfg.image.caption.enabled = false; // tested separately below + cfg.models.llm.endpoint = mock_endpoint.to_string(); + cfg.models.llm.model = "vision-mock:1b".to_string(); + cfg +} + +// ── 1. Happy path: OCR-only ingest ─────────────────────────────────────── + +/// One PNG asset + OCR enabled (caption off) → ingest produces 1 doc + 1 +/// chunk; chunk text contains alt + OCR transcription joined by `\n\n`. +#[tokio::test] +async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "model": "vision-mock:1b", + "response": "Hello World 2026", + "done": true, + "done_reason": "stop" + }))) + .mount(&server) + .await; + + let env = TestEnv::lexical_only(); + let png = write_red_png(&env.workspace_root, "diagram.png"); + eprintln!("PNG written to {}", png.display()); + let cfg = cfg_with_image_pipeline(&env, &server.uri()); + let cfg_clone = cfg.clone(); + let env_workspace = env.workspace_root.clone(); + let env_scope = env.scope(); + + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, env_scope, false) + .expect("image ingest must succeed") + }) + .await + .expect("blocking task panicked"); + + // Counters: scanned should include the PNG; new ≥ 1 (markdown + // fixtures from the workspace tree may also count). + assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items); + assert_eq!(report.errors, 0, "no errors on lenient OCR path"); + + // Locate the image doc in the report items. + let items = report.items.expect("items present (summary_only=false)"); + let img_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("diagram.png")) + .expect("image doc item must be present"); + assert_eq!( + img_item.kind, + kebab_core::IngestItemKind::New, + "image asset must be classified New on first ingest" + ); + assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk"); + + // Inspect the stored chunk text via kb-app's inspect_chunk facade. + let doc_id = img_item.doc_id.clone().expect("image doc id"); + let doc = kebab_app::inspect_doc_with_config(cfg.clone(), &doc_id) + .expect("inspect_doc returns the image document"); + let block = match doc.blocks.first() { + Some(kebab_core::Block::ImageRef(b)) => b, + other => panic!("expected ImageRef, got {other:?}"), + }; + assert!(block.ocr.is_some(), "block.ocr populated by apply_ocr"); + assert_eq!( + block.ocr.as_ref().unwrap().joined, + "Hello World 2026", + "OCR text from mock" + ); + assert!( + block.caption.is_none(), + "caption disabled in cfg → block.caption stays None" + ); + + // Sanity: the doc was actually persisted into SQLite (kb-app's + // list_docs facade reads the same store the chunker writes to). + let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()) + .expect("list_docs"); + assert!( + summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")), + "image doc must appear in list_docs" + ); + + drop(env_workspace); // keep TempDir alive until here + drop(env); +} + +// ── 2. OCR + caption together ──────────────────────────────────────────── + +/// Both OCR and caption enabled. The mock returns the same JSON body +/// for every `/api/generate` POST — wiremock has no per-prompt routing +/// on the default `Mock` so we treat both calls as equivalent. We then +/// verify both `block.ocr` and `block.caption` are populated, and the +/// chunk text contains both fragments separated by `\n\n`. +#[tokio::test] +async fn ingest_image_with_ocr_and_caption_populates_both_fields() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "response": "shared mock body", + "done": true, + "done_reason": "stop" + }))) + .mount(&server) + .await; + + let env = TestEnv::lexical_only(); + write_red_png(&env.workspace_root, "diagram.png"); + let mut cfg = cfg_with_image_pipeline(&env, &server.uri()); + cfg.image.caption.enabled = true; + cfg.image.caption.max_pixels = 384; + + let cfg_clone = cfg.clone(); + let scope = env.scope(); + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, scope, false) + .expect("ingest must succeed with both OCR+caption") + }) + .await + .expect("task"); + + assert_eq!(report.errors, 0); + let img_item = report + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("diagram.png")) + .unwrap(); + let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()) + .unwrap(); + let block = match &doc.blocks[0] { + kebab_core::Block::ImageRef(b) => b, + _ => unreachable!(), + }; + assert!(block.ocr.is_some(), "OCR populated"); + assert!(block.caption.is_some(), "caption populated"); + drop(env); +} + +// ── 3. Lenient failure: OCR Ollama 503 → asset still indexed ───────────── + +/// OCR endpoint returns 503. Spec contract: image is still indexed, +/// `block.ocr = None`, Provenance has a Warning event, `errors` +/// counter NOT incremented. +#[tokio::test] +async fn ocr_failure_indexes_asset_with_warning_no_error_counter() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(503)) + .mount(&server) + .await; + + let env = TestEnv::lexical_only(); + write_red_png(&env.workspace_root, "broken.png"); + let cfg = cfg_with_image_pipeline(&env, &server.uri()); + + let cfg_clone = cfg.clone(); + let scope = env.scope(); + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, scope, false) + .expect("ingest does not abort on lenient OCR failure") + }) + .await + .expect("task"); + + assert_eq!( + report.errors, 0, + "lenient OCR failure must NOT increment errors counter (spec)" + ); + let img_item = report + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("broken.png")) + .expect("asset still indexed despite OCR failure"); + assert_eq!(img_item.kind, kebab_core::IngestItemKind::New); + assert_eq!(img_item.chunk_count, Some(1)); + assert!( + !img_item.warnings.is_empty(), + "lenient OCR failure must surface a warning on the IngestItem" + ); + + let doc_id = img_item.doc_id.clone().unwrap(); + let doc = kebab_app::inspect_doc_with_config(cfg, &doc_id).unwrap(); + let block = match &doc.blocks[0] { + kebab_core::Block::ImageRef(b) => b, + _ => unreachable!(), + }; + assert!(block.ocr.is_none(), "block.ocr stays None on OCR failure"); + let warning = doc + .provenance + .events + .iter() + .find(|e| e.kind == kebab_core::ProvenanceKind::Warning && e.agent == "kb-app") + .expect("Provenance Warning attributed to kb-app"); + let note = warning.note.as_deref().unwrap_or(""); + assert!( + note.contains("ocr_failed"), + "warning note must describe OCR failure: {note}" + ); +} + +// ── 4. Both image.ocr.enabled and image.caption.enabled = false ────────── + +/// When both adapters are disabled, the image is still extracted + +/// chunked. Chunk text falls back to the filename. EXIF + dimensions +/// are populated by the extractor regardless. +#[tokio::test] +async fn image_indexed_with_filename_when_ocr_and_caption_disabled() { + // No mock server needed — neither HTTP path is touched. + let env = TestEnv::lexical_only(); + write_red_png(&env.workspace_root, "raw.png"); + let mut cfg = env.config.clone(); + cfg.workspace.include.push("**/*.png".to_string()); + cfg.image.ocr.enabled = false; + cfg.image.caption.enabled = false; + + let cfg_clone = cfg.clone(); + let scope = env.scope(); + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, scope, false) + .expect("ingest with no OCR/caption") + }) + .await + .expect("task"); + + assert_eq!(report.errors, 0); + let img_item = report + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("raw.png")) + .unwrap(); + assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk"); + let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()) + .unwrap(); + let block = match &doc.blocks[0] { + kebab_core::Block::ImageRef(b) => b, + _ => unreachable!(), + }; + assert!(block.ocr.is_none() && block.caption.is_none()); + // EXIF + dimensions still populated by the extractor. + let dims = doc + .metadata + .user + .get("dimensions") + .and_then(|v: &serde_json::Value| v.as_object()) + .expect("dimensions object present"); + assert_eq!( + dims.get("w").and_then(|v: &serde_json::Value| v.as_u64()), + Some(100) + ); + assert_eq!( + dims.get("h").and_then(|v: &serde_json::Value| v.as_u64()), + Some(50) + ); +} + +// ── 5. Determinism: re-ingest produces identical doc_id / chunk_id ─────── + +/// Idempotency contract — running the same ingest twice should mark +/// the asset Updated on the second run with byte-identical IDs. +#[tokio::test] +async fn re_ingest_image_produces_updated_with_same_doc_id() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "response": "stable", + "done": true, + "done_reason": "stop" + }))) + .mount(&server) + .await; + + let env = TestEnv::lexical_only(); + write_red_png(&env.workspace_root, "diagram.png"); + let cfg = cfg_with_image_pipeline(&env, &server.uri()); + + let scope = env.scope(); + let cfg1 = cfg.clone(); + let cfg2 = cfg.clone(); + let scope1 = scope.clone(); + let scope2 = scope.clone(); + + let r1 = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg1, scope1, false).unwrap() + }) + .await + .unwrap(); + let r2 = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg2, scope2, false).unwrap() + }) + .await + .unwrap(); + + let id1 = r1 + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("diagram.png")) + .unwrap() + .doc_id + .clone() + .unwrap(); + let img2 = r2 + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("diagram.png")) + .unwrap(); + assert_eq!(img2.kind, kebab_core::IngestItemKind::Updated); + assert_eq!(img2.doc_id.as_ref().unwrap(), &id1); +} diff --git a/crates/kebab-chunk/src/md_heading_v1.rs b/crates/kebab-chunk/src/md_heading_v1.rs index f29a4d4..1279ebf 100644 --- a/crates/kebab-chunk/src/md_heading_v1.rs +++ b/crates/kebab-chunk/src/md_heading_v1.rs @@ -381,17 +381,41 @@ fn render_block_text(b: &Block) -> String { } s } - // ImageRef text portion = alt (per task spec). Fall back to - // model caption text if alt is empty. + // ImageRef text portion follows the P6-4 (β) plain-concat + // contract — `[alt, ocr.joined, caption.text]` joined by + // `\n\n`, dropping empty parts. Filename fallback for empty + // alt keeps lexical search hits on filenames working even when + // P6-1's filename auto-fill is bypassed. Block::ImageRef(i) => { - if !i.alt.is_empty() { + let alt = if !i.alt.is_empty() { i.alt.clone() } else { - i.caption - .as_ref() - .map(|c| c.text.clone()) - .unwrap_or_default() - } + // P6-1 falls back to filename so this branch is + // defensive — keep it lest a future test fixture or + // synthetic block path skip the auto-fill. + i.src + .rsplit('/') + .next() + .filter(|s| !s.is_empty()) + .unwrap_or("[image]") + .to_string() + }; + let ocr = i + .ocr + .as_ref() + .map(|o| o.joined.as_str()) + .unwrap_or(""); + let cap = i + .caption + .as_ref() + .map(|c| c.text.as_str()) + .unwrap_or(""); + [alt.as_str(), ocr, cap] + .iter() + .filter(|s| !s.is_empty()) + .copied() + .collect::>() + .join("\n\n") } // AudioRef has no caption preview yet (transcript joins land // in P8). Empty string per task spec. @@ -700,6 +724,63 @@ mod tests { } } + /// P6-4 (β) plain concatenation — alt + ocr.joined + caption.text + /// joined by `\n\n`, dropping empty parts. Verifies all four + /// (alt-only, alt+ocr, alt+caption, alt+ocr+caption) shapes. + #[test] + fn image_ref_p6_4_plain_concat_drops_empty_parts() { + use kebab_core::{ModelCaption, OcrText}; + + let mk = |alt: &str, ocr: Option<&str>, cap: Option<&str>| { + Block::ImageRef(ImageRefBlock { + common: common_for("imageref", &[], 0, span(1, 1)), + asset_id: None, + src: "img.png".into(), + alt: alt.into(), + ocr: ocr.map(|t| OcrText { + joined: t.into(), + regions: vec![], + engine: "test".into(), + engine_version: "v1".into(), + }), + caption: cap.map(|t| ModelCaption { + text: t.into(), + model: "m".into(), + model_version: "v".into(), + }), + }) + }; + + // alt-only — no separators between empty parts. + assert_eq!(render_block_text(&mk("photo.png", None, None)), "photo.png"); + + // alt + ocr — joined by exactly one `\n\n`. + assert_eq!( + render_block_text(&mk("photo.png", Some("Hello"), None)), + "photo.png\n\nHello" + ); + + // alt + caption. + assert_eq!( + render_block_text(&mk("photo.png", None, Some("a red square"))), + "photo.png\n\na red square" + ); + + // alt + ocr + caption — three parts joined by `\n\n` each. + assert_eq!( + render_block_text(&mk("photo.png", Some("Hello"), Some("a red square"))), + "photo.png\n\nHello\n\na red square" + ); + + // empty alt — falls back to filename derived from `src`. + let blk = mk("", Some("text from image"), None); + assert_eq!( + render_block_text(&blk), + "img.png\n\ntext from image", + "empty alt must fall back to the basename of `src`" + ); + } + /// ImageRef → own chunk, token_estimate=0. #[test] fn image_ref_emits_own_chunk_zero_tokens() { diff --git a/docs/SMOKE.md b/docs/SMOKE.md index e3cdf79..3a17012 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -118,16 +118,41 @@ max_context_tokens = 6000 KEBAB() { ./target/debug/kebab --config /tmp/kebab-smoke/config.toml "$@"; } KB doctor # 1. health check -KB ingest # 2. 워크스페이스 색인 -KB list docs # 3. 색인 결과 목록 +KB ingest # 2. 워크스페이스 색인 (markdown + image) +KB list docs # 3. 색인 결과 목록 (markdown + image 모두 표시) KB search --mode lexical "코루틴" --k 3 # 4. lexical 검색 KB search --mode vector "memory safety" --k 3 # 5. vector 검색 KB search --mode hybrid "Cargo workspace" --k 3 # 6. hybrid 검색 -KB inspect chunk # 7. raw chunk 보기 -KB ask "이 KB 안에서 ..." --mode hybrid --k 5 # 8. RAG 답변 (Ollama 필요) -KB --json ask "..." --mode hybrid # 9. 기계 친화 출력 검증 +KB search --mode lexical "Hello World" --k 3 # 7. image OCR 텍스트 검색 (P6-4) +KB inspect chunk # 8. raw chunk 보기 +KB ask "이 KB 안에서 ..." --mode hybrid --k 5 # 9. RAG 답변 (Ollama 필요) +KB --json ask "..." --mode hybrid # 10. 기계 친화 출력 검증 ``` +## P6-4 이미지 ingestion 옵션 + +`config.toml` 에 다음 절을 추가하면 `kebab ingest` 가 `**/*.png` / `**/*.jpg` 등 이미지 자산도 함께 색인합니다 (텍스트만 색인하려면 생략): + +```toml +[workspace] +include = ["**/*.md", "**/*.png", "**/*.jpg"] + +[image.ocr] +enabled = true # vision LM 으로 이미지 안 텍스트 전사 +engine = "ollama-vision" +model = "gemma4:e4b" # 사용자 환경의 비전 모델 +endpoint = "http://192.168.0.47:11434" # 비우면 models.llm.endpoint fallback +languages = ["eng", "kor"] +max_pixels = 1600 # long-edge cap + +[image.caption] +enabled = true # vision LM 으로 한 문장 객관 설명 생성 +max_pixels = 768 +prompt_template_version = "caption-v1" +``` + +이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로 (P7 머지 후). + 각 명령은 0 종료 코드면 정상. `kebab ask` 는 거절 시 종료 코드 1 (`RefusalSignal`) — 의도된 동작. ## 검증 체크리스트 @@ -138,6 +163,8 @@ KB --json ask "..." --mode hybrid # 9. 기계 친화 출력 검 - `kebab search --mode hybrid` 의 `fusion_score` 가 `[0, 1]` 범위 (top-1 종종 1.0 — 두 retriever 모두 rank 1 일 때). - `kebab ask` JSON 응답에 `model.id` 가 config 의 모델 (`gemma4:26b` 등) 과 일치, `embedding.id = multilingual-e5-small`, `citations[].marker` 가 `[1]` / `[2]` 형식 (square-bracketed bare index). - 코퍼스에 없는 주제로 `kebab ask` → `refusal_reason: "llm_self_judge"` (또는 `no_chunks` / `score_gate`) + `grounded: false`. +- (P6-4) `image.ocr.enabled = true` 로 PNG 자산을 ingest 하면 `kebab list docs` 가 markdown 옆에 image doc 도 출력 (`workspace_path` 가 `*.png`). `kebab inspect doc ` 의 `block.ocr.joined` 가 vision LM 의 OCR 결과 (예: 스크린샷 안의 텍스트). `kebab search --mode lexical ""` 가 그 image chunk 를 반환하면 wiring 정상. +- OCR / caption 부분 실패는 `errors` 카운터 미증가 — `kebab inspect doc ` 의 Provenance Warning 이벤트 또는 `--debug` 로그에서만 확인. ## 정리 @@ -154,5 +181,6 @@ rm -rf /tmp/kebab-smoke # 통째로 정리 - `kebab ask` 응답 시간 = LLM 토큰 throughput 에 종속. M4 Pro 48GB + gemma4:26b 기준 답변 50–100 토큰에 20–55초. - `--config` path 가 존재하지 않거나 malformed 면 `kebab doctor` 가 hard fail (defaults 가 silently mask 하지 않게 하는 hotfix 동작). - 매 CLI invocation 마다 fastembed 모델 init 비용 (~4초) — process-level 캐시 부재 때문. P9 TUI 진입 시 `App` 의 `OnceLock` 으로 세션 동안 한 번만 init. +- (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장 (P7 머지 후). 자세한 history 와 발견된 버그는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 참조. diff --git a/tasks/p6/p6-4-image-ingest-wiring.md b/tasks/p6/p6-4-image-ingest-wiring.md index 716a2fb..04f87db 100644 --- a/tasks/p6/p6-4-image-ingest-wiring.md +++ b/tasks/p6/p6-4-image-ingest-wiring.md @@ -3,7 +3,7 @@ phase: P6 component: kebab-app (image ingest dispatch + chunking) task_id: p6-4 title: "Wire ImageExtractor + OCR + caption into kebab-app::ingest end-to-end" -status: planned +status: completed depends_on: [p6-1, p6-2, p6-3, p1-6, p3-5] unblocks: [] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md -- 2.49.1 From 469a1a34ecde78f4c7236cf52ab4d7f12d2013fd Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 07:42:44 +0000 Subject: [PATCH 2/3] =?UTF-8?q?review(p6-4):=20=ED=9A=8C=EC=B0=A8=201=20?= =?UTF-8?q?=EC=A7=80=EC=A0=81=20=EB=B0=98=EC=98=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - src/lib.rs: • `ingest_one_asset` 의 doc-comment 가 새 `ImagePipeline` struct 와 합쳐지던 (rustdoc 가 두 doc 을 struct 의 것으로 합치던) 문제 해소 — 두 doc-comment 위치 교환 + 빈 줄 분리. • `if let Some(Block::ImageRef(...)) = blocks.first_mut()` 의 silent-skip 분기를 `match` 의 `other` arm 으로 명시 — 미래에 P6-1 contract 가 깨지면 `tracing::warn!` + Provenance Warning + `IngestItem.warnings` 에 \"ImageDispatchAnomaly\" 노트로 즉시 가시화. 운영 디버깅 단서 제공. • OCR 실패 분기 + caption 실패 분기의 ~25줄 boilerplate 를 `record_image_analysis_failure` 헬퍼로 추출 — 두 호출이 한 줄로 줄고 미래 ProvenanceEvent 필드 변경이 한 곳에서 끝남. • 분석 단계 Warning 이벤트가 fn 진입 시 캡처한 단일 `OffsetDateTime::now_utc()` 를 공유 — spec Risks/notes 의 \"Determinism stress: must not introduce a second `now()` call between extract and apply_ocr/caption\" 약속 회복. • 경고 라벨을 markdown 경로의 `WarningKind` 컨벤션 (`{kind}: {note}`) 에 맞춤 — `\"ocr_failed: ...\"` → `\"OcrFailed: ...\"`, `\"caption_failed: ...\"` → `\"CaptionFailed: ...\"`. 같은 wire 필드 (`IngestItem.warnings`) 가 두 갈래의 다른 형식을 갖던 inconsistency 해소. - tests/image_pipeline.rs: • 회귀 테스트의 \"ocr_failed\" assertion 을 \"OcrFailed\" 로 갱신. cargo test -p kebab-app -p kebab-chunk — 전부 pass. cargo clippy --workspace --all-targets -- -D warnings — pass. --- crates/kebab-app/src/lib.rs | 137 +++++++++++++++-------- crates/kebab-app/tests/image_pipeline.rs | 5 +- 2 files changed, 96 insertions(+), 46 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index fff98c3..3cf31fc 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -467,10 +467,6 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String { /// `<… as JobRepo>` to be explicit. type SqliteStoreAlias = kebab_store_sqlite::SqliteStore; -/// Process a single asset: read bytes, parse, normalize, chunk, -/// persist, embed. Per-asset failures bubble up to the caller for -/// labelling as `IngestItemKind::Error` — they do NOT abort the -/// whole run. /// P6-4: borrowed bundle of the three image-pipeline components built /// once per ingest invocation. Threaded through `ingest_one_asset` so /// the dispatch does not need ten separate parameters. @@ -480,6 +476,10 @@ struct ImagePipeline<'a> { caption_llm: Option<&'a dyn LanguageModel>, } +/// Process a single asset: read bytes, parse, normalize, chunk, +/// persist, embed. Per-asset failures bubble up to the caller for +/// labelling as `IngestItemKind::Error` — they do NOT abort the +/// whole run. #[allow(clippy::too_many_arguments)] fn ingest_one_asset( app: &App, @@ -742,62 +742,78 @@ fn ingest_one_image_asset( // `block.ocr` / `block.caption` stay `None`. P6-4 spec // explicitly: such partial failures do NOT increment the // `errors` counter. + // + // Determinism stress (per spec Risks): the per-document + // Provenance timestamps for any analysis-stage Warning + // events share a single `now_utc()` reading taken once + // here, mirroring `kb-normalize::build_canonical_document`. let lang_hint = lang_hint_from_doc(&canonical); + let now = time::OffsetDateTime::now_utc(); let mut warning_notes: Vec = Vec::new(); - if !canonical.blocks.is_empty() { - // P6-1 contract: image documents always have exactly one - // `Block::ImageRef`. Defensive match keeps us forward-compatible. - if let Some(Block::ImageRef(block)) = canonical.blocks.first_mut() { - if let Some(engine) = ocr_engine { - if let Err(e) = apply_ocr( + match canonical.blocks.first_mut() { + Some(Block::ImageRef(block)) => { + if let Some(engine) = ocr_engine + && let Err(e) = apply_ocr( engine, &bytes, block, lang_hint.as_ref(), &mut canonical.provenance.events, - ) { - let note = format!("ocr_failed: {e:#}"); - tracing::warn!( - target: "kebab-app", - path = %asset.workspace_path.0, - "{}", - note - ); - canonical.provenance.events.push(kebab_core::ProvenanceEvent { - at: time::OffsetDateTime::now_utc(), - agent: "kb-app".to_string(), - kind: kebab_core::ProvenanceKind::Warning, - note: Some(note.clone()), - }); - warning_notes.push(note); - } + ) + { + record_image_analysis_failure( + asset, + &mut canonical.provenance.events, + &mut warning_notes, + "OcrFailed", + e, + now, + ); } - if let Some(llm) = caption_llm { - if let Err(e) = apply_caption( + if let Some(llm) = caption_llm + && let Err(e) = apply_caption( llm, &bytes, block, lang_hint.as_ref(), &app.config, &mut canonical.provenance.events, - ) { - let note = format!("caption_failed: {e:#}"); - tracing::warn!( - target: "kebab-app", - path = %asset.workspace_path.0, - "{}", - note - ); - canonical.provenance.events.push(kebab_core::ProvenanceEvent { - at: time::OffsetDateTime::now_utc(), - agent: "kb-app".to_string(), - kind: kebab_core::ProvenanceKind::Warning, - note: Some(note.clone()), - }); - warning_notes.push(note); - } + ) + { + record_image_analysis_failure( + asset, + &mut canonical.provenance.events, + &mut warning_notes, + "CaptionFailed", + e, + now, + ); } } + // P6-1 contract: image documents always have exactly one + // `Block::ImageRef`. If a future task introduces multi-block + // image documents the silent-skip would mask a real bug, so + // this arm surfaces the divergence loudly. + other => { + tracing::warn!( + target: "kebab-app", + path = %asset.workspace_path.0, + blocks = canonical.blocks.len(), + "image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})", + other.map(|b| std::mem::discriminant(b)) + ); + canonical.provenance.events.push(kebab_core::ProvenanceEvent { + at: now, + agent: "kb-app".to_string(), + kind: kebab_core::ProvenanceKind::Warning, + note: Some( + "image document missing leading ImageRef block — OCR/caption skipped" + .to_string(), + ), + }); + warning_notes + .push("ImageDispatchAnomaly: missing ImageRef block".to_string()); + } } // 4. Chunk via the same `MdHeadingV1Chunker` markdown uses — its @@ -884,6 +900,39 @@ fn ingest_one_image_asset( }) } +/// Centralised handling for image-analysis (OCR / caption) failures. +/// Emits a `tracing::warn!`, appends a `ProvenanceKind::Warning` +/// event sharing the caller's per-document `now`, and pushes a +/// `: ` note onto the `IngestItem.warnings` slot +/// using the same shape the markdown path uses (so downstream wire +/// readers don't have to learn two formats — see kb-normalize's +/// `warning_agent`). +fn record_image_analysis_failure( + asset: &RawAsset, + events: &mut Vec, + warning_notes: &mut Vec, + kind_label: &str, + err: anyhow::Error, + now: time::OffsetDateTime, +) { + let detail = format!("{err:#}"); + let note = format!("{kind_label}: {detail}"); + tracing::warn!( + target: "kebab-app", + path = %asset.workspace_path.0, + "image analysis stage {} failed: {}", + kind_label, + detail + ); + events.push(kebab_core::ProvenanceEvent { + at: now, + agent: "kb-app".to_string(), + kind: kebab_core::ProvenanceKind::Warning, + note: Some(note.clone()), + }); + warning_notes.push(note); +} + /// Pull the BCP-47 language hint from the canonical document. P6-1 /// stamps `Lang("und")` by default; image-pipeline OCR / caption /// adapters special-case "und" so the hint is intentionally dropped diff --git a/crates/kebab-app/tests/image_pipeline.rs b/crates/kebab-app/tests/image_pipeline.rs index 60dea3e..2dfa557 100644 --- a/crates/kebab-app/tests/image_pipeline.rs +++ b/crates/kebab-app/tests/image_pipeline.rs @@ -244,8 +244,9 @@ async fn ocr_failure_indexes_asset_with_warning_no_error_counter() { .expect("Provenance Warning attributed to kb-app"); let note = warning.note.as_deref().unwrap_or(""); assert!( - note.contains("ocr_failed"), - "warning note must describe OCR failure: {note}" + note.contains("OcrFailed"), + "warning note must describe OCR failure with OcrFailed prefix \ + (markdown-style WarningKind format): {note}" ); } -- 2.49.1 From 6e4884aff802a5fc2b9784614761b817ab7d4691 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 08:13:41 +0000 Subject: [PATCH 3/3] =?UTF-8?q?fix(kebab-app):=20IngestReport.errors=20dou?= =?UTF-8?q?ble-count=20regression=20=E2=80=94=20increment=20only=20in=20`m?= =?UTF-8?q?atch=20item.kind=20{=20Error=20=3D>=20...=20}`=20arm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 수동 스모크 검증 (12 PNG + 손상 PNG) 중 발견. `IngestReport.errors` 가 자산 한 장당 2회 증가해서 `scanned = new + updated + skipped + errors` invariant 가 깨짐: - `garbage.png` (이미지 아닌 바이트, .png 확장자만) 1장 + 정상 자산 3장 → 기대 `scanned=4 errors=1`, 실제 `scanned=4 errors=2`. - 원인: `match item { Err(e) => { error_count += 1; IngestItem {...} } }` 에서 1회 증가 후, 직후 `match item.kind { Error => { error_count += 1 } }` arm 에서 또 1회 증가. - markdown 경로의 `ingest_one_asset` Err 가 거의 발생 안 해서 P6-4 머지 전까지 표면화 안 됐던 기존 결함. 이미지 dispatch 가 garbage bytes 를 Err 로 흘려보내며 처음으로 노출. 수정: `Err(e)` 분기의 `error_count.saturating_add(1)` 제거. 단일 증가 지점은 `match item.kind { Error => ... }` arm. 코멘트로 의도 명시. 회귀 테스트 추가 (`tests/image_pipeline.rs`): - `garbage_png_increments_errors_counter_exactly_once` — 정확히 1 증가 + `scanned == new + updated + skipped + errors` invariant 검증. 검증 — release binary + 실 Ollama (192.168.0.47 / gemma4:e4b): ``` $ kebab --json ingest scanned=4 new=3 updated=0 skipped=0 errors=1 error garbage.png (extract Err — unrecognised format) new intro.md new normal.png (OCR success) new truncated.png (OcrFailed warning, asset still indexed) ``` cargo test --workspace --no-fail-fast -j 1 — 전부 pass. cargo clippy --workspace --all-targets -- -D warnings — pass. cargo test -p kebab-app --test image_pipeline — 6 pass (5 기존 + 1 회귀). --- crates/kebab-app/src/lib.rs | 7 ++- crates/kebab-app/tests/image_pipeline.rs | 56 +++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 3cf31fc..d916f7e 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -274,7 +274,12 @@ pub fn ingest_with_config( error = %e, "kb-app::ingest: per-file fatal" ); - error_count = error_count.saturating_add(1); + // Note: `error_count += 1` happens below in the + // `match item.kind { Error => ... }` arm — incrementing + // here too would double-count (a regression first + // surfaced by P6-4 image dispatch where Err returns + // are common; markdown rarely propagated Err so the + // bug went unnoticed). kebab_core::IngestItem { kind: kebab_core::IngestItemKind::Error, doc_id: None, diff --git a/crates/kebab-app/tests/image_pipeline.rs b/crates/kebab-app/tests/image_pipeline.rs index 2dfa557..4d12a8b 100644 --- a/crates/kebab-app/tests/image_pipeline.rs +++ b/crates/kebab-app/tests/image_pipeline.rs @@ -307,7 +307,61 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() { ); } -// ── 5. Determinism: re-ingest produces identical doc_id / chunk_id ─────── +// ── 5. Garbage bytes (not an image) → errors counter exactly 1 ────────── + +/// `kebab-source-fs` classifies a `.png` extension as +/// `MediaType::Image(Png)` regardless of content. When the bytes don't +/// decode as any image format, `ImageExtractor::extract` returns Err +/// and the asset must be classified as `IngestItemKind::Error` with +/// the `errors` counter incremented **exactly once** (regression for +/// the double-count bug surfaced during P6-4 manual smoke). +#[tokio::test] +async fn garbage_png_increments_errors_counter_exactly_once() { + // No mock server needed — extract fails before any HTTP call. + let env = TestEnv::lexical_only(); + // Single non-image asset with .png extension. + std::fs::write( + env.workspace_root.join("garbage.png"), + b"this is not an image at all", + ) + .expect("write garbage fixture"); + let mut cfg = env.config.clone(); + cfg.workspace.include.push("**/*.png".to_string()); + cfg.image.ocr.enabled = false; + cfg.image.caption.enabled = false; + + let cfg_clone = cfg.clone(); + let scope = env.scope(); + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, scope, false) + .expect("ingest does not abort on per-asset failure") + }) + .await + .expect("task"); + + // Exactly-once: scanned counts the asset, errors counts it once, + // and (scanned == new + updated + skipped + errors) holds. + assert_eq!( + report.errors, 1, + "garbage PNG must increment errors exactly once, not twice (double-count regression)" + ); + assert_eq!( + report.scanned, + report.new + report.updated + report.skipped + report.errors, + "counter sum must equal scanned — invariant of the IngestReport contract" + ); + + // The single Error item carries the propagated extract error. + let items = report.items.expect("items present"); + let err_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("garbage.png")) + .expect("garbage item present"); + assert_eq!(err_item.kind, kebab_core::IngestItemKind::Error); + assert!(err_item.error.is_some(), "Error item carries error string"); +} + +// ── 6. Determinism: re-ingest produces identical doc_id / chunk_id ─────── /// Idempotency contract — running the same ingest twice should mark /// the asset Updated on the second run with byte-identical IDs. -- 2.49.1