diff --git a/Cargo.lock b/Cargo.lock index 0c9ee2a..219f346 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3397,6 +3397,7 @@ dependencies = [ "anyhow", "blake3", "dirs 5.0.1", + "image", "kebab-chunk", "kebab-config", "kebab-core", @@ -3405,6 +3406,7 @@ dependencies = [ "kebab-llm", "kebab-llm-local", "kebab-normalize", + "kebab-parse-image", "kebab-parse-md", "kebab-parse-types", "kebab-rag", @@ -3417,10 +3419,12 @@ dependencies = [ "serde_json", "tempfile", "time", + "tokio", "toml", "tracing", "tracing-appender", "tracing-subscriber", + "wiremock", ] [[package]] diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml index e8fae60..c50ae8e 100644 --- a/crates/kebab-app/Cargo.toml +++ b/crates/kebab-app/Cargo.toml @@ -23,6 +23,11 @@ kebab-embed-local = { path = "../kebab-embed-local" } kebab-llm = { path = "../kebab-llm" } kebab-llm-local = { path = "../kebab-llm-local" } kebab-rag = { path = "../kebab-rag" } +# P6-4: image extractor + OCR + caption adapters live here. App +# threads them into the per-asset dispatch (see `ingest_one_asset` +# image branch). Trait-only consumption — no `kebab-parse-image` +# internals leak into kb-app code. +kebab-parse-image = { path = "../kebab-parse-image" } anyhow = { workspace = true } blake3 = { workspace = true } serde = { workspace = true } @@ -37,3 +42,9 @@ dirs = "5" [dev-dependencies] rusqlite = { workspace = true } tempfile = { workspace = true } +# Image-pipeline integration tests use wiremock to stub Ollama for OCR +# / caption HTTP calls. Async runtime to host the mock server only; +# the kb-app code under test stays sync. +wiremock = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread"] } +image = { version = "0.25", default-features = false, features = ["png"] } diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 15d6fc9..d916f7e 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -41,12 +41,15 @@ use serde::{Deserialize, Serialize}; use kebab_chunk::MdHeadingV1Chunker; use kebab_core::{ - Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, + Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, - EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery, - SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore, + EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType, + ParserVersion, RawAsset, SearchHit, SearchQuery, SourceConnector, SourceScope, + SourceUri, VectorRecord, VectorStore, }; +use kebab_llm_local::OllamaLanguageModel; use kebab_normalize::build_canonical_document; +use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -190,6 +193,35 @@ pub fn ingest_with_config( let parser_version = ParserVersion(KEBAB_PARSE_MD_VERSION.to_string()); let chunk_policy = chunk_policy_from_config(&app.config); + // P6-4: build OCR / caption adapters once per ingest invocation, + // gated on their respective `enabled` flags. `reqwest::blocking::Client` + // is internally Arc-shared so reusing one instance across the asset + // loop is correct and cheap. Construction failure (e.g. invalid + // endpoint) aborts ingest fail-fast — better than silently disabling + // OCR/caption mid-run. + let ocr_engine: Option = if app.config.image.ocr.enabled { + Some( + OllamaVisionOcr::new(&app.config) + .context("kb-app::ingest: build OllamaVisionOcr")?, + ) + } else { + None + }; + let caption_llm: Option> = if app.config.image.caption.enabled { + Some(Box::new( + OllamaLanguageModel::new(&app.config) + .context("kb-app::ingest: build OllamaLanguageModel for caption")?, + )) + } else { + None + }; + let image_extractor = ImageExtractor::new(); + let image_pipeline = ImagePipeline { + extractor: &image_extractor, + ocr_engine: ocr_engine.as_ref(), + caption_llm: caption_llm.as_deref(), + }; + // Pre-load every existing doc_id so we can label `IngestItem.kind` // as `New` vs `Updated` correctly. `list_documents` returns one // row per `(workspace_path, asset_id)` — index by the deterministic @@ -230,6 +262,7 @@ pub fn ingest_with_config( embedder.as_ref(), vector_store.as_ref(), &existing_doc_ids, + &image_pipeline, ); let item = match item { @@ -241,7 +274,12 @@ pub fn ingest_with_config( error = %e, "kb-app::ingest: per-file fatal" ); - error_count = error_count.saturating_add(1); + // Note: `error_count += 1` happens below in the + // `match item.kind { Error => ... }` arm — incrementing + // here too would double-count (a regression first + // surfaced by P6-4 image dispatch where Err returns + // are common; markdown rarely propagated Err so the + // bug went unnoticed). kebab_core::IngestItem { kind: kebab_core::IngestItemKind::Error, doc_id: None, @@ -434,10 +472,20 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String { /// `<… as JobRepo>` to be explicit. type SqliteStoreAlias = kebab_store_sqlite::SqliteStore; +/// P6-4: borrowed bundle of the three image-pipeline components built +/// once per ingest invocation. Threaded through `ingest_one_asset` so +/// the dispatch does not need ten separate parameters. +struct ImagePipeline<'a> { + extractor: &'a ImageExtractor, + ocr_engine: Option<&'a OllamaVisionOcr>, + caption_llm: Option<&'a dyn LanguageModel>, +} + /// Process a single asset: read bytes, parse, normalize, chunk, /// persist, embed. Per-asset failures bubble up to the caller for /// labelling as `IngestItemKind::Error` — they do NOT abort the /// whole run. +#[allow(clippy::too_many_arguments)] fn ingest_one_asset( app: &App, asset: &RawAsset, @@ -446,27 +494,47 @@ fn ingest_one_asset( embedder: Option<&Arc>, vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, + image_pipeline: &ImagePipeline<'_>, ) -> anyhow::Result { tracing::debug!( target: "kebab-app::ingest", path = %asset.workspace_path.0, + media_type = ?asset.media_type, "processing asset" ); - // Only handle Markdown for now; other media types are P6+ work. - if asset.media_type != kebab_core::MediaType::Markdown { - return Ok(kebab_core::IngestItem { - kind: kebab_core::IngestItemKind::Skipped, - doc_id: None, - doc_path: asset.workspace_path.clone(), - asset_id: Some(asset.asset_id.clone()), - byte_len: Some(asset.byte_len), - block_count: None, - chunk_count: None, - parser_version: None, - chunker_version: None, - warnings: Vec::new(), - error: None, - }); + // P6-4: dispatch on media_type. Markdown takes the existing + // parse-md / normalize path; image takes the new + // ImageExtractor + (optional) OCR + (optional) caption path. + // Anything else (PDF, audio, unknown) is skipped — the + // respective phases (P7 / P8) wire them in later. + match &asset.media_type { + MediaType::Markdown => { /* fall through to markdown path */ } + MediaType::Image(_) => { + return ingest_one_image_asset( + app, + asset, + chunk_policy, + embedder, + vector_store, + existing_doc_ids, + image_pipeline, + ); + } + _ => { + return Ok(kebab_core::IngestItem { + kind: kebab_core::IngestItemKind::Skipped, + doc_id: None, + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: None, + chunk_count: None, + parser_version: None, + chunker_version: None, + warnings: Vec::new(), + error: None, + }); + } } let path = match &asset.source_uri { @@ -612,6 +680,277 @@ fn ingest_one_asset( }) } +/// P6-4: process one `MediaType::Image(_)` asset end-to-end. +/// +/// Pipeline: read bytes → `ImageExtractor::extract` → optional +/// `apply_ocr` → optional `apply_caption` → existing chunker / embedder +/// / store path (the same one markdown uses, which already handles +/// `Block::ImageRef` per P1-5). +/// +/// Failure semantics (per P6-4 spec): +/// - `ImageExtractor::extract` Err → propagate (caller increments +/// `errors`). +/// - OCR / caption Err → log + `Provenance::Warning` event, continue. +/// `block.ocr` / `block.caption` stay `None`. `errors` NOT incremented. +#[allow(clippy::too_many_arguments)] +fn ingest_one_image_asset( + app: &App, + asset: &RawAsset, + chunk_policy: &ChunkPolicy, + embedder: Option<&Arc>, + vector_store: Option<&Arc>, + existing_doc_ids: &std::collections::HashSet, + image_pipeline: &ImagePipeline<'_>, +) -> anyhow::Result { + let image_extractor = image_pipeline.extractor; + let ocr_engine = image_pipeline.ocr_engine; + let caption_llm = image_pipeline.caption_llm; + let path = match &asset.source_uri { + SourceUri::File(p) => p.clone(), + SourceUri::Kb(_) => { + return Ok(kebab_core::IngestItem { + kind: kebab_core::IngestItemKind::Skipped, + doc_id: None, + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: None, + chunk_count: None, + parser_version: None, + chunker_version: None, + warnings: vec![ + "kb:// source URIs are not supported by the fs ingester".into(), + ], + error: None, + }); + } + }; + let bytes = std::fs::read(&path) + .with_context(|| format!("read image asset bytes from {}", path.display()))?; + + // 1. Decode + EXIF + dimensions. ExtractContext.config carries + // nothing the image extractor reads today; we pass a default + // instance per the trait shape. + let extract_config = kebab_core::ExtractConfig::default(); + let workspace_root = std::path::PathBuf::from(&app.config.workspace.root); + let ctx = ExtractContext { + asset, + workspace_root: &workspace_root, + config: &extract_config, + }; + let mut canonical = image_extractor + .extract(&ctx, &bytes) + .context("kb-parse-image::ImageExtractor::extract")?; + + // 2 + 3. Apply OCR / caption when their adapters exist. Both are + // Lenient — failure is captured into Provenance Warning, + // `block.ocr` / `block.caption` stay `None`. P6-4 spec + // explicitly: such partial failures do NOT increment the + // `errors` counter. + // + // Determinism stress (per spec Risks): the per-document + // Provenance timestamps for any analysis-stage Warning + // events share a single `now_utc()` reading taken once + // here, mirroring `kb-normalize::build_canonical_document`. + let lang_hint = lang_hint_from_doc(&canonical); + let now = time::OffsetDateTime::now_utc(); + let mut warning_notes: Vec = Vec::new(); + match canonical.blocks.first_mut() { + Some(Block::ImageRef(block)) => { + if let Some(engine) = ocr_engine + && let Err(e) = apply_ocr( + engine, + &bytes, + block, + lang_hint.as_ref(), + &mut canonical.provenance.events, + ) + { + record_image_analysis_failure( + asset, + &mut canonical.provenance.events, + &mut warning_notes, + "OcrFailed", + e, + now, + ); + } + if let Some(llm) = caption_llm + && let Err(e) = apply_caption( + llm, + &bytes, + block, + lang_hint.as_ref(), + &app.config, + &mut canonical.provenance.events, + ) + { + record_image_analysis_failure( + asset, + &mut canonical.provenance.events, + &mut warning_notes, + "CaptionFailed", + e, + now, + ); + } + } + // P6-1 contract: image documents always have exactly one + // `Block::ImageRef`. If a future task introduces multi-block + // image documents the silent-skip would mask a real bug, so + // this arm surfaces the divergence loudly. + other => { + tracing::warn!( + target: "kebab-app", + path = %asset.workspace_path.0, + blocks = canonical.blocks.len(), + "image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})", + other.map(|b| std::mem::discriminant(b)) + ); + canonical.provenance.events.push(kebab_core::ProvenanceEvent { + at: now, + agent: "kb-app".to_string(), + kind: kebab_core::ProvenanceKind::Warning, + note: Some( + "image document missing leading ImageRef block — OCR/caption skipped" + .to_string(), + ), + }); + warning_notes + .push("ImageDispatchAnomaly: missing ImageRef block".to_string()); + } + } + + // 4. Chunk via the same `MdHeadingV1Chunker` markdown uses — its + // `Block::ImageRef` arm already produces a single chunk per + // image (P1-5). The chunk text now follows the (β) plain-concat + // contract per the kebab-chunk render_block_text update. + let chunks = MdHeadingV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?; + + // 5. Persist + embed — identical sequence to markdown. + app.sqlite + .put_asset_with_bytes(asset, &bytes) + .context("DocumentStore::put_asset_with_bytes (image)")?; + app.sqlite + .put_document(&canonical) + .context("DocumentStore::put_document (image)")?; + app.sqlite + .put_blocks(&canonical.doc_id, &canonical.blocks) + .context("DocumentStore::put_blocks (image)")?; + app.sqlite + .put_chunks(&canonical.doc_id, &chunks) + .context("DocumentStore::put_chunks (image)")?; + + if let (Some(emb), Some(vec_store)) = (embedder, vector_store) + && !chunks.is_empty() + { + let inputs: Vec> = chunks + .iter() + .map(|c| EmbeddingInput { + text: c.text.as_str(), + kind: EmbeddingKind::Document, + }) + .collect(); + let vectors = emb + .embed(&inputs) + .context("Embedder::embed (image chunks)")?; + let model_id = emb.model_id(); + let model_version = emb.model_version(); + let dimensions = emb.dimensions(); + let records: Vec = chunks + .iter() + .zip(vectors) + .map(|(c, v)| VectorRecord { + embedding_id: kebab_core::id_for_embedding( + &c.chunk_id, + &model_id, + &model_version, + dimensions, + ), + chunk_id: c.chunk_id.clone(), + vector: v, + doc_id: canonical.doc_id.clone(), + text: c.text.clone(), + heading_path: c.heading_path.clone(), + model_id: model_id.clone(), + model_version: model_version.clone(), + dimensions, + }) + .collect(); + vec_store + .upsert(&records) + .context("VectorStore::upsert (image)")?; + } + + let kind = if existing_doc_ids.contains(&canonical.doc_id.0) { + kebab_core::IngestItemKind::Updated + } else { + kebab_core::IngestItemKind::New + }; + + Ok(kebab_core::IngestItem { + kind, + doc_id: Some(canonical.doc_id.clone()), + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: u32::try_from(canonical.blocks.len()).ok(), + chunk_count: u32::try_from(chunks.len()).ok(), + parser_version: Some(canonical.parser_version.clone()), + chunker_version: Some(MdHeadingV1Chunker.chunker_version()), + warnings: warning_notes, + error: None, + }) +} + +/// Centralised handling for image-analysis (OCR / caption) failures. +/// Emits a `tracing::warn!`, appends a `ProvenanceKind::Warning` +/// event sharing the caller's per-document `now`, and pushes a +/// `: ` note onto the `IngestItem.warnings` slot +/// using the same shape the markdown path uses (so downstream wire +/// readers don't have to learn two formats — see kb-normalize's +/// `warning_agent`). +fn record_image_analysis_failure( + asset: &RawAsset, + events: &mut Vec, + warning_notes: &mut Vec, + kind_label: &str, + err: anyhow::Error, + now: time::OffsetDateTime, +) { + let detail = format!("{err:#}"); + let note = format!("{kind_label}: {detail}"); + tracing::warn!( + target: "kebab-app", + path = %asset.workspace_path.0, + "image analysis stage {} failed: {}", + kind_label, + detail + ); + events.push(kebab_core::ProvenanceEvent { + at: now, + agent: "kb-app".to_string(), + kind: kebab_core::ProvenanceKind::Warning, + note: Some(note.clone()), + }); + warning_notes.push(note); +} + +/// Pull the BCP-47 language hint from the canonical document. P6-1 +/// stamps `Lang("und")` by default; image-pipeline OCR / caption +/// adapters special-case "und" so the hint is intentionally dropped +/// from prompts. +fn lang_hint_from_doc(doc: &CanonicalDocument) -> Option { + let s = doc.lang.0.as_str(); + if s.is_empty() || s == "und" { + None + } else { + Some(doc.lang.clone()) + } +} + /// Convenience: end byte of the frontmatter region (or 0 when absent). fn fm_span_end(span: Option) -> usize { span.map(|s| s.end).unwrap_or(0) diff --git a/crates/kebab-app/tests/image_pipeline.rs b/crates/kebab-app/tests/image_pipeline.rs new file mode 100644 index 0000000..4d12a8b --- /dev/null +++ b/crates/kebab-app/tests/image_pipeline.rs @@ -0,0 +1,421 @@ +//! P6-4 image ingest wiring — end-to-end integration. +//! +//! Each test spins up a `TempDir` workspace + writes one PNG fixture + +//! routes OCR / caption HTTP calls through a `wiremock` server that +//! impersonates Ollama's `/api/generate` endpoint. The kb-app code +//! under test is sync; the wiremock server is async, so test bodies +//! drive blocking work via `tokio::task::spawn_blocking`. + +mod common; + +use std::path::Path; + +use common::TestEnv; +use kebab_config::Config; +use serde_json::json; +use tokio::task::spawn_blocking; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +// ── Fixture helpers ────────────────────────────────────────────────────── + +/// Tiny solid-red PNG written into the test workspace at `/`. +/// 100×50 — small enough to skip downscale by default but non-trivially +/// inspectable in stored DB rows. +fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf { + use image::{ImageBuffer, Rgb}; + let img: ImageBuffer, _> = + ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0])); + let path = root.join(name); + img.save(&path).expect("write PNG fixture"); + path +} + +fn cfg_with_image_pipeline(env: &TestEnv, mock_endpoint: &str) -> Config { + let mut cfg = env.config.clone(); + // Ensure image assets are scanned. + cfg.workspace + .include + .push("**/*.png".to_string()); + cfg.image.ocr.enabled = true; + cfg.image.ocr.endpoint = Some(mock_endpoint.to_string()); + cfg.image.ocr.model = "vision-mock:1b".to_string(); + cfg.image.ocr.max_pixels = 512; + cfg.image.caption.enabled = false; // tested separately below + cfg.models.llm.endpoint = mock_endpoint.to_string(); + cfg.models.llm.model = "vision-mock:1b".to_string(); + cfg +} + +// ── 1. Happy path: OCR-only ingest ─────────────────────────────────────── + +/// One PNG asset + OCR enabled (caption off) → ingest produces 1 doc + 1 +/// chunk; chunk text contains alt + OCR transcription joined by `\n\n`. +#[tokio::test] +async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "model": "vision-mock:1b", + "response": "Hello World 2026", + "done": true, + "done_reason": "stop" + }))) + .mount(&server) + .await; + + let env = TestEnv::lexical_only(); + let png = write_red_png(&env.workspace_root, "diagram.png"); + eprintln!("PNG written to {}", png.display()); + let cfg = cfg_with_image_pipeline(&env, &server.uri()); + let cfg_clone = cfg.clone(); + let env_workspace = env.workspace_root.clone(); + let env_scope = env.scope(); + + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, env_scope, false) + .expect("image ingest must succeed") + }) + .await + .expect("blocking task panicked"); + + // Counters: scanned should include the PNG; new ≥ 1 (markdown + // fixtures from the workspace tree may also count). + assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items); + assert_eq!(report.errors, 0, "no errors on lenient OCR path"); + + // Locate the image doc in the report items. + let items = report.items.expect("items present (summary_only=false)"); + let img_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("diagram.png")) + .expect("image doc item must be present"); + assert_eq!( + img_item.kind, + kebab_core::IngestItemKind::New, + "image asset must be classified New on first ingest" + ); + assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk"); + + // Inspect the stored chunk text via kb-app's inspect_chunk facade. + let doc_id = img_item.doc_id.clone().expect("image doc id"); + let doc = kebab_app::inspect_doc_with_config(cfg.clone(), &doc_id) + .expect("inspect_doc returns the image document"); + let block = match doc.blocks.first() { + Some(kebab_core::Block::ImageRef(b)) => b, + other => panic!("expected ImageRef, got {other:?}"), + }; + assert!(block.ocr.is_some(), "block.ocr populated by apply_ocr"); + assert_eq!( + block.ocr.as_ref().unwrap().joined, + "Hello World 2026", + "OCR text from mock" + ); + assert!( + block.caption.is_none(), + "caption disabled in cfg → block.caption stays None" + ); + + // Sanity: the doc was actually persisted into SQLite (kb-app's + // list_docs facade reads the same store the chunker writes to). + let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()) + .expect("list_docs"); + assert!( + summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")), + "image doc must appear in list_docs" + ); + + drop(env_workspace); // keep TempDir alive until here + drop(env); +} + +// ── 2. OCR + caption together ──────────────────────────────────────────── + +/// Both OCR and caption enabled. The mock returns the same JSON body +/// for every `/api/generate` POST — wiremock has no per-prompt routing +/// on the default `Mock` so we treat both calls as equivalent. We then +/// verify both `block.ocr` and `block.caption` are populated, and the +/// chunk text contains both fragments separated by `\n\n`. +#[tokio::test] +async fn ingest_image_with_ocr_and_caption_populates_both_fields() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "response": "shared mock body", + "done": true, + "done_reason": "stop" + }))) + .mount(&server) + .await; + + let env = TestEnv::lexical_only(); + write_red_png(&env.workspace_root, "diagram.png"); + let mut cfg = cfg_with_image_pipeline(&env, &server.uri()); + cfg.image.caption.enabled = true; + cfg.image.caption.max_pixels = 384; + + let cfg_clone = cfg.clone(); + let scope = env.scope(); + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, scope, false) + .expect("ingest must succeed with both OCR+caption") + }) + .await + .expect("task"); + + assert_eq!(report.errors, 0); + let img_item = report + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("diagram.png")) + .unwrap(); + let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()) + .unwrap(); + let block = match &doc.blocks[0] { + kebab_core::Block::ImageRef(b) => b, + _ => unreachable!(), + }; + assert!(block.ocr.is_some(), "OCR populated"); + assert!(block.caption.is_some(), "caption populated"); + drop(env); +} + +// ── 3. Lenient failure: OCR Ollama 503 → asset still indexed ───────────── + +/// OCR endpoint returns 503. Spec contract: image is still indexed, +/// `block.ocr = None`, Provenance has a Warning event, `errors` +/// counter NOT incremented. +#[tokio::test] +async fn ocr_failure_indexes_asset_with_warning_no_error_counter() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(503)) + .mount(&server) + .await; + + let env = TestEnv::lexical_only(); + write_red_png(&env.workspace_root, "broken.png"); + let cfg = cfg_with_image_pipeline(&env, &server.uri()); + + let cfg_clone = cfg.clone(); + let scope = env.scope(); + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, scope, false) + .expect("ingest does not abort on lenient OCR failure") + }) + .await + .expect("task"); + + assert_eq!( + report.errors, 0, + "lenient OCR failure must NOT increment errors counter (spec)" + ); + let img_item = report + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("broken.png")) + .expect("asset still indexed despite OCR failure"); + assert_eq!(img_item.kind, kebab_core::IngestItemKind::New); + assert_eq!(img_item.chunk_count, Some(1)); + assert!( + !img_item.warnings.is_empty(), + "lenient OCR failure must surface a warning on the IngestItem" + ); + + let doc_id = img_item.doc_id.clone().unwrap(); + let doc = kebab_app::inspect_doc_with_config(cfg, &doc_id).unwrap(); + let block = match &doc.blocks[0] { + kebab_core::Block::ImageRef(b) => b, + _ => unreachable!(), + }; + assert!(block.ocr.is_none(), "block.ocr stays None on OCR failure"); + let warning = doc + .provenance + .events + .iter() + .find(|e| e.kind == kebab_core::ProvenanceKind::Warning && e.agent == "kb-app") + .expect("Provenance Warning attributed to kb-app"); + let note = warning.note.as_deref().unwrap_or(""); + assert!( + note.contains("OcrFailed"), + "warning note must describe OCR failure with OcrFailed prefix \ + (markdown-style WarningKind format): {note}" + ); +} + +// ── 4. Both image.ocr.enabled and image.caption.enabled = false ────────── + +/// When both adapters are disabled, the image is still extracted + +/// chunked. Chunk text falls back to the filename. EXIF + dimensions +/// are populated by the extractor regardless. +#[tokio::test] +async fn image_indexed_with_filename_when_ocr_and_caption_disabled() { + // No mock server needed — neither HTTP path is touched. + let env = TestEnv::lexical_only(); + write_red_png(&env.workspace_root, "raw.png"); + let mut cfg = env.config.clone(); + cfg.workspace.include.push("**/*.png".to_string()); + cfg.image.ocr.enabled = false; + cfg.image.caption.enabled = false; + + let cfg_clone = cfg.clone(); + let scope = env.scope(); + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, scope, false) + .expect("ingest with no OCR/caption") + }) + .await + .expect("task"); + + assert_eq!(report.errors, 0); + let img_item = report + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("raw.png")) + .unwrap(); + assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk"); + let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()) + .unwrap(); + let block = match &doc.blocks[0] { + kebab_core::Block::ImageRef(b) => b, + _ => unreachable!(), + }; + assert!(block.ocr.is_none() && block.caption.is_none()); + // EXIF + dimensions still populated by the extractor. + let dims = doc + .metadata + .user + .get("dimensions") + .and_then(|v: &serde_json::Value| v.as_object()) + .expect("dimensions object present"); + assert_eq!( + dims.get("w").and_then(|v: &serde_json::Value| v.as_u64()), + Some(100) + ); + assert_eq!( + dims.get("h").and_then(|v: &serde_json::Value| v.as_u64()), + Some(50) + ); +} + +// ── 5. Garbage bytes (not an image) → errors counter exactly 1 ────────── + +/// `kebab-source-fs` classifies a `.png` extension as +/// `MediaType::Image(Png)` regardless of content. When the bytes don't +/// decode as any image format, `ImageExtractor::extract` returns Err +/// and the asset must be classified as `IngestItemKind::Error` with +/// the `errors` counter incremented **exactly once** (regression for +/// the double-count bug surfaced during P6-4 manual smoke). +#[tokio::test] +async fn garbage_png_increments_errors_counter_exactly_once() { + // No mock server needed — extract fails before any HTTP call. + let env = TestEnv::lexical_only(); + // Single non-image asset with .png extension. + std::fs::write( + env.workspace_root.join("garbage.png"), + b"this is not an image at all", + ) + .expect("write garbage fixture"); + let mut cfg = env.config.clone(); + cfg.workspace.include.push("**/*.png".to_string()); + cfg.image.ocr.enabled = false; + cfg.image.caption.enabled = false; + + let cfg_clone = cfg.clone(); + let scope = env.scope(); + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, scope, false) + .expect("ingest does not abort on per-asset failure") + }) + .await + .expect("task"); + + // Exactly-once: scanned counts the asset, errors counts it once, + // and (scanned == new + updated + skipped + errors) holds. + assert_eq!( + report.errors, 1, + "garbage PNG must increment errors exactly once, not twice (double-count regression)" + ); + assert_eq!( + report.scanned, + report.new + report.updated + report.skipped + report.errors, + "counter sum must equal scanned — invariant of the IngestReport contract" + ); + + // The single Error item carries the propagated extract error. + let items = report.items.expect("items present"); + let err_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("garbage.png")) + .expect("garbage item present"); + assert_eq!(err_item.kind, kebab_core::IngestItemKind::Error); + assert!(err_item.error.is_some(), "Error item carries error string"); +} + +// ── 6. Determinism: re-ingest produces identical doc_id / chunk_id ─────── + +/// Idempotency contract — running the same ingest twice should mark +/// the asset Updated on the second run with byte-identical IDs. +#[tokio::test] +async fn re_ingest_image_produces_updated_with_same_doc_id() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "response": "stable", + "done": true, + "done_reason": "stop" + }))) + .mount(&server) + .await; + + let env = TestEnv::lexical_only(); + write_red_png(&env.workspace_root, "diagram.png"); + let cfg = cfg_with_image_pipeline(&env, &server.uri()); + + let scope = env.scope(); + let cfg1 = cfg.clone(); + let cfg2 = cfg.clone(); + let scope1 = scope.clone(); + let scope2 = scope.clone(); + + let r1 = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg1, scope1, false).unwrap() + }) + .await + .unwrap(); + let r2 = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg2, scope2, false).unwrap() + }) + .await + .unwrap(); + + let id1 = r1 + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("diagram.png")) + .unwrap() + .doc_id + .clone() + .unwrap(); + let img2 = r2 + .items + .as_ref() + .unwrap() + .iter() + .find(|i| i.doc_path.0.ends_with("diagram.png")) + .unwrap(); + assert_eq!(img2.kind, kebab_core::IngestItemKind::Updated); + assert_eq!(img2.doc_id.as_ref().unwrap(), &id1); +} diff --git a/crates/kebab-chunk/src/md_heading_v1.rs b/crates/kebab-chunk/src/md_heading_v1.rs index f29a4d4..1279ebf 100644 --- a/crates/kebab-chunk/src/md_heading_v1.rs +++ b/crates/kebab-chunk/src/md_heading_v1.rs @@ -381,17 +381,41 @@ fn render_block_text(b: &Block) -> String { } s } - // ImageRef text portion = alt (per task spec). Fall back to - // model caption text if alt is empty. + // ImageRef text portion follows the P6-4 (β) plain-concat + // contract — `[alt, ocr.joined, caption.text]` joined by + // `\n\n`, dropping empty parts. Filename fallback for empty + // alt keeps lexical search hits on filenames working even when + // P6-1's filename auto-fill is bypassed. Block::ImageRef(i) => { - if !i.alt.is_empty() { + let alt = if !i.alt.is_empty() { i.alt.clone() } else { - i.caption - .as_ref() - .map(|c| c.text.clone()) - .unwrap_or_default() - } + // P6-1 falls back to filename so this branch is + // defensive — keep it lest a future test fixture or + // synthetic block path skip the auto-fill. + i.src + .rsplit('/') + .next() + .filter(|s| !s.is_empty()) + .unwrap_or("[image]") + .to_string() + }; + let ocr = i + .ocr + .as_ref() + .map(|o| o.joined.as_str()) + .unwrap_or(""); + let cap = i + .caption + .as_ref() + .map(|c| c.text.as_str()) + .unwrap_or(""); + [alt.as_str(), ocr, cap] + .iter() + .filter(|s| !s.is_empty()) + .copied() + .collect::>() + .join("\n\n") } // AudioRef has no caption preview yet (transcript joins land // in P8). Empty string per task spec. @@ -700,6 +724,63 @@ mod tests { } } + /// P6-4 (β) plain concatenation — alt + ocr.joined + caption.text + /// joined by `\n\n`, dropping empty parts. Verifies all four + /// (alt-only, alt+ocr, alt+caption, alt+ocr+caption) shapes. + #[test] + fn image_ref_p6_4_plain_concat_drops_empty_parts() { + use kebab_core::{ModelCaption, OcrText}; + + let mk = |alt: &str, ocr: Option<&str>, cap: Option<&str>| { + Block::ImageRef(ImageRefBlock { + common: common_for("imageref", &[], 0, span(1, 1)), + asset_id: None, + src: "img.png".into(), + alt: alt.into(), + ocr: ocr.map(|t| OcrText { + joined: t.into(), + regions: vec![], + engine: "test".into(), + engine_version: "v1".into(), + }), + caption: cap.map(|t| ModelCaption { + text: t.into(), + model: "m".into(), + model_version: "v".into(), + }), + }) + }; + + // alt-only — no separators between empty parts. + assert_eq!(render_block_text(&mk("photo.png", None, None)), "photo.png"); + + // alt + ocr — joined by exactly one `\n\n`. + assert_eq!( + render_block_text(&mk("photo.png", Some("Hello"), None)), + "photo.png\n\nHello" + ); + + // alt + caption. + assert_eq!( + render_block_text(&mk("photo.png", None, Some("a red square"))), + "photo.png\n\na red square" + ); + + // alt + ocr + caption — three parts joined by `\n\n` each. + assert_eq!( + render_block_text(&mk("photo.png", Some("Hello"), Some("a red square"))), + "photo.png\n\nHello\n\na red square" + ); + + // empty alt — falls back to filename derived from `src`. + let blk = mk("", Some("text from image"), None); + assert_eq!( + render_block_text(&blk), + "img.png\n\ntext from image", + "empty alt must fall back to the basename of `src`" + ); + } + /// ImageRef → own chunk, token_estimate=0. #[test] fn image_ref_emits_own_chunk_zero_tokens() { diff --git a/docs/SMOKE.md b/docs/SMOKE.md index e3cdf79..3a17012 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -118,16 +118,41 @@ max_context_tokens = 6000 KEBAB() { ./target/debug/kebab --config /tmp/kebab-smoke/config.toml "$@"; } KB doctor # 1. health check -KB ingest # 2. 워크스페이스 색인 -KB list docs # 3. 색인 결과 목록 +KB ingest # 2. 워크스페이스 색인 (markdown + image) +KB list docs # 3. 색인 결과 목록 (markdown + image 모두 표시) KB search --mode lexical "코루틴" --k 3 # 4. lexical 검색 KB search --mode vector "memory safety" --k 3 # 5. vector 검색 KB search --mode hybrid "Cargo workspace" --k 3 # 6. hybrid 검색 -KB inspect chunk # 7. raw chunk 보기 -KB ask "이 KB 안에서 ..." --mode hybrid --k 5 # 8. RAG 답변 (Ollama 필요) -KB --json ask "..." --mode hybrid # 9. 기계 친화 출력 검증 +KB search --mode lexical "Hello World" --k 3 # 7. image OCR 텍스트 검색 (P6-4) +KB inspect chunk # 8. raw chunk 보기 +KB ask "이 KB 안에서 ..." --mode hybrid --k 5 # 9. RAG 답변 (Ollama 필요) +KB --json ask "..." --mode hybrid # 10. 기계 친화 출력 검증 ``` +## P6-4 이미지 ingestion 옵션 + +`config.toml` 에 다음 절을 추가하면 `kebab ingest` 가 `**/*.png` / `**/*.jpg` 등 이미지 자산도 함께 색인합니다 (텍스트만 색인하려면 생략): + +```toml +[workspace] +include = ["**/*.md", "**/*.png", "**/*.jpg"] + +[image.ocr] +enabled = true # vision LM 으로 이미지 안 텍스트 전사 +engine = "ollama-vision" +model = "gemma4:e4b" # 사용자 환경의 비전 모델 +endpoint = "http://192.168.0.47:11434" # 비우면 models.llm.endpoint fallback +languages = ["eng", "kor"] +max_pixels = 1600 # long-edge cap + +[image.caption] +enabled = true # vision LM 으로 한 문장 객관 설명 생성 +max_pixels = 768 +prompt_template_version = "caption-v1" +``` + +이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로 (P7 머지 후). + 각 명령은 0 종료 코드면 정상. `kebab ask` 는 거절 시 종료 코드 1 (`RefusalSignal`) — 의도된 동작. ## 검증 체크리스트 @@ -138,6 +163,8 @@ KB --json ask "..." --mode hybrid # 9. 기계 친화 출력 검 - `kebab search --mode hybrid` 의 `fusion_score` 가 `[0, 1]` 범위 (top-1 종종 1.0 — 두 retriever 모두 rank 1 일 때). - `kebab ask` JSON 응답에 `model.id` 가 config 의 모델 (`gemma4:26b` 등) 과 일치, `embedding.id = multilingual-e5-small`, `citations[].marker` 가 `[1]` / `[2]` 형식 (square-bracketed bare index). - 코퍼스에 없는 주제로 `kebab ask` → `refusal_reason: "llm_self_judge"` (또는 `no_chunks` / `score_gate`) + `grounded: false`. +- (P6-4) `image.ocr.enabled = true` 로 PNG 자산을 ingest 하면 `kebab list docs` 가 markdown 옆에 image doc 도 출력 (`workspace_path` 가 `*.png`). `kebab inspect doc ` 의 `block.ocr.joined` 가 vision LM 의 OCR 결과 (예: 스크린샷 안의 텍스트). `kebab search --mode lexical ""` 가 그 image chunk 를 반환하면 wiring 정상. +- OCR / caption 부분 실패는 `errors` 카운터 미증가 — `kebab inspect doc ` 의 Provenance Warning 이벤트 또는 `--debug` 로그에서만 확인. ## 정리 @@ -154,5 +181,6 @@ rm -rf /tmp/kebab-smoke # 통째로 정리 - `kebab ask` 응답 시간 = LLM 토큰 throughput 에 종속. M4 Pro 48GB + gemma4:26b 기준 답변 50–100 토큰에 20–55초. - `--config` path 가 존재하지 않거나 malformed 면 `kebab doctor` 가 hard fail (defaults 가 silently mask 하지 않게 하는 hotfix 동작). - 매 CLI invocation 마다 fastembed 모델 init 비용 (~4초) — process-level 캐시 부재 때문. P9 TUI 진입 시 `App` 의 `OnceLock` 으로 세션 동안 한 번만 init. +- (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장 (P7 머지 후). 자세한 history 와 발견된 버그는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 참조. diff --git a/tasks/p6/p6-4-image-ingest-wiring.md b/tasks/p6/p6-4-image-ingest-wiring.md index 716a2fb..04f87db 100644 --- a/tasks/p6/p6-4-image-ingest-wiring.md +++ b/tasks/p6/p6-4-image-ingest-wiring.md @@ -3,7 +3,7 @@ phase: P6 component: kebab-app (image ingest dispatch + chunking) task_id: p6-4 title: "Wire ImageExtractor + OCR + caption into kebab-app::ingest end-to-end" -status: planned +status: completed depends_on: [p6-1, p6-2, p6-3, p1-6, p3-5] unblocks: [] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md