diff --git a/Cargo.lock b/Cargo.lock index f595476..0c9ee2a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3576,6 +3576,8 @@ dependencies = [ "kamadak-exif", "kebab-config", "kebab-core", + "kebab-llm", + "kebab-llm-local", "reqwest", "serde", "serde_json", diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index f9a737c..898fd67 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -105,18 +105,20 @@ pub struct RagCfg { } /// Settings for the image ingest pipeline (P6). `ocr` controls OCR -/// behaviour; future fields (e.g. `caption`) will join here as P6-3 -/// lands. +/// behaviour (P6-2); `caption` controls vision-LM captioning (P6-3). #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct ImageCfg { #[serde(default = "OcrCfg::defaults")] pub ocr: OcrCfg, + #[serde(default = "CaptionCfg::defaults")] + pub caption: CaptionCfg, } impl ImageCfg { pub fn defaults() -> Self { Self { ocr: OcrCfg::defaults(), + caption: CaptionCfg::defaults(), } } } @@ -162,6 +164,36 @@ impl OcrCfg { } } +/// Caption settings (P6-3). Caption uses the same Ollama-vision / +/// `LanguageModel` pipeline as the rest of the workspace; the trait +/// abstraction is the part the spec demands. `enabled` defaults to +/// `false` because captioning costs one model call per asset and the +/// output is model-generated (low trust). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct CaptionCfg { + /// Run captioning on every image during ingest. Default `false`. + pub enabled: bool, + /// Cap the long edge of the image (in pixels) before sending. The + /// spec recommends an aggressive 768×768 cap because larger + /// vision-LM inputs translate directly into prompt cost. Default + /// `768`. + pub max_pixels: u32, + /// Caption prompt template version pinned into wire output via + /// `ModelCaption.model_version`. Bump when the prompt changes so + /// downstream eval can detect regressions. + pub prompt_template_version: String, +} + +impl CaptionCfg { + pub fn defaults() -> Self { + Self { + enabled: false, + max_pixels: 768, + prompt_template_version: "caption-v1".to_string(), + } + } +} + impl Config { /// Defaults per design §6.4. pub fn defaults() -> Self { @@ -417,6 +449,19 @@ impl Config { } } + // image.caption (P6-3) + "KEBAB_IMAGE_CAPTION_ENABLED" => { + self.image.caption.enabled = parse_bool(v); + } + "KEBAB_IMAGE_CAPTION_MAX_PIXELS" => { + if let Ok(n) = v.parse::() { + self.image.caption.max_pixels = n; + } + } + "KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => { + self.image.caption.prompt_template_version = v.clone(); + } + // Unknown KEBAB_* keys are silently ignored — see // `env_unknown_key_is_ignored` test. _ => {} @@ -608,6 +653,35 @@ mod tests { /// Pre-P6 config files don't have an `[image]` section. The /// `#[serde(default)]` attribute on `Config::image` must let those /// files load with `ImageCfg::defaults()` instead of erroring. + #[test] + fn image_caption_defaults_disabled() { + let c = Config::defaults(); + assert!(!c.image.caption.enabled); + assert_eq!(c.image.caption.max_pixels, 768); + assert_eq!(c.image.caption.prompt_template_version, "caption-v1"); + } + + #[test] + fn image_caption_env_overrides() { + let mut env = HashMap::new(); + env.insert( + "KEBAB_IMAGE_CAPTION_ENABLED".to_string(), + "true".to_string(), + ); + env.insert( + "KEBAB_IMAGE_CAPTION_MAX_PIXELS".to_string(), + "1024".to_string(), + ); + env.insert( + "KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION".to_string(), + "caption-v2".to_string(), + ); + let c = Config::defaults().apply_env(&env); + assert!(c.image.caption.enabled); + assert_eq!(c.image.caption.max_pixels, 1024); + assert_eq!(c.image.caption.prompt_template_version, "caption-v2"); + } + /// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None` /// rather than to `Some("")` so the fallback to `models.llm.endpoint` /// kicks in. Covers the env-equivalent of a missing TOML key. diff --git a/crates/kebab-core/src/traits.rs b/crates/kebab-core/src/traits.rs index dcf2024..bbdf663 100644 --- a/crates/kebab-core/src/traits.rs +++ b/crates/kebab-core/src/traits.rs @@ -69,6 +69,17 @@ pub struct GenerateRequest { pub max_tokens: usize, pub temperature: f32, pub seed: Option, + /// Vision inputs (base64-encoded, one per image). Empty for the + /// text-only path that P4-2 / P4-3 / RAG uses; non-empty when a + /// vision-capable adapter (P6-3 caption, future multimodal RAG) + /// drives the call. The LM adapter is responsible for routing + /// these onto the wire — Ollama uses `images: [base64, ...]`, + /// other backends may differ. + /// + /// Defaulted on deserialization so older `*.json` payloads / + /// snapshots that predate the field still parse. + #[serde(default)] + pub images: Vec, } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] diff --git a/crates/kebab-llm-local/src/ollama.rs b/crates/kebab-llm-local/src/ollama.rs index 51e2a49..42a9b13 100644 --- a/crates/kebab-llm-local/src/ollama.rs +++ b/crates/kebab-llm-local/src/ollama.rs @@ -140,9 +140,15 @@ impl LanguageModel for OllamaLanguageModel { format!("{}\n\n{}", req.system, req.user) }; + // Vision inputs (P6-3) flow through the request via Ollama's + // `images: [base64, ...]` field. Empty for the text-only RAG + // path so older snapshots and JSON dumps stay byte-identical + // (the field is `#[serde(default)]` here so it's omitted from + // the wire when empty). let body = OllamaRequest { model: &self.model_id, prompt, + images: &req.images, stream: true, options: OllamaOptions { temperature: effective_temperature, @@ -188,6 +194,13 @@ impl LanguageModel for OllamaLanguageModel { struct OllamaRequest<'a> { model: &'a str, prompt: String, + /// Skipped from the JSON when empty so the text-only path keeps + /// the same on-the-wire shape it had pre-P6-3 (`{"model": ..., + /// "prompt": ..., "stream": ..., "options": ...}` — no `images` + /// key). Vision-capable callers populate this with one or more + /// base64-encoded images. + #[serde(skip_serializing_if = "<[String]>::is_empty")] + images: &'a [String], stream: bool, options: OllamaOptions<'a>, } diff --git a/crates/kebab-llm-local/tests/integration.rs b/crates/kebab-llm-local/tests/integration.rs index a0836e2..66270e6 100644 --- a/crates/kebab-llm-local/tests/integration.rs +++ b/crates/kebab-llm-local/tests/integration.rs @@ -31,6 +31,7 @@ fn real_ollama_streams_non_empty_response() { max_tokens: 8, temperature: 0.0, seed: Some(0), + images: Vec::new(), }; let stream = llm.generate_stream(req).expect("stream should start"); diff --git a/crates/kebab-llm-local/tests/streaming.rs b/crates/kebab-llm-local/tests/streaming.rs index cc25940..337b434 100644 --- a/crates/kebab-llm-local/tests/streaming.rs +++ b/crates/kebab-llm-local/tests/streaming.rs @@ -35,6 +35,7 @@ fn sample_request() -> GenerateRequest { max_tokens: 64, temperature: 0.0, seed: Some(0), + images: Vec::new(), } } diff --git a/crates/kebab-llm/tests/mock.rs b/crates/kebab-llm/tests/mock.rs index 2d5bbbd..97fe2e0 100644 --- a/crates/kebab-llm/tests/mock.rs +++ b/crates/kebab-llm/tests/mock.rs @@ -26,6 +26,7 @@ fn req_with_stop(stop: Vec<&str>) -> GenerateRequest { max_tokens: 64, temperature: 0.0, seed: None, + images: Vec::new(), } } diff --git a/crates/kebab-llm/tests/reexports.rs b/crates/kebab-llm/tests/reexports.rs index 8df88d5..0e479de 100644 --- a/crates/kebab-llm/tests/reexports.rs +++ b/crates/kebab-llm/tests/reexports.rs @@ -55,6 +55,7 @@ fn dyn_dispatch_via_box_works() { max_tokens: 16, temperature: 0.0, seed: None, + images: Vec::new(), }; let stream = m.generate_stream(req).expect("stream"); let chunks: Vec = stream.map(|r| r.expect("ok chunk")).collect(); diff --git a/crates/kebab-parse-image/Cargo.toml b/crates/kebab-parse-image/Cargo.toml index e596d6c..4e78465 100644 --- a/crates/kebab-parse-image/Cargo.toml +++ b/crates/kebab-parse-image/Cargo.toml @@ -10,6 +10,12 @@ description = "Image extractor + EXIF + OCR (Ollama-vision) for the kebab pipe [dependencies] kebab-core = { path = "../kebab-core" } kebab-config = { path = "../kebab-config" } +# `kebab-llm` re-exports the trait crate (`kebab-core::LanguageModel`) +# under a stable surface; the caption adapter consumes any +# `dyn LanguageModel`. We do NOT depend on `kebab-llm-local` (forbidden +# by p6-3 design §8) — the trait abstraction is exactly what spec +# requires. +kebab-llm = { path = "../kebab-llm" } anyhow = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } @@ -42,3 +48,10 @@ tokio = { workspace = true, features = ["rt-multi-thread"] } # font rendering. ab_glyph = "0.2" base64 = "0.22" +# `kebab-llm/mock` exposes `MockLanguageModel` for hermetic caption +# tests. Real adapters (Ollama) live in `kebab-llm-local`, which is +# only allowed at the dev-dep level here — the runtime crate stays +# trait-only, so the §8 forbidden-deps rule (no `kebab-llm-local` +# at runtime) is preserved. +kebab-llm = { path = "../kebab-llm", features = ["mock"] } +kebab-llm-local = { path = "../kebab-llm-local" } diff --git a/crates/kebab-parse-image/src/caption.rs b/crates/kebab-parse-image/src/caption.rs new file mode 100644 index 0000000..3e3597b --- /dev/null +++ b/crates/kebab-parse-image/src/caption.rs @@ -0,0 +1,281 @@ +//! Caption adapter (P6-3). +//! +//! [`caption_image`] runs a vision-capable [`LanguageModel`] over an +//! image and produces a [`ModelCaption`]. [`apply_caption`] is the +//! helper that mutates an [`ImageRefBlock`] in place and emits a +//! [`ProvenanceKind::CaptionApplied`] event. +//! +//! ## Trust note +//! +//! Captions are **model-generated** (`TrustLevel::Generated`), not +//! observed text. Vision LMs hallucinate; the system prompt explicitly +//! forbids guessing but expect false captions. Downstream UI / RAG +//! must label captions as model-generated and surface the model id + +//! prompt template version (carried in `ModelCaption.model_version`) +//! so a regression in either is auditable. +//! +//! ## Spec deviation (cargo `caption` feature dropped) +//! +//! The original P6-3 spec asked for a cargo feature `caption` (default +//! OFF at compile time). We collapse this into a single runtime gate +//! (`config.image.caption.enabled = false`, default OFF). Reasoning: +//! the captioning module's only extra deps are `base64` + `image` + +//! `kebab-llm` trait — all already pulled in by the rest of the +//! crate. A cargo feature would only complicate the build matrix +//! without saving meaningful binary weight. See `tasks/HOTFIXES.md` +//! (2026-05-02) for the deviation log. + +use std::io::Cursor; + +use anyhow::{Context, Result}; +use base64::Engine as _; +use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; +use image::{ImageFormat, ImageReader}; +use kebab_core::{ + FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption, + ProvenanceEvent, ProvenanceKind, TokenChunk, +}; +use time::OffsetDateTime; + +/// Long-edge clamp range for caption inputs. Smaller than OCR's +/// `[256, 4096]` because vision LMs charge proportionally to input +/// dimension — captions tolerate aggressive downscale better than +/// OCR. +const MIN_CAPTION_LONG_EDGE: u32 = 128; +const MAX_CAPTION_LONG_EDGE: u32 = 1536; + +/// Token budget for captions. Captions are one-sentence by spec — 96 +/// tokens covers a 50-word English sentence or a 30-token Korean one +/// with headroom for the LM's preamble before the stop sequence. +const CAPTION_MAX_TOKENS: usize = 96; + +/// Run a caption pass and return the resulting `ModelCaption`. Honours +/// `config.image.caption.enabled` — when disabled the function is a +/// no-op and returns an `Err` so the caller can route the asset +/// through `apply_caption` instead, which knows to short-circuit. +/// +/// Direct callers should prefer [`apply_caption`] for end-to-end +/// pipeline integration; this lower-level entry exists so tests can +/// pin the produced `ModelCaption` independent of block mutation. +pub fn caption_image( + llm: &dyn LanguageModel, + image_bytes: &[u8], + lang_hint: Option<&Lang>, + cfg: &kebab_config::Config, +) -> Result { + if !cfg.image.caption.enabled { + anyhow::bail!( + "captioning is disabled (set image.caption.enabled = true in config to enable)" + ); + } + + let max_pixels = cfg + .image + .caption + .max_pixels + .clamp(MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE); + if max_pixels != cfg.image.caption.max_pixels { + tracing::warn!( + target: "kebab-parse-image", + "image.caption.max_pixels = {} clamped to {} (legal range [{}, {}])", + cfg.image.caption.max_pixels, + max_pixels, + MIN_CAPTION_LONG_EDGE, + MAX_CAPTION_LONG_EDGE + ); + } + + let prepared = downscale_to_png(image_bytes, max_pixels) + .context("preparing image for caption")?; + let b64 = BASE64_STANDARD.encode(&prepared); + + let lang = lang_hint + .map(|l| l.0.as_str()) + .filter(|s| !s.is_empty() && *s != "und"); + let (system, user) = build_prompt(lang); + + // Determinism — temperature 0.0 + seed 0, same convention as RAG + // and OCR. The LM adapter routes the base64 image via its + // provider-specific channel (Ollama: `images: [base64]`). + let req = GenerateRequest { + system, + user, + stop: vec!["\n\n".to_string()], + max_tokens: CAPTION_MAX_TOKENS, + temperature: 0.0, + seed: Some(0), + images: vec![b64], + }; + + let stream = llm + .generate_stream(req) + .context("captioning LM call failed")?; + + let mut text = String::new(); + let mut saw_done = false; + for chunk in stream { + match chunk? { + TokenChunk::Token(t) => { + text.push_str(&t); + } + TokenChunk::Done { finish_reason, .. } => { + saw_done = true; + if let FinishReason::Error(e) = finish_reason { + anyhow::bail!("captioning LM ended with error: {e}"); + } + break; + } + } + } + if !saw_done { + anyhow::bail!("captioning LM stream ended without a Done frame"); + } + + let caption_text = text.trim().to_string(); + + let model_ref = llm.model_ref(); + let prompt_v = &cfg.image.caption.prompt_template_version; + let model_version = format!( + "{provider}/{prompt}", + provider = model_ref.provider, + prompt = prompt_v + ); + + tracing::debug!( + target: "kebab-parse-image", + "caption ok (model={}, prompt={}, chars={})", + model_ref.id, + prompt_v, + caption_text.chars().count() + ); + + Ok(ModelCaption { + text: caption_text, + model: model_ref.id, + model_version, + }) +} + +/// Mutate `block.caption` in place by running `caption_image` over +/// `image_bytes`. When `config.image.caption.enabled = false` the +/// function is a clean no-op (returns `Ok(())` without invoking the +/// LM and without writing a Provenance event). +/// +/// On LM failure, `block.caption` stays `None` — partial state is +/// never written. The caller decides whether to skip the asset or +/// surface the error. +pub fn apply_caption( + llm: &dyn LanguageModel, + image_bytes: &[u8], + block: &mut ImageRefBlock, + lang_hint: Option<&Lang>, + cfg: &kebab_config::Config, + events: &mut Vec, +) -> Result<()> { + if !cfg.image.caption.enabled { + tracing::debug!( + target: "kebab-parse-image", + "captioning skipped — image.caption.enabled = false" + ); + return Ok(()); + } + let caption = caption_image(llm, image_bytes, lang_hint, cfg)?; + let model_label = caption.model.clone(); + let model_version_label = caption.model_version.clone(); + block.caption = Some(caption); + events.push(ProvenanceEvent { + at: OffsetDateTime::now_utc(), + agent: "kb-parse-image".to_string(), + kind: ProvenanceKind::CaptionApplied, + note: Some(format!( + "model={model_label} model_version={model_version_label}" + )), + }); + Ok(()) +} + +/// Compose the `(system, user)` prompt pair for the caption call. +/// Korean / English split keeps the model on the requested output +/// language; everything else falls through to English. +fn build_prompt(lang_hint: Option<&str>) -> (String, String) { + match lang_hint { + Some("ko") | Some("kor") => ( + "이미지를 한 문장으로 객관적으로 설명한다. 추측은 피하고, \ + 보이는 것만 적는다. 마크다운 / 따옴표 / 부가 설명 없이 \ + 한 문장만 출력." + .to_string(), + "위 이미지를 한국어로 한 문장으로 설명하라.".to_string(), + ), + _ => ( + "Describe the image in one objective sentence. Do not \ + speculate; describe only what is visible. No markdown, \ + no quotes, no commentary — output a single sentence." + .to_string(), + "Describe the image above in one English sentence.".to_string(), + ), + } +} + +/// Decode `bytes`, downscale long-edge to `max_long_edge`, re-encode as +/// PNG. Mirrors the OCR pipeline's pattern but with the caption-side +/// long-edge bounds. PNG sources within the cap pass through without +/// re-encode. +fn downscale_to_png(bytes: &[u8], max_long_edge: u32) -> Result> { + let reader = ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .context("reading image header for caption")?; + let format = reader.format(); + let (w, h) = reader + .into_dimensions() + .context("reading image dimensions for caption")?; + + let long = w.max(h); + if long <= max_long_edge && format == Some(ImageFormat::Png) { + return Ok(bytes.to_vec()); + } + + let img = ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .context("re-reading image for caption decode")? + .decode() + .context("decoding image for caption")?; + let final_img = if long <= max_long_edge { + img + } else { + let scale = max_long_edge as f32 / long as f32; + let mut new_w = ((w as f32) * scale).round().max(1.0) as u32; + let mut new_h = ((h as f32) * scale).round().max(1.0) as u32; + if w >= h { + new_w = new_w.min(max_long_edge); + } else { + new_h = new_h.min(max_long_edge); + } + img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle) + }; + + let mut out = Cursor::new(Vec::new()); + final_img + .write_to(&mut out, ImageFormat::Png) + .context("encoding image as PNG for caption")?; + Ok(out.into_inner()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn build_prompt_korean_for_ko_hint() { + let (sys, user) = build_prompt(Some("ko")); + assert!(sys.contains("이미지를 한 문장으로")); + assert!(user.contains("한국어로")); + } + + #[test] + fn build_prompt_english_for_no_hint_or_und() { + let (sys, _) = build_prompt(None); + assert!(sys.contains("Describe the image")); + let (sys2, _) = build_prompt(Some("en")); + assert!(sys2.contains("Describe the image")); + } +} diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs index b5d826b..c5e1b87 100644 --- a/crates/kebab-parse-image/src/lib.rs +++ b/crates/kebab-parse-image/src/lib.rs @@ -13,14 +13,24 @@ //! consumers can branch trust by engine (Tesseract / Apple Vision //! adapters, when added, will write a different `engine` string). //! +//! P6-3 adds the [`caption`] module: [`caption_image`] / +//! [`apply_caption`] route an image through any vision-capable +//! [`kebab_core::LanguageModel`] (text-only LMs are not vision-aware +//! and will surface a model-side error). Captions are explicitly +//! marked **model-generated** — the trust gap between OCR (observed, +//! engine-tagged) and caption (generated, prompt-tagged) is the +//! workspace's central trust contract. +//! //! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText / //! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption //! provenance), §9 (versioning). mod dims; mod exif_extract; +pub mod caption; pub mod ocr; +pub use caption::{apply_caption, caption_image}; pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr}; use anyhow::{Context, Result}; diff --git a/crates/kebab-parse-image/tests/caption.rs b/crates/kebab-parse-image/tests/caption.rs new file mode 100644 index 0000000..dc8c78e --- /dev/null +++ b/crates/kebab-parse-image/tests/caption.rs @@ -0,0 +1,368 @@ +//! Integration tests for the caption adapter (P6-3). +//! +//! All hermetic tests use `MockLanguageModel` from `kebab-llm/mock` +//! which captures `req.images` indirectly via the canned response. A +//! single opt-in test (`#[ignore]`) wires the real +//! `kebab-llm-local::OllamaLanguageModel` against the workspace's +//! Ollama daemon to verify the `images: [base64]` round-trip. + +mod common; + +use std::sync::{Arc, Mutex}; + +use kebab_config::Config; +use kebab_core::{ + AssetId, BlockId, CommonBlock, FinishReason, GenerateRequest, ImageRefBlock, Lang, + LanguageModel, ModelRef, ProvenanceEvent, ProvenanceKind, SourceSpan, TokenChunk, + TokenUsage, +}; +use kebab_llm::MockLanguageModel; +use kebab_parse_image::{apply_caption, caption_image}; + +use crate::common::red_100x50_png; + +fn cfg_with_caption_enabled() -> Config { + let mut cfg = Config::defaults(); + cfg.image.caption.enabled = true; + cfg.image.caption.max_pixels = 512; + cfg +} + +fn empty_image_block() -> ImageRefBlock { + ImageRefBlock { + common: CommonBlock { + block_id: BlockId("0".repeat(32)), + heading_path: Vec::new(), + source_span: SourceSpan::Region { + x: 0, + y: 0, + w: 100, + h: 50, + }, + }, + asset_id: Some(AssetId("a".repeat(32))), + src: "img/x.png".to_string(), + alt: "x.png".to_string(), + ocr: None, + caption: None, + } +} + +fn mk_mock(canned: &str) -> MockLanguageModel { + MockLanguageModel { + model_id: "vision-mock:1b".to_string(), + provider: "mock".to_string(), + context_tokens: 4096, + canned_response: canned.to_string(), + canned_finish: FinishReason::Stop, + canned_usage: TokenUsage { + prompt_tokens: 0, + completion_tokens: 0, + latency_ms: 0, + }, + } +} + +// ── Disabled feature gate ───────────────────────────────────────────────── + +#[test] +fn apply_caption_no_op_when_feature_disabled() { + let mut cfg = Config::defaults(); + cfg.image.caption.enabled = false; + let mock = mk_mock("ignored"); + let mut block = empty_image_block(); + let mut events: Vec = Vec::new(); + let bytes = red_100x50_png(); + apply_caption(&mock, &bytes, &mut block, None, &cfg, &mut events) + .expect("disabled apply_caption must return Ok(())"); + assert!( + block.caption.is_none(), + "disabled apply_caption must not write caption" + ); + assert!( + events.is_empty(), + "disabled apply_caption must not append a Provenance event" + ); +} + +#[test] +fn caption_image_errors_when_feature_disabled() { + let cfg = Config::defaults(); // enabled = false + let mock = mk_mock("ignored"); + let bytes = red_100x50_png(); + let r = caption_image(&mock, &bytes, None, &cfg); + assert!( + r.is_err(), + "caption_image must Err when image.caption.enabled = false" + ); + let msg = format!("{:#}", r.unwrap_err()); + assert!( + msg.contains("disabled"), + "error must mention disabled state: {msg}" + ); +} + +// ── Happy path ──────────────────────────────────────────────────────────── + +#[test] +fn apply_caption_sets_block_caption_and_appends_provenance() { + let cfg = cfg_with_caption_enabled(); + let mock = mk_mock("사진 한 장"); + let mut block = empty_image_block(); + let mut events: Vec = Vec::new(); + let bytes = red_100x50_png(); + apply_caption( + &mock, + &bytes, + &mut block, + Some(&Lang("ko".to_string())), + &cfg, + &mut events, + ) + .expect("apply_caption must succeed"); + + let cap = block.caption.as_ref().expect("caption Some"); + assert_eq!(cap.text, "사진 한 장"); + assert_eq!(cap.model, "vision-mock:1b"); + assert_eq!(cap.model_version, "mock/caption-v1"); + + assert_eq!(events.len(), 1); + assert_eq!(events[0].kind, ProvenanceKind::CaptionApplied); + assert_eq!(events[0].agent, "kb-parse-image"); + let note = events[0].note.as_deref().unwrap_or(""); + assert!(note.contains("vision-mock:1b") && note.contains("caption-v1"), "{note}"); +} + +// ── Empty token stream → empty caption text ────────────────────────────── + +#[test] +fn caption_image_empty_stream_yields_empty_text() { + let cfg = cfg_with_caption_enabled(); + let mock = mk_mock(""); + let bytes = red_100x50_png(); + let cap = caption_image(&mock, &bytes, None, &cfg).expect("empty stream must succeed"); + assert_eq!(cap.text, ""); + // Spec contract: caller can distinguish "captioning attempted, no + // result" from "captioning never attempted" by `caption.is_some()`. + // The text being empty does not erase the attempt. + assert!(!cap.model.is_empty()); +} + +// ── Korean vs English prompt selection ─────────────────────────────────── + +/// `LanguageModel` impl that captures the `system` prompt sent to it +/// so tests can verify the language branch picked by `build_prompt` +/// (the function is private; this is the cleanest observable signal). +struct CapturingMock { + captured_system: Arc>>, + captured_images: Arc>>, +} + +impl LanguageModel for CapturingMock { + fn model_ref(&self) -> ModelRef { + ModelRef { + id: "capture:1".to_string(), + provider: "mock".to_string(), + dimensions: None, + } + } + fn context_tokens(&self) -> usize { + 4096 + } + fn generate_stream( + &self, + req: GenerateRequest, + ) -> anyhow::Result> + Send>> { + *self.captured_system.lock().unwrap() = Some(req.system); + *self.captured_images.lock().unwrap() = req.images; + let chunks: Vec = vec![ + TokenChunk::Token("ok".to_string()), + TokenChunk::Done { + finish_reason: FinishReason::Stop, + usage: TokenUsage { + prompt_tokens: 0, + completion_tokens: 0, + latency_ms: 0, + }, + }, + ]; + Ok(Box::new(chunks.into_iter().map(Ok))) + } +} + +#[test] +fn caption_image_routes_image_into_request_images_field() { + let cfg = cfg_with_caption_enabled(); + let captured_system: Arc>> = Arc::new(Mutex::new(None)); + let captured_images: Arc>> = Arc::new(Mutex::new(Vec::new())); + let mock = CapturingMock { + captured_system: captured_system.clone(), + captured_images: captured_images.clone(), + }; + let bytes = red_100x50_png(); + let _ = caption_image(&mock, &bytes, Some(&Lang("ko".to_string())), &cfg) + .expect("caption succeeds"); + + let imgs = captured_images.lock().unwrap(); + assert_eq!(imgs.len(), 1, "exactly one base64 image routed"); + use base64::Engine as _; + let decoded = base64::engine::general_purpose::STANDARD + .decode(&imgs[0]) + .expect("base64 decodes"); + assert!( + !decoded.is_empty(), + "decoded image bytes must be non-empty" + ); + + let sys = captured_system.lock().unwrap().clone().unwrap(); + assert!( + sys.contains("이미지를 한 문장으로"), + "Korean hint must produce Korean system prompt: {sys}" + ); +} + +#[test] +fn caption_image_uses_english_prompt_for_undetermined_lang() { + let cfg = cfg_with_caption_enabled(); + let captured_system: Arc>> = Arc::new(Mutex::new(None)); + let mock = CapturingMock { + captured_system: captured_system.clone(), + captured_images: Arc::new(Mutex::new(Vec::new())), + }; + let bytes = red_100x50_png(); + let _ = caption_image(&mock, &bytes, Some(&Lang("und".to_string())), &cfg) + .expect("caption succeeds"); + let sys = captured_system.lock().unwrap().clone().unwrap(); + assert!(sys.contains("Describe the image"), "{sys}"); +} + +// ── LM error propagates ────────────────────────────────────────────────── + +/// LM that returns Err immediately from `generate_stream` (before any +/// token). +struct FailingLm; +impl LanguageModel for FailingLm { + fn model_ref(&self) -> ModelRef { + ModelRef { + id: "fail".into(), + provider: "mock".into(), + dimensions: None, + } + } + fn context_tokens(&self) -> usize { + 0 + } + fn generate_stream( + &self, + _req: GenerateRequest, + ) -> anyhow::Result> + Send>> { + Err(anyhow::anyhow!("simulated LM connection refused")) + } +} + +#[test] +fn apply_caption_lm_error_leaves_block_untouched() { + let cfg = cfg_with_caption_enabled(); + let mut block = empty_image_block(); + let mut events: Vec = Vec::new(); + let bytes = red_100x50_png(); + let r = apply_caption(&FailingLm, &bytes, &mut block, None, &cfg, &mut events); + assert!(r.is_err()); + assert!( + block.caption.is_none(), + "caption stays None when LM fails — partial state must not leak" + ); + assert!(events.is_empty(), "no provenance event when LM fails"); +} + +// ── Determinism — identical mock input → identical caption ─────────────── + +#[test] +fn caption_image_deterministic_with_identical_inputs() { + let cfg = cfg_with_caption_enabled(); + let bytes = red_100x50_png(); + let mock1 = mk_mock("a deterministic caption"); + let mock2 = mk_mock("a deterministic caption"); + let cap1 = caption_image(&mock1, &bytes, None, &cfg).unwrap(); + let cap2 = caption_image(&mock2, &bytes, None, &cfg).unwrap(); + assert_eq!(cap1, cap2); +} + +// ── max_pixels clamp ───────────────────────────────────────────────────── + +/// Out-of-range `max_pixels` is silently clamped at construction so a +/// bad config can't kill ingest. The captured `images` field's +/// decoded long edge confirms the clamp engaged. +#[test] +fn caption_image_clamps_oversized_max_pixels() { + let mut cfg = Config::defaults(); + cfg.image.caption.enabled = true; + cfg.image.caption.max_pixels = 99_999; // way over MAX_CAPTION_LONG_EDGE + let captured_images: Arc>> = Arc::new(Mutex::new(Vec::new())); + let mock = CapturingMock { + captured_system: Arc::new(Mutex::new(None)), + captured_images: captured_images.clone(), + }; + // 4000×3000 PNG well above the 1536 cap. + let bytes = common::large_blue_4000x3000_png(); + let _ = caption_image(&mock, &bytes, None, &cfg).expect("caption succeeds"); + let imgs = captured_images.lock().unwrap(); + use base64::Engine as _; + let decoded = base64::engine::general_purpose::STANDARD + .decode(&imgs[0]) + .unwrap(); + let reader = image::ImageReader::new(std::io::Cursor::new(decoded)) + .with_guessed_format() + .unwrap(); + let (w, h) = reader.into_dimensions().unwrap(); + let long = w.max(h); + assert!( + long <= 1536, + "max_pixels must clamp to MAX_CAPTION_LONG_EDGE=1536, got {long}" + ); +} + +// ── Real Ollama integration (opt-in) ───────────────────────────────────── + +/// End-to-end captioning against the workspace's real Ollama daemon +/// via `kebab-llm-local::OllamaLanguageModel` (dev-dep). Skipped by +/// default via `#[ignore]`; opt in with `--ignored`. +/// +/// Run with: +/// +/// ```sh +/// KEBAB_MODELS_LLM_ENDPOINT=http://192.168.0.47:11434 \ +/// KEBAB_MODELS_LLM_MODEL=gemma4:e4b \ +/// cargo test -p kebab-parse-image --test caption \ +/// caption_integration -- --ignored --nocapture +/// ``` +#[test] +#[ignore = "hits a real Ollama daemon; opt in via `cargo test -- --ignored`"] +fn caption_integration_real_ollama_describes_image() { + use kebab_llm_local::OllamaLanguageModel; + + let mut cfg = Config::defaults(); + cfg.image.caption.enabled = true; + cfg.image.caption.max_pixels = 768; + if let Ok(ep) = std::env::var("KEBAB_MODELS_LLM_ENDPOINT") { + cfg.models.llm.endpoint = ep; + } else { + cfg.models.llm.endpoint = "http://192.168.0.47:11434".to_string(); + } + if let Ok(m) = std::env::var("KEBAB_MODELS_LLM_MODEL") { + cfg.models.llm.model = m; + } else { + cfg.models.llm.model = "gemma4:e4b".to_string(); + } + cfg.models.llm.provider = "ollama".to_string(); + + let llm = OllamaLanguageModel::new(&cfg).expect("OllamaLanguageModel::new"); + let bytes = red_100x50_png(); + let cap = caption_image(&llm, &bytes, Some(&Lang("en".to_string())), &cfg) + .expect("real-Ollama caption_image must succeed"); + eprintln!("integration caption: {}", cap.text); + assert!(!cap.text.is_empty(), "caption must be non-empty"); + assert_eq!(cap.model, "gemma4:e4b"); + assert!(cap.model_version.contains("ollama")); + assert!(cap.model_version.contains("caption-v1")); +} diff --git a/crates/kebab-rag/src/pipeline.rs b/crates/kebab-rag/src/pipeline.rs index 3271f63..af53265 100644 --- a/crates/kebab-rag/src/pipeline.rs +++ b/crates/kebab-rag/src/pipeline.rs @@ -195,6 +195,9 @@ impl RagPipeline { max_tokens: max_completion, temperature, seed, + // RAG is text-only — vision inputs only flow when a + // future multimodal pipeline injects images here. + images: Vec::new(), }; let mut acc = String::new(); diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index e11346a..d1863b0 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,30 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-05-02 — P6-3 caption: GenerateRequest.images + cargo feature dropped + +**Discovered**: P6-3 implementation start. + +**Symptom 1**: `tasks/p6/p6-3-caption-adapter.md` § Public surface declares `caption_image(llm: &dyn kebab_core::LanguageModel, ...)`, but the frozen `LanguageModel` trait + `GenerateRequest` from p4-1 carry no vision input. The spec's behavior contract ("the adapter is responsible for rendering the prompt to wire") implicitly relied on a trait extension that p4-1 never specced. + +**Symptom 2**: Spec § Definition of Done asks for `cargo check -p kebab-parse-image --features caption` — i.e. a cargo feature gate. The captioning module's only extra deps are `base64` + `image` + the `kebab-llm` trait, all already pulled in by P6-2. A cargo feature would only complicate the build matrix without saving meaningful binary weight. + +**Root cause**: Two small spec gaps that resolve cleanly together — extend the `LanguageModel` trait once for vision routing, and collapse compile-time + runtime gating into a single runtime gate. + +**Fix** (PR #34, feat/p6-3-caption-adapter): +- `kebab-core::GenerateRequest` gains an `images: Vec` field (`#[serde(default)]` for backward compat with pre-P6 wire payloads / snapshots). Empty for the text-only RAG path; populated with one or more base64 strings by vision-aware callers. +- `kebab-llm-local::OllamaLanguageModel` routes `req.images` onto the wire as `images: [base64, ...]` (Ollama's vision channel). The wire shape stays byte-identical for empty `images` because the field uses `#[serde(skip_serializing_if = "<[String]>::is_empty")]`. +- `kebab-parse-image::caption` module: `caption_image` / `apply_caption` build `GenerateRequest { images: vec![b64], temperature: 0.0, seed: 0, ... }` and accept any `&dyn LanguageModel`. Korean / English prompt branch picked from `lang_hint`. +- Cargo feature `caption` is **not** introduced — the runtime gate `config.image.caption.enabled = false` (default OFF) suffices. +- All existing `GenerateRequest { ... }` literals (kebab-rag, kebab-llm tests, kebab-llm-local tests) gained `images: Vec::new()` to satisfy the new field. + +**Trust note**: Captions stay explicitly model-generated. `ModelCaption.model_version` carries `"/"` (e.g. `"ollama/caption-v1"`) so a regression in either prompt or model is auditable from the wire. + +**Amends**: +- tasks/p4/p4-1-llm-trait.md (`GenerateRequest` schema gained `images: Vec`). +- tasks/p4/p4-2-ollama-adapter.md (request body now optionally includes `images: [...]`). +- tasks/p6/p6-3-caption-adapter.md ("Definition of Done" cargo feature `caption` dropped; runtime gate is the only feature gate). + ## 2026-05-02 — P6-2 default OCR engine: Tesseract → Ollama-vision **Discovered**: P6-2 implementation start. diff --git a/tasks/p6/p6-3-caption-adapter.md b/tasks/p6/p6-3-caption-adapter.md index c4a99fd..8f8a4fb 100644 --- a/tasks/p6/p6-3-caption-adapter.md +++ b/tasks/p6/p6-3-caption-adapter.md @@ -3,7 +3,7 @@ phase: P6 component: kebab-parse-image (caption adapter) task_id: p6-3 title: "ModelCaption adapter (LanguageModel-driven, feature-gated)" -status: planned +status: completed depends_on: [p6-1, p4-2] unblocks: [] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md