//! Caption adapter (P6-3). //! //! [`caption_image`] runs a vision-capable [`LanguageModel`] over an //! image and produces a [`ModelCaption`]. [`apply_caption`] is the //! helper that mutates an [`ImageRefBlock`] in place and emits a //! [`ProvenanceKind::CaptionApplied`] event. //! //! ## Trust note //! //! Captions are **model-generated** (`TrustLevel::Generated`), not //! observed text. Vision LMs hallucinate; the system prompt explicitly //! forbids guessing but expect false captions. Downstream UI / RAG //! must label captions as model-generated and surface the model id + //! prompt template version (carried in `ModelCaption.model_version`) //! so a regression in either is auditable. //! //! ## Spec deviation (cargo `caption` feature dropped) //! //! The original P6-3 spec asked for a cargo feature `caption` (default //! OFF at compile time). We collapse this into a single runtime gate //! (`config.ingest.image.caption.enabled = false`, default OFF). Reasoning: //! the captioning module's only extra deps are `base64` + `image` + //! `kebab-llm` trait — all already pulled in by the rest of the //! crate. A cargo feature would only complicate the build matrix //! without saving meaningful binary weight. See `tasks/HOTFIXES.md` //! (2026-05-02) for the deviation log. use anyhow::{Context, Result}; use base64::Engine as _; use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; use kebab_core::{ FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption, ProvenanceEvent, ProvenanceKind, TokenChunk, }; use time::OffsetDateTime; use crate::image_prep; /// Long-edge clamp range for caption inputs. Smaller than OCR's /// `[256, 4096]` because vision LMs charge proportionally to input /// dimension — captions tolerate aggressive downscale better than /// OCR. pub const MIN_CAPTION_LONG_EDGE: u32 = 128; pub const MAX_CAPTION_LONG_EDGE: u32 = 1536; /// Token budget for captions. Captions are one-sentence by spec — 96 /// tokens covers a 50-word English sentence or a 30-token Korean one /// with headroom for the LM's preamble before the stop sequence. const CAPTION_MAX_TOKENS: usize = 96; /// Run a caption pass and return the resulting `ModelCaption`. /// /// Pure raw operation — does **not** consult `config.ingest.image.caption.enabled`. /// The runtime feature gate lives in [`apply_caption`]; this entry /// always invokes the LM. Tests pinning the produced `ModelCaption` /// shape can call this directly without flipping the config flag. /// /// Honours the `[MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE]` clamp /// on `config.ingest.image.caption.max_pixels` so a hostile config cannot /// blow up prompt cost. pub fn caption_image( llm: &dyn LanguageModel, image_bytes: &[u8], lang_hint: Option<&Lang>, cfg: &kebab_config::Config, ) -> Result { let max_pixels = cfg .ingest .image .caption .max_pixels .clamp(MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE); if max_pixels != cfg.ingest.image.caption.max_pixels { tracing::warn!( target: "kebab-parse-image", "image.caption.max_pixels = {} clamped to {} (legal range [{}, {}])", cfg.ingest.image.caption.max_pixels, max_pixels, MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE ); } let (prepared, _w, _h) = image_prep::downscale_to_png(image_bytes, max_pixels) .context("preparing image for caption")?; let b64 = BASE64_STANDARD.encode(&prepared); let lang = lang_hint .map(|l| l.0.as_str()) .filter(|s| !s.is_empty() && *s != "und"); let (system, user) = build_prompt(lang); // Determinism — temperature 0.0 + seed 0, same convention as RAG // and OCR. The LM adapter routes the base64 image via its // provider-specific channel (Ollama: `images: [base64]`). let req = GenerateRequest { system, user, stop: vec!["\n\n".to_string()], max_tokens: CAPTION_MAX_TOKENS, temperature: 0.0, seed: Some(0), images: vec![b64], }; let stream = llm .generate_stream(req) .context("captioning LM call failed")?; let mut text = String::new(); let mut saw_done = false; for chunk in stream { match chunk? { TokenChunk::Token(t) => { text.push_str(&t); } TokenChunk::Done { finish_reason, .. } => { saw_done = true; if let FinishReason::Error(e) = finish_reason { anyhow::bail!("captioning LM ended with error: {e}"); } break; } } } if !saw_done { anyhow::bail!("captioning LM stream ended without a Done frame"); } let caption_text = text.trim().to_string(); let model_ref = llm.model_ref(); let prompt_v = &cfg.ingest.image.caption.prompt_template_version; let model_version = format!( "{provider}/{prompt}", provider = model_ref.provider, prompt = prompt_v ); tracing::debug!( target: "kebab-parse-image", "caption ok (model={}, prompt={}, chars={})", model_ref.id, prompt_v, caption_text.chars().count() ); Ok(ModelCaption { text: caption_text, model: model_ref.id, model_version, }) } /// Pipeline entry point — gate-checks `config.ingest.image.caption.enabled` /// then mutates `block.caption` in place via [`caption_image`]. /// /// When `enabled = false` the function is a clean no-op (returns /// `Ok(())` without invoking the LM and without writing a Provenance /// event). On LM failure `block.caption` stays `None` — partial state /// is never written. The caller decides whether to skip the asset or /// surface the error. pub fn apply_caption( llm: &dyn LanguageModel, image_bytes: &[u8], block: &mut ImageRefBlock, lang_hint: Option<&Lang>, cfg: &kebab_config::Config, events: &mut Vec, ) -> Result<()> { if !cfg.ingest.image.caption.enabled { tracing::debug!( target: "kebab-parse-image", "captioning skipped — image.caption.enabled = false" ); return Ok(()); } let caption = caption_image(llm, image_bytes, lang_hint, cfg)?; // Build the Provenance note BEFORE moving `caption` into // `block.caption` so we sidestep the per-call `String::clone` of // `caption.model` + `caption.model_version`. Tight ingest loops // (thousands of images) save two allocations per asset. let note = format!( "model={} model_version={}", caption.model, caption.model_version ); block.caption = Some(caption); events.push(ProvenanceEvent { at: OffsetDateTime::now_utc(), agent: "kb-parse-image".to_string(), kind: ProvenanceKind::CaptionApplied, note: Some(note), }); Ok(()) } /// Compose the `(system, user)` prompt pair for the caption call. /// Korean / English split keeps the model on the requested output /// language; everything else falls through to English. fn build_prompt(lang_hint: Option<&str>) -> (String, String) { match lang_hint { Some("ko" | "kor") => ( "이미지를 한 문장으로 객관적으로 설명한다. 추측은 피하고, \ 보이는 것만 적는다. 마크다운 / 따옴표 / 부가 설명 없이 \ 한 문장만 출력." .to_string(), "위 이미지를 한국어로 한 문장으로 설명하라.".to_string(), ), _ => ( "Describe the image in one objective sentence. Do not \ speculate; describe only what is visible. No markdown, \ no quotes, no commentary — output a single sentence." .to_string(), "Describe the image above in one English sentence.".to_string(), ), } } #[cfg(test)] mod tests { use super::*; #[test] fn build_prompt_korean_for_ko_hint() { let (sys, user) = build_prompt(Some("ko")); assert!(sys.contains("이미지를 한 문장으로")); assert!(user.contains("한국어로")); } #[test] fn build_prompt_english_for_no_hint_or_und() { let (sys, _) = build_prompt(None); assert!(sys.contains("Describe the image")); let (sys2, _) = build_prompt(Some("en")); assert!(sys2.contains("Describe the image")); } }