Files
kebab/crates/kebab-parse-image/src/caption.rs
altair823 d5c69f6715 refactor(config): v3 경로 call-site sweep (kebab-app/kebab-eval/kebab-parse-image)
부모 경로에 .ingest 삽입(leaf 구조체 불변). src + 테스트 call-site 전부.
kebab-cli 테스트의 v2 TOML fixture 는 from_file 자동변환(T6) 경로 검증용으로 유지.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 12:40:06 +00:00

238 lines
8.4 KiB
Rust

//! Caption adapter (P6-3).
//!
//! [`caption_image`] runs a vision-capable [`LanguageModel`] over an
//! image and produces a [`ModelCaption`]. [`apply_caption`] is the
//! helper that mutates an [`ImageRefBlock`] in place and emits a
//! [`ProvenanceKind::CaptionApplied`] event.
//!
//! ## Trust note
//!
//! Captions are **model-generated** (`TrustLevel::Generated`), not
//! observed text. Vision LMs hallucinate; the system prompt explicitly
//! forbids guessing but expect false captions. Downstream UI / RAG
//! must label captions as model-generated and surface the model id +
//! prompt template version (carried in `ModelCaption.model_version`)
//! so a regression in either is auditable.
//!
//! ## Spec deviation (cargo `caption` feature dropped)
//!
//! The original P6-3 spec asked for a cargo feature `caption` (default
//! OFF at compile time). We collapse this into a single runtime gate
//! (`config.ingest.image.caption.enabled = false`, default OFF). Reasoning:
//! the captioning module's only extra deps are `base64` + `image` +
//! `kebab-llm` trait — all already pulled in by the rest of the
//! crate. A cargo feature would only complicate the build matrix
//! without saving meaningful binary weight. See `tasks/HOTFIXES.md`
//! (2026-05-02) for the deviation log.
use anyhow::{Context, Result};
use base64::Engine as _;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use kebab_core::{
FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption,
ProvenanceEvent, ProvenanceKind, TokenChunk,
};
use time::OffsetDateTime;
use crate::image_prep;
/// Long-edge clamp range for caption inputs. Smaller than OCR's
/// `[256, 4096]` because vision LMs charge proportionally to input
/// dimension — captions tolerate aggressive downscale better than
/// OCR.
pub const MIN_CAPTION_LONG_EDGE: u32 = 128;
pub const MAX_CAPTION_LONG_EDGE: u32 = 1536;
/// Token budget for captions. Captions are one-sentence by spec — 96
/// tokens covers a 50-word English sentence or a 30-token Korean one
/// with headroom for the LM's preamble before the stop sequence.
const CAPTION_MAX_TOKENS: usize = 96;
/// Run a caption pass and return the resulting `ModelCaption`.
///
/// Pure raw operation — does **not** consult `config.ingest.image.caption.enabled`.
/// The runtime feature gate lives in [`apply_caption`]; this entry
/// always invokes the LM. Tests pinning the produced `ModelCaption`
/// shape can call this directly without flipping the config flag.
///
/// Honours the `[MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE]` clamp
/// on `config.ingest.image.caption.max_pixels` so a hostile config cannot
/// blow up prompt cost.
pub fn caption_image(
llm: &dyn LanguageModel,
image_bytes: &[u8],
lang_hint: Option<&Lang>,
cfg: &kebab_config::Config,
) -> Result<ModelCaption> {
let max_pixels = cfg
.ingest
.image
.caption
.max_pixels
.clamp(MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE);
if max_pixels != cfg.ingest.image.caption.max_pixels {
tracing::warn!(
target: "kebab-parse-image",
"image.caption.max_pixels = {} clamped to {} (legal range [{}, {}])",
cfg.ingest.image.caption.max_pixels,
max_pixels,
MIN_CAPTION_LONG_EDGE,
MAX_CAPTION_LONG_EDGE
);
}
let (prepared, _w, _h) = image_prep::downscale_to_png(image_bytes, max_pixels)
.context("preparing image for caption")?;
let b64 = BASE64_STANDARD.encode(&prepared);
let lang = lang_hint
.map(|l| l.0.as_str())
.filter(|s| !s.is_empty() && *s != "und");
let (system, user) = build_prompt(lang);
// Determinism — temperature 0.0 + seed 0, same convention as RAG
// and OCR. The LM adapter routes the base64 image via its
// provider-specific channel (Ollama: `images: [base64]`).
let req = GenerateRequest {
system,
user,
stop: vec!["\n\n".to_string()],
max_tokens: CAPTION_MAX_TOKENS,
temperature: 0.0,
seed: Some(0),
images: vec![b64],
};
let stream = llm
.generate_stream(req)
.context("captioning LM call failed")?;
let mut text = String::new();
let mut saw_done = false;
for chunk in stream {
match chunk? {
TokenChunk::Token(t) => {
text.push_str(&t);
}
TokenChunk::Done { finish_reason, .. } => {
saw_done = true;
if let FinishReason::Error(e) = finish_reason {
anyhow::bail!("captioning LM ended with error: {e}");
}
break;
}
}
}
if !saw_done {
anyhow::bail!("captioning LM stream ended without a Done frame");
}
let caption_text = text.trim().to_string();
let model_ref = llm.model_ref();
let prompt_v = &cfg.ingest.image.caption.prompt_template_version;
let model_version = format!(
"{provider}/{prompt}",
provider = model_ref.provider,
prompt = prompt_v
);
tracing::debug!(
target: "kebab-parse-image",
"caption ok (model={}, prompt={}, chars={})",
model_ref.id,
prompt_v,
caption_text.chars().count()
);
Ok(ModelCaption {
text: caption_text,
model: model_ref.id,
model_version,
})
}
/// Pipeline entry point — gate-checks `config.ingest.image.caption.enabled`
/// then mutates `block.caption` in place via [`caption_image`].
///
/// When `enabled = false` the function is a clean no-op (returns
/// `Ok(())` without invoking the LM and without writing a Provenance
/// event). On LM failure `block.caption` stays `None` — partial state
/// is never written. The caller decides whether to skip the asset or
/// surface the error.
pub fn apply_caption(
llm: &dyn LanguageModel,
image_bytes: &[u8],
block: &mut ImageRefBlock,
lang_hint: Option<&Lang>,
cfg: &kebab_config::Config,
events: &mut Vec<ProvenanceEvent>,
) -> Result<()> {
if !cfg.ingest.image.caption.enabled {
tracing::debug!(
target: "kebab-parse-image",
"captioning skipped — image.caption.enabled = false"
);
return Ok(());
}
let caption = caption_image(llm, image_bytes, lang_hint, cfg)?;
// Build the Provenance note BEFORE moving `caption` into
// `block.caption` so we sidestep the per-call `String::clone` of
// `caption.model` + `caption.model_version`. Tight ingest loops
// (thousands of images) save two allocations per asset.
let note = format!(
"model={} model_version={}",
caption.model, caption.model_version
);
block.caption = Some(caption);
events.push(ProvenanceEvent {
at: OffsetDateTime::now_utc(),
agent: "kb-parse-image".to_string(),
kind: ProvenanceKind::CaptionApplied,
note: Some(note),
});
Ok(())
}
/// Compose the `(system, user)` prompt pair for the caption call.
/// Korean / English split keeps the model on the requested output
/// language; everything else falls through to English.
fn build_prompt(lang_hint: Option<&str>) -> (String, String) {
match lang_hint {
Some("ko" | "kor") => (
"이미지를 한 문장으로 객관적으로 설명한다. 추측은 피하고, \
보이는 것만 적는다. 마크다운 / 따옴표 / 부가 설명 없이 \
한 문장만 출력."
.to_string(),
"위 이미지를 한국어로 한 문장으로 설명하라.".to_string(),
),
_ => (
"Describe the image in one objective sentence. Do not \
speculate; describe only what is visible. No markdown, \
no quotes, no commentary — output a single sentence."
.to_string(),
"Describe the image above in one English sentence.".to_string(),
),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_prompt_korean_for_ko_hint() {
let (sys, user) = build_prompt(Some("ko"));
assert!(sys.contains("이미지를 한 문장으로"));
assert!(user.contains("한국어로"));
}
#[test]
fn build_prompt_english_for_no_hint_or_und() {
let (sys, _) = build_prompt(None);
assert!(sys.contains("Describe the image"));
let (sys2, _) = build_prompt(Some("en"));
assert!(sys2.contains("Describe the image"));
}
}