부모 경로에 .ingest 삽입(leaf 구조체 불변). src + 테스트 call-site 전부. kebab-cli 테스트의 v2 TOML fixture 는 from_file 자동변환(T6) 경로 검증용으로 유지. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
238 lines
8.4 KiB
Rust
238 lines
8.4 KiB
Rust
//! Caption adapter (P6-3).
|
|
//!
|
|
//! [`caption_image`] runs a vision-capable [`LanguageModel`] over an
|
|
//! image and produces a [`ModelCaption`]. [`apply_caption`] is the
|
|
//! helper that mutates an [`ImageRefBlock`] in place and emits a
|
|
//! [`ProvenanceKind::CaptionApplied`] event.
|
|
//!
|
|
//! ## Trust note
|
|
//!
|
|
//! Captions are **model-generated** (`TrustLevel::Generated`), not
|
|
//! observed text. Vision LMs hallucinate; the system prompt explicitly
|
|
//! forbids guessing but expect false captions. Downstream UI / RAG
|
|
//! must label captions as model-generated and surface the model id +
|
|
//! prompt template version (carried in `ModelCaption.model_version`)
|
|
//! so a regression in either is auditable.
|
|
//!
|
|
//! ## Spec deviation (cargo `caption` feature dropped)
|
|
//!
|
|
//! The original P6-3 spec asked for a cargo feature `caption` (default
|
|
//! OFF at compile time). We collapse this into a single runtime gate
|
|
//! (`config.ingest.image.caption.enabled = false`, default OFF). Reasoning:
|
|
//! the captioning module's only extra deps are `base64` + `image` +
|
|
//! `kebab-llm` trait — all already pulled in by the rest of the
|
|
//! crate. A cargo feature would only complicate the build matrix
|
|
//! without saving meaningful binary weight. See `tasks/HOTFIXES.md`
|
|
//! (2026-05-02) for the deviation log.
|
|
|
|
use anyhow::{Context, Result};
|
|
use base64::Engine as _;
|
|
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
|
|
use kebab_core::{
|
|
FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption,
|
|
ProvenanceEvent, ProvenanceKind, TokenChunk,
|
|
};
|
|
use time::OffsetDateTime;
|
|
|
|
use crate::image_prep;
|
|
|
|
/// Long-edge clamp range for caption inputs. Smaller than OCR's
|
|
/// `[256, 4096]` because vision LMs charge proportionally to input
|
|
/// dimension — captions tolerate aggressive downscale better than
|
|
/// OCR.
|
|
pub const MIN_CAPTION_LONG_EDGE: u32 = 128;
|
|
pub const MAX_CAPTION_LONG_EDGE: u32 = 1536;
|
|
|
|
/// Token budget for captions. Captions are one-sentence by spec — 96
|
|
/// tokens covers a 50-word English sentence or a 30-token Korean one
|
|
/// with headroom for the LM's preamble before the stop sequence.
|
|
const CAPTION_MAX_TOKENS: usize = 96;
|
|
|
|
/// Run a caption pass and return the resulting `ModelCaption`.
|
|
///
|
|
/// Pure raw operation — does **not** consult `config.ingest.image.caption.enabled`.
|
|
/// The runtime feature gate lives in [`apply_caption`]; this entry
|
|
/// always invokes the LM. Tests pinning the produced `ModelCaption`
|
|
/// shape can call this directly without flipping the config flag.
|
|
///
|
|
/// Honours the `[MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE]` clamp
|
|
/// on `config.ingest.image.caption.max_pixels` so a hostile config cannot
|
|
/// blow up prompt cost.
|
|
pub fn caption_image(
|
|
llm: &dyn LanguageModel,
|
|
image_bytes: &[u8],
|
|
lang_hint: Option<&Lang>,
|
|
cfg: &kebab_config::Config,
|
|
) -> Result<ModelCaption> {
|
|
let max_pixels = cfg
|
|
.ingest
|
|
.image
|
|
.caption
|
|
.max_pixels
|
|
.clamp(MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE);
|
|
if max_pixels != cfg.ingest.image.caption.max_pixels {
|
|
tracing::warn!(
|
|
target: "kebab-parse-image",
|
|
"image.caption.max_pixels = {} clamped to {} (legal range [{}, {}])",
|
|
cfg.ingest.image.caption.max_pixels,
|
|
max_pixels,
|
|
MIN_CAPTION_LONG_EDGE,
|
|
MAX_CAPTION_LONG_EDGE
|
|
);
|
|
}
|
|
|
|
let (prepared, _w, _h) = image_prep::downscale_to_png(image_bytes, max_pixels)
|
|
.context("preparing image for caption")?;
|
|
let b64 = BASE64_STANDARD.encode(&prepared);
|
|
|
|
let lang = lang_hint
|
|
.map(|l| l.0.as_str())
|
|
.filter(|s| !s.is_empty() && *s != "und");
|
|
let (system, user) = build_prompt(lang);
|
|
|
|
// Determinism — temperature 0.0 + seed 0, same convention as RAG
|
|
// and OCR. The LM adapter routes the base64 image via its
|
|
// provider-specific channel (Ollama: `images: [base64]`).
|
|
let req = GenerateRequest {
|
|
system,
|
|
user,
|
|
stop: vec!["\n\n".to_string()],
|
|
max_tokens: CAPTION_MAX_TOKENS,
|
|
temperature: 0.0,
|
|
seed: Some(0),
|
|
images: vec![b64],
|
|
};
|
|
|
|
let stream = llm
|
|
.generate_stream(req)
|
|
.context("captioning LM call failed")?;
|
|
|
|
let mut text = String::new();
|
|
let mut saw_done = false;
|
|
for chunk in stream {
|
|
match chunk? {
|
|
TokenChunk::Token(t) => {
|
|
text.push_str(&t);
|
|
}
|
|
TokenChunk::Done { finish_reason, .. } => {
|
|
saw_done = true;
|
|
if let FinishReason::Error(e) = finish_reason {
|
|
anyhow::bail!("captioning LM ended with error: {e}");
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if !saw_done {
|
|
anyhow::bail!("captioning LM stream ended without a Done frame");
|
|
}
|
|
|
|
let caption_text = text.trim().to_string();
|
|
|
|
let model_ref = llm.model_ref();
|
|
let prompt_v = &cfg.ingest.image.caption.prompt_template_version;
|
|
let model_version = format!(
|
|
"{provider}/{prompt}",
|
|
provider = model_ref.provider,
|
|
prompt = prompt_v
|
|
);
|
|
|
|
tracing::debug!(
|
|
target: "kebab-parse-image",
|
|
"caption ok (model={}, prompt={}, chars={})",
|
|
model_ref.id,
|
|
prompt_v,
|
|
caption_text.chars().count()
|
|
);
|
|
|
|
Ok(ModelCaption {
|
|
text: caption_text,
|
|
model: model_ref.id,
|
|
model_version,
|
|
})
|
|
}
|
|
|
|
/// Pipeline entry point — gate-checks `config.ingest.image.caption.enabled`
|
|
/// then mutates `block.caption` in place via [`caption_image`].
|
|
///
|
|
/// When `enabled = false` the function is a clean no-op (returns
|
|
/// `Ok(())` without invoking the LM and without writing a Provenance
|
|
/// event). On LM failure `block.caption` stays `None` — partial state
|
|
/// is never written. The caller decides whether to skip the asset or
|
|
/// surface the error.
|
|
pub fn apply_caption(
|
|
llm: &dyn LanguageModel,
|
|
image_bytes: &[u8],
|
|
block: &mut ImageRefBlock,
|
|
lang_hint: Option<&Lang>,
|
|
cfg: &kebab_config::Config,
|
|
events: &mut Vec<ProvenanceEvent>,
|
|
) -> Result<()> {
|
|
if !cfg.ingest.image.caption.enabled {
|
|
tracing::debug!(
|
|
target: "kebab-parse-image",
|
|
"captioning skipped — image.caption.enabled = false"
|
|
);
|
|
return Ok(());
|
|
}
|
|
let caption = caption_image(llm, image_bytes, lang_hint, cfg)?;
|
|
// Build the Provenance note BEFORE moving `caption` into
|
|
// `block.caption` so we sidestep the per-call `String::clone` of
|
|
// `caption.model` + `caption.model_version`. Tight ingest loops
|
|
// (thousands of images) save two allocations per asset.
|
|
let note = format!(
|
|
"model={} model_version={}",
|
|
caption.model, caption.model_version
|
|
);
|
|
block.caption = Some(caption);
|
|
events.push(ProvenanceEvent {
|
|
at: OffsetDateTime::now_utc(),
|
|
agent: "kb-parse-image".to_string(),
|
|
kind: ProvenanceKind::CaptionApplied,
|
|
note: Some(note),
|
|
});
|
|
Ok(())
|
|
}
|
|
|
|
/// Compose the `(system, user)` prompt pair for the caption call.
|
|
/// Korean / English split keeps the model on the requested output
|
|
/// language; everything else falls through to English.
|
|
fn build_prompt(lang_hint: Option<&str>) -> (String, String) {
|
|
match lang_hint {
|
|
Some("ko" | "kor") => (
|
|
"이미지를 한 문장으로 객관적으로 설명한다. 추측은 피하고, \
|
|
보이는 것만 적는다. 마크다운 / 따옴표 / 부가 설명 없이 \
|
|
한 문장만 출력."
|
|
.to_string(),
|
|
"위 이미지를 한국어로 한 문장으로 설명하라.".to_string(),
|
|
),
|
|
_ => (
|
|
"Describe the image in one objective sentence. Do not \
|
|
speculate; describe only what is visible. No markdown, \
|
|
no quotes, no commentary — output a single sentence."
|
|
.to_string(),
|
|
"Describe the image above in one English sentence.".to_string(),
|
|
),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn build_prompt_korean_for_ko_hint() {
|
|
let (sys, user) = build_prompt(Some("ko"));
|
|
assert!(sys.contains("이미지를 한 문장으로"));
|
|
assert!(user.contains("한국어로"));
|
|
}
|
|
|
|
#[test]
|
|
fn build_prompt_english_for_no_hint_or_und() {
|
|
let (sys, _) = build_prompt(None);
|
|
assert!(sys.contains("Describe the image"));
|
|
let (sys2, _) = build_prompt(Some("en"));
|
|
assert!(sys2.contains("Describe the image"));
|
|
}
|
|
}
|