//! `kebab-parse-image` — image extractor (P6-1) + OCR adapter (P6-2). //! //! P6-1 implements [`kebab_core::Extractor`] for `MediaType::Image(_)`, //! producing a single-block [`CanonicalDocument`] (`ImageRefBlock` with //! EXIF + dimensions in `metadata.user`). OCR / caption fields stay //! `None` until populated by the OCR / caption adapters. //! //! P6-2 adds the [`ocr`] module: an [`OcrEngine`] trait and an //! [`OllamaVisionOcr`] default adapter that talks to a vision-capable //! Ollama model. [`apply_ocr`] is the helper that mutates an //! [`ImageRefBlock`] in place. Trust note — the LLM-driven default //! can hallucinate; `OcrText.engine` carries the source identity so //! consumers can branch trust by engine (Tesseract / Apple Vision //! adapters, when added, will write a different `engine` string). //! //! P6-3 adds the [`caption`] module: [`caption_image`] / //! [`apply_caption`] route an image through any vision-capable //! [`kebab_core::LanguageModel`] (text-only LMs are not vision-aware //! and will surface a model-side error). Captions are explicitly //! marked **model-generated** — the trust gap between OCR (observed, //! engine-tagged) and caption (generated, prompt-tagged) is the //! workspace's central trust contract. //! //! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText / //! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption //! provenance), §9 (versioning). pub mod caption; mod dims; mod exif_extract; mod image_prep; pub mod ocr; pub mod paddle_onnx; pub use caption::{apply_caption, caption_image}; pub use ocr::{OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, apply_ocr}; pub use paddle_onnx::{ModelPaths, OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config}; use anyhow::{Context, Result}; use kebab_core::{ Block, CanonicalDocument, CommonBlock, Extractor, ImageRefBlock, Lang, MediaType, Metadata, ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, id_for_block, id_for_doc, }; use serde_json::{Map, Value}; use time::OffsetDateTime; /// Parser version label for the image extractor (§9 versioning). pub const PARSER_VERSION: &str = "image-meta-v1"; /// Maximum decode dimension (per axis) before we refuse to read the image. /// Matches the §9.1 "cap decode at ~16k" policy in the design doc. pub const MAX_DECODE_DIM: u32 = 16_384; /// Image extractor — produces a single-block `CanonicalDocument` whose body /// is exactly one [`ImageRefBlock`]. pub struct ImageExtractor; impl ImageExtractor { pub fn new() -> Self { Self } } impl Default for ImageExtractor { fn default() -> Self { Self::new() } } impl Extractor for ImageExtractor { fn supports(&self, m: &MediaType) -> bool { matches!(m, MediaType::Image(_)) } fn parser_version(&self) -> ParserVersion { ParserVersion(PARSER_VERSION.to_string()) } fn extract( &self, ctx: &kebab_core::ExtractContext<'_>, bytes: &[u8], ) -> Result { let asset = ctx.asset; if !self.supports(&asset.media_type) { anyhow::bail!( "kebab-parse-image: unsupported media_type for ImageExtractor: {:?}", asset.media_type ); } let parser_version = self.parser_version(); let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); // Dimensions / format. `Err` here means the bytes don't even resolve // to a known image format — we propagate so the caller can skip the // asset (per spec failure modes: "Unsupported format → anyhow::Error"). let dim_outcome = dims::probe(bytes).context("guessing image format")?; // EXIF is best-effort regardless of dimension outcome. A corrupt // pixel stream may still carry a readable EXIF block (and vice // versa), so the two probes are independent. let exif_map = exif_extract::extract_whitelisted(bytes); let (span, dims_value, dim_warning) = match &dim_outcome { dims::DimOutcome::Ok { width, height, format, } => { let mut dims = Map::new(); dims.insert("w".into(), Value::Number((*width).into())); dims.insert("h".into(), Value::Number((*height).into())); dims.insert("format".into(), Value::String(format.to_string())); ( SourceSpan::Region { x: 0, y: 0, w: *width, h: *height, }, Value::Object(dims), None, ) } dims::DimOutcome::Failed { reason } => ( SourceSpan::Region { x: 0, y: 0, w: 0, h: 0, }, Value::Null, Some(reason.clone()), ), }; let block_id = id_for_block(&doc_id, "imageref", &[], 0, &span); let workspace_path_str = asset.workspace_path.0.clone(); let filename = filename_from_workspace_path(&workspace_path_str); let title = strip_extension(&filename); let block = Block::ImageRef(ImageRefBlock { common: CommonBlock { block_id, heading_path: Vec::new(), source_span: span, }, asset_id: Some(asset.asset_id.clone()), src: workspace_path_str, alt: filename, ocr: None, caption: None, }); let now = OffsetDateTime::now_utc(); // Discovered + Parsed (always) + optional Warning when the // dim probe failed. let mut events: Vec = Vec::with_capacity(if dim_warning.is_some() { 3 } else { 2 }); events.push(ProvenanceEvent { at: asset.discovered_at, agent: "kb-source-fs".to_string(), kind: ProvenanceKind::Discovered, note: None, }); events.push(ProvenanceEvent { at: now, agent: "kb-parse-image".to_string(), kind: ProvenanceKind::Parsed, note: Some(format!("parser_version={}", parser_version.0)), }); if let Some(reason) = dim_warning { events.push(ProvenanceEvent { at: now, agent: "kb-parse-image".to_string(), kind: ProvenanceKind::Warning, note: Some(reason), }); } // Metadata. `created_at` / `updated_at` are sourced from the asset's // `discovered_at` so the wire form does not embed a fresh timestamp // for every extract call (which would break determinism). let mut user = Map::new(); user.insert("exif".into(), Value::Object(exif_map)); user.insert("dimensions".into(), dims_value); let metadata = Metadata { aliases: Vec::new(), tags: Vec::new(), created_at: asset.discovered_at, updated_at: asset.discovered_at, source_type: SourceType::Reference, trust_level: TrustLevel::Primary, user_id_alias: None, user, repo: None, git_branch: None, git_commit: None, code_lang: None, }; tracing::debug!( target: "kebab-parse-image", "extracted image doc_id={} workspace_path={} dim_ok={}", doc_id.0, asset.workspace_path.0, matches!(dim_outcome, dims::DimOutcome::Ok { .. }) ); Ok(CanonicalDocument { doc_id, source_asset_id: asset.asset_id.clone(), workspace_path: asset.workspace_path.clone(), title, lang: Lang("und".to_string()), blocks: vec![block], metadata, provenance: Provenance { events }, parser_version, schema_version: 1, doc_version: 1, last_chunker_version: None, last_embedding_version: None, }) } } fn filename_from_workspace_path(p: &str) -> String { p.rsplit('/').next().unwrap_or(p).to_string() } fn strip_extension(filename: &str) -> String { match filename.rfind('.') { Some(0) => filename.to_string(), Some(idx) => filename[..idx].to_string(), None => filename.to_string(), } }