Files
kebab/crates/kebab-parse-image/src/lib.rs
th-kim0823 bf4ebf8d2a feat(p10-1a-1): add Metadata.repo / git_branch / git_commit / code_lang
Four optional, serde-skipped-when-None fields added to `Metadata` for
code ingest context. All 11 downstream construction sites patched with
`repo: None, git_branch: None, git_commit: None, code_lang: None`.
Full workspace check (`--tests`) and per-crate test suite pass clean.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 15:44:18 +09:00

236 lines
8.3 KiB
Rust

//! `kebab-parse-image` — image extractor (P6-1) + OCR adapter (P6-2).
//!
//! P6-1 implements [`kebab_core::Extractor`] for `MediaType::Image(_)`,
//! producing a single-block [`CanonicalDocument`] (`ImageRefBlock` with
//! EXIF + dimensions in `metadata.user`). OCR / caption fields stay
//! `None` until populated by the OCR / caption adapters.
//!
//! P6-2 adds the [`ocr`] module: an [`OcrEngine`] trait and an
//! [`OllamaVisionOcr`] default adapter that talks to a vision-capable
//! Ollama model. [`apply_ocr`] is the helper that mutates an
//! [`ImageRefBlock`] in place. Trust note — the LLM-driven default
//! can hallucinate; `OcrText.engine` carries the source identity so
//! consumers can branch trust by engine (Tesseract / Apple Vision
//! adapters, when added, will write a different `engine` string).
//!
//! P6-3 adds the [`caption`] module: [`caption_image`] /
//! [`apply_caption`] route an image through any vision-capable
//! [`kebab_core::LanguageModel`] (text-only LMs are not vision-aware
//! and will surface a model-side error). Captions are explicitly
//! marked **model-generated** — the trust gap between OCR (observed,
//! engine-tagged) and caption (generated, prompt-tagged) is the
//! workspace's central trust contract.
//!
//! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText /
//! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption
//! provenance), §9 (versioning).
mod dims;
mod exif_extract;
mod image_prep;
pub mod caption;
pub mod ocr;
pub use caption::{apply_caption, caption_image};
pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
use anyhow::{Context, Result};
use kebab_core::{
Block, CanonicalDocument, CommonBlock, Extractor, ImageRefBlock, Lang, MediaType, Metadata,
ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType,
TrustLevel, id_for_block, id_for_doc,
};
use serde_json::{Map, Value};
use time::OffsetDateTime;
/// Parser version label for the image extractor (§9 versioning).
pub const PARSER_VERSION: &str = "image-meta-v1";
/// Maximum decode dimension (per axis) before we refuse to read the image.
/// Matches the §9.1 "cap decode at ~16k" policy in the design doc.
pub const MAX_DECODE_DIM: u32 = 16_384;
/// Image extractor — produces a single-block `CanonicalDocument` whose body
/// is exactly one [`ImageRefBlock`].
pub struct ImageExtractor;
impl ImageExtractor {
pub fn new() -> Self {
Self
}
}
impl Default for ImageExtractor {
fn default() -> Self {
Self::new()
}
}
impl Extractor for ImageExtractor {
fn supports(&self, m: &MediaType) -> bool {
matches!(m, MediaType::Image(_))
}
fn parser_version(&self) -> ParserVersion {
ParserVersion(PARSER_VERSION.to_string())
}
fn extract(
&self,
ctx: &kebab_core::ExtractContext<'_>,
bytes: &[u8],
) -> Result<CanonicalDocument> {
let asset = ctx.asset;
if !self.supports(&asset.media_type) {
anyhow::bail!(
"kebab-parse-image: unsupported media_type for ImageExtractor: {:?}",
asset.media_type
);
}
let parser_version = self.parser_version();
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version);
// Dimensions / format. `Err` here means the bytes don't even resolve
// to a known image format — we propagate so the caller can skip the
// asset (per spec failure modes: "Unsupported format → anyhow::Error").
let dim_outcome = dims::probe(bytes).context("guessing image format")?;
// EXIF is best-effort regardless of dimension outcome. A corrupt
// pixel stream may still carry a readable EXIF block (and vice
// versa), so the two probes are independent.
let exif_map = exif_extract::extract_whitelisted(bytes);
let (span, dims_value, dim_warning) = match &dim_outcome {
dims::DimOutcome::Ok { width, height, format } => {
let mut dims = Map::new();
dims.insert("w".into(), Value::Number((*width).into()));
dims.insert("h".into(), Value::Number((*height).into()));
dims.insert("format".into(), Value::String(format.to_string()));
(
SourceSpan::Region {
x: 0,
y: 0,
w: *width,
h: *height,
},
Value::Object(dims),
None,
)
}
dims::DimOutcome::Failed { reason } => (
SourceSpan::Region {
x: 0,
y: 0,
w: 0,
h: 0,
},
Value::Null,
Some(reason.clone()),
),
};
let block_id = id_for_block(&doc_id, "imageref", &[], 0, &span);
let workspace_path_str = asset.workspace_path.0.clone();
let filename = filename_from_workspace_path(&workspace_path_str);
let title = strip_extension(&filename);
let block = Block::ImageRef(ImageRefBlock {
common: CommonBlock {
block_id,
heading_path: Vec::new(),
source_span: span,
},
asset_id: Some(asset.asset_id.clone()),
src: workspace_path_str,
alt: filename,
ocr: None,
caption: None,
});
let now = OffsetDateTime::now_utc();
// Discovered + Parsed (always) + optional Warning when the
// dim probe failed.
let mut events: Vec<ProvenanceEvent> =
Vec::with_capacity(if dim_warning.is_some() { 3 } else { 2 });
events.push(ProvenanceEvent {
at: asset.discovered_at,
agent: "kb-source-fs".to_string(),
kind: ProvenanceKind::Discovered,
note: None,
});
events.push(ProvenanceEvent {
at: now,
agent: "kb-parse-image".to_string(),
kind: ProvenanceKind::Parsed,
note: Some(format!("parser_version={}", parser_version.0)),
});
if let Some(reason) = dim_warning {
events.push(ProvenanceEvent {
at: now,
agent: "kb-parse-image".to_string(),
kind: ProvenanceKind::Warning,
note: Some(reason),
});
}
// Metadata. `created_at` / `updated_at` are sourced from the asset's
// `discovered_at` so the wire form does not embed a fresh timestamp
// for every extract call (which would break determinism).
let mut user = Map::new();
user.insert("exif".into(), Value::Object(exif_map));
user.insert("dimensions".into(), dims_value);
let metadata = Metadata {
aliases: Vec::new(),
tags: Vec::new(),
created_at: asset.discovered_at,
updated_at: asset.discovered_at,
source_type: SourceType::Reference,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user,
repo: None,
git_branch: None,
git_commit: None,
code_lang: None,
};
tracing::debug!(
target: "kebab-parse-image",
"extracted image doc_id={} workspace_path={} dim_ok={}",
doc_id.0,
asset.workspace_path.0,
matches!(dim_outcome, dims::DimOutcome::Ok { .. })
);
Ok(CanonicalDocument {
doc_id,
source_asset_id: asset.asset_id.clone(),
workspace_path: asset.workspace_path.clone(),
title,
lang: Lang("und".to_string()),
blocks: vec![block],
metadata,
provenance: Provenance { events },
parser_version,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
})
}
}
fn filename_from_workspace_path(p: &str) -> String {
p.rsplit('/').next().unwrap_or(p).to_string()
}
fn strip_extension(filename: &str) -> String {
match filename.rfind('.') {
Some(0) => filename.to_string(),
Some(idx) => filename[..idx].to_string(),
None => filename.to_string(),
}
}