diff --git a/Cargo.lock b/Cargo.lock index 43199b7..9a560fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3365,6 +3365,15 @@ dependencies = [ "zmij", ] +[[package]] +name = "kamadak-exif" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1130d80c7374efad55a117d715a3af9368f0fa7a2c54573afc15a188cd984837" +dependencies = [ + "mutate_once", +] + [[package]] name = "kebab-app" version = "0.1.0" @@ -3539,6 +3548,23 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "kebab-parse-image" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "image", + "kamadak-exif", + "kebab-core", + "serde", + "serde_json", + "tempfile", + "thiserror 2.0.18", + "time", + "tracing", +] + [[package]] name = "kebab-parse-md" version = "0.1.0" @@ -4723,6 +4749,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "mutate_once" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13d2233c9842d08cfe13f9eac96e207ca6a2ea10b80259ebe8ad0268be27d2af" + [[package]] name = "native-tls" version = "0.2.18" diff --git a/Cargo.toml b/Cargo.toml index ef10989..5b6cce7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ members = [ "crates/kebab-app", "crates/kebab-cli", "crates/kebab-eval", + "crates/kebab-parse-image", ] [workspace.package] diff --git a/crates/kebab-parse-image/Cargo.toml b/crates/kebab-parse-image/Cargo.toml new file mode 100644 index 0000000..1445307 --- /dev/null +++ b/crates/kebab-parse-image/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "kebab-parse-image" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Image extractor — produces a single-block CanonicalDocument with EXIF metadata (P6-1)" + +[dependencies] +kebab-core = { path = "../kebab-core" } +anyhow = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +time = { workspace = true } +tracing = { workspace = true } +thiserror = { workspace = true } +# `image` ships a wide format menagerie under default features (BMP, DDS, +# Farbfeld, …). We only need PNG / JPEG / WebP / GIF / TIFF for v1 (per +# task spec out-of-scope HEIC/RAW). Trim defaults to keep the dep +# closure small. +image = { version = "0.25", default-features = false, features = ["png", "jpeg", "webp", "gif", "tiff"] } +# kamadak-exif: pure-Rust EXIF reader. Used for the whitelisted tag +# extraction (DateTimeOriginal, GPS, Make, Model, Orientation, Software). +kamadak-exif = "0.6" + +[dev-dependencies] +tempfile = { workspace = true } +blake3 = { workspace = true } +serde_json = { workspace = true } diff --git a/crates/kebab-parse-image/src/dims.rs b/crates/kebab-parse-image/src/dims.rs new file mode 100644 index 0000000..55dbcb7 --- /dev/null +++ b/crates/kebab-parse-image/src/dims.rs @@ -0,0 +1,82 @@ +//! Image-dimension probing for the `ImageExtractor` (P6-1). +//! +//! Reads just enough of the file header to obtain `(width, height, format)`. +//! The contract is: +//! +//! * `Err(_)` — the bytes don't resolve to any known image format. The +//! caller propagates this so the asset is skipped (per task spec +//! "Unsupported format → anyhow::Error"). +//! * `Ok(DimOutcome::Failed { reason })` — the format is recognised but +//! dimensions cannot be read (truncated header, oversized image, +//! decoder error). The caller emits a Warning provenance event and +//! stores `dimensions = null` in user metadata. +//! * `Ok(DimOutcome::Ok { .. })` — width/height/format read successfully. + +use std::io::Cursor; + +use anyhow::Result; +use image::{ImageFormat, ImageReader}; + +use crate::MAX_DECODE_DIM; + +#[derive(Debug, Clone)] +pub(crate) enum DimOutcome { + Ok { + width: u32, + height: u32, + /// Lowercase format string — `"png"`, `"jpeg"`, `"webp"`, … + format: &'static str, + }, + Failed { + reason: String, + }, +} + +pub(crate) fn probe(bytes: &[u8]) -> Result { + let reader = ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .map_err(|e| anyhow::anyhow!("io error guessing format: {e}"))?; + + let format = match reader.format() { + Some(f) => f, + None => { + anyhow::bail!("unsupported or unrecognised image format"); + } + }; + let format_str = format_label(format); + + match reader.into_dimensions() { + Ok((w, h)) => { + if w > MAX_DECODE_DIM || h > MAX_DECODE_DIM { + Ok(DimOutcome::Failed { + reason: format!( + "image dimensions {w}x{h} exceed cap {MAX_DECODE_DIM}x{MAX_DECODE_DIM}" + ), + }) + } else { + Ok(DimOutcome::Ok { + width: w, + height: h, + format: format_str, + }) + } + } + Err(e) => Ok(DimOutcome::Failed { + reason: format!("decode error: {e}"), + }), + } +} + +fn format_label(f: ImageFormat) -> &'static str { + match f { + ImageFormat::Png => "png", + ImageFormat::Jpeg => "jpeg", + ImageFormat::WebP => "webp", + ImageFormat::Gif => "gif", + ImageFormat::Tiff => "tiff", + // The `image` crate's enum is non-exhaustive and may grow new + // variants in minor versions. Map anything else to a stable + // catch-all so callers see a deterministic label. + _ => "other", + } +} diff --git a/crates/kebab-parse-image/src/exif_extract.rs b/crates/kebab-parse-image/src/exif_extract.rs new file mode 100644 index 0000000..1939f91 --- /dev/null +++ b/crates/kebab-parse-image/src/exif_extract.rs @@ -0,0 +1,189 @@ +//! EXIF whitelist extraction for the `ImageExtractor` (P6-1). +//! +//! Only the small set of tags listed in the task spec is captured into +//! `metadata.user["exif"]`. Everything else (thumbnails, maker notes, full +//! camera state) is dropped on the floor so the on-disk wire form keeps a +//! tight PII surface. +//! +//! Whitelisted tags: +//! +//! | tag | output JSON shape | +//! |--------------------|----------------------------| +//! | DateTimeOriginal | `"YYYY-MM-DDTHH:MM:SS"` | +//! | GPSLatitude / Ref | merged into `gps_lat: f64` | +//! | GPSLongitude / Ref | merged into `gps_lon: f64` | +//! | Make | `String` | +//! | Model | `String` | +//! | Orientation | `u32` (1..=8) | +//! | Software | `String` | +//! +//! Any tag whose source value cannot be parsed into the documented shape +//! is silently dropped — extractor failure must never fail the whole +//! document. + +use std::io::Cursor; + +use exif::{In, Reader, Tag, Value}; +use serde_json::{Map, Value as JsonValue}; + +/// Read EXIF from `bytes` (any container the `exif` crate understands — +/// JPEG APP1, PNG eXIf, TIFF, HEIF). Always returns a map; if there is no +/// EXIF block (or parsing fails), the map is empty. +pub(crate) fn extract_whitelisted(bytes: &[u8]) -> Map { + let mut out = Map::new(); + let exif = match Reader::new().read_from_container(&mut Cursor::new(bytes)) { + Ok(e) => e, + Err(_) => return out, + }; + + if let Some(s) = ascii_field(&exif, Tag::DateTimeOriginal, In::PRIMARY) { + if let Some(iso) = exif_datetime_to_iso(&s) { + out.insert("DateTimeOriginal".into(), JsonValue::String(iso)); + } + } + + if let Some(lat) = gps_decimal(&exif, Tag::GPSLatitude, Tag::GPSLatitudeRef) { + if let Some(num) = serde_json::Number::from_f64(lat) { + out.insert("gps_lat".into(), JsonValue::Number(num)); + } + } + if let Some(lon) = gps_decimal(&exif, Tag::GPSLongitude, Tag::GPSLongitudeRef) { + if let Some(num) = serde_json::Number::from_f64(lon) { + out.insert("gps_lon".into(), JsonValue::Number(num)); + } + } + + if let Some(s) = ascii_field(&exif, Tag::Make, In::PRIMARY) { + out.insert("Make".into(), JsonValue::String(s)); + } + if let Some(s) = ascii_field(&exif, Tag::Model, In::PRIMARY) { + out.insert("Model".into(), JsonValue::String(s)); + } + if let Some(o) = u32_field(&exif, Tag::Orientation, In::PRIMARY) { + out.insert("Orientation".into(), JsonValue::Number(o.into())); + } + if let Some(s) = ascii_field(&exif, Tag::Software, In::PRIMARY) { + out.insert("Software".into(), JsonValue::String(s)); + } + + out +} + +fn ascii_field(exif: &exif::Exif, tag: Tag, ifd: In) -> Option { + let f = exif.get_field(tag, ifd)?; + match &f.value { + Value::Ascii(parts) => { + // The EXIF 2.x ASCII type is one or more null-terminated C + // strings. We concatenate without separators since the + // whitelisted tags here (Make, Model, Software, DateTime) + // never legitimately split into multiple parts. + let mut s = String::new(); + for part in parts { + s.push_str(&String::from_utf8_lossy(part)); + } + let trimmed = s.trim_matches(char::from(0)).trim().to_string(); + if trimmed.is_empty() { + None + } else { + Some(trimmed) + } + } + _ => None, + } +} + +fn u32_field(exif: &exif::Exif, tag: Tag, ifd: In) -> Option { + let f = exif.get_field(tag, ifd)?; + match &f.value { + Value::Short(v) => v.first().map(|x| *x as u32), + Value::Long(v) => v.first().copied(), + _ => None, + } +} + +/// EXIF datetime tags use `"YYYY:MM:DD HH:MM:SS"`. We rewrite to ISO-8601 +/// `"YYYY-MM-DDTHH:MM:SS"` for downstream consumers (no timezone — EXIF +/// stores local time, and there's a separate OffsetTime tag we don't read). +fn exif_datetime_to_iso(raw: &str) -> Option { + let raw = raw.trim(); + if raw.len() != 19 { + return None; + } + let bytes = raw.as_bytes(); + if bytes[4] != b':' || bytes[7] != b':' || bytes[10] != b' ' { + return None; + } + // Replace the three structural separators; leave digits + ':' in time + // section untouched. + let mut out = String::with_capacity(19); + out.push_str(&raw[..4]); + out.push('-'); + out.push_str(&raw[5..7]); + out.push('-'); + out.push_str(&raw[8..10]); + out.push('T'); + out.push_str(&raw[11..]); + Some(out) +} + +/// Convert a GPS DMS triple (degrees / minutes / seconds, each +/// `Rational`) into a signed decimal degree using the matching N/S/E/W +/// reference tag. Returns `None` if either tag is missing or shaped +/// unexpectedly. +fn gps_decimal(exif: &exif::Exif, value_tag: Tag, ref_tag: Tag) -> Option { + let f = exif.get_field(value_tag, In::PRIMARY)?; + let dms = match &f.value { + Value::Rational(r) if r.len() == 3 => r, + _ => return None, + }; + let deg = rational_to_f64(&dms[0])?; + let min = rational_to_f64(&dms[1])?; + let sec = rational_to_f64(&dms[2])?; + let mut decimal = deg + min / 60.0 + sec / 3600.0; + if let Some(reference) = ascii_field(exif, ref_tag, In::PRIMARY) { + let r = reference.to_ascii_uppercase(); + if r.starts_with('S') || r.starts_with('W') { + decimal = -decimal; + } + } + if decimal.is_finite() { + Some(decimal) + } else { + None + } +} + +fn rational_to_f64(r: &exif::Rational) -> Option { + if r.denom == 0 { + None + } else { + Some(r.num as f64 / r.denom as f64) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn datetime_well_formed_converts_to_iso() { + let iso = exif_datetime_to_iso("2024:08:15 12:34:56").unwrap(); + assert_eq!(iso, "2024-08-15T12:34:56"); + } + + #[test] + fn datetime_wrong_separator_rejected() { + assert!(exif_datetime_to_iso("2024-08-15 12:34:56").is_none()); + } + + #[test] + fn datetime_short_string_rejected() { + assert!(exif_datetime_to_iso("2024:08:15").is_none()); + } + + #[test] + fn extract_on_empty_bytes_yields_empty_map() { + let m = extract_whitelisted(&[]); + assert!(m.is_empty()); + } +} diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs new file mode 100644 index 0000000..70cd0fc --- /dev/null +++ b/crates/kebab-parse-image/src/lib.rs @@ -0,0 +1,205 @@ +//! `kebab-parse-image` — image extractor (P6-1). +//! +//! Implements [`kebab_core::Extractor`] for `MediaType::Image(_)`. One asset +//! produces one [`CanonicalDocument`] with a single +//! [`Block::ImageRef`](kebab_core::Block::ImageRef). EXIF is captured into +//! `metadata.user["exif"]`, dimensions into `metadata.user["dimensions"]`. +//! OCR / caption fields stay `None`; later tasks (P6-2 / P6-3) populate +//! them. +//! +//! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText / +//! ModelCaption stubs), §9.1 (image extraction policy), §9 (versioning). + +mod dims; +mod exif_extract; + +use anyhow::{Context, Result}; +use kebab_core::{ + Block, CanonicalDocument, CommonBlock, Extractor, ImageRefBlock, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, + TrustLevel, id_for_block, id_for_doc, +}; +use serde_json::{Map, Value}; +use time::OffsetDateTime; + +/// Parser version label for the image extractor (§9 versioning). +pub const PARSER_VERSION: &str = "image-meta-v1"; + +/// Maximum decode dimension (per axis) before we refuse to read the image. +/// Matches the §9.1 "cap decode at ~16k" policy in the design doc. +pub const MAX_DECODE_DIM: u32 = 16_384; + +/// Image extractor — produces a single-block `CanonicalDocument` whose body +/// is exactly one [`ImageRefBlock`]. +pub struct ImageExtractor; + +impl ImageExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for ImageExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for ImageExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Image(_)) + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-image: unsupported media_type for ImageExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + // Dimensions / format. `Err` here means the bytes don't even resolve + // to a known image format — we propagate so the caller can skip the + // asset (per spec failure modes: "Unsupported format → anyhow::Error"). + let dim_outcome = dims::probe(bytes).context("guessing image format")?; + + // EXIF is best-effort regardless of dimension outcome. A corrupt + // pixel stream may still carry a readable EXIF block (and vice + // versa), so the two probes are independent. + let exif_map = exif_extract::extract_whitelisted(bytes); + + let (span, dims_value, decode_warning) = match &dim_outcome { + dims::DimOutcome::Ok { width, height, format } => { + let mut dims = Map::new(); + dims.insert("w".into(), Value::Number((*width).into())); + dims.insert("h".into(), Value::Number((*height).into())); + dims.insert("format".into(), Value::String((*format).to_string())); + ( + SourceSpan::Region { + x: 0, + y: 0, + w: *width, + h: *height, + }, + Value::Object(dims), + None, + ) + } + dims::DimOutcome::Failed { reason } => ( + SourceSpan::Region { + x: 0, + y: 0, + w: 0, + h: 0, + }, + Value::Null, + Some(reason.clone()), + ), + }; + + let block_id = id_for_block(&doc_id, "imageref", &[], 0, &span); + + let workspace_path_str = asset.workspace_path.0.clone(); + let filename = filename_from_workspace_path(&workspace_path_str); + let title = strip_extension(&filename); + + let block = Block::ImageRef(ImageRefBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + asset_id: Some(asset.asset_id.clone()), + src: workspace_path_str, + alt: filename, + ocr: None, + caption: None, + }); + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(3); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-image".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!("parser_version={}", parser_version.0)), + }); + if let Some(reason) = decode_warning { + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-image".to_string(), + kind: ProvenanceKind::Warning, + note: Some(reason), + }); + } + + // Metadata. `created_at` / `updated_at` are sourced from the asset's + // `discovered_at` so the wire form does not embed a fresh timestamp + // for every extract call (which would break determinism). + let mut user = Map::new(); + user.insert("exif".into(), Value::Object(exif_map)); + user.insert("dimensions".into(), dims_value); + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Reference, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user, + }; + + tracing::debug!( + target: "kebab-parse-image", + "extracted image doc_id={} workspace_path={} dim_ok={}", + doc_id.0, + asset.workspace_path.0, + matches!(dim_outcome, dims::DimOutcome::Ok { .. }) + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks: vec![block], + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + }) + } +} + +fn filename_from_workspace_path(p: &str) -> String { + p.rsplit('/').next().unwrap_or(p).to_string() +} + +fn strip_extension(filename: &str) -> String { + match filename.rfind('.') { + Some(0) => filename.to_string(), + Some(idx) => filename[..idx].to_string(), + None => filename.to_string(), + } +} diff --git a/crates/kebab-parse-image/tests/common/mod.rs b/crates/kebab-parse-image/tests/common/mod.rs new file mode 100644 index 0000000..3d05418 --- /dev/null +++ b/crates/kebab-parse-image/tests/common/mod.rs @@ -0,0 +1,241 @@ +//! Test fixture builders for `kebab-parse-image`. +//! +//! Images are generated in-memory at test time rather than committed as +//! binary fixtures so: +//! +//! * The test binary stays self-contained — no `include_bytes!` paths to +//! keep in sync with the workspace layout. +//! * Fixture provenance is auditable from source (anyone reading this +//! module can see exactly what bytes the tests run against). +//! +//! All builders are deterministic (no time / RNG dependence). + +#![allow(dead_code)] + +use std::io::Cursor; + +use exif::experimental::Writer as ExifWriter; +use exif::{Field, In, Rational, Tag, Value}; +use image::{ImageBuffer, Rgb}; +use kebab_core::{ + AssetStorage, Checksum, ExtractConfig, ExtractContext, ImageType, MediaType, RawAsset, + SourceUri, WorkspacePath, +}; +use std::path::{Path, PathBuf}; +use time::OffsetDateTime; + +/// 100×50 solid-red PNG, no EXIF. +pub fn red_100x50_png() -> Vec { + let img: ImageBuffer, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0])); + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, image::ImageFormat::Png) + .expect("encoding tiny PNG must not fail"); + buf.into_inner() +} + +/// 10×10 solid-blue PNG, no EXIF (smaller fixture for cases where +/// dimensions don't matter). +pub fn no_exif_png() -> Vec { + let img: ImageBuffer, _> = ImageBuffer::from_fn(10, 10, |_, _| Rgb([0, 0, 255])); + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, image::ImageFormat::Png) + .expect("encoding tiny PNG must not fail"); + buf.into_inner() +} + +/// JPEG with embedded EXIF APP1 segment carrying GPS + Make + Model + +/// DateTimeOriginal + Orientation + Software. The base image is a 4×4 +/// solid white square — pixel content is irrelevant; the test cares about +/// the EXIF tags. +/// +/// Construction: encode JPEG via the `image` crate, then splice an EXIF +/// APP1 segment immediately after SOI (FF D8). The EXIF blob is built +/// with `exif::experimental::Writer`. +pub fn exif_with_gps_jpg() -> Vec { + let base = encode_tiny_jpeg(); + let exif_blob = build_exif_blob_gps(); + + let mut out = Vec::with_capacity(base.len() + exif_blob.len() + 16); + // SOI: FF D8. + out.push(0xFF); + out.push(0xD8); + // APP1 marker: FF E1. + out.push(0xFF); + out.push(0xE1); + // APP1 segment length (BE): 2 (length field itself) + 6 ("Exif\0\0") + // + exif_blob.len(). Pre-validated against the 0xFFFF segment limit. + let app1_payload_len = 2 + 6 + exif_blob.len(); + assert!( + app1_payload_len <= u16::MAX as usize, + "EXIF segment too large for a single APP1" + ); + out.extend_from_slice(&(app1_payload_len as u16).to_be_bytes()); + out.extend_from_slice(b"Exif\x00\x00"); + out.extend_from_slice(&exif_blob); + // Append the rest of the JPEG starting just after the original SOI. + out.extend_from_slice(&base[2..]); + out +} + +fn encode_tiny_jpeg() -> Vec { + let img: ImageBuffer, _> = ImageBuffer::from_fn(4, 4, |_, _| Rgb([255, 255, 255])); + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, image::ImageFormat::Jpeg) + .expect("encoding tiny JPEG must not fail"); + buf.into_inner() +} + +fn build_exif_blob_gps() -> Vec { + let make = Field { + tag: Tag::Make, + ifd_num: In::PRIMARY, + value: Value::Ascii(vec![b"KebabCam\0".to_vec()]), + }; + let model = Field { + tag: Tag::Model, + ifd_num: In::PRIMARY, + value: Value::Ascii(vec![b"X1\0".to_vec()]), + }; + let software = Field { + tag: Tag::Software, + ifd_num: In::PRIMARY, + value: Value::Ascii(vec![b"kebab-test\0".to_vec()]), + }; + let datetime = Field { + tag: Tag::DateTimeOriginal, + ifd_num: In::PRIMARY, + value: Value::Ascii(vec![b"2024:08:15 12:34:56\0".to_vec()]), + }; + let orientation = Field { + tag: Tag::Orientation, + ifd_num: In::PRIMARY, + value: Value::Short(vec![1]), + }; + // GPS — 37.5 N, 127.0 E (Seoul-ish). DMS triple: 37°30'0" N, + // 127°0'0" E. Each component is num/denom rationals. + let lat = Field { + tag: Tag::GPSLatitude, + ifd_num: In::PRIMARY, + value: Value::Rational(vec![ + Rational { num: 37, denom: 1 }, + Rational { num: 30, denom: 1 }, + Rational { num: 0, denom: 1 }, + ]), + }; + let lat_ref = Field { + tag: Tag::GPSLatitudeRef, + ifd_num: In::PRIMARY, + value: Value::Ascii(vec![b"N\0".to_vec()]), + }; + let lon = Field { + tag: Tag::GPSLongitude, + ifd_num: In::PRIMARY, + value: Value::Rational(vec![ + Rational { num: 127, denom: 1 }, + Rational { num: 0, denom: 1 }, + Rational { num: 0, denom: 1 }, + ]), + }; + let lon_ref = Field { + tag: Tag::GPSLongitudeRef, + ifd_num: In::PRIMARY, + value: Value::Ascii(vec![b"E\0".to_vec()]), + }; + + let mut writer = ExifWriter::new(); + writer.push_field(&make); + writer.push_field(&model); + writer.push_field(&software); + writer.push_field(&datetime); + writer.push_field(&orientation); + writer.push_field(&lat); + writer.push_field(&lat_ref); + writer.push_field(&lon); + writer.push_field(&lon_ref); + + let mut blob = Cursor::new(Vec::new()); + writer + .write(&mut blob, false) + .expect("EXIF writer must succeed for the small whitelisted set"); + blob.into_inner() +} + +/// PNG header magic followed by truncated payload. The format guess +/// succeeds (eight-byte PNG signature is intact) but `into_dimensions` +/// fails because the IHDR chunk is missing. +pub fn corrupt_png() -> Vec { + // 8-byte PNG signature only — every byte after is missing. + vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A] +} + +/// Build a `RawAsset` + matching workspace_root + `ExtractContext` for +/// the test. `bytes_for_id` is hashed (BLAKE3) to produce the AssetId +/// per §4.2 — this matches what `kebab-source-fs` does in production. +pub struct ImageFixture { + pub asset: RawAsset, + pub workspace_root: PathBuf, + pub config: ExtractConfig, +} + +impl ImageFixture { + pub fn ctx(&self) -> ExtractContext<'_> { + ExtractContext { + asset: &self.asset, + workspace_root: &self.workspace_root, + config: &self.config, + } + } +} + +pub fn fixture_for(workspace_path: &str, image_type: ImageType, bytes: &[u8]) -> ImageFixture { + let blake = blake3::hash(bytes); + let full_hex = blake.to_hex().to_string(); + let asset_id = kebab_core::id_for_asset(&full_hex); + let workspace_path = WorkspacePath::new(workspace_path.to_string()).unwrap(); + // Fixed timestamp so determinism tests can compare outputs across runs. + let discovered_at = OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(); + let asset = RawAsset { + asset_id, + source_uri: SourceUri::File(PathBuf::from(format!("/tmp/{}", workspace_path.0))), + workspace_path, + media_type: MediaType::Image(image_type), + byte_len: bytes.len() as u64, + checksum: Checksum(full_hex), + discovered_at, + stored: AssetStorage::Reference { + path: PathBuf::from("/tmp/fake"), + sha: Checksum("0".repeat(64)), + }, + }; + ImageFixture { + asset, + workspace_root: PathBuf::from("/tmp/fake-root"), + config: ExtractConfig::default(), + } +} + +/// Strip the two non-deterministic provenance timestamps (Parsed + +/// optional Warning) so determinism / snapshot tests can compare JSON +/// without worrying about wall-clock jitter. +pub fn strip_dynamic_at(json: &mut serde_json::Value) { + if let Some(events) = json + .get_mut("provenance") + .and_then(|p| p.get_mut("events")) + .and_then(|e| e.as_array_mut()) + { + for (i, ev) in events.iter_mut().enumerate() { + if i > 0 + && let Some(obj) = ev.as_object_mut() + { + obj.insert("at".into(), serde_json::Value::String("".into())); + } + } + } +} + +/// Stable ASCII path constant — avoids depending on `Path::new` or the +/// host's path separator in the call sites. +#[allow(dead_code)] +pub fn fake_path(p: &str) -> &Path { + Path::new(p) +} diff --git a/crates/kebab-parse-image/tests/extractor.rs b/crates/kebab-parse-image/tests/extractor.rs new file mode 100644 index 0000000..637b8ce9 --- /dev/null +++ b/crates/kebab-parse-image/tests/extractor.rs @@ -0,0 +1,249 @@ +//! Integration tests for `kebab_parse_image::ImageExtractor` (P6-1). + +mod common; + +use kebab_core::{Block, Extractor, ImageType, ProvenanceKind, SourceSpan}; +use kebab_parse_image::ImageExtractor; +use serde_json::Value; + +use crate::common::{ + corrupt_png, exif_with_gps_jpg, fixture_for, no_exif_png, red_100x50_png, strip_dynamic_at, +}; + +fn extract_block(doc: &kebab_core::CanonicalDocument) -> &kebab_core::ImageRefBlock { + assert_eq!(doc.blocks.len(), 1, "exactly one block expected"); + match &doc.blocks[0] { + Block::ImageRef(b) => b, + other => panic!("expected ImageRef, got {other:?}"), + } +} + +#[test] +fn png_decode_produces_correct_dimensions() { + let bytes = red_100x50_png(); + let fx = fixture_for("photos/red-100x50.png", ImageType::Png, &bytes); + let doc = ImageExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("PNG extraction must succeed"); + + assert_eq!(doc.title, "red-100x50"); + assert_eq!(doc.lang.0, "und"); + assert_eq!(doc.parser_version.0, kebab_parse_image::PARSER_VERSION); + + let dims = doc + .metadata + .user + .get("dimensions") + .expect("dimensions key present"); + let obj = dims.as_object().expect("dimensions is an object"); + assert_eq!(obj.get("w"), Some(&Value::Number(100.into()))); + assert_eq!(obj.get("h"), Some(&Value::Number(50.into()))); + assert_eq!(obj.get("format"), Some(&Value::String("png".into()))); + + let block = extract_block(&doc); + assert_eq!(block.alt, "red-100x50.png"); + assert_eq!(block.src, "photos/red-100x50.png"); + assert_eq!(block.asset_id, Some(fx.asset.asset_id.clone())); + assert!(block.ocr.is_none()); + assert!(block.caption.is_none()); + match &block.common.source_span { + SourceSpan::Region { x, y, w, h } => { + assert_eq!((*x, *y, *w, *h), (0, 0, 100, 50)); + } + other => panic!("expected Region span, got {other:?}"), + } +} + +#[test] +fn jpeg_with_exif_gps_captures_whitelisted_tags() { + let bytes = exif_with_gps_jpg(); + let fx = fixture_for("img/seoul.jpg", ImageType::Jpeg, &bytes); + let doc = ImageExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("JPEG extraction must succeed"); + + let exif = doc + .metadata + .user + .get("exif") + .and_then(|v| v.as_object()) + .expect("exif object present"); + assert_eq!(exif.get("Make"), Some(&Value::String("KebabCam".into()))); + assert_eq!(exif.get("Model"), Some(&Value::String("X1".into()))); + assert_eq!( + exif.get("Software"), + Some(&Value::String("kebab-test".into())) + ); + assert_eq!( + exif.get("DateTimeOriginal"), + Some(&Value::String("2024-08-15T12:34:56".into())) + ); + assert_eq!(exif.get("Orientation"), Some(&Value::Number(1.into()))); + let lat = exif.get("gps_lat").and_then(|v| v.as_f64()).expect("gps_lat"); + let lon = exif.get("gps_lon").and_then(|v| v.as_f64()).expect("gps_lon"); + assert!((lat - 37.5).abs() < 1e-6, "lat={lat}"); + assert!((lon - 127.0).abs() < 1e-6, "lon={lon}"); + + // Maker notes / thumbnails / unrelated tags must NOT have leaked in. + let allowed: std::collections::HashSet<&str> = [ + "Make", + "Model", + "Software", + "DateTimeOriginal", + "Orientation", + "gps_lat", + "gps_lon", + ] + .into_iter() + .collect(); + for k in exif.keys() { + assert!( + allowed.contains(k.as_str()), + "non-whitelisted EXIF key leaked: {k}" + ); + } +} + +#[test] +fn no_exif_image_yields_empty_exif_map() { + let bytes = no_exif_png(); + let fx = fixture_for("img/blank.png", ImageType::Png, &bytes); + let doc = ImageExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("PNG extraction must succeed"); + let exif = doc + .metadata + .user + .get("exif") + .and_then(|v| v.as_object()) + .expect("exif object present"); + assert!(exif.is_empty(), "no-EXIF PNG must yield empty exif map: {exif:?}"); +} + +#[test] +fn corrupt_image_emits_warning_no_panic() { + let bytes = corrupt_png(); + let fx = fixture_for("img/corrupt.png", ImageType::Png, &bytes); + let doc = ImageExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("corrupt PNG must NOT cause an Err — warning provenance event instead"); + + // dimensions = null + assert_eq!( + doc.metadata.user.get("dimensions"), + Some(&Value::Null), + "corrupt image must record dimensions = null" + ); + // exif = {} + let exif = doc + .metadata + .user + .get("exif") + .and_then(|v| v.as_object()) + .expect("exif object present"); + assert!(exif.is_empty()); + // Span is Region(0,0,0,0). + let block = extract_block(&doc); + assert!(matches!( + block.common.source_span, + SourceSpan::Region { x: 0, y: 0, w: 0, h: 0 } + )); + // Warning provenance event. + let warnings: Vec<_> = doc + .provenance + .events + .iter() + .filter(|e| e.kind == ProvenanceKind::Warning) + .collect(); + assert_eq!(warnings.len(), 1, "expected exactly one Warning event"); + assert_eq!(warnings[0].agent, "kb-parse-image"); +} + +#[test] +fn unsupported_bytes_return_err() { + let bytes = b"not an image at all".to_vec(); + let fx = fixture_for("img/garbage.png", ImageType::Png, &bytes); + let r = ImageExtractor::new().extract(&fx.ctx(), &bytes); + assert!( + r.is_err(), + "unrecognised format must propagate Err so caller skips" + ); +} + +#[test] +fn provenance_events_are_in_order() { + let bytes = red_100x50_png(); + let fx = fixture_for("a/b.png", ImageType::Png, &bytes); + let doc = ImageExtractor::new().extract(&fx.ctx(), &bytes).unwrap(); + let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect(); + assert_eq!( + kinds, + vec![ProvenanceKind::Discovered, ProvenanceKind::Parsed] + ); + assert_eq!(doc.provenance.events[0].agent, "kb-source-fs"); + assert_eq!(doc.provenance.events[0].at, fx.asset.discovered_at); + assert_eq!(doc.provenance.events[1].agent, "kb-parse-image"); +} + +#[test] +fn determinism_identical_bytes_produce_identical_ids() { + let bytes = red_100x50_png(); + let fx_a = fixture_for("a/b.png", ImageType::Png, &bytes); + let fx_b = fixture_for("a/b.png", ImageType::Png, &bytes); + let extractor = ImageExtractor::new(); + let doc1 = extractor.extract(&fx_a.ctx(), &bytes).unwrap(); + let doc2 = extractor.extract(&fx_b.ctx(), &bytes).unwrap(); + assert_eq!(doc1.doc_id, doc2.doc_id); + let id1 = &extract_block(&doc1).common.block_id; + let id2 = &extract_block(&doc2).common.block_id; + assert_eq!(id1, id2); +} + +#[test] +fn snapshot_red_100x50_canonical_document_stable() { + let bytes = red_100x50_png(); + let fx = fixture_for("photos/red-100x50.png", ImageType::Png, &bytes); + let extractor = ImageExtractor::new(); + let doc1 = extractor.extract(&fx.ctx(), &bytes).unwrap(); + let doc2 = extractor.extract(&fx.ctx(), &bytes).unwrap(); + + let mut j1 = serde_json::to_value(&doc1).unwrap(); + let mut j2 = serde_json::to_value(&doc2).unwrap(); + strip_dynamic_at(&mut j1); + strip_dynamic_at(&mut j2); + assert_eq!( + j1, j2, + "two extractions of identical bytes must serialise byte-for-byte equal (modulo dynamic timestamps)" + ); + + // Pin a few fields by exact value so a future regression in the + // ID recipe / serialisation order surfaces here, not at the JSON + // diff level only. + assert_eq!(j1["title"], "red-100x50"); + assert_eq!(j1["lang"], "und"); + assert_eq!(j1["parser_version"], kebab_parse_image::PARSER_VERSION); + assert_eq!(j1["schema_version"], 1); + assert_eq!(j1["doc_version"], 1); + assert_eq!(j1["blocks"].as_array().unwrap().len(), 1); + assert_eq!(j1["blocks"][0]["kind"], "imageref"); + assert_eq!(j1["metadata"]["source_type"], "reference"); + assert_eq!(j1["metadata"]["trust_level"], "primary"); +} + +#[test] +fn supports_only_image_media_type() { + let e = ImageExtractor::new(); + assert!(e.supports(&kebab_core::MediaType::Image(ImageType::Png))); + assert!(e.supports(&kebab_core::MediaType::Image(ImageType::Jpeg))); + assert!(!e.supports(&kebab_core::MediaType::Markdown)); + assert!(!e.supports(&kebab_core::MediaType::Pdf)); +} + +#[test] +fn rejects_extract_when_media_type_mismatches() { + let bytes = red_100x50_png(); + let mut fx = fixture_for("a/b.md", ImageType::Png, &bytes); + fx.asset.media_type = kebab_core::MediaType::Markdown; + let r = ImageExtractor::new().extract(&fx.ctx(), &bytes); + assert!(r.is_err()); +} diff --git a/tasks/p6/p6-1-image-extractor-exif.md b/tasks/p6/p6-1-image-extractor-exif.md index ff6449a..39af1be 100644 --- a/tasks/p6/p6-1-image-extractor-exif.md +++ b/tasks/p6/p6-1-image-extractor-exif.md @@ -3,7 +3,7 @@ phase: P6 component: kebab-parse-image (image extractor + EXIF) task_id: p6-1 title: "Image Extractor producing single-block CanonicalDocument + EXIF metadata" -status: planned +status: completed depends_on: [p0-1, p1-6] unblocks: [p6-2, p6-3] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md