feat(kebab-parse-image): P6-1 image extractor + EXIF whitelist
- 새 crate kebab-parse-image 추가 (workspace 19개째). MediaType::Image(_)
자산을 단일-블록 CanonicalDocument 로 변환하는 ImageExtractor 구현.
- parser_version "image-meta-v1" (§9 versioning).
- 본문은 Block::ImageRef 1건만 포함 — OCR / caption 필드는 None 으로
남겨 두고 P6-2 / P6-3 에서 채운다.
- EXIF 화이트리스트 (§9.1, PII 표면 최소화):
Make / Model / Software / DateTimeOriginal / Orientation /
GPSLatitude(+Ref) / GPSLongitude(+Ref). MakerNote / Thumbnail / 기타
태그는 폐기. DateTime 은 EXIF "YYYY:MM:DD HH:MM:SS" → ISO-8601 변환.
GPS DMS triple + N/S/E/W ref → signed decimal degree.
- 차원: image::ImageReader 헤더만 읽어 (w, h, format) 획득. 16k×16k cap
초과 또는 디코드 실패 → metadata.user.dimensions = null + Provenance
Warning 이벤트 (Err 아님). 포맷 자체 인식 실패 → anyhow::Error
(caller skip).
- SourceSpan::Region { 0, 0, w, h } 으로 전체 이미지 영역 표기. 결정성:
동일 bytes + 동일 parser_version → 동일 doc_id + block_id (§4.2 ID
recipe 그대로 사용).
- metadata.source_type = Reference, trust_level = Primary, lang = "und".
title = 확장자 제외 파일명, alt = 파일명.
- 의존성 경계 (§8): kebab-core 만 + image 0.25 (default features off,
png/jpeg/webp/gif/tiff 만), kamadak-exif 0.6, anyhow / serde /
serde_json / time / tracing / thiserror. kebab-source-fs · parse-md ·
store-* · embed* · llm* · rag · UI crate 미참조.
- 테스트 14개 (4 unit + 10 integration):
• PNG 차원 추출, JPEG EXIF GPS 추출 (DMS → decimal 변환 정확도 1e-6),
EXIF 없는 PNG → 빈 map, 손상 PNG → warning + null dims (panic 없음),
인식 불가 bytes → Err, 결정성, 스냅샷, supports() 매칭, media_type
불일치 거부.
• 픽스처는 in-memory 생성 (PNG 는 image crate, EXIF JPEG 는 kamadak
Writer 로 EXIF blob 만든 뒤 SOI 직후 APP1 splice) — 바이너리
fixture 커밋 없음.
- HEIC / RAW 는 spec 상 v1 out of scope (image crate 미지원, Apple
Vision sidecar 가 추후 P+ 에서 채움).
- tasks/p6/p6-1-image-extractor-exif.md status: planned → completed.
contract: docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
sections: §3.4 Block::ImageRef + ImageRefBlock, §3.7a OcrText /
ModelCaption stubs, §9.1 image extraction policy, §9 versioning.
This commit is contained in:
32
Cargo.lock
generated
32
Cargo.lock
generated
@@ -3365,6 +3365,15 @@ dependencies = [
|
||||
"zmij",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kamadak-exif"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1130d80c7374efad55a117d715a3af9368f0fa7a2c54573afc15a188cd984837"
|
||||
dependencies = [
|
||||
"mutate_once",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kebab-app"
|
||||
version = "0.1.0"
|
||||
@@ -3539,6 +3548,23 @@ dependencies = [
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-image"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"image",
|
||||
"kamadak-exif",
|
||||
"kebab-core",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror 2.0.18",
|
||||
"time",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-md"
|
||||
version = "0.1.0"
|
||||
@@ -4723,6 +4749,12 @@ version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
|
||||
|
||||
[[package]]
|
||||
name = "mutate_once"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13d2233c9842d08cfe13f9eac96e207ca6a2ea10b80259ebe8ad0268be27d2af"
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.18"
|
||||
|
||||
@@ -19,6 +19,7 @@ members = [
|
||||
"crates/kebab-app",
|
||||
"crates/kebab-cli",
|
||||
"crates/kebab-eval",
|
||||
"crates/kebab-parse-image",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
|
||||
30
crates/kebab-parse-image/Cargo.toml
Normal file
30
crates/kebab-parse-image/Cargo.toml
Normal file
@@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "kebab-parse-image"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Image extractor — produces a single-block CanonicalDocument with EXIF metadata (P6-1)"
|
||||
|
||||
[dependencies]
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
# `image` ships a wide format menagerie under default features (BMP, DDS,
|
||||
# Farbfeld, …). We only need PNG / JPEG / WebP / GIF / TIFF for v1 (per
|
||||
# task spec out-of-scope HEIC/RAW). Trim defaults to keep the dep
|
||||
# closure small.
|
||||
image = { version = "0.25", default-features = false, features = ["png", "jpeg", "webp", "gif", "tiff"] }
|
||||
# kamadak-exif: pure-Rust EXIF reader. Used for the whitelisted tag
|
||||
# extraction (DateTimeOriginal, GPS, Make, Model, Orientation, Software).
|
||||
kamadak-exif = "0.6"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
82
crates/kebab-parse-image/src/dims.rs
Normal file
82
crates/kebab-parse-image/src/dims.rs
Normal file
@@ -0,0 +1,82 @@
|
||||
//! Image-dimension probing for the `ImageExtractor` (P6-1).
|
||||
//!
|
||||
//! Reads just enough of the file header to obtain `(width, height, format)`.
|
||||
//! The contract is:
|
||||
//!
|
||||
//! * `Err(_)` — the bytes don't resolve to any known image format. The
|
||||
//! caller propagates this so the asset is skipped (per task spec
|
||||
//! "Unsupported format → anyhow::Error").
|
||||
//! * `Ok(DimOutcome::Failed { reason })` — the format is recognised but
|
||||
//! dimensions cannot be read (truncated header, oversized image,
|
||||
//! decoder error). The caller emits a Warning provenance event and
|
||||
//! stores `dimensions = null` in user metadata.
|
||||
//! * `Ok(DimOutcome::Ok { .. })` — width/height/format read successfully.
|
||||
|
||||
use std::io::Cursor;
|
||||
|
||||
use anyhow::Result;
|
||||
use image::{ImageFormat, ImageReader};
|
||||
|
||||
use crate::MAX_DECODE_DIM;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum DimOutcome {
|
||||
Ok {
|
||||
width: u32,
|
||||
height: u32,
|
||||
/// Lowercase format string — `"png"`, `"jpeg"`, `"webp"`, …
|
||||
format: &'static str,
|
||||
},
|
||||
Failed {
|
||||
reason: String,
|
||||
},
|
||||
}
|
||||
|
||||
pub(crate) fn probe(bytes: &[u8]) -> Result<DimOutcome> {
|
||||
let reader = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.map_err(|e| anyhow::anyhow!("io error guessing format: {e}"))?;
|
||||
|
||||
let format = match reader.format() {
|
||||
Some(f) => f,
|
||||
None => {
|
||||
anyhow::bail!("unsupported or unrecognised image format");
|
||||
}
|
||||
};
|
||||
let format_str = format_label(format);
|
||||
|
||||
match reader.into_dimensions() {
|
||||
Ok((w, h)) => {
|
||||
if w > MAX_DECODE_DIM || h > MAX_DECODE_DIM {
|
||||
Ok(DimOutcome::Failed {
|
||||
reason: format!(
|
||||
"image dimensions {w}x{h} exceed cap {MAX_DECODE_DIM}x{MAX_DECODE_DIM}"
|
||||
),
|
||||
})
|
||||
} else {
|
||||
Ok(DimOutcome::Ok {
|
||||
width: w,
|
||||
height: h,
|
||||
format: format_str,
|
||||
})
|
||||
}
|
||||
}
|
||||
Err(e) => Ok(DimOutcome::Failed {
|
||||
reason: format!("decode error: {e}"),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
fn format_label(f: ImageFormat) -> &'static str {
|
||||
match f {
|
||||
ImageFormat::Png => "png",
|
||||
ImageFormat::Jpeg => "jpeg",
|
||||
ImageFormat::WebP => "webp",
|
||||
ImageFormat::Gif => "gif",
|
||||
ImageFormat::Tiff => "tiff",
|
||||
// The `image` crate's enum is non-exhaustive and may grow new
|
||||
// variants in minor versions. Map anything else to a stable
|
||||
// catch-all so callers see a deterministic label.
|
||||
_ => "other",
|
||||
}
|
||||
}
|
||||
189
crates/kebab-parse-image/src/exif_extract.rs
Normal file
189
crates/kebab-parse-image/src/exif_extract.rs
Normal file
@@ -0,0 +1,189 @@
|
||||
//! EXIF whitelist extraction for the `ImageExtractor` (P6-1).
|
||||
//!
|
||||
//! Only the small set of tags listed in the task spec is captured into
|
||||
//! `metadata.user["exif"]`. Everything else (thumbnails, maker notes, full
|
||||
//! camera state) is dropped on the floor so the on-disk wire form keeps a
|
||||
//! tight PII surface.
|
||||
//!
|
||||
//! Whitelisted tags:
|
||||
//!
|
||||
//! | tag | output JSON shape |
|
||||
//! |--------------------|----------------------------|
|
||||
//! | DateTimeOriginal | `"YYYY-MM-DDTHH:MM:SS"` |
|
||||
//! | GPSLatitude / Ref | merged into `gps_lat: f64` |
|
||||
//! | GPSLongitude / Ref | merged into `gps_lon: f64` |
|
||||
//! | Make | `String` |
|
||||
//! | Model | `String` |
|
||||
//! | Orientation | `u32` (1..=8) |
|
||||
//! | Software | `String` |
|
||||
//!
|
||||
//! Any tag whose source value cannot be parsed into the documented shape
|
||||
//! is silently dropped — extractor failure must never fail the whole
|
||||
//! document.
|
||||
|
||||
use std::io::Cursor;
|
||||
|
||||
use exif::{In, Reader, Tag, Value};
|
||||
use serde_json::{Map, Value as JsonValue};
|
||||
|
||||
/// Read EXIF from `bytes` (any container the `exif` crate understands —
|
||||
/// JPEG APP1, PNG eXIf, TIFF, HEIF). Always returns a map; if there is no
|
||||
/// EXIF block (or parsing fails), the map is empty.
|
||||
pub(crate) fn extract_whitelisted(bytes: &[u8]) -> Map<String, JsonValue> {
|
||||
let mut out = Map::new();
|
||||
let exif = match Reader::new().read_from_container(&mut Cursor::new(bytes)) {
|
||||
Ok(e) => e,
|
||||
Err(_) => return out,
|
||||
};
|
||||
|
||||
if let Some(s) = ascii_field(&exif, Tag::DateTimeOriginal, In::PRIMARY) {
|
||||
if let Some(iso) = exif_datetime_to_iso(&s) {
|
||||
out.insert("DateTimeOriginal".into(), JsonValue::String(iso));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(lat) = gps_decimal(&exif, Tag::GPSLatitude, Tag::GPSLatitudeRef) {
|
||||
if let Some(num) = serde_json::Number::from_f64(lat) {
|
||||
out.insert("gps_lat".into(), JsonValue::Number(num));
|
||||
}
|
||||
}
|
||||
if let Some(lon) = gps_decimal(&exif, Tag::GPSLongitude, Tag::GPSLongitudeRef) {
|
||||
if let Some(num) = serde_json::Number::from_f64(lon) {
|
||||
out.insert("gps_lon".into(), JsonValue::Number(num));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(s) = ascii_field(&exif, Tag::Make, In::PRIMARY) {
|
||||
out.insert("Make".into(), JsonValue::String(s));
|
||||
}
|
||||
if let Some(s) = ascii_field(&exif, Tag::Model, In::PRIMARY) {
|
||||
out.insert("Model".into(), JsonValue::String(s));
|
||||
}
|
||||
if let Some(o) = u32_field(&exif, Tag::Orientation, In::PRIMARY) {
|
||||
out.insert("Orientation".into(), JsonValue::Number(o.into()));
|
||||
}
|
||||
if let Some(s) = ascii_field(&exif, Tag::Software, In::PRIMARY) {
|
||||
out.insert("Software".into(), JsonValue::String(s));
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
fn ascii_field(exif: &exif::Exif, tag: Tag, ifd: In) -> Option<String> {
|
||||
let f = exif.get_field(tag, ifd)?;
|
||||
match &f.value {
|
||||
Value::Ascii(parts) => {
|
||||
// The EXIF 2.x ASCII type is one or more null-terminated C
|
||||
// strings. We concatenate without separators since the
|
||||
// whitelisted tags here (Make, Model, Software, DateTime)
|
||||
// never legitimately split into multiple parts.
|
||||
let mut s = String::new();
|
||||
for part in parts {
|
||||
s.push_str(&String::from_utf8_lossy(part));
|
||||
}
|
||||
let trimmed = s.trim_matches(char::from(0)).trim().to_string();
|
||||
if trimmed.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(trimmed)
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn u32_field(exif: &exif::Exif, tag: Tag, ifd: In) -> Option<u32> {
|
||||
let f = exif.get_field(tag, ifd)?;
|
||||
match &f.value {
|
||||
Value::Short(v) => v.first().map(|x| *x as u32),
|
||||
Value::Long(v) => v.first().copied(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// EXIF datetime tags use `"YYYY:MM:DD HH:MM:SS"`. We rewrite to ISO-8601
|
||||
/// `"YYYY-MM-DDTHH:MM:SS"` for downstream consumers (no timezone — EXIF
|
||||
/// stores local time, and there's a separate OffsetTime tag we don't read).
|
||||
fn exif_datetime_to_iso(raw: &str) -> Option<String> {
|
||||
let raw = raw.trim();
|
||||
if raw.len() != 19 {
|
||||
return None;
|
||||
}
|
||||
let bytes = raw.as_bytes();
|
||||
if bytes[4] != b':' || bytes[7] != b':' || bytes[10] != b' ' {
|
||||
return None;
|
||||
}
|
||||
// Replace the three structural separators; leave digits + ':' in time
|
||||
// section untouched.
|
||||
let mut out = String::with_capacity(19);
|
||||
out.push_str(&raw[..4]);
|
||||
out.push('-');
|
||||
out.push_str(&raw[5..7]);
|
||||
out.push('-');
|
||||
out.push_str(&raw[8..10]);
|
||||
out.push('T');
|
||||
out.push_str(&raw[11..]);
|
||||
Some(out)
|
||||
}
|
||||
|
||||
/// Convert a GPS DMS triple (degrees / minutes / seconds, each
|
||||
/// `Rational`) into a signed decimal degree using the matching N/S/E/W
|
||||
/// reference tag. Returns `None` if either tag is missing or shaped
|
||||
/// unexpectedly.
|
||||
fn gps_decimal(exif: &exif::Exif, value_tag: Tag, ref_tag: Tag) -> Option<f64> {
|
||||
let f = exif.get_field(value_tag, In::PRIMARY)?;
|
||||
let dms = match &f.value {
|
||||
Value::Rational(r) if r.len() == 3 => r,
|
||||
_ => return None,
|
||||
};
|
||||
let deg = rational_to_f64(&dms[0])?;
|
||||
let min = rational_to_f64(&dms[1])?;
|
||||
let sec = rational_to_f64(&dms[2])?;
|
||||
let mut decimal = deg + min / 60.0 + sec / 3600.0;
|
||||
if let Some(reference) = ascii_field(exif, ref_tag, In::PRIMARY) {
|
||||
let r = reference.to_ascii_uppercase();
|
||||
if r.starts_with('S') || r.starts_with('W') {
|
||||
decimal = -decimal;
|
||||
}
|
||||
}
|
||||
if decimal.is_finite() {
|
||||
Some(decimal)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn rational_to_f64(r: &exif::Rational) -> Option<f64> {
|
||||
if r.denom == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(r.num as f64 / r.denom as f64)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn datetime_well_formed_converts_to_iso() {
|
||||
let iso = exif_datetime_to_iso("2024:08:15 12:34:56").unwrap();
|
||||
assert_eq!(iso, "2024-08-15T12:34:56");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn datetime_wrong_separator_rejected() {
|
||||
assert!(exif_datetime_to_iso("2024-08-15 12:34:56").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn datetime_short_string_rejected() {
|
||||
assert!(exif_datetime_to_iso("2024:08:15").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_on_empty_bytes_yields_empty_map() {
|
||||
let m = extract_whitelisted(&[]);
|
||||
assert!(m.is_empty());
|
||||
}
|
||||
}
|
||||
205
crates/kebab-parse-image/src/lib.rs
Normal file
205
crates/kebab-parse-image/src/lib.rs
Normal file
@@ -0,0 +1,205 @@
|
||||
//! `kebab-parse-image` — image extractor (P6-1).
|
||||
//!
|
||||
//! Implements [`kebab_core::Extractor`] for `MediaType::Image(_)`. One asset
|
||||
//! produces one [`CanonicalDocument`] with a single
|
||||
//! [`Block::ImageRef`](kebab_core::Block::ImageRef). EXIF is captured into
|
||||
//! `metadata.user["exif"]`, dimensions into `metadata.user["dimensions"]`.
|
||||
//! OCR / caption fields stay `None`; later tasks (P6-2 / P6-3) populate
|
||||
//! them.
|
||||
//!
|
||||
//! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText /
|
||||
//! ModelCaption stubs), §9.1 (image extraction policy), §9 (versioning).
|
||||
|
||||
mod dims;
|
||||
mod exif_extract;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, CommonBlock, Extractor, ImageRefBlock, Lang, MediaType, Metadata,
|
||||
ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType,
|
||||
TrustLevel, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::{Map, Value};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
/// Parser version label for the image extractor (§9 versioning).
|
||||
pub const PARSER_VERSION: &str = "image-meta-v1";
|
||||
|
||||
/// Maximum decode dimension (per axis) before we refuse to read the image.
|
||||
/// Matches the §9.1 "cap decode at ~16k" policy in the design doc.
|
||||
pub const MAX_DECODE_DIM: u32 = 16_384;
|
||||
|
||||
/// Image extractor — produces a single-block `CanonicalDocument` whose body
|
||||
/// is exactly one [`ImageRefBlock`].
|
||||
pub struct ImageExtractor;
|
||||
|
||||
impl ImageExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ImageExtractor {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor for ImageExtractor {
|
||||
fn supports(&self, m: &MediaType) -> bool {
|
||||
matches!(m, MediaType::Image(_))
|
||||
}
|
||||
|
||||
fn parser_version(&self) -> ParserVersion {
|
||||
ParserVersion(PARSER_VERSION.to_string())
|
||||
}
|
||||
|
||||
fn extract(
|
||||
&self,
|
||||
ctx: &kebab_core::ExtractContext<'_>,
|
||||
bytes: &[u8],
|
||||
) -> Result<CanonicalDocument> {
|
||||
let asset = ctx.asset;
|
||||
if !self.supports(&asset.media_type) {
|
||||
anyhow::bail!(
|
||||
"kebab-parse-image: unsupported media_type for ImageExtractor: {:?}",
|
||||
asset.media_type
|
||||
);
|
||||
}
|
||||
|
||||
let parser_version = self.parser_version();
|
||||
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version);
|
||||
|
||||
// Dimensions / format. `Err` here means the bytes don't even resolve
|
||||
// to a known image format — we propagate so the caller can skip the
|
||||
// asset (per spec failure modes: "Unsupported format → anyhow::Error").
|
||||
let dim_outcome = dims::probe(bytes).context("guessing image format")?;
|
||||
|
||||
// EXIF is best-effort regardless of dimension outcome. A corrupt
|
||||
// pixel stream may still carry a readable EXIF block (and vice
|
||||
// versa), so the two probes are independent.
|
||||
let exif_map = exif_extract::extract_whitelisted(bytes);
|
||||
|
||||
let (span, dims_value, decode_warning) = match &dim_outcome {
|
||||
dims::DimOutcome::Ok { width, height, format } => {
|
||||
let mut dims = Map::new();
|
||||
dims.insert("w".into(), Value::Number((*width).into()));
|
||||
dims.insert("h".into(), Value::Number((*height).into()));
|
||||
dims.insert("format".into(), Value::String((*format).to_string()));
|
||||
(
|
||||
SourceSpan::Region {
|
||||
x: 0,
|
||||
y: 0,
|
||||
w: *width,
|
||||
h: *height,
|
||||
},
|
||||
Value::Object(dims),
|
||||
None,
|
||||
)
|
||||
}
|
||||
dims::DimOutcome::Failed { reason } => (
|
||||
SourceSpan::Region {
|
||||
x: 0,
|
||||
y: 0,
|
||||
w: 0,
|
||||
h: 0,
|
||||
},
|
||||
Value::Null,
|
||||
Some(reason.clone()),
|
||||
),
|
||||
};
|
||||
|
||||
let block_id = id_for_block(&doc_id, "imageref", &[], 0, &span);
|
||||
|
||||
let workspace_path_str = asset.workspace_path.0.clone();
|
||||
let filename = filename_from_workspace_path(&workspace_path_str);
|
||||
let title = strip_extension(&filename);
|
||||
|
||||
let block = Block::ImageRef(ImageRefBlock {
|
||||
common: CommonBlock {
|
||||
block_id,
|
||||
heading_path: Vec::new(),
|
||||
source_span: span,
|
||||
},
|
||||
asset_id: Some(asset.asset_id.clone()),
|
||||
src: workspace_path_str,
|
||||
alt: filename,
|
||||
ocr: None,
|
||||
caption: None,
|
||||
});
|
||||
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let mut events: Vec<ProvenanceEvent> = Vec::with_capacity(3);
|
||||
events.push(ProvenanceEvent {
|
||||
at: asset.discovered_at,
|
||||
agent: "kb-source-fs".to_string(),
|
||||
kind: ProvenanceKind::Discovered,
|
||||
note: None,
|
||||
});
|
||||
events.push(ProvenanceEvent {
|
||||
at: now,
|
||||
agent: "kb-parse-image".to_string(),
|
||||
kind: ProvenanceKind::Parsed,
|
||||
note: Some(format!("parser_version={}", parser_version.0)),
|
||||
});
|
||||
if let Some(reason) = decode_warning {
|
||||
events.push(ProvenanceEvent {
|
||||
at: now,
|
||||
agent: "kb-parse-image".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(reason),
|
||||
});
|
||||
}
|
||||
|
||||
// Metadata. `created_at` / `updated_at` are sourced from the asset's
|
||||
// `discovered_at` so the wire form does not embed a fresh timestamp
|
||||
// for every extract call (which would break determinism).
|
||||
let mut user = Map::new();
|
||||
user.insert("exif".into(), Value::Object(exif_map));
|
||||
user.insert("dimensions".into(), dims_value);
|
||||
let metadata = Metadata {
|
||||
aliases: Vec::new(),
|
||||
tags: Vec::new(),
|
||||
created_at: asset.discovered_at,
|
||||
updated_at: asset.discovered_at,
|
||||
source_type: SourceType::Reference,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-parse-image",
|
||||
"extracted image doc_id={} workspace_path={} dim_ok={}",
|
||||
doc_id.0,
|
||||
asset.workspace_path.0,
|
||||
matches!(dim_outcome, dims::DimOutcome::Ok { .. })
|
||||
);
|
||||
|
||||
Ok(CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: asset.asset_id.clone(),
|
||||
workspace_path: asset.workspace_path.clone(),
|
||||
title,
|
||||
lang: Lang("und".to_string()),
|
||||
blocks: vec![block],
|
||||
metadata,
|
||||
provenance: Provenance { events },
|
||||
parser_version,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn filename_from_workspace_path(p: &str) -> String {
|
||||
p.rsplit('/').next().unwrap_or(p).to_string()
|
||||
}
|
||||
|
||||
fn strip_extension(filename: &str) -> String {
|
||||
match filename.rfind('.') {
|
||||
Some(0) => filename.to_string(),
|
||||
Some(idx) => filename[..idx].to_string(),
|
||||
None => filename.to_string(),
|
||||
}
|
||||
}
|
||||
241
crates/kebab-parse-image/tests/common/mod.rs
Normal file
241
crates/kebab-parse-image/tests/common/mod.rs
Normal file
@@ -0,0 +1,241 @@
|
||||
//! Test fixture builders for `kebab-parse-image`.
|
||||
//!
|
||||
//! Images are generated in-memory at test time rather than committed as
|
||||
//! binary fixtures so:
|
||||
//!
|
||||
//! * The test binary stays self-contained — no `include_bytes!` paths to
|
||||
//! keep in sync with the workspace layout.
|
||||
//! * Fixture provenance is auditable from source (anyone reading this
|
||||
//! module can see exactly what bytes the tests run against).
|
||||
//!
|
||||
//! All builders are deterministic (no time / RNG dependence).
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::io::Cursor;
|
||||
|
||||
use exif::experimental::Writer as ExifWriter;
|
||||
use exif::{Field, In, Rational, Tag, Value};
|
||||
use image::{ImageBuffer, Rgb};
|
||||
use kebab_core::{
|
||||
AssetStorage, Checksum, ExtractConfig, ExtractContext, ImageType, MediaType, RawAsset,
|
||||
SourceUri, WorkspacePath,
|
||||
};
|
||||
use std::path::{Path, PathBuf};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
/// 100×50 solid-red PNG, no EXIF.
|
||||
pub fn red_100x50_png() -> Vec<u8> {
|
||||
let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
|
||||
let mut buf = Cursor::new(Vec::new());
|
||||
img.write_to(&mut buf, image::ImageFormat::Png)
|
||||
.expect("encoding tiny PNG must not fail");
|
||||
buf.into_inner()
|
||||
}
|
||||
|
||||
/// 10×10 solid-blue PNG, no EXIF (smaller fixture for cases where
|
||||
/// dimensions don't matter).
|
||||
pub fn no_exif_png() -> Vec<u8> {
|
||||
let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_fn(10, 10, |_, _| Rgb([0, 0, 255]));
|
||||
let mut buf = Cursor::new(Vec::new());
|
||||
img.write_to(&mut buf, image::ImageFormat::Png)
|
||||
.expect("encoding tiny PNG must not fail");
|
||||
buf.into_inner()
|
||||
}
|
||||
|
||||
/// JPEG with embedded EXIF APP1 segment carrying GPS + Make + Model +
|
||||
/// DateTimeOriginal + Orientation + Software. The base image is a 4×4
|
||||
/// solid white square — pixel content is irrelevant; the test cares about
|
||||
/// the EXIF tags.
|
||||
///
|
||||
/// Construction: encode JPEG via the `image` crate, then splice an EXIF
|
||||
/// APP1 segment immediately after SOI (FF D8). The EXIF blob is built
|
||||
/// with `exif::experimental::Writer`.
|
||||
pub fn exif_with_gps_jpg() -> Vec<u8> {
|
||||
let base = encode_tiny_jpeg();
|
||||
let exif_blob = build_exif_blob_gps();
|
||||
|
||||
let mut out = Vec::with_capacity(base.len() + exif_blob.len() + 16);
|
||||
// SOI: FF D8.
|
||||
out.push(0xFF);
|
||||
out.push(0xD8);
|
||||
// APP1 marker: FF E1.
|
||||
out.push(0xFF);
|
||||
out.push(0xE1);
|
||||
// APP1 segment length (BE): 2 (length field itself) + 6 ("Exif\0\0")
|
||||
// + exif_blob.len(). Pre-validated against the 0xFFFF segment limit.
|
||||
let app1_payload_len = 2 + 6 + exif_blob.len();
|
||||
assert!(
|
||||
app1_payload_len <= u16::MAX as usize,
|
||||
"EXIF segment too large for a single APP1"
|
||||
);
|
||||
out.extend_from_slice(&(app1_payload_len as u16).to_be_bytes());
|
||||
out.extend_from_slice(b"Exif\x00\x00");
|
||||
out.extend_from_slice(&exif_blob);
|
||||
// Append the rest of the JPEG starting just after the original SOI.
|
||||
out.extend_from_slice(&base[2..]);
|
||||
out
|
||||
}
|
||||
|
||||
fn encode_tiny_jpeg() -> Vec<u8> {
|
||||
let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_fn(4, 4, |_, _| Rgb([255, 255, 255]));
|
||||
let mut buf = Cursor::new(Vec::new());
|
||||
img.write_to(&mut buf, image::ImageFormat::Jpeg)
|
||||
.expect("encoding tiny JPEG must not fail");
|
||||
buf.into_inner()
|
||||
}
|
||||
|
||||
fn build_exif_blob_gps() -> Vec<u8> {
|
||||
let make = Field {
|
||||
tag: Tag::Make,
|
||||
ifd_num: In::PRIMARY,
|
||||
value: Value::Ascii(vec![b"KebabCam\0".to_vec()]),
|
||||
};
|
||||
let model = Field {
|
||||
tag: Tag::Model,
|
||||
ifd_num: In::PRIMARY,
|
||||
value: Value::Ascii(vec![b"X1\0".to_vec()]),
|
||||
};
|
||||
let software = Field {
|
||||
tag: Tag::Software,
|
||||
ifd_num: In::PRIMARY,
|
||||
value: Value::Ascii(vec![b"kebab-test\0".to_vec()]),
|
||||
};
|
||||
let datetime = Field {
|
||||
tag: Tag::DateTimeOriginal,
|
||||
ifd_num: In::PRIMARY,
|
||||
value: Value::Ascii(vec![b"2024:08:15 12:34:56\0".to_vec()]),
|
||||
};
|
||||
let orientation = Field {
|
||||
tag: Tag::Orientation,
|
||||
ifd_num: In::PRIMARY,
|
||||
value: Value::Short(vec![1]),
|
||||
};
|
||||
// GPS — 37.5 N, 127.0 E (Seoul-ish). DMS triple: 37°30'0" N,
|
||||
// 127°0'0" E. Each component is num/denom rationals.
|
||||
let lat = Field {
|
||||
tag: Tag::GPSLatitude,
|
||||
ifd_num: In::PRIMARY,
|
||||
value: Value::Rational(vec![
|
||||
Rational { num: 37, denom: 1 },
|
||||
Rational { num: 30, denom: 1 },
|
||||
Rational { num: 0, denom: 1 },
|
||||
]),
|
||||
};
|
||||
let lat_ref = Field {
|
||||
tag: Tag::GPSLatitudeRef,
|
||||
ifd_num: In::PRIMARY,
|
||||
value: Value::Ascii(vec![b"N\0".to_vec()]),
|
||||
};
|
||||
let lon = Field {
|
||||
tag: Tag::GPSLongitude,
|
||||
ifd_num: In::PRIMARY,
|
||||
value: Value::Rational(vec![
|
||||
Rational { num: 127, denom: 1 },
|
||||
Rational { num: 0, denom: 1 },
|
||||
Rational { num: 0, denom: 1 },
|
||||
]),
|
||||
};
|
||||
let lon_ref = Field {
|
||||
tag: Tag::GPSLongitudeRef,
|
||||
ifd_num: In::PRIMARY,
|
||||
value: Value::Ascii(vec![b"E\0".to_vec()]),
|
||||
};
|
||||
|
||||
let mut writer = ExifWriter::new();
|
||||
writer.push_field(&make);
|
||||
writer.push_field(&model);
|
||||
writer.push_field(&software);
|
||||
writer.push_field(&datetime);
|
||||
writer.push_field(&orientation);
|
||||
writer.push_field(&lat);
|
||||
writer.push_field(&lat_ref);
|
||||
writer.push_field(&lon);
|
||||
writer.push_field(&lon_ref);
|
||||
|
||||
let mut blob = Cursor::new(Vec::new());
|
||||
writer
|
||||
.write(&mut blob, false)
|
||||
.expect("EXIF writer must succeed for the small whitelisted set");
|
||||
blob.into_inner()
|
||||
}
|
||||
|
||||
/// PNG header magic followed by truncated payload. The format guess
|
||||
/// succeeds (eight-byte PNG signature is intact) but `into_dimensions`
|
||||
/// fails because the IHDR chunk is missing.
|
||||
pub fn corrupt_png() -> Vec<u8> {
|
||||
// 8-byte PNG signature only — every byte after is missing.
|
||||
vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]
|
||||
}
|
||||
|
||||
/// Build a `RawAsset` + matching workspace_root + `ExtractContext` for
|
||||
/// the test. `bytes_for_id` is hashed (BLAKE3) to produce the AssetId
|
||||
/// per §4.2 — this matches what `kebab-source-fs` does in production.
|
||||
pub struct ImageFixture {
|
||||
pub asset: RawAsset,
|
||||
pub workspace_root: PathBuf,
|
||||
pub config: ExtractConfig,
|
||||
}
|
||||
|
||||
impl ImageFixture {
|
||||
pub fn ctx(&self) -> ExtractContext<'_> {
|
||||
ExtractContext {
|
||||
asset: &self.asset,
|
||||
workspace_root: &self.workspace_root,
|
||||
config: &self.config,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fixture_for(workspace_path: &str, image_type: ImageType, bytes: &[u8]) -> ImageFixture {
|
||||
let blake = blake3::hash(bytes);
|
||||
let full_hex = blake.to_hex().to_string();
|
||||
let asset_id = kebab_core::id_for_asset(&full_hex);
|
||||
let workspace_path = WorkspacePath::new(workspace_path.to_string()).unwrap();
|
||||
// Fixed timestamp so determinism tests can compare outputs across runs.
|
||||
let discovered_at = OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap();
|
||||
let asset = RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(format!("/tmp/{}", workspace_path.0))),
|
||||
workspace_path,
|
||||
media_type: MediaType::Image(image_type),
|
||||
byte_len: bytes.len() as u64,
|
||||
checksum: Checksum(full_hex),
|
||||
discovered_at,
|
||||
stored: AssetStorage::Reference {
|
||||
path: PathBuf::from("/tmp/fake"),
|
||||
sha: Checksum("0".repeat(64)),
|
||||
},
|
||||
};
|
||||
ImageFixture {
|
||||
asset,
|
||||
workspace_root: PathBuf::from("/tmp/fake-root"),
|
||||
config: ExtractConfig::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Strip the two non-deterministic provenance timestamps (Parsed +
|
||||
/// optional Warning) so determinism / snapshot tests can compare JSON
|
||||
/// without worrying about wall-clock jitter.
|
||||
pub fn strip_dynamic_at(json: &mut serde_json::Value) {
|
||||
if let Some(events) = json
|
||||
.get_mut("provenance")
|
||||
.and_then(|p| p.get_mut("events"))
|
||||
.and_then(|e| e.as_array_mut())
|
||||
{
|
||||
for (i, ev) in events.iter_mut().enumerate() {
|
||||
if i > 0
|
||||
&& let Some(obj) = ev.as_object_mut()
|
||||
{
|
||||
obj.insert("at".into(), serde_json::Value::String("<stripped>".into()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Stable ASCII path constant — avoids depending on `Path::new` or the
|
||||
/// host's path separator in the call sites.
|
||||
#[allow(dead_code)]
|
||||
pub fn fake_path(p: &str) -> &Path {
|
||||
Path::new(p)
|
||||
}
|
||||
249
crates/kebab-parse-image/tests/extractor.rs
Normal file
249
crates/kebab-parse-image/tests/extractor.rs
Normal file
@@ -0,0 +1,249 @@
|
||||
//! Integration tests for `kebab_parse_image::ImageExtractor` (P6-1).
|
||||
|
||||
mod common;
|
||||
|
||||
use kebab_core::{Block, Extractor, ImageType, ProvenanceKind, SourceSpan};
|
||||
use kebab_parse_image::ImageExtractor;
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::common::{
|
||||
corrupt_png, exif_with_gps_jpg, fixture_for, no_exif_png, red_100x50_png, strip_dynamic_at,
|
||||
};
|
||||
|
||||
fn extract_block(doc: &kebab_core::CanonicalDocument) -> &kebab_core::ImageRefBlock {
|
||||
assert_eq!(doc.blocks.len(), 1, "exactly one block expected");
|
||||
match &doc.blocks[0] {
|
||||
Block::ImageRef(b) => b,
|
||||
other => panic!("expected ImageRef, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn png_decode_produces_correct_dimensions() {
|
||||
let bytes = red_100x50_png();
|
||||
let fx = fixture_for("photos/red-100x50.png", ImageType::Png, &bytes);
|
||||
let doc = ImageExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("PNG extraction must succeed");
|
||||
|
||||
assert_eq!(doc.title, "red-100x50");
|
||||
assert_eq!(doc.lang.0, "und");
|
||||
assert_eq!(doc.parser_version.0, kebab_parse_image::PARSER_VERSION);
|
||||
|
||||
let dims = doc
|
||||
.metadata
|
||||
.user
|
||||
.get("dimensions")
|
||||
.expect("dimensions key present");
|
||||
let obj = dims.as_object().expect("dimensions is an object");
|
||||
assert_eq!(obj.get("w"), Some(&Value::Number(100.into())));
|
||||
assert_eq!(obj.get("h"), Some(&Value::Number(50.into())));
|
||||
assert_eq!(obj.get("format"), Some(&Value::String("png".into())));
|
||||
|
||||
let block = extract_block(&doc);
|
||||
assert_eq!(block.alt, "red-100x50.png");
|
||||
assert_eq!(block.src, "photos/red-100x50.png");
|
||||
assert_eq!(block.asset_id, Some(fx.asset.asset_id.clone()));
|
||||
assert!(block.ocr.is_none());
|
||||
assert!(block.caption.is_none());
|
||||
match &block.common.source_span {
|
||||
SourceSpan::Region { x, y, w, h } => {
|
||||
assert_eq!((*x, *y, *w, *h), (0, 0, 100, 50));
|
||||
}
|
||||
other => panic!("expected Region span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn jpeg_with_exif_gps_captures_whitelisted_tags() {
|
||||
let bytes = exif_with_gps_jpg();
|
||||
let fx = fixture_for("img/seoul.jpg", ImageType::Jpeg, &bytes);
|
||||
let doc = ImageExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("JPEG extraction must succeed");
|
||||
|
||||
let exif = doc
|
||||
.metadata
|
||||
.user
|
||||
.get("exif")
|
||||
.and_then(|v| v.as_object())
|
||||
.expect("exif object present");
|
||||
assert_eq!(exif.get("Make"), Some(&Value::String("KebabCam".into())));
|
||||
assert_eq!(exif.get("Model"), Some(&Value::String("X1".into())));
|
||||
assert_eq!(
|
||||
exif.get("Software"),
|
||||
Some(&Value::String("kebab-test".into()))
|
||||
);
|
||||
assert_eq!(
|
||||
exif.get("DateTimeOriginal"),
|
||||
Some(&Value::String("2024-08-15T12:34:56".into()))
|
||||
);
|
||||
assert_eq!(exif.get("Orientation"), Some(&Value::Number(1.into())));
|
||||
let lat = exif.get("gps_lat").and_then(|v| v.as_f64()).expect("gps_lat");
|
||||
let lon = exif.get("gps_lon").and_then(|v| v.as_f64()).expect("gps_lon");
|
||||
assert!((lat - 37.5).abs() < 1e-6, "lat={lat}");
|
||||
assert!((lon - 127.0).abs() < 1e-6, "lon={lon}");
|
||||
|
||||
// Maker notes / thumbnails / unrelated tags must NOT have leaked in.
|
||||
let allowed: std::collections::HashSet<&str> = [
|
||||
"Make",
|
||||
"Model",
|
||||
"Software",
|
||||
"DateTimeOriginal",
|
||||
"Orientation",
|
||||
"gps_lat",
|
||||
"gps_lon",
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
for k in exif.keys() {
|
||||
assert!(
|
||||
allowed.contains(k.as_str()),
|
||||
"non-whitelisted EXIF key leaked: {k}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_exif_image_yields_empty_exif_map() {
|
||||
let bytes = no_exif_png();
|
||||
let fx = fixture_for("img/blank.png", ImageType::Png, &bytes);
|
||||
let doc = ImageExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("PNG extraction must succeed");
|
||||
let exif = doc
|
||||
.metadata
|
||||
.user
|
||||
.get("exif")
|
||||
.and_then(|v| v.as_object())
|
||||
.expect("exif object present");
|
||||
assert!(exif.is_empty(), "no-EXIF PNG must yield empty exif map: {exif:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_image_emits_warning_no_panic() {
|
||||
let bytes = corrupt_png();
|
||||
let fx = fixture_for("img/corrupt.png", ImageType::Png, &bytes);
|
||||
let doc = ImageExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("corrupt PNG must NOT cause an Err — warning provenance event instead");
|
||||
|
||||
// dimensions = null
|
||||
assert_eq!(
|
||||
doc.metadata.user.get("dimensions"),
|
||||
Some(&Value::Null),
|
||||
"corrupt image must record dimensions = null"
|
||||
);
|
||||
// exif = {}
|
||||
let exif = doc
|
||||
.metadata
|
||||
.user
|
||||
.get("exif")
|
||||
.and_then(|v| v.as_object())
|
||||
.expect("exif object present");
|
||||
assert!(exif.is_empty());
|
||||
// Span is Region(0,0,0,0).
|
||||
let block = extract_block(&doc);
|
||||
assert!(matches!(
|
||||
block.common.source_span,
|
||||
SourceSpan::Region { x: 0, y: 0, w: 0, h: 0 }
|
||||
));
|
||||
// Warning provenance event.
|
||||
let warnings: Vec<_> = doc
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == ProvenanceKind::Warning)
|
||||
.collect();
|
||||
assert_eq!(warnings.len(), 1, "expected exactly one Warning event");
|
||||
assert_eq!(warnings[0].agent, "kb-parse-image");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unsupported_bytes_return_err() {
|
||||
let bytes = b"not an image at all".to_vec();
|
||||
let fx = fixture_for("img/garbage.png", ImageType::Png, &bytes);
|
||||
let r = ImageExtractor::new().extract(&fx.ctx(), &bytes);
|
||||
assert!(
|
||||
r.is_err(),
|
||||
"unrecognised format must propagate Err so caller skips"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn provenance_events_are_in_order() {
|
||||
let bytes = red_100x50_png();
|
||||
let fx = fixture_for("a/b.png", ImageType::Png, &bytes);
|
||||
let doc = ImageExtractor::new().extract(&fx.ctx(), &bytes).unwrap();
|
||||
let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect();
|
||||
assert_eq!(
|
||||
kinds,
|
||||
vec![ProvenanceKind::Discovered, ProvenanceKind::Parsed]
|
||||
);
|
||||
assert_eq!(doc.provenance.events[0].agent, "kb-source-fs");
|
||||
assert_eq!(doc.provenance.events[0].at, fx.asset.discovered_at);
|
||||
assert_eq!(doc.provenance.events[1].agent, "kb-parse-image");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn determinism_identical_bytes_produce_identical_ids() {
|
||||
let bytes = red_100x50_png();
|
||||
let fx_a = fixture_for("a/b.png", ImageType::Png, &bytes);
|
||||
let fx_b = fixture_for("a/b.png", ImageType::Png, &bytes);
|
||||
let extractor = ImageExtractor::new();
|
||||
let doc1 = extractor.extract(&fx_a.ctx(), &bytes).unwrap();
|
||||
let doc2 = extractor.extract(&fx_b.ctx(), &bytes).unwrap();
|
||||
assert_eq!(doc1.doc_id, doc2.doc_id);
|
||||
let id1 = &extract_block(&doc1).common.block_id;
|
||||
let id2 = &extract_block(&doc2).common.block_id;
|
||||
assert_eq!(id1, id2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snapshot_red_100x50_canonical_document_stable() {
|
||||
let bytes = red_100x50_png();
|
||||
let fx = fixture_for("photos/red-100x50.png", ImageType::Png, &bytes);
|
||||
let extractor = ImageExtractor::new();
|
||||
let doc1 = extractor.extract(&fx.ctx(), &bytes).unwrap();
|
||||
let doc2 = extractor.extract(&fx.ctx(), &bytes).unwrap();
|
||||
|
||||
let mut j1 = serde_json::to_value(&doc1).unwrap();
|
||||
let mut j2 = serde_json::to_value(&doc2).unwrap();
|
||||
strip_dynamic_at(&mut j1);
|
||||
strip_dynamic_at(&mut j2);
|
||||
assert_eq!(
|
||||
j1, j2,
|
||||
"two extractions of identical bytes must serialise byte-for-byte equal (modulo dynamic timestamps)"
|
||||
);
|
||||
|
||||
// Pin a few fields by exact value so a future regression in the
|
||||
// ID recipe / serialisation order surfaces here, not at the JSON
|
||||
// diff level only.
|
||||
assert_eq!(j1["title"], "red-100x50");
|
||||
assert_eq!(j1["lang"], "und");
|
||||
assert_eq!(j1["parser_version"], kebab_parse_image::PARSER_VERSION);
|
||||
assert_eq!(j1["schema_version"], 1);
|
||||
assert_eq!(j1["doc_version"], 1);
|
||||
assert_eq!(j1["blocks"].as_array().unwrap().len(), 1);
|
||||
assert_eq!(j1["blocks"][0]["kind"], "imageref");
|
||||
assert_eq!(j1["metadata"]["source_type"], "reference");
|
||||
assert_eq!(j1["metadata"]["trust_level"], "primary");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn supports_only_image_media_type() {
|
||||
let e = ImageExtractor::new();
|
||||
assert!(e.supports(&kebab_core::MediaType::Image(ImageType::Png)));
|
||||
assert!(e.supports(&kebab_core::MediaType::Image(ImageType::Jpeg)));
|
||||
assert!(!e.supports(&kebab_core::MediaType::Markdown));
|
||||
assert!(!e.supports(&kebab_core::MediaType::Pdf));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_extract_when_media_type_mismatches() {
|
||||
let bytes = red_100x50_png();
|
||||
let mut fx = fixture_for("a/b.md", ImageType::Png, &bytes);
|
||||
fx.asset.media_type = kebab_core::MediaType::Markdown;
|
||||
let r = ImageExtractor::new().extract(&fx.ctx(), &bytes);
|
||||
assert!(r.is_err());
|
||||
}
|
||||
@@ -3,7 +3,7 @@ phase: P6
|
||||
component: kebab-parse-image (image extractor + EXIF)
|
||||
task_id: p6-1
|
||||
title: "Image Extractor producing single-block CanonicalDocument + EXIF metadata"
|
||||
status: planned
|
||||
status: completed
|
||||
depends_on: [p0-1, p1-6]
|
||||
unblocks: [p6-2, p6-3]
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
|
||||
|
||||
Reference in New Issue
Block a user