- 새 crate kebab-parse-image 추가 (workspace 19개째). MediaType::Image(_)
자산을 단일-블록 CanonicalDocument 로 변환하는 ImageExtractor 구현.
- parser_version "image-meta-v1" (§9 versioning).
- 본문은 Block::ImageRef 1건만 포함 — OCR / caption 필드는 None 으로
남겨 두고 P6-2 / P6-3 에서 채운다.
- EXIF 화이트리스트 (§9.1, PII 표면 최소화):
Make / Model / Software / DateTimeOriginal / Orientation /
GPSLatitude(+Ref) / GPSLongitude(+Ref). MakerNote / Thumbnail / 기타
태그는 폐기. DateTime 은 EXIF "YYYY:MM:DD HH:MM:SS" → ISO-8601 변환.
GPS DMS triple + N/S/E/W ref → signed decimal degree.
- 차원: image::ImageReader 헤더만 읽어 (w, h, format) 획득. 16k×16k cap
초과 또는 디코드 실패 → metadata.user.dimensions = null + Provenance
Warning 이벤트 (Err 아님). 포맷 자체 인식 실패 → anyhow::Error
(caller skip).
- SourceSpan::Region { 0, 0, w, h } 으로 전체 이미지 영역 표기. 결정성:
동일 bytes + 동일 parser_version → 동일 doc_id + block_id (§4.2 ID
recipe 그대로 사용).
- metadata.source_type = Reference, trust_level = Primary, lang = "und".
title = 확장자 제외 파일명, alt = 파일명.
- 의존성 경계 (§8): kebab-core 만 + image 0.25 (default features off,
png/jpeg/webp/gif/tiff 만), kamadak-exif 0.6, anyhow / serde /
serde_json / time / tracing / thiserror. kebab-source-fs · parse-md ·
store-* · embed* · llm* · rag · UI crate 미참조.
- 테스트 14개 (4 unit + 10 integration):
• PNG 차원 추출, JPEG EXIF GPS 추출 (DMS → decimal 변환 정확도 1e-6),
EXIF 없는 PNG → 빈 map, 손상 PNG → warning + null dims (panic 없음),
인식 불가 bytes → Err, 결정성, 스냅샷, supports() 매칭, media_type
불일치 거부.
• 픽스처는 in-memory 생성 (PNG 는 image crate, EXIF JPEG 는 kamadak
Writer 로 EXIF blob 만든 뒤 SOI 직후 APP1 splice) — 바이너리
fixture 커밋 없음.
- HEIC / RAW 는 spec 상 v1 out of scope (image crate 미지원, Apple
Vision sidecar 가 추후 P+ 에서 채움).
- tasks/p6/p6-1-image-extractor-exif.md status: planned → completed.
contract: docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
sections: §3.4 Block::ImageRef + ImageRefBlock, §3.7a OcrText /
ModelCaption stubs, §9.1 image extraction policy, §9 versioning.
250 lines
8.5 KiB
Rust
250 lines
8.5 KiB
Rust
//! Integration tests for `kebab_parse_image::ImageExtractor` (P6-1).
|
|
|
|
mod common;
|
|
|
|
use kebab_core::{Block, Extractor, ImageType, ProvenanceKind, SourceSpan};
|
|
use kebab_parse_image::ImageExtractor;
|
|
use serde_json::Value;
|
|
|
|
use crate::common::{
|
|
corrupt_png, exif_with_gps_jpg, fixture_for, no_exif_png, red_100x50_png, strip_dynamic_at,
|
|
};
|
|
|
|
fn extract_block(doc: &kebab_core::CanonicalDocument) -> &kebab_core::ImageRefBlock {
|
|
assert_eq!(doc.blocks.len(), 1, "exactly one block expected");
|
|
match &doc.blocks[0] {
|
|
Block::ImageRef(b) => b,
|
|
other => panic!("expected ImageRef, got {other:?}"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn png_decode_produces_correct_dimensions() {
|
|
let bytes = red_100x50_png();
|
|
let fx = fixture_for("photos/red-100x50.png", ImageType::Png, &bytes);
|
|
let doc = ImageExtractor::new()
|
|
.extract(&fx.ctx(), &bytes)
|
|
.expect("PNG extraction must succeed");
|
|
|
|
assert_eq!(doc.title, "red-100x50");
|
|
assert_eq!(doc.lang.0, "und");
|
|
assert_eq!(doc.parser_version.0, kebab_parse_image::PARSER_VERSION);
|
|
|
|
let dims = doc
|
|
.metadata
|
|
.user
|
|
.get("dimensions")
|
|
.expect("dimensions key present");
|
|
let obj = dims.as_object().expect("dimensions is an object");
|
|
assert_eq!(obj.get("w"), Some(&Value::Number(100.into())));
|
|
assert_eq!(obj.get("h"), Some(&Value::Number(50.into())));
|
|
assert_eq!(obj.get("format"), Some(&Value::String("png".into())));
|
|
|
|
let block = extract_block(&doc);
|
|
assert_eq!(block.alt, "red-100x50.png");
|
|
assert_eq!(block.src, "photos/red-100x50.png");
|
|
assert_eq!(block.asset_id, Some(fx.asset.asset_id.clone()));
|
|
assert!(block.ocr.is_none());
|
|
assert!(block.caption.is_none());
|
|
match &block.common.source_span {
|
|
SourceSpan::Region { x, y, w, h } => {
|
|
assert_eq!((*x, *y, *w, *h), (0, 0, 100, 50));
|
|
}
|
|
other => panic!("expected Region span, got {other:?}"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn jpeg_with_exif_gps_captures_whitelisted_tags() {
|
|
let bytes = exif_with_gps_jpg();
|
|
let fx = fixture_for("img/seoul.jpg", ImageType::Jpeg, &bytes);
|
|
let doc = ImageExtractor::new()
|
|
.extract(&fx.ctx(), &bytes)
|
|
.expect("JPEG extraction must succeed");
|
|
|
|
let exif = doc
|
|
.metadata
|
|
.user
|
|
.get("exif")
|
|
.and_then(|v| v.as_object())
|
|
.expect("exif object present");
|
|
assert_eq!(exif.get("Make"), Some(&Value::String("KebabCam".into())));
|
|
assert_eq!(exif.get("Model"), Some(&Value::String("X1".into())));
|
|
assert_eq!(
|
|
exif.get("Software"),
|
|
Some(&Value::String("kebab-test".into()))
|
|
);
|
|
assert_eq!(
|
|
exif.get("DateTimeOriginal"),
|
|
Some(&Value::String("2024-08-15T12:34:56".into()))
|
|
);
|
|
assert_eq!(exif.get("Orientation"), Some(&Value::Number(1.into())));
|
|
let lat = exif.get("gps_lat").and_then(|v| v.as_f64()).expect("gps_lat");
|
|
let lon = exif.get("gps_lon").and_then(|v| v.as_f64()).expect("gps_lon");
|
|
assert!((lat - 37.5).abs() < 1e-6, "lat={lat}");
|
|
assert!((lon - 127.0).abs() < 1e-6, "lon={lon}");
|
|
|
|
// Maker notes / thumbnails / unrelated tags must NOT have leaked in.
|
|
let allowed: std::collections::HashSet<&str> = [
|
|
"Make",
|
|
"Model",
|
|
"Software",
|
|
"DateTimeOriginal",
|
|
"Orientation",
|
|
"gps_lat",
|
|
"gps_lon",
|
|
]
|
|
.into_iter()
|
|
.collect();
|
|
for k in exif.keys() {
|
|
assert!(
|
|
allowed.contains(k.as_str()),
|
|
"non-whitelisted EXIF key leaked: {k}"
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn no_exif_image_yields_empty_exif_map() {
|
|
let bytes = no_exif_png();
|
|
let fx = fixture_for("img/blank.png", ImageType::Png, &bytes);
|
|
let doc = ImageExtractor::new()
|
|
.extract(&fx.ctx(), &bytes)
|
|
.expect("PNG extraction must succeed");
|
|
let exif = doc
|
|
.metadata
|
|
.user
|
|
.get("exif")
|
|
.and_then(|v| v.as_object())
|
|
.expect("exif object present");
|
|
assert!(exif.is_empty(), "no-EXIF PNG must yield empty exif map: {exif:?}");
|
|
}
|
|
|
|
#[test]
|
|
fn corrupt_image_emits_warning_no_panic() {
|
|
let bytes = corrupt_png();
|
|
let fx = fixture_for("img/corrupt.png", ImageType::Png, &bytes);
|
|
let doc = ImageExtractor::new()
|
|
.extract(&fx.ctx(), &bytes)
|
|
.expect("corrupt PNG must NOT cause an Err — warning provenance event instead");
|
|
|
|
// dimensions = null
|
|
assert_eq!(
|
|
doc.metadata.user.get("dimensions"),
|
|
Some(&Value::Null),
|
|
"corrupt image must record dimensions = null"
|
|
);
|
|
// exif = {}
|
|
let exif = doc
|
|
.metadata
|
|
.user
|
|
.get("exif")
|
|
.and_then(|v| v.as_object())
|
|
.expect("exif object present");
|
|
assert!(exif.is_empty());
|
|
// Span is Region(0,0,0,0).
|
|
let block = extract_block(&doc);
|
|
assert!(matches!(
|
|
block.common.source_span,
|
|
SourceSpan::Region { x: 0, y: 0, w: 0, h: 0 }
|
|
));
|
|
// Warning provenance event.
|
|
let warnings: Vec<_> = doc
|
|
.provenance
|
|
.events
|
|
.iter()
|
|
.filter(|e| e.kind == ProvenanceKind::Warning)
|
|
.collect();
|
|
assert_eq!(warnings.len(), 1, "expected exactly one Warning event");
|
|
assert_eq!(warnings[0].agent, "kb-parse-image");
|
|
}
|
|
|
|
#[test]
|
|
fn unsupported_bytes_return_err() {
|
|
let bytes = b"not an image at all".to_vec();
|
|
let fx = fixture_for("img/garbage.png", ImageType::Png, &bytes);
|
|
let r = ImageExtractor::new().extract(&fx.ctx(), &bytes);
|
|
assert!(
|
|
r.is_err(),
|
|
"unrecognised format must propagate Err so caller skips"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn provenance_events_are_in_order() {
|
|
let bytes = red_100x50_png();
|
|
let fx = fixture_for("a/b.png", ImageType::Png, &bytes);
|
|
let doc = ImageExtractor::new().extract(&fx.ctx(), &bytes).unwrap();
|
|
let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect();
|
|
assert_eq!(
|
|
kinds,
|
|
vec![ProvenanceKind::Discovered, ProvenanceKind::Parsed]
|
|
);
|
|
assert_eq!(doc.provenance.events[0].agent, "kb-source-fs");
|
|
assert_eq!(doc.provenance.events[0].at, fx.asset.discovered_at);
|
|
assert_eq!(doc.provenance.events[1].agent, "kb-parse-image");
|
|
}
|
|
|
|
#[test]
|
|
fn determinism_identical_bytes_produce_identical_ids() {
|
|
let bytes = red_100x50_png();
|
|
let fx_a = fixture_for("a/b.png", ImageType::Png, &bytes);
|
|
let fx_b = fixture_for("a/b.png", ImageType::Png, &bytes);
|
|
let extractor = ImageExtractor::new();
|
|
let doc1 = extractor.extract(&fx_a.ctx(), &bytes).unwrap();
|
|
let doc2 = extractor.extract(&fx_b.ctx(), &bytes).unwrap();
|
|
assert_eq!(doc1.doc_id, doc2.doc_id);
|
|
let id1 = &extract_block(&doc1).common.block_id;
|
|
let id2 = &extract_block(&doc2).common.block_id;
|
|
assert_eq!(id1, id2);
|
|
}
|
|
|
|
#[test]
|
|
fn snapshot_red_100x50_canonical_document_stable() {
|
|
let bytes = red_100x50_png();
|
|
let fx = fixture_for("photos/red-100x50.png", ImageType::Png, &bytes);
|
|
let extractor = ImageExtractor::new();
|
|
let doc1 = extractor.extract(&fx.ctx(), &bytes).unwrap();
|
|
let doc2 = extractor.extract(&fx.ctx(), &bytes).unwrap();
|
|
|
|
let mut j1 = serde_json::to_value(&doc1).unwrap();
|
|
let mut j2 = serde_json::to_value(&doc2).unwrap();
|
|
strip_dynamic_at(&mut j1);
|
|
strip_dynamic_at(&mut j2);
|
|
assert_eq!(
|
|
j1, j2,
|
|
"two extractions of identical bytes must serialise byte-for-byte equal (modulo dynamic timestamps)"
|
|
);
|
|
|
|
// Pin a few fields by exact value so a future regression in the
|
|
// ID recipe / serialisation order surfaces here, not at the JSON
|
|
// diff level only.
|
|
assert_eq!(j1["title"], "red-100x50");
|
|
assert_eq!(j1["lang"], "und");
|
|
assert_eq!(j1["parser_version"], kebab_parse_image::PARSER_VERSION);
|
|
assert_eq!(j1["schema_version"], 1);
|
|
assert_eq!(j1["doc_version"], 1);
|
|
assert_eq!(j1["blocks"].as_array().unwrap().len(), 1);
|
|
assert_eq!(j1["blocks"][0]["kind"], "imageref");
|
|
assert_eq!(j1["metadata"]["source_type"], "reference");
|
|
assert_eq!(j1["metadata"]["trust_level"], "primary");
|
|
}
|
|
|
|
#[test]
|
|
fn supports_only_image_media_type() {
|
|
let e = ImageExtractor::new();
|
|
assert!(e.supports(&kebab_core::MediaType::Image(ImageType::Png)));
|
|
assert!(e.supports(&kebab_core::MediaType::Image(ImageType::Jpeg)));
|
|
assert!(!e.supports(&kebab_core::MediaType::Markdown));
|
|
assert!(!e.supports(&kebab_core::MediaType::Pdf));
|
|
}
|
|
|
|
#[test]
|
|
fn rejects_extract_when_media_type_mismatches() {
|
|
let bytes = red_100x50_png();
|
|
let mut fx = fixture_for("a/b.md", ImageType::Png, &bytes);
|
|
fx.asset.media_type = kebab_core::MediaType::Markdown;
|
|
let r = ImageExtractor::new().extract(&fx.ctx(), &bytes);
|
|
assert!(r.is_err());
|
|
}
|