review(p6-4): 회차 1 지적 반영

- src/lib.rs:
  • `ingest_one_asset` 의 doc-comment 가 새 `ImagePipeline` struct 와
    합쳐지던 (rustdoc 가 두 doc 을 struct 의 것으로 합치던) 문제
    해소 — 두 doc-comment 위치 교환 + 빈 줄 분리.
  • `if let Some(Block::ImageRef(...)) = blocks.first_mut()` 의
    silent-skip 분기를 `match` 의 `other` arm 으로 명시 — 미래에
    P6-1 contract 가 깨지면 `tracing::warn!` + Provenance Warning +
    `IngestItem.warnings` 에 \"ImageDispatchAnomaly\" 노트로 즉시
    가시화. 운영 디버깅 단서 제공.
  • OCR 실패 분기 + caption 실패 분기의 ~25줄 boilerplate 를
    `record_image_analysis_failure` 헬퍼로 추출 — 두 호출이 한 줄로
    줄고 미래 ProvenanceEvent 필드 변경이 한 곳에서 끝남.
  • 분석 단계 Warning 이벤트가 fn 진입 시 캡처한 단일
    `OffsetDateTime::now_utc()` 를 공유 — spec Risks/notes 의
    \"Determinism stress: must not introduce a second `now()` call
    between extract and apply_ocr/caption\" 약속 회복.
  • 경고 라벨을 markdown 경로의 `WarningKind` 컨벤션 (`{kind}: {note}`)
    에 맞춤 — `\"ocr_failed: ...\"` → `\"OcrFailed: ...\"`,
    `\"caption_failed: ...\"` → `\"CaptionFailed: ...\"`. 같은 wire
    필드 (`IngestItem.warnings`) 가 두 갈래의 다른 형식을 갖던
    inconsistency 해소.
- tests/image_pipeline.rs:
  • 회귀 테스트의 \"ocr_failed\" assertion 을 \"OcrFailed\" 로 갱신.

cargo test -p kebab-app -p kebab-chunk — 전부 pass.
cargo clippy --workspace --all-targets -- -D warnings — pass.
This commit is contained in:
2026-05-02 07:42:44 +00:00
parent ca0567c72b
commit 469a1a34ec
2 changed files with 96 additions and 46 deletions

View File

@@ -467,10 +467,6 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
/// `<… as JobRepo>` to be explicit.
type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
/// Process a single asset: read bytes, parse, normalize, chunk,
/// persist, embed. Per-asset failures bubble up to the caller for
/// labelling as `IngestItemKind::Error` — they do NOT abort the
/// whole run.
/// P6-4: borrowed bundle of the three image-pipeline components built
/// once per ingest invocation. Threaded through `ingest_one_asset` so
/// the dispatch does not need ten separate parameters.
@@ -480,6 +476,10 @@ struct ImagePipeline<'a> {
caption_llm: Option<&'a dyn LanguageModel>,
}
/// Process a single asset: read bytes, parse, normalize, chunk,
/// persist, embed. Per-asset failures bubble up to the caller for
/// labelling as `IngestItemKind::Error` — they do NOT abort the
/// whole run.
#[allow(clippy::too_many_arguments)]
fn ingest_one_asset(
app: &App,
@@ -742,62 +742,78 @@ fn ingest_one_image_asset(
// `block.ocr` / `block.caption` stay `None`. P6-4 spec
// explicitly: such partial failures do NOT increment the
// `errors` counter.
//
// Determinism stress (per spec Risks): the per-document
// Provenance timestamps for any analysis-stage Warning
// events share a single `now_utc()` reading taken once
// here, mirroring `kb-normalize::build_canonical_document`.
let lang_hint = lang_hint_from_doc(&canonical);
let now = time::OffsetDateTime::now_utc();
let mut warning_notes: Vec<String> = Vec::new();
if !canonical.blocks.is_empty() {
// P6-1 contract: image documents always have exactly one
// `Block::ImageRef`. Defensive match keeps us forward-compatible.
if let Some(Block::ImageRef(block)) = canonical.blocks.first_mut() {
if let Some(engine) = ocr_engine {
if let Err(e) = apply_ocr(
match canonical.blocks.first_mut() {
Some(Block::ImageRef(block)) => {
if let Some(engine) = ocr_engine
&& let Err(e) = apply_ocr(
engine,
&bytes,
block,
lang_hint.as_ref(),
&mut canonical.provenance.events,
) {
let note = format!("ocr_failed: {e:#}");
tracing::warn!(
target: "kebab-app",
path = %asset.workspace_path.0,
"{}",
note
);
canonical.provenance.events.push(kebab_core::ProvenanceEvent {
at: time::OffsetDateTime::now_utc(),
agent: "kb-app".to_string(),
kind: kebab_core::ProvenanceKind::Warning,
note: Some(note.clone()),
});
warning_notes.push(note);
}
)
{
record_image_analysis_failure(
asset,
&mut canonical.provenance.events,
&mut warning_notes,
"OcrFailed",
e,
now,
);
}
if let Some(llm) = caption_llm {
if let Err(e) = apply_caption(
if let Some(llm) = caption_llm
&& let Err(e) = apply_caption(
llm,
&bytes,
block,
lang_hint.as_ref(),
&app.config,
&mut canonical.provenance.events,
) {
let note = format!("caption_failed: {e:#}");
tracing::warn!(
target: "kebab-app",
path = %asset.workspace_path.0,
"{}",
note
);
canonical.provenance.events.push(kebab_core::ProvenanceEvent {
at: time::OffsetDateTime::now_utc(),
agent: "kb-app".to_string(),
kind: kebab_core::ProvenanceKind::Warning,
note: Some(note.clone()),
});
warning_notes.push(note);
}
)
{
record_image_analysis_failure(
asset,
&mut canonical.provenance.events,
&mut warning_notes,
"CaptionFailed",
e,
now,
);
}
}
// P6-1 contract: image documents always have exactly one
// `Block::ImageRef`. If a future task introduces multi-block
// image documents the silent-skip would mask a real bug, so
// this arm surfaces the divergence loudly.
other => {
tracing::warn!(
target: "kebab-app",
path = %asset.workspace_path.0,
blocks = canonical.blocks.len(),
"image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})",
other.map(|b| std::mem::discriminant(b))
);
canonical.provenance.events.push(kebab_core::ProvenanceEvent {
at: now,
agent: "kb-app".to_string(),
kind: kebab_core::ProvenanceKind::Warning,
note: Some(
"image document missing leading ImageRef block — OCR/caption skipped"
.to_string(),
),
});
warning_notes
.push("ImageDispatchAnomaly: missing ImageRef block".to_string());
}
}
// 4. Chunk via the same `MdHeadingV1Chunker` markdown uses — its
@@ -884,6 +900,39 @@ fn ingest_one_image_asset(
})
}
/// Centralised handling for image-analysis (OCR / caption) failures.
/// Emits a `tracing::warn!`, appends a `ProvenanceKind::Warning`
/// event sharing the caller's per-document `now`, and pushes a
/// `<WarningKind>: <err>` note onto the `IngestItem.warnings` slot
/// using the same shape the markdown path uses (so downstream wire
/// readers don't have to learn two formats — see kb-normalize's
/// `warning_agent`).
fn record_image_analysis_failure(
asset: &RawAsset,
events: &mut Vec<kebab_core::ProvenanceEvent>,
warning_notes: &mut Vec<String>,
kind_label: &str,
err: anyhow::Error,
now: time::OffsetDateTime,
) {
let detail = format!("{err:#}");
let note = format!("{kind_label}: {detail}");
tracing::warn!(
target: "kebab-app",
path = %asset.workspace_path.0,
"image analysis stage {} failed: {}",
kind_label,
detail
);
events.push(kebab_core::ProvenanceEvent {
at: now,
agent: "kb-app".to_string(),
kind: kebab_core::ProvenanceKind::Warning,
note: Some(note.clone()),
});
warning_notes.push(note);
}
/// Pull the BCP-47 language hint from the canonical document. P6-1
/// stamps `Lang("und")` by default; image-pipeline OCR / caption
/// adapters special-case "und" so the hint is intentionally dropped

View File

@@ -244,8 +244,9 @@ async fn ocr_failure_indexes_asset_with_warning_no_error_counter() {
.expect("Provenance Warning attributed to kb-app");
let note = warning.note.as_deref().unwrap_or("");
assert!(
note.contains("ocr_failed"),
"warning note must describe OCR failure: {note}"
note.contains("OcrFailed"),
"warning note must describe OCR failure with OcrFailed prefix \
(markdown-style WarningKind format): {note}"
);
}