From 6e4884aff802a5fc2b9784614761b817ab7d4691 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 08:13:41 +0000 Subject: [PATCH] =?UTF-8?q?fix(kebab-app):=20IngestReport.errors=20double-?= =?UTF-8?q?count=20regression=20=E2=80=94=20increment=20only=20in=20`match?= =?UTF-8?q?=20item.kind=20{=20Error=20=3D>=20...=20}`=20arm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 수동 스모크 검증 (12 PNG + 손상 PNG) 중 발견. `IngestReport.errors` 가 자산 한 장당 2회 증가해서 `scanned = new + updated + skipped + errors` invariant 가 깨짐: - `garbage.png` (이미지 아닌 바이트, .png 확장자만) 1장 + 정상 자산 3장 → 기대 `scanned=4 errors=1`, 실제 `scanned=4 errors=2`. - 원인: `match item { Err(e) => { error_count += 1; IngestItem {...} } }` 에서 1회 증가 후, 직후 `match item.kind { Error => { error_count += 1 } }` arm 에서 또 1회 증가. - markdown 경로의 `ingest_one_asset` Err 가 거의 발생 안 해서 P6-4 머지 전까지 표면화 안 됐던 기존 결함. 이미지 dispatch 가 garbage bytes 를 Err 로 흘려보내며 처음으로 노출. 수정: `Err(e)` 분기의 `error_count.saturating_add(1)` 제거. 단일 증가 지점은 `match item.kind { Error => ... }` arm. 코멘트로 의도 명시. 회귀 테스트 추가 (`tests/image_pipeline.rs`): - `garbage_png_increments_errors_counter_exactly_once` — 정확히 1 증가 + `scanned == new + updated + skipped + errors` invariant 검증. 검증 — release binary + 실 Ollama (192.168.0.47 / gemma4:e4b): ``` $ kebab --json ingest scanned=4 new=3 updated=0 skipped=0 errors=1 error garbage.png (extract Err — unrecognised format) new intro.md new normal.png (OCR success) new truncated.png (OcrFailed warning, asset still indexed) ``` cargo test --workspace --no-fail-fast -j 1 — 전부 pass. cargo clippy --workspace --all-targets -- -D warnings — pass. cargo test -p kebab-app --test image_pipeline — 6 pass (5 기존 + 1 회귀). --- crates/kebab-app/src/lib.rs | 7 ++- crates/kebab-app/tests/image_pipeline.rs | 56 +++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 3cf31fc..d916f7e 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -274,7 +274,12 @@ pub fn ingest_with_config( error = %e, "kb-app::ingest: per-file fatal" ); - error_count = error_count.saturating_add(1); + // Note: `error_count += 1` happens below in the + // `match item.kind { Error => ... }` arm — incrementing + // here too would double-count (a regression first + // surfaced by P6-4 image dispatch where Err returns + // are common; markdown rarely propagated Err so the + // bug went unnoticed). kebab_core::IngestItem { kind: kebab_core::IngestItemKind::Error, doc_id: None, diff --git a/crates/kebab-app/tests/image_pipeline.rs b/crates/kebab-app/tests/image_pipeline.rs index 2dfa557..4d12a8b 100644 --- a/crates/kebab-app/tests/image_pipeline.rs +++ b/crates/kebab-app/tests/image_pipeline.rs @@ -307,7 +307,61 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() { ); } -// ── 5. Determinism: re-ingest produces identical doc_id / chunk_id ─────── +// ── 5. Garbage bytes (not an image) → errors counter exactly 1 ────────── + +/// `kebab-source-fs` classifies a `.png` extension as +/// `MediaType::Image(Png)` regardless of content. When the bytes don't +/// decode as any image format, `ImageExtractor::extract` returns Err +/// and the asset must be classified as `IngestItemKind::Error` with +/// the `errors` counter incremented **exactly once** (regression for +/// the double-count bug surfaced during P6-4 manual smoke). +#[tokio::test] +async fn garbage_png_increments_errors_counter_exactly_once() { + // No mock server needed — extract fails before any HTTP call. + let env = TestEnv::lexical_only(); + // Single non-image asset with .png extension. + std::fs::write( + env.workspace_root.join("garbage.png"), + b"this is not an image at all", + ) + .expect("write garbage fixture"); + let mut cfg = env.config.clone(); + cfg.workspace.include.push("**/*.png".to_string()); + cfg.image.ocr.enabled = false; + cfg.image.caption.enabled = false; + + let cfg_clone = cfg.clone(); + let scope = env.scope(); + let report = spawn_blocking(move || { + kebab_app::ingest_with_config(cfg_clone, scope, false) + .expect("ingest does not abort on per-asset failure") + }) + .await + .expect("task"); + + // Exactly-once: scanned counts the asset, errors counts it once, + // and (scanned == new + updated + skipped + errors) holds. + assert_eq!( + report.errors, 1, + "garbage PNG must increment errors exactly once, not twice (double-count regression)" + ); + assert_eq!( + report.scanned, + report.new + report.updated + report.skipped + report.errors, + "counter sum must equal scanned — invariant of the IngestReport contract" + ); + + // The single Error item carries the propagated extract error. + let items = report.items.expect("items present"); + let err_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("garbage.png")) + .expect("garbage item present"); + assert_eq!(err_item.kind, kebab_core::IngestItemKind::Error); + assert!(err_item.error.is_some(), "Error item carries error string"); +} + +// ── 6. Determinism: re-ingest produces identical doc_id / chunk_id ─────── /// Idempotency contract — running the same ingest twice should mark /// the asset Updated on the second run with byte-identical IDs.