From 4ad4ef271eae0e6ebf9e58be6bc307d3e3c29fdb Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 09:31:55 +0000 Subject: [PATCH] =?UTF-8?q?review(p7-3):=20=ED=9A=8C=EC=B0=A8=201=20?= =?UTF-8?q?=EC=A7=80=EC=A0=81=20=EB=B0=98=EC=98=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `IngestItem.warnings` 가 PDF path 에서 빈 vec 였던 갭 해소. P7-1 의 Provenance Warning (scanned candidate / extract panic 흡수) 노트들을 `IngestItem.warnings` 로 surface — md path 의 `fm_warns + blk_warns` patten 과 평행. 사용자가 ingest summary 에서 "이 PDF page 2 가 스캔 이라 검색 불가" 를 즉시 확인 가능. - `mixed_page_pdf_stores_asset_with_scanned_candidate_warning` 에 `IngestItem.warnings` 단정 추가 (정확히 1건 + 노트 내용 검증). - `encrypted_pdf` / `corrupt_pdf` 테스트의 `errors >= 1` → `errors == 1` strict 단정. 미래에 다른 source 가 errors 늘리면 즉시 빨개짐. - `re_ingest_identical_pdf` 에 `chunk_count` 동일성 단정 추가. P1 idempotency contract 의 chunk-단위 axis 검증 (chunk_id 전체 set 비교는 pdf-page-v1 의 `deterministic_chunk_ids_1000` 가 잠그고 있어 chunk_count 가 가벼운 proxy 로 충분). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 15 +++++++++++++- crates/kebab-app/tests/pdf_pipeline.rs | 28 ++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 026a5d5..7ee324e 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -1084,6 +1084,19 @@ fn ingest_one_pdf_asset( kebab_core::IngestItemKind::New }; + // Surface every `Provenance::Warning` note onto `IngestItem.warnings` + // so the ingest summary shows partial-success signals (e.g. "page 2 + // empty (scanned candidate)") without forcing the operator into + // `kebab inspect doc `. Mirrors how the markdown path threads + // frontmatter / block warnings up to the same field. + let warnings: Vec = canonical + .provenance + .events + .iter() + .filter(|e| e.kind == kebab_core::ProvenanceKind::Warning) + .filter_map(|e| e.note.clone()) + .collect(); + Ok(kebab_core::IngestItem { kind, doc_id: Some(canonical.doc_id.clone()), @@ -1094,7 +1107,7 @@ fn ingest_one_pdf_asset( chunk_count: u32::try_from(chunks.len()).ok(), parser_version: Some(canonical.parser_version.clone()), chunker_version: Some(chunker.chunker_version()), - warnings: Vec::new(), + warnings, error: None, }) } diff --git a/crates/kebab-app/tests/pdf_pipeline.rs b/crates/kebab-app/tests/pdf_pipeline.rs index 12accf1..f5cbea2 100644 --- a/crates/kebab-app/tests/pdf_pipeline.rs +++ b/crates/kebab-app/tests/pdf_pipeline.rs @@ -218,6 +218,15 @@ fn re_ingest_identical_pdf_produces_updated_with_same_doc_id() { .unwrap(); assert_eq!(item2.kind, IngestItemKind::Updated); assert_eq!(item2.doc_id, item1.doc_id); + // P1 idempotency contract: identical bytes → identical chunk set. + // Comparing `chunk_count` as a proxy (full chunk_id set comparison + // would need direct sqlite access; the per-chunk #c{char_start} + // hash variant in pdf-page-v1 is already tested for stability in + // `kebab-chunk::pdf_page_v1::deterministic_chunk_ids_1000`). + assert_eq!( + item1.chunk_count, item2.chunk_count, + "identical bytes must produce identical chunk count" + ); } /// Edit a PDF (replace bytes) → different blake3 → different asset_id @@ -285,7 +294,7 @@ fn encrypted_pdf_fails_with_qpdf_hint() { let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap(); - assert!(report.errors >= 1, "encrypted PDF must increment errors"); + assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once"); let items = report.items.as_ref().unwrap(); let pdf_item = items .iter() @@ -313,7 +322,7 @@ fn corrupt_pdf_fails_without_storing() { let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); - assert!(report.errors >= 1); + assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once"); let items = report.items.as_ref().unwrap(); let pdf_item = items .iter() @@ -390,6 +399,21 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() { note.contains("page2") && note.contains("scanned candidate"), "Warning note marks page 2 as scanned candidate: {note}" ); + + // R1: Warning notes also surface on `IngestItem.warnings` so + // operators can see the partial-success signal in the ingest + // summary without `kebab inspect doc`. + assert_eq!( + pdf_item.warnings.len(), + 1, + "exactly one warning surfaced on IngestItem" + ); + assert!( + pdf_item.warnings[0].contains("page2") + && pdf_item.warnings[0].contains("scanned candidate"), + "IngestItem.warnings preserves the Provenance Warning note: {:?}", + pdf_item.warnings + ); } /// IngestReport invariant `scanned == new + updated + skipped + errors`