review(p7-3): 회차 1 지적 반영
- `IngestItem.warnings` 가 PDF path 에서 빈 vec 였던 갭 해소. P7-1 의 Provenance Warning (scanned candidate / extract panic 흡수) 노트들을 `IngestItem.warnings` 로 surface — md path 의 `fm_warns + blk_warns` patten 과 평행. 사용자가 ingest summary 에서 "이 PDF page 2 가 스캔 이라 검색 불가" 를 즉시 확인 가능. - `mixed_page_pdf_stores_asset_with_scanned_candidate_warning` 에 `IngestItem.warnings` 단정 추가 (정확히 1건 + 노트 내용 검증). - `encrypted_pdf` / `corrupt_pdf` 테스트의 `errors >= 1` → `errors == 1` strict 단정. 미래에 다른 source 가 errors 늘리면 즉시 빨개짐. - `re_ingest_identical_pdf` 에 `chunk_count` 동일성 단정 추가. P1 idempotency contract 의 chunk-단위 axis 검증 (chunk_id 전체 set 비교는 pdf-page-v1 의 `deterministic_chunk_ids_1000` 가 잠그고 있어 chunk_count 가 가벼운 proxy 로 충분). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1084,6 +1084,19 @@ fn ingest_one_pdf_asset(
|
||||
kebab_core::IngestItemKind::New
|
||||
};
|
||||
|
||||
// Surface every `Provenance::Warning` note onto `IngestItem.warnings`
|
||||
// so the ingest summary shows partial-success signals (e.g. "page 2
|
||||
// empty (scanned candidate)") without forcing the operator into
|
||||
// `kebab inspect doc <id>`. Mirrors how the markdown path threads
|
||||
// frontmatter / block warnings up to the same field.
|
||||
let warnings: Vec<String> = canonical
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
||||
.filter_map(|e| e.note.clone())
|
||||
.collect();
|
||||
|
||||
Ok(kebab_core::IngestItem {
|
||||
kind,
|
||||
doc_id: Some(canonical.doc_id.clone()),
|
||||
@@ -1094,7 +1107,7 @@ fn ingest_one_pdf_asset(
|
||||
chunk_count: u32::try_from(chunks.len()).ok(),
|
||||
parser_version: Some(canonical.parser_version.clone()),
|
||||
chunker_version: Some(chunker.chunker_version()),
|
||||
warnings: Vec::new(),
|
||||
warnings,
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -218,6 +218,15 @@ fn re_ingest_identical_pdf_produces_updated_with_same_doc_id() {
|
||||
.unwrap();
|
||||
assert_eq!(item2.kind, IngestItemKind::Updated);
|
||||
assert_eq!(item2.doc_id, item1.doc_id);
|
||||
// P1 idempotency contract: identical bytes → identical chunk set.
|
||||
// Comparing `chunk_count` as a proxy (full chunk_id set comparison
|
||||
// would need direct sqlite access; the per-chunk #c{char_start}
|
||||
// hash variant in pdf-page-v1 is already tested for stability in
|
||||
// `kebab-chunk::pdf_page_v1::deterministic_chunk_ids_1000`).
|
||||
assert_eq!(
|
||||
item1.chunk_count, item2.chunk_count,
|
||||
"identical bytes must produce identical chunk count"
|
||||
);
|
||||
}
|
||||
|
||||
/// Edit a PDF (replace bytes) → different blake3 → different asset_id
|
||||
@@ -285,7 +294,7 @@ fn encrypted_pdf_fails_with_qpdf_hint() {
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
assert!(report.errors >= 1, "encrypted PDF must increment errors");
|
||||
assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once");
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -313,7 +322,7 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert!(report.errors >= 1);
|
||||
assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once");
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -390,6 +399,21 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
|
||||
note.contains("page2") && note.contains("scanned candidate"),
|
||||
"Warning note marks page 2 as scanned candidate: {note}"
|
||||
);
|
||||
|
||||
// R1: Warning notes also surface on `IngestItem.warnings` so
|
||||
// operators can see the partial-success signal in the ingest
|
||||
// summary without `kebab inspect doc`.
|
||||
assert_eq!(
|
||||
pdf_item.warnings.len(),
|
||||
1,
|
||||
"exactly one warning surfaced on IngestItem"
|
||||
);
|
||||
assert!(
|
||||
pdf_item.warnings[0].contains("page2")
|
||||
&& pdf_item.warnings[0].contains("scanned candidate"),
|
||||
"IngestItem.warnings preserves the Provenance Warning note: {:?}",
|
||||
pdf_item.warnings
|
||||
);
|
||||
}
|
||||
|
||||
/// IngestReport invariant `scanned == new + updated + skipped + errors`
|
||||
|
||||
Reference in New Issue
Block a user