From 4c5ccd54470a5ebb9b7cf5a519e7dfd02852c636 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 27 May 2026 08:51:51 +0000 Subject: [PATCH] =?UTF-8?q?feat(wire):=20additive=20minor=20=E2=80=94=20In?= =?UTF-8?q?gestEvent=20kind=20=EC=9D=98=20pdf=5Focr=5F*=20+=20ingest=5Frep?= =?UTF-8?q?ort.items[]=20=EC=9D=98=20pdf=5Focr=5Fpages/ms=5Ftotal=20+=20sk?= =?UTF-8?q?ipped=20field=20carry=20(Step=206=20M-4/M-2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 7 (Group G) of v0.20.0 sub-item 1 (scanned PDF OCR) plan + Step 6 code reviewer Important M-4 (skipped field carry) + Minor M-2 (ordering invariant doc) fix. G3 — JSON Schema sync (additive minor — schema_version 보존): ingest_progress.schema.json: - kind enum 2 추가: pdf_ocr_started + pdf_ocr_finished. - 새 field: page (1-based PDF page), ocr_engine (engine_name), skipped (bool). - 기존 ms / chars field 의 description 갱신 (pdf_ocr_finished carry 추가). ingest_report.schema.json: - items.items.properties 신규 정의 (이전 stub ["array", "null"] 만). - pdf_ocr_pages + pdf_ocr_ms_total (nullable integer). - 모든 기존 IngestItem field 도 명시화 (kind, doc_path, byte_len, ...). Step 6 reviewer M-4 (Important) — skipped field carry: - IngestEvent::PdfOcrFinished 에 skipped: bool 추가. - ingest_one_pdf_asset 의 emit closure (lib.rs:~1864) 가 source PdfOcrProgress::Finished { skipped } 를 discard 않고 propagate. Step 6 reviewer M-2 (Minor) — ordering invariant doc: - crates/kebab-app/src/ingest_progress.rs 의 ordering text 갱신: ScanStarted < ScanCompleted < (AssetStarted [< (PdfOcrStarted < PdfOcrFinished)*] < AssetFinished)* < (Completed | Aborted). .md doc (docs/wire-schema/v1/*.md) 부재 — plan §3 Step 7 G3 의 .md deliverable retro N/A (해당 file 0). spec: docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md plan: docs/superpowers/plans/2026-05-27-pdf-scanned-ocr-plan.md (Step 7 G3) prior: b9ee09f (Step 6 wiring) + Step 6 reviewer M-4/M-2 권고 contract: §9 (additive minor wire bump — schema_version 보존) Co-Authored-By: Claude Sonnet 4.6 --- crates/kebab-app/src/ingest_progress.rs | 10 ++++++-- crates/kebab-app/src/lib.rs | 3 ++- .../v1/ingest_progress.schema.json | 8 ++++++- docs/wire-schema/v1/ingest_report.schema.json | 23 ++++++++++++++++++- 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/crates/kebab-app/src/ingest_progress.rs b/crates/kebab-app/src/ingest_progress.rs index 19f1d6f..a65e9b9 100644 --- a/crates/kebab-app/src/ingest_progress.rs +++ b/crates/kebab-app/src/ingest_progress.rs @@ -46,10 +46,13 @@ pub struct AggregateCounts { /// Ordering invariant per design §2.4a: /// /// ```text -/// ScanStarted < ScanCompleted < (AssetStarted < AssetFinished)* -/// < (Completed | Aborted) +/// ScanStarted < ScanCompleted +/// < (AssetStarted [< (PdfOcrStarted < PdfOcrFinished)*] < AssetFinished)* +/// < (Completed | Aborted) /// ``` /// +/// `[]` = optional, per-PDF asset only (v0.20.0 sub-item 1). +/// /// Embed-batch events (`embed_batch_started` / `embed_batch_finished` /// in §2.4a) are reserved for a future iteration and are not emitted /// by this task; the spec calls them out as "임의 위치" (optional). @@ -88,11 +91,14 @@ pub enum IngestEvent { /// PDF page 별 OCR 시작 시 emit. v0.20.0 sub-item 1. PdfOcrStarted { page: u32 }, /// PDF page 별 OCR 종료 시 emit. v0.20.0 sub-item 1. + /// `skipped` = `true` 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패). + /// `chars = 0` 만으로는 "skip" 과 "0-char OCR result" 구분 불가, `skipped` field 가 명시적. PdfOcrFinished { page: u32, ms: u64, chars: u32, ocr_engine: String, + skipped: bool, }, } diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index e7839aa..791869c 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -1865,7 +1865,7 @@ fn ingest_one_pdf_asset( page, ms, chars, - skipped: _, + skipped, } => { if let Some(sender) = progress { let _ = sender.send( @@ -1874,6 +1874,7 @@ fn ingest_one_pdf_asset( ms, chars, ocr_engine: engine.engine_name().to_string(), + skipped, }, ); } diff --git a/docs/wire-schema/v1/ingest_progress.schema.json b/docs/wire-schema/v1/ingest_progress.schema.json index b81ea78..317071c 100644 --- a/docs/wire-schema/v1/ingest_progress.schema.json +++ b/docs/wire-schema/v1/ingest_progress.schema.json @@ -16,6 +16,8 @@ "asset_finished", "embed_batch_started", "embed_batch_finished", + "pdf_ocr_started", + "pdf_ocr_finished", "completed", "aborted" ] @@ -33,7 +35,11 @@ }, "chunks": { "type": "integer", "minimum": 0, "description": "asset_finished: chunk count produced for this asset." }, "n_chunks": { "type": "integer", "minimum": 0, "description": "embed_batch_started / embed_batch_finished: chunks in this embedding batch." }, - "ms": { "type": "integer", "minimum": 0, "description": "embed_batch_finished: wall-clock duration of the batch." }, + "ms": { "type": "integer", "minimum": 0, "description": "embed_batch_finished / pdf_ocr_finished: wall-clock duration (ms). pdf_ocr_finished skip path 의 의미는 mixed (DCTDecode 부재 시 0, engine 실패 시 latency-before-bail)." }, + "chars": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished: char count of OCR result. Skip 시 0." }, + "page": { "type": "integer", "minimum": 1, "description": "pdf_ocr_started / pdf_ocr_finished: 1-based PDF page number under OCR." }, + "ocr_engine": { "type": "string", "description": "pdf_ocr_finished: engine_name (e.g. 'ollama-vision')." }, + "skipped": { "type": "boolean", "description": "pdf_ocr_finished: true 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패). chars=0 만으로는 skip 과 0-char result 구분 불가." }, "counts": { "type": "object", "description": "completed / aborted: aggregate counters at the moment the run ended (mirrors fields on `ingest_report.v1`).", diff --git a/docs/wire-schema/v1/ingest_report.schema.json b/docs/wire-schema/v1/ingest_report.schema.json index 92ed1f1..f2ee803 100644 --- a/docs/wire-schema/v1/ingest_report.schema.json +++ b/docs/wire-schema/v1/ingest_report.schema.json @@ -38,7 +38,28 @@ }, "description": "p9-fb-25: per-extension skip count. Key = lowercase extension without leading dot (e.g. 'docx'). Files without extension key under ''." }, - "items": { "type": ["array", "null"] }, + "items": { + "type": ["array", "null"], + "items": { + "type": "object", + "required": ["kind", "doc_path"], + "properties": { + "kind": { "type": "string", "enum": ["new", "updated", "skipped", "unchanged", "error"] }, + "doc_id": { "type": ["string", "null"] }, + "doc_path": { "type": "string" }, + "asset_id": { "type": ["string", "null"] }, + "byte_len": { "type": ["integer", "null"], "minimum": 0 }, + "block_count": { "type": ["integer", "null"], "minimum": 0 }, + "chunk_count": { "type": ["integer", "null"], "minimum": 0 }, + "parser_version": { "type": ["string", "null"] }, + "chunker_version": { "type": ["string", "null"] }, + "warnings": { "type": "array", "items": { "type": "string" } }, + "pdf_ocr_pages": { "type": ["integer", "null"], "minimum": 0, "description": "v0.20.0 sub-item 1: number of PDF pages 가 OCR pipeline 통과. null = OCR disabled or non-PDF asset." }, + "pdf_ocr_ms_total": { "type": ["integer", "null"], "minimum": 0, "description": "v0.20.0 sub-item 1: cumulative OCR engine wall-clock duration (ms). null = OCR disabled or non-PDF asset." }, + "error": { "type": ["string", "null"] } + } + } + }, "skipped_gitignore": { "type": "integer", "minimum": 0 }, "skipped_kebabignore": { "type": "integer", "minimum": 0 }, "skipped_builtin_blacklist": { "type": "integer", "minimum": 0 },