From bef0c988670218e45fe8d7eafa51cca80dd8e734 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 28 May 2026 02:57:59 +0000 Subject: [PATCH] feat(wire): PdfOcrProgress.Finished + ingest_progress.v1 additive 4 fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.20.x ingest log feature 의 wire side. additive minor cascade: * PdfOcrProgress::Finished + IngestEvent::PdfOcrFinished 의 4 field: - image_byte_size: Option - image_width: Option - image_height: Option - failure_reason: Option * docs/wire-schema/v1/ingest_progress.schema.json — 4 추가 property (모두 optional, required 변경 없음 = additive minor) * integrations/claude-code/kebab/SKILL.md — wire schema description 동기 기존 ingest_progress.v1 consumer (CLI wire dump, integration test fixture, kebab-cli wire_search/wire_ask) 는 4 추가 field 의 Option::None 으로 backward-compat. version bump 0 (additive minor = binary-version cascade trigger 아님 per CLAUDE.md §Versioning cascade). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/ingest_progress.rs | 12 +++++++++++ crates/kebab-app/src/lib.rs | 8 +++++++ crates/kebab-app/src/pdf_ocr_apply.rs | 21 +++++++++++++++++++ crates/kebab-cli/src/progress.rs | 2 +- .../v1/ingest_progress.schema.json | 4 ++++ integrations/claude-code/kebab/SKILL.md | 2 +- 6 files changed, 47 insertions(+), 2 deletions(-) diff --git a/crates/kebab-app/src/ingest_progress.rs b/crates/kebab-app/src/ingest_progress.rs index a65e9b9..05284a0 100644 --- a/crates/kebab-app/src/ingest_progress.rs +++ b/crates/kebab-app/src/ingest_progress.rs @@ -99,6 +99,18 @@ pub enum IngestEvent { chars: u32, ocr_engine: String, skipped: bool, + /// v0.20.x ingest log: raster image byte size (additive minor, optional). + #[serde(skip_serializing_if = "Option::is_none")] + image_byte_size: Option, + /// v0.20.x ingest log: raster image width in pixels (additive minor, optional). + #[serde(skip_serializing_if = "Option::is_none")] + image_width: Option, + /// v0.20.x ingest log: raster image height in pixels (additive minor, optional). + #[serde(skip_serializing_if = "Option::is_none")] + image_height: Option, + /// v0.20.x ingest log: OCR failure reason (additive minor, optional). + #[serde(skip_serializing_if = "Option::is_none")] + failure_reason: Option, }, } diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 47cb9ec..7fbda13 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -1869,6 +1869,10 @@ fn ingest_one_pdf_asset( ms, chars, skipped, + image_byte_size, + image_width, + image_height, + failure_reason, } => { if let Some(sender) = progress { let _ = sender.send( @@ -1878,6 +1882,10 @@ fn ingest_one_pdf_asset( chars, ocr_engine: engine.engine_name().to_string(), skipped, + image_byte_size, + image_width, + image_height, + failure_reason: failure_reason.clone(), }, ); } diff --git a/crates/kebab-app/src/pdf_ocr_apply.rs b/crates/kebab-app/src/pdf_ocr_apply.rs index f903507..129da6d 100644 --- a/crates/kebab-app/src/pdf_ocr_apply.rs +++ b/crates/kebab-app/src/pdf_ocr_apply.rs @@ -147,6 +147,10 @@ where ms: 0, chars: 0, skipped: true, + image_byte_size: None, + image_width: None, + image_height: None, + failure_reason: None, }); continue; }; @@ -175,6 +179,10 @@ where ms: start.elapsed().as_millis() as u64, chars: 0, skipped: true, + image_byte_size: Some(page_image_bytes.len() as u64), + image_width: None, + image_height: None, + failure_reason: Some("ocr_error".to_string()), }); continue; } @@ -249,6 +257,10 @@ where ms: elapsed_ms, chars: chars_ocr, skipped: false, + image_byte_size: Some(page_image_bytes.len() as u64), + image_width: None, + image_height: None, + failure_reason: None, }); } @@ -291,5 +303,14 @@ pub enum PdfOcrProgress { /// `true` = DCTDecode 부재 또는 OCR engine 실패 로 skip. /// `false` = 정상 OCR 완료. skipped: bool, + /// v0.20.x ingest log: raster image byte size (additive, optional). + image_byte_size: Option, + /// v0.20.x ingest log: raster image width in pixels (additive, optional). + image_width: Option, + /// v0.20.x ingest log: raster image height in pixels (additive, optional). + image_height: Option, + /// v0.20.x ingest log: failure reason string when OCR failed (additive, optional). + /// Values: "timeout" | "ocr_error" | "network_error" | None (success). + failure_reason: Option, }, } diff --git a/crates/kebab-cli/src/progress.rs b/crates/kebab-cli/src/progress.rs index a8495a1..fc64b30 100644 --- a/crates/kebab-cli/src/progress.rs +++ b/crates/kebab-cli/src/progress.rs @@ -210,7 +210,7 @@ impl ProgressDisplay { let _ = writeln!(err, " 📷 OCR page {page}..."); } } - IngestEvent::PdfOcrFinished { page, ms, chars, ocr_engine, skipped } => { + IngestEvent::PdfOcrFinished { page, ms, chars, ocr_engine, skipped, .. } => { if !quiet { let mut err = std::io::stderr().lock(); if *skipped { diff --git a/docs/wire-schema/v1/ingest_progress.schema.json b/docs/wire-schema/v1/ingest_progress.schema.json index 317071c..833cfaa 100644 --- a/docs/wire-schema/v1/ingest_progress.schema.json +++ b/docs/wire-schema/v1/ingest_progress.schema.json @@ -40,6 +40,10 @@ "page": { "type": "integer", "minimum": 1, "description": "pdf_ocr_started / pdf_ocr_finished: 1-based PDF page number under OCR." }, "ocr_engine": { "type": "string", "description": "pdf_ocr_finished: engine_name (e.g. 'ollama-vision')." }, "skipped": { "type": "boolean", "description": "pdf_ocr_finished: true 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패). chars=0 만으로는 skip 과 0-char result 구분 불가." }, + "image_byte_size": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished (optional, v0.20.x): raster image byte size." }, + "image_width": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished (optional, v0.20.x): raster image width px." }, + "image_height": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished (optional, v0.20.x): raster image height px." }, + "failure_reason": { "type": "string", "description": "pdf_ocr_finished (optional, v0.20.x): OCR failure reason. Present iff skipped=true due to engine error. Values: timeout | ocr_error | network_error | other." }, "counts": { "type": "object", "description": "completed / aborted: aggregate counters at the moment the run ended (mirrors fields on `ingest_report.v1`).", diff --git a/integrations/claude-code/kebab/SKILL.md b/integrations/claude-code/kebab/SKILL.md index 2af5c74..85472f2 100644 --- a/integrations/claude-code/kebab/SKILL.md +++ b/integrations/claude-code/kebab/SKILL.md @@ -145,7 +145,7 @@ Claude Code spawns `kebab mcp` at session start; the process stays alive across - Pagination: `search_response.v1.next_cursor` is opaque base64 — pass back as `--cursor` (CLI) or `cursor` (MCP) for the next page. `null` means no more hits. `corpus_revision` mismatch returns `error.v1.code = stale_cursor` — re-issue search to obtain a fresh cursor. - `search_response.v1.truncated = true` means budget forced snippet shortening or k reduction. Independent of `next_cursor`: widen `max_tokens` for fuller snippets, follow `next_cursor` for more hits, or both. - `ask`'s `citations[]` mirrors `search_hit.v1` minus retrieval internals — same `doc_path` / `citation` shape. -- Schema reference lives in the kebab repo at `docs/wire-schema/v1/*.schema.json` if a field is unclear. +- Schema reference lives in the kebab repo at `docs/wire-schema/v1/*.schema.json` if a field is unclear. v0.20.x additive minor: `ingest_progress.v1` `pdf_ocr_finished` events carry 4 optional new fields (`image_byte_size`, `image_width`, `image_height`, `failure_reason`) — absent on pre-v0.20 events (backward compat). - `search_hit.v1` and `answer.v1.citations[]` carry `indexed_at` (RFC3339) + `stale` (bool). When `stale == true`, the source doc hasn't been re-processed since `config.search.stale_threshold_days`. Surface this caveat to the user when summarizing — the cited snapshot may not reflect current reality. ## Capability discovery