diff --git a/crates/kebab-app/src/ingest_progress.rs b/crates/kebab-app/src/ingest_progress.rs index 19f1d6f..a65e9b9 100644 --- a/crates/kebab-app/src/ingest_progress.rs +++ b/crates/kebab-app/src/ingest_progress.rs @@ -46,10 +46,13 @@ pub struct AggregateCounts { /// Ordering invariant per design §2.4a: /// /// ```text -/// ScanStarted < ScanCompleted < (AssetStarted < AssetFinished)* -/// < (Completed | Aborted) +/// ScanStarted < ScanCompleted +/// < (AssetStarted [< (PdfOcrStarted < PdfOcrFinished)*] < AssetFinished)* +/// < (Completed | Aborted) /// ``` /// +/// `[]` = optional, per-PDF asset only (v0.20.0 sub-item 1). +/// /// Embed-batch events (`embed_batch_started` / `embed_batch_finished` /// in §2.4a) are reserved for a future iteration and are not emitted /// by this task; the spec calls them out as "임의 위치" (optional). @@ -88,11 +91,14 @@ pub enum IngestEvent { /// PDF page 별 OCR 시작 시 emit. v0.20.0 sub-item 1. PdfOcrStarted { page: u32 }, /// PDF page 별 OCR 종료 시 emit. v0.20.0 sub-item 1. + /// `skipped` = `true` 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패). + /// `chars = 0` 만으로는 "skip" 과 "0-char OCR result" 구분 불가, `skipped` field 가 명시적. PdfOcrFinished { page: u32, ms: u64, chars: u32, ocr_engine: String, + skipped: bool, }, } diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index e7839aa..791869c 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -1865,7 +1865,7 @@ fn ingest_one_pdf_asset( page, ms, chars, - skipped: _, + skipped, } => { if let Some(sender) = progress { let _ = sender.send( @@ -1874,6 +1874,7 @@ fn ingest_one_pdf_asset( ms, chars, ocr_engine: engine.engine_name().to_string(), + skipped, }, ); } diff --git a/docs/wire-schema/v1/ingest_progress.schema.json b/docs/wire-schema/v1/ingest_progress.schema.json index b81ea78..317071c 100644 --- a/docs/wire-schema/v1/ingest_progress.schema.json +++ b/docs/wire-schema/v1/ingest_progress.schema.json @@ -16,6 +16,8 @@ "asset_finished", "embed_batch_started", "embed_batch_finished", + "pdf_ocr_started", + "pdf_ocr_finished", "completed", "aborted" ] @@ -33,7 +35,11 @@ }, "chunks": { "type": "integer", "minimum": 0, "description": "asset_finished: chunk count produced for this asset." }, "n_chunks": { "type": "integer", "minimum": 0, "description": "embed_batch_started / embed_batch_finished: chunks in this embedding batch." }, - "ms": { "type": "integer", "minimum": 0, "description": "embed_batch_finished: wall-clock duration of the batch." }, + "ms": { "type": "integer", "minimum": 0, "description": "embed_batch_finished / pdf_ocr_finished: wall-clock duration (ms). pdf_ocr_finished skip path 의 의미는 mixed (DCTDecode 부재 시 0, engine 실패 시 latency-before-bail)." }, + "chars": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished: char count of OCR result. Skip 시 0." }, + "page": { "type": "integer", "minimum": 1, "description": "pdf_ocr_started / pdf_ocr_finished: 1-based PDF page number under OCR." }, + "ocr_engine": { "type": "string", "description": "pdf_ocr_finished: engine_name (e.g. 'ollama-vision')." }, + "skipped": { "type": "boolean", "description": "pdf_ocr_finished: true 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패). chars=0 만으로는 skip 과 0-char result 구분 불가." }, "counts": { "type": "object", "description": "completed / aborted: aggregate counters at the moment the run ended (mirrors fields on `ingest_report.v1`).", diff --git a/docs/wire-schema/v1/ingest_report.schema.json b/docs/wire-schema/v1/ingest_report.schema.json index 92ed1f1..f2ee803 100644 --- a/docs/wire-schema/v1/ingest_report.schema.json +++ b/docs/wire-schema/v1/ingest_report.schema.json @@ -38,7 +38,28 @@ }, "description": "p9-fb-25: per-extension skip count. Key = lowercase extension without leading dot (e.g. 'docx'). Files without extension key under ''." }, - "items": { "type": ["array", "null"] }, + "items": { + "type": ["array", "null"], + "items": { + "type": "object", + "required": ["kind", "doc_path"], + "properties": { + "kind": { "type": "string", "enum": ["new", "updated", "skipped", "unchanged", "error"] }, + "doc_id": { "type": ["string", "null"] }, + "doc_path": { "type": "string" }, + "asset_id": { "type": ["string", "null"] }, + "byte_len": { "type": ["integer", "null"], "minimum": 0 }, + "block_count": { "type": ["integer", "null"], "minimum": 0 }, + "chunk_count": { "type": ["integer", "null"], "minimum": 0 }, + "parser_version": { "type": ["string", "null"] }, + "chunker_version": { "type": ["string", "null"] }, + "warnings": { "type": "array", "items": { "type": "string" } }, + "pdf_ocr_pages": { "type": ["integer", "null"], "minimum": 0, "description": "v0.20.0 sub-item 1: number of PDF pages 가 OCR pipeline 통과. null = OCR disabled or non-PDF asset." }, + "pdf_ocr_ms_total": { "type": ["integer", "null"], "minimum": 0, "description": "v0.20.0 sub-item 1: cumulative OCR engine wall-clock duration (ms). null = OCR disabled or non-PDF asset." }, + "error": { "type": ["string", "null"] } + } + } + }, "skipped_gitignore": { "type": "integer", "minimum": 0 }, "skipped_kebabignore": { "type": "integer", "minimum": 0 }, "skipped_builtin_blacklist": { "type": "integer", "minimum": 0 },