Files
kebab/docs/wire-schema/v1/ingest_progress.schema.json
altair823 bef0c98867 feat(wire): PdfOcrProgress.Finished + ingest_progress.v1 additive 4 fields
v0.20.x ingest log feature 의 wire side. additive minor cascade:

  * PdfOcrProgress::Finished + IngestEvent::PdfOcrFinished 의 4 field:
      - image_byte_size: Option<u64>
      - image_width:     Option<u32>
      - image_height:    Option<u32>
      - failure_reason:  Option<String>
  * docs/wire-schema/v1/ingest_progress.schema.json — 4 추가 property
    (모두 optional, required 변경 없음 = additive minor)
  * integrations/claude-code/kebab/SKILL.md — wire schema description 동기

기존 ingest_progress.v1 consumer (CLI wire dump, integration test
fixture, kebab-cli wire_search/wire_ask) 는 4 추가 field 의
Option::None 으로 backward-compat. version bump 0 (additive minor =
binary-version cascade trigger 아님 per CLAUDE.md §Versioning cascade).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 02:57:59 +00:00

62 lines
4.4 KiB
JSON

{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://kb.local/wire/v1/ingest_progress.schema.json",
"title": "IngestProgressEvent v1",
"description": "Streaming progress event emitted by `kebab ingest --json`. One event per line (line-delimited JSON). Discriminated by `kind`. The terminal events are `completed` and `aborted` — every ingest run ends with exactly one of them. The final stdout line of a `--json` ingest is still the existing `ingest_report.v1` for backwards compatibility; progress events stream above it.",
"type": "object",
"required": ["schema_version", "kind", "ts"],
"properties": {
"schema_version": { "const": "ingest_progress.v1" },
"kind": {
"type": "string",
"enum": [
"scan_started",
"scan_completed",
"asset_started",
"asset_finished",
"embed_batch_started",
"embed_batch_finished",
"pdf_ocr_started",
"pdf_ocr_finished",
"completed",
"aborted"
]
},
"ts": { "type": "string", "format": "date-time", "description": "RFC 3339 timestamp at the moment the event was emitted." },
"root": { "type": "string", "description": "scan_started: workspace root being walked." },
"total": { "type": "integer", "minimum": 0, "description": "scan_completed / asset_started / asset_finished: total assets discovered." },
"idx": { "type": "integer", "minimum": 1, "description": "asset_started / asset_finished: 1-based index of the current asset within the scan." },
"path": { "type": "string", "description": "asset_started: workspace-relative path of the asset being processed." },
"media": { "type": "string", "description": "asset_started: media kind label (e.g. `markdown`, `pdf`, `image`)." },
"result": {
"type": "string",
"enum": ["new", "updated", "skipped", "error"],
"description": "asset_finished: per-asset outcome (mirrors `ingest_report.v1.items[].kind`)."
},
"chunks": { "type": "integer", "minimum": 0, "description": "asset_finished: chunk count produced for this asset." },
"n_chunks": { "type": "integer", "minimum": 0, "description": "embed_batch_started / embed_batch_finished: chunks in this embedding batch." },
"ms": { "type": "integer", "minimum": 0, "description": "embed_batch_finished / pdf_ocr_finished: wall-clock duration (ms). pdf_ocr_finished skip path 의 의미는 mixed (DCTDecode 부재 시 0, engine 실패 시 latency-before-bail)." },
"chars": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished: char count of OCR result. Skip 시 0." },
"page": { "type": "integer", "minimum": 1, "description": "pdf_ocr_started / pdf_ocr_finished: 1-based PDF page number under OCR." },
"ocr_engine": { "type": "string", "description": "pdf_ocr_finished: engine_name (e.g. 'ollama-vision')." },
"skipped": { "type": "boolean", "description": "pdf_ocr_finished: true 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패). chars=0 만으로는 skip 과 0-char result 구분 불가." },
"image_byte_size": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished (optional, v0.20.x): raster image byte size." },
"image_width": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished (optional, v0.20.x): raster image width px." },
"image_height": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished (optional, v0.20.x): raster image height px." },
"failure_reason": { "type": "string", "description": "pdf_ocr_finished (optional, v0.20.x): OCR failure reason. Present iff skipped=true due to engine error. Values: timeout | ocr_error | network_error | other." },
"counts": {
"type": "object",
"description": "completed / aborted: aggregate counters at the moment the run ended (mirrors fields on `ingest_report.v1`).",
"properties": {
"scanned": { "type": "integer", "minimum": 0 },
"new": { "type": "integer", "minimum": 0 },
"updated": { "type": "integer", "minimum": 0 },
"skipped": { "type": "integer", "minimum": 0 },
"errors": { "type": "integer", "minimum": 0 },
"chunks_indexed": { "type": "integer", "minimum": 0 },
"embeddings_indexed": { "type": "integer", "minimum": 0 }
}
}
}
}