kebab/docs/wire-schema/v1/schema.schema.json

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://kebab.local/wire-schema/v1/schema.schema.json",
  "title": "schema.v1",
  "description": "kebab introspection report — wire schemas, capabilities, model versions, and index stats.",
  "type": "object",
  "required": ["schema_version", "kebab_version", "wire", "capabilities", "models", "stats"],
  "properties": {
    "schema_version": { "const": "schema.v1" },
    "kebab_version": { "type": "string" },
    "wire": {
      "type": "object",
      "required": ["schemas"],
      "properties": {
        "schemas": {
          "type": "array",
          "items": { "type": "string", "pattern": "^[a-z_]+\\.v[0-9]+$" }
        }
      }
    },
    "capabilities": {
      "type": "object",
      "additionalProperties": { "type": "boolean" },
      "required": [
        "json_mode", "ingest_progress", "ingest_cancellation",
        "rag_multi_turn", "search_cache", "incremental_ingest",
        "streaming_ask", "http_daemon", "mcp_server", "single_file_ingest", "bulk_search"
      ]
    },
    "models": {
      "type": "object",
      "required": [
        "parser_version", "chunker_version", "embedding_version",
        "prompt_template_version", "index_version", "corpus_revision"
      ],
      "properties": {
        "parser_version": { "type": "string" },
        "chunker_version": { "type": "string" },
        "active_parsers": {
          "type": "array",
          "items": { "type": "string" },
          "description": "v0.20.1+ (Bug #13). 활성 parser version 전체 (DISTINCT, ORDER BY). 빈 corpus → []. backward-compat: optional, 기존 client 무영향."
        },
        "active_chunkers": {
          "type": "array",
          "items": { "type": "string" },
          "description": "v0.20.1+ (Bug #13). 활성 chunker version 전체 (DISTINCT, ORDER BY). 빈 corpus → []."
        },
        "embedding_version": { "type": "string" },
        "prompt_template_version": { "type": "string" },
        "index_version": { "type": "string" },
        "corpus_revision": { "type": "integer", "minimum": 0 }
      }
    },
    "stats": {
      "type": "object",
      "required": ["doc_count", "chunk_count", "asset_count", "last_ingest_at"],
      "properties": {
        "doc_count": { "type": "integer", "minimum": 0 },
        "chunk_count": { "type": "integer", "minimum": 0 },
        "asset_count": { "type": "integer", "minimum": 0 },
        "last_ingest_at": {
          "anyOf": [
            { "type": "string", "format": "date-time" },
            { "type": "null" }
          ]
        },
        "media_breakdown": {
          "type": "object",
          "description": "p9-fb-37: per-media-kind doc count. 5 keys (markdown/pdf/image/audio/other), zero-padded.",
          "additionalProperties": { "type": "integer", "minimum": 0 }
        },
        "lang_breakdown": {
          "type": "object",
          "description": "p9-fb-37: per-language doc count. NULL lang keyed as the literal string 'null'. Map may be empty on empty corpus. v0.20.2 (Todo #4) 주의: `lang` 은 자연어 prose 의 lingua 감지 결과 (Markdown 등). 소스코드 문서는 자연어 감지를 하지 않아 `lang = \"und\"` 이며, 소스 언어는 별도 `code_lang_breakdown` 에 집계된다 — 따라서 code 비중이 큰 corpus 에서 `und` 가 높은 것은 설계상 정상 (감지 실패 아님).",
          "additionalProperties": { "type": "integer", "minimum": 0 }
        },
        "index_bytes": {
          "type": "object",
          "description": "p9-fb-37: on-disk byte sums.",
          "required": ["sqlite", "lancedb"],
          "properties": {
            "sqlite":  { "type": "integer", "minimum": 0 },
            "lancedb": { "type": "integer", "minimum": 0 }
          }
        },
        "stale_doc_count": {
          "type": "integer",
          "minimum": 0,
          "description": "p9-fb-37: docs whose updated_at exceeds config.search.stale_threshold_days. 0 when threshold=0."
        },
        "code_lang_breakdown": {
          "type": "object",
          "description": "p10-1A-1: per-language **doc** count (one entry per indexed code document). Key = lowercase language name (e.g. 'rust', 'python'). Empty on markdown-only corpora. Pair with `code_lang_chunk_breakdown` for chunk-level granularity (one file's 200 chunks vs one doc).",
          "additionalProperties": { "type": "integer", "minimum": 0 }
        },
        "repo_breakdown": {
          "type": "object",
          "description": "p10-1A-1: per-repo **doc** count. Key = repo name as detected by kebab-parse-code::repo. Empty on markdown-only corpora.",
          "additionalProperties": { "type": "integer", "minimum": 0 }
        },
        "code_lang_chunk_breakdown": {
          "type": "object",
          "description": "v0.17.0 PR-C: per-language **chunk** count (closes HOTFIXES 2026-05-22 'code_lang_breakdown chunk granularity'). Companion to `code_lang_breakdown` (doc count) — chunk-level granularity is the indexing-pressure metric (a 200-chunk PDF + a 5-chunk Rust file both appear as `1 doc` but `200` vs `5` chunks). Key = lowercase language name. Empty on markdown-only corpora.",
          "additionalProperties": { "type": "integer", "minimum": 0 }
        }
      }
    }
  }
}