feat(schema): add active_parsers + active_chunkers arrays to schema.v1.models (Bug #13)

이전: schema.v1.models 가 parser_version / chunker_version 단일 값만 보고 → multi-medium corpus (md + pdf + code Rust/Python + dockerfile + k8s + manifest) 의 version cascade audit 누락 risk. 이후: additive minor — Models struct 에 active_parsers + active_chunkers Vec<String> 추가. backward compat: 기존 단일 field 보존 (markdown default), 신규 array 는 optional (#[serde(default)] + JSON schema required 미포함). source: - kebab_store_sqlite::fetch_distinct_parser_versions() 가 documents.parser_version DISTINCT + ORDER BY 반환. - fetch_distinct_chunker_versions() 가 chunks.chunker_version 동일 pattern. - collect_models 가 매 schema 호출마다 재계산 (cache 없음 — R-3 자동 해결). wire schema additive only — 메이저 bump 불필요. v0.20.1 minor 로 충분. integrations/claude-code/kebab/SKILL.md 동기 갱신. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 23:15:58 +00:00
parent 10b0e2f4f2
commit d9c7aabce1
5 changed files with 132 additions and 1 deletions
--- a/crates/kebab-app/src/schema.rs
+++ b/crates/kebab-app/src/schema.rs
@@ -39,6 +39,14 @@ pub struct Capabilities {
 pub struct Models {
    pub parser_version: String,
    pub chunker_version: String,
+    /// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
+    /// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
+    #[serde(default)]
+    pub active_parsers: Vec<String>,
+    /// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
+    /// 빈 corpus → empty Vec.
+    #[serde(default)]
+    pub active_chunkers: Vec<String>,
    pub embedding_version: String,
    pub prompt_template_version: String,
    pub index_version: String,
@@ -190,12 +198,16 @@ fn collect_stats(
 }

 fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
+    let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
+    let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
    Models {
        // markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
        // maintain their own versions; surface those when SchemaV1.models
        // becomes a multi-medium map (P+).
        parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
        chunker_version: cfg.chunking.chunker_version.clone(),
+        active_parsers,
+        active_chunkers,
        // EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
        embedding_version: cfg.models.embedding.model.clone(),
        prompt_template_version: cfg.rag.prompt_template_version.clone(),
--- a/crates/kebab-app/tests/schema_active_versions.rs
+++ b/crates/kebab-app/tests/schema_active_versions.rs
@@ -0,0 +1,64 @@
+//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers.
+
+use kebab_app::schema_with_config;
+use kebab_config::Config;
+use kebab_core::SourceScope;
+
+fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
+    let mut cfg = Config::defaults();
+    cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
+    cfg.workspace.exclude.clear();
+    cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
+    cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
+    cfg.models.embedding.provider = "none".to_string();
+    cfg.models.embedding.dimensions = 0;
+    cfg.chunking.target_tokens = 80;
+    cfg.chunking.overlap_tokens = 20;
+    cfg
+}
+
+fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope {
+    SourceScope {
+        root: workspace_root.to_path_buf(),
+        include: vec![],
+        exclude: vec![],
+    }
+}
+
+#[test]
+fn schema_models_active_arrays_empty_on_empty_corpus() {
+    let dir = tempfile::tempdir().unwrap();
+    let workspace = dir.path().join("kb");
+    std::fs::create_dir_all(&workspace).unwrap();
+    let cfg = minimal_config(dir.path(), &workspace);
+
+    let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
+    store.run_migrations().unwrap();
+    drop(store);
+
+    let s = schema_with_config(&cfg).unwrap();
+    assert!(s.models.active_parsers.is_empty(), "empty corpus → no parsers");
+    assert!(s.models.active_chunkers.is_empty(), "empty corpus → no chunkers");
+    // backward compat: 기존 단일 field 는 markdown default 보존.
+    assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION);
+}
+
+#[test]
+fn schema_emits_active_parsers_and_chunkers_array_after_ingest() {
+    let dir = tempfile::tempdir().unwrap();
+    let workspace = dir.path().join("kb");
+    std::fs::create_dir_all(&workspace).unwrap();
+    std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap();
+    let cfg = minimal_config(dir.path(), &workspace);
+    let scope = minimal_scope(&workspace);
+
+    kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap();
+
+    let s = schema_with_config(&cfg).unwrap();
+    assert!(!s.models.active_parsers.is_empty(), "active_parsers populated after ingest");
+    assert!(!s.models.active_chunkers.is_empty(), "active_chunkers populated after ingest");
+    // active arrays must be sorted (ORDER BY in SQL).
+    let mut sorted = s.models.active_parsers.clone();
+    sorted.sort();
+    assert_eq!(s.models.active_parsers, sorted, "active_parsers must be sorted");
+}
--- a/crates/kebab-store-sqlite/src/store.rs
+++ b/crates/kebab-store-sqlite/src/store.rs
@@ -961,6 +961,51 @@ impl SqliteStore {
        }
        Ok(out)
    }
+
+    /// p20-bugfix3 Bug #13: schema.v1.models.active_parsers 의 source.
+    /// `documents.parser_version` 컬럼의 DISTINCT 값을 정렬해 반환.
+    /// 빈 corpus → 빈 Vec.
+    pub fn fetch_distinct_parser_versions(&self) -> anyhow::Result<Vec<String>> {
+        use anyhow::Context;
+        let conn = self.read_conn();
+        let mut stmt = conn
+            .prepare(
+                "SELECT DISTINCT parser_version FROM documents \
+                  WHERE parser_version IS NOT NULL AND parser_version != '' \
+                  ORDER BY parser_version",
+            )
+            .context("prepare fetch_distinct_parser_versions")?;
+        let rows = stmt
+            .query_map([], |row| row.get::<_, String>(0))
+            .context("query fetch_distinct_parser_versions")?;
+        let mut out = Vec::new();
+        for r in rows {
+            out.push(r.context("read parser_version row")?);
+        }
+        Ok(out)
+    }
+
+    /// p20-bugfix3 Bug #13: schema.v1.models.active_chunkers 의 source.
+    /// `chunks.chunker_version` 컬럼의 DISTINCT 값을 정렬해 반환.
+    pub fn fetch_distinct_chunker_versions(&self) -> anyhow::Result<Vec<String>> {
+        use anyhow::Context;
+        let conn = self.read_conn();
+        let mut stmt = conn
+            .prepare(
+                "SELECT DISTINCT chunker_version FROM chunks \
+                  WHERE chunker_version IS NOT NULL AND chunker_version != '' \
+                  ORDER BY chunker_version",
+            )
+            .context("prepare fetch_distinct_chunker_versions")?;
+        let rows = stmt
+            .query_map([], |row| row.get::<_, String>(0))
+            .context("query fetch_distinct_chunker_versions")?;
+        let mut out = Vec::new();
+        for r in rows {
+            out.push(r.context("read chunker_version row")?);
+        }
+        Ok(out)
+    }
 }

 /// Apply the design §5 / task-spec pragmas. Called once per connection.
--- a/docs/wire-schema/v1/schema.schema.json
+++ b/docs/wire-schema/v1/schema.schema.json
@@ -36,6 +36,16 @@
      "properties": {
        "parser_version": { "type": "string" },
        "chunker_version": { "type": "string" },
+        "active_parsers": {
+          "type": "array",
+          "items": { "type": "string" },
+          "description": "v0.20.1+ (Bug #13). 활성 parser version 전체 (DISTINCT, ORDER BY). 빈 corpus → []. backward-compat: optional, 기존 client 무영향."
+        },
+        "active_chunkers": {
+          "type": "array",
+          "items": { "type": "string" },
+          "description": "v0.20.1+ (Bug #13). 활성 chunker version 전체 (DISTINCT, ORDER BY). 빈 corpus → []."
+        },
        "embedding_version": { "type": "string" },
        "prompt_template_version": { "type": "string" },
        "index_version": { "type": "string" },
--- a/integrations/claude-code/kebab/SKILL.md
+++ b/integrations/claude-code/kebab/SKILL.md
@@ -152,7 +152,7 @@ Claude Code spawns `kebab mcp` at session start; the process stays alive across

 Before using streaming or multi-turn features, probe what this binary supports — call `mcp__kebab__schema` (or CLI `kebab schema --json`):

-Returns `schema.v1`: `wire.schemas` (supported wire ids), `capabilities` (bool flags — e.g. `streaming_ask`, `rag_multi_turn`), `models` (version cascade 6-axis), `stats` (doc/chunk/asset count + last_ingest_at, plus p9-fb-37 health surface: `media_breakdown` per-kind doc counts (5 zero-padded keys: markdown / pdf / image / audio / other), `lang_breakdown` per BCP-47 lang (NULL keyed as the literal string `"null"`), `index_bytes.{sqlite,lancedb}` on-disk byte sums, `stale_doc_count` for docs older than `config.search.stale_threshold_days`). Gate streaming / session flows on `capabilities.streaming_ask` / `capabilities.rag_multi_turn` being `true`. Cheap call (no LLM), once per session.
+Returns `schema.v1`: `wire.schemas` (supported wire ids), `capabilities` (bool flags — e.g. `streaming_ask`, `rag_multi_turn`), `models` (version cascade 6-axis + v0.20.1 `active_parsers` / `active_chunkers` arrays for multi-version corpora), `stats` (doc/chunk/asset count + last_ingest_at, plus p9-fb-37 health surface: `media_breakdown` per-kind doc counts (5 zero-padded keys: markdown / pdf / image / audio / other), `lang_breakdown` per BCP-47 lang (NULL keyed as the literal string `"null"`), `index_bytes.{sqlite,lancedb}` on-disk byte sums, `stale_doc_count` for docs older than `config.search.stale_threshold_days`). Gate streaming / session flows on `capabilities.streaming_ask` / `capabilities.rag_multi_turn` being `true`. Cheap call (no LLM), once per session.

 ## Quick health check