feat(schema): add active_parsers + active_chunkers arrays to schema.v1.models (Bug #13)

이전: schema.v1.models 가 parser_version / chunker_version 단일 값만 보고 →
multi-medium corpus (md + pdf + code Rust/Python + dockerfile + k8s + manifest)
의 version cascade audit 누락 risk.

이후: additive minor — Models struct 에 active_parsers + active_chunkers Vec<String>
추가. backward compat: 기존 단일 field 보존 (markdown default), 신규 array 는
optional (#[serde(default)] + JSON schema required 미포함).

source:
- kebab_store_sqlite::fetch_distinct_parser_versions() 가
  documents.parser_version DISTINCT + ORDER BY 반환.
- fetch_distinct_chunker_versions() 가 chunks.chunker_version 동일 pattern.
- collect_models 가 매 schema 호출마다 재계산 (cache 없음 — R-3 자동 해결).

wire schema additive only — 메이저 bump 불필요. v0.20.1 minor 로 충분.
integrations/claude-code/kebab/SKILL.md 동기 갱신.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-27 23:15:58 +00:00
parent 10b0e2f4f2
commit d9c7aabce1
5 changed files with 132 additions and 1 deletions

View File

@@ -39,6 +39,14 @@ pub struct Capabilities {
pub struct Models {
pub parser_version: String,
pub chunker_version: String,
/// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
/// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
#[serde(default)]
pub active_parsers: Vec<String>,
/// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
/// 빈 corpus → empty Vec.
#[serde(default)]
pub active_chunkers: Vec<String>,
pub embedding_version: String,
pub prompt_template_version: String,
pub index_version: String,
@@ -190,12 +198,16 @@ fn collect_stats(
}
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
Models {
// markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
// maintain their own versions; surface those when SchemaV1.models
// becomes a multi-medium map (P+).
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
chunker_version: cfg.chunking.chunker_version.clone(),
active_parsers,
active_chunkers,
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
embedding_version: cfg.models.embedding.model.clone(),
prompt_template_version: cfg.rag.prompt_template_version.clone(),

View File

@@ -0,0 +1,64 @@
//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers.
use kebab_app::schema_with_config;
use kebab_config::Config;
use kebab_core::SourceScope;
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
let mut cfg = Config::defaults();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
cfg.chunking.target_tokens = 80;
cfg.chunking.overlap_tokens = 20;
cfg
}
fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope {
SourceScope {
root: workspace_root.to_path_buf(),
include: vec![],
exclude: vec![],
}
}
#[test]
fn schema_models_active_arrays_empty_on_empty_corpus() {
let dir = tempfile::tempdir().unwrap();
let workspace = dir.path().join("kb");
std::fs::create_dir_all(&workspace).unwrap();
let cfg = minimal_config(dir.path(), &workspace);
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
drop(store);
let s = schema_with_config(&cfg).unwrap();
assert!(s.models.active_parsers.is_empty(), "empty corpus → no parsers");
assert!(s.models.active_chunkers.is_empty(), "empty corpus → no chunkers");
// backward compat: 기존 단일 field 는 markdown default 보존.
assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION);
}
#[test]
fn schema_emits_active_parsers_and_chunkers_array_after_ingest() {
let dir = tempfile::tempdir().unwrap();
let workspace = dir.path().join("kb");
std::fs::create_dir_all(&workspace).unwrap();
std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap();
let cfg = minimal_config(dir.path(), &workspace);
let scope = minimal_scope(&workspace);
kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap();
let s = schema_with_config(&cfg).unwrap();
assert!(!s.models.active_parsers.is_empty(), "active_parsers populated after ingest");
assert!(!s.models.active_chunkers.is_empty(), "active_chunkers populated after ingest");
// active arrays must be sorted (ORDER BY in SQL).
let mut sorted = s.models.active_parsers.clone();
sorted.sort();
assert_eq!(s.models.active_parsers, sorted, "active_parsers must be sorted");
}

View File

@@ -961,6 +961,51 @@ impl SqliteStore {
}
Ok(out)
}
/// p20-bugfix3 Bug #13: schema.v1.models.active_parsers 의 source.
/// `documents.parser_version` 컬럼의 DISTINCT 값을 정렬해 반환.
/// 빈 corpus → 빈 Vec.
pub fn fetch_distinct_parser_versions(&self) -> anyhow::Result<Vec<String>> {
use anyhow::Context;
let conn = self.read_conn();
let mut stmt = conn
.prepare(
"SELECT DISTINCT parser_version FROM documents \
WHERE parser_version IS NOT NULL AND parser_version != '' \
ORDER BY parser_version",
)
.context("prepare fetch_distinct_parser_versions")?;
let rows = stmt
.query_map([], |row| row.get::<_, String>(0))
.context("query fetch_distinct_parser_versions")?;
let mut out = Vec::new();
for r in rows {
out.push(r.context("read parser_version row")?);
}
Ok(out)
}
/// p20-bugfix3 Bug #13: schema.v1.models.active_chunkers 의 source.
/// `chunks.chunker_version` 컬럼의 DISTINCT 값을 정렬해 반환.
pub fn fetch_distinct_chunker_versions(&self) -> anyhow::Result<Vec<String>> {
use anyhow::Context;
let conn = self.read_conn();
let mut stmt = conn
.prepare(
"SELECT DISTINCT chunker_version FROM chunks \
WHERE chunker_version IS NOT NULL AND chunker_version != '' \
ORDER BY chunker_version",
)
.context("prepare fetch_distinct_chunker_versions")?;
let rows = stmt
.query_map([], |row| row.get::<_, String>(0))
.context("query fetch_distinct_chunker_versions")?;
let mut out = Vec::new();
for r in rows {
out.push(r.context("read chunker_version row")?);
}
Ok(out)
}
}
/// Apply the design §5 / task-spec pragmas. Called once per connection.

View File

@@ -36,6 +36,16 @@
"properties": {
"parser_version": { "type": "string" },
"chunker_version": { "type": "string" },
"active_parsers": {
"type": "array",
"items": { "type": "string" },
"description": "v0.20.1+ (Bug #13). 활성 parser version 전체 (DISTINCT, ORDER BY). 빈 corpus → []. backward-compat: optional, 기존 client 무영향."
},
"active_chunkers": {
"type": "array",
"items": { "type": "string" },
"description": "v0.20.1+ (Bug #13). 활성 chunker version 전체 (DISTINCT, ORDER BY). 빈 corpus → []."
},
"embedding_version": { "type": "string" },
"prompt_template_version": { "type": "string" },
"index_version": { "type": "string" },

View File

@@ -152,7 +152,7 @@ Claude Code spawns `kebab mcp` at session start; the process stays alive across
Before using streaming or multi-turn features, probe what this binary supports — call `mcp__kebab__schema` (or CLI `kebab schema --json`):
Returns `schema.v1`: `wire.schemas` (supported wire ids), `capabilities` (bool flags — e.g. `streaming_ask`, `rag_multi_turn`), `models` (version cascade 6-axis), `stats` (doc/chunk/asset count + last_ingest_at, plus p9-fb-37 health surface: `media_breakdown` per-kind doc counts (5 zero-padded keys: markdown / pdf / image / audio / other), `lang_breakdown` per BCP-47 lang (NULL keyed as the literal string `"null"`), `index_bytes.{sqlite,lancedb}` on-disk byte sums, `stale_doc_count` for docs older than `config.search.stale_threshold_days`). Gate streaming / session flows on `capabilities.streaming_ask` / `capabilities.rag_multi_turn` being `true`. Cheap call (no LLM), once per session.
Returns `schema.v1`: `wire.schemas` (supported wire ids), `capabilities` (bool flags — e.g. `streaming_ask`, `rag_multi_turn`), `models` (version cascade 6-axis + v0.20.1 `active_parsers` / `active_chunkers` arrays for multi-version corpora), `stats` (doc/chunk/asset count + last_ingest_at, plus p9-fb-37 health surface: `media_breakdown` per-kind doc counts (5 zero-padded keys: markdown / pdf / image / audio / other), `lang_breakdown` per BCP-47 lang (NULL keyed as the literal string `"null"`), `index_bytes.{sqlite,lancedb}` on-disk byte sums, `stale_doc_count` for docs older than `config.search.stale_threshold_days`). Gate streaming / session flows on `capabilities.streaming_ask` / `capabilities.rag_multi_turn` being `true`. Cheap call (no LLM), once per session.
## Quick health check