feat(schema): add active_parsers + active_chunkers arrays to schema.v1.models (Bug #13)
이전: schema.v1.models 가 parser_version / chunker_version 단일 값만 보고 → multi-medium corpus (md + pdf + code Rust/Python + dockerfile + k8s + manifest) 의 version cascade audit 누락 risk. 이후: additive minor — Models struct 에 active_parsers + active_chunkers Vec<String> 추가. backward compat: 기존 단일 field 보존 (markdown default), 신규 array 는 optional (#[serde(default)] + JSON schema required 미포함). source: - kebab_store_sqlite::fetch_distinct_parser_versions() 가 documents.parser_version DISTINCT + ORDER BY 반환. - fetch_distinct_chunker_versions() 가 chunks.chunker_version 동일 pattern. - collect_models 가 매 schema 호출마다 재계산 (cache 없음 — R-3 자동 해결). wire schema additive only — 메이저 bump 불필요. v0.20.1 minor 로 충분. integrations/claude-code/kebab/SKILL.md 동기 갱신. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -39,6 +39,14 @@ pub struct Capabilities {
|
||||
pub struct Models {
|
||||
pub parser_version: String,
|
||||
pub chunker_version: String,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
|
||||
/// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
|
||||
#[serde(default)]
|
||||
pub active_parsers: Vec<String>,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
|
||||
/// 빈 corpus → empty Vec.
|
||||
#[serde(default)]
|
||||
pub active_chunkers: Vec<String>,
|
||||
pub embedding_version: String,
|
||||
pub prompt_template_version: String,
|
||||
pub index_version: String,
|
||||
@@ -190,12 +198,16 @@ fn collect_stats(
|
||||
}
|
||||
|
||||
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
|
||||
let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
|
||||
let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
|
||||
Models {
|
||||
// markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
|
||||
// maintain their own versions; surface those when SchemaV1.models
|
||||
// becomes a multi-medium map (P+).
|
||||
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
|
||||
chunker_version: cfg.chunking.chunker_version.clone(),
|
||||
active_parsers,
|
||||
active_chunkers,
|
||||
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
|
||||
embedding_version: cfg.models.embedding.model.clone(),
|
||||
prompt_template_version: cfg.rag.prompt_template_version.clone(),
|
||||
|
||||
64
crates/kebab-app/tests/schema_active_versions.rs
Normal file
64
crates/kebab-app/tests/schema_active_versions.rs
Normal file
@@ -0,0 +1,64 @@
|
||||
//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers.
|
||||
|
||||
use kebab_app::schema_with_config;
|
||||
use kebab_config::Config;
|
||||
use kebab_core::SourceScope;
|
||||
|
||||
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.chunking.target_tokens = 80;
|
||||
cfg.chunking.overlap_tokens = 20;
|
||||
cfg
|
||||
}
|
||||
|
||||
fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope {
|
||||
SourceScope {
|
||||
root: workspace_root.to_path_buf(),
|
||||
include: vec![],
|
||||
exclude: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_models_active_arrays_empty_on_empty_corpus() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(s.models.active_parsers.is_empty(), "empty corpus → no parsers");
|
||||
assert!(s.models.active_chunkers.is_empty(), "empty corpus → no chunkers");
|
||||
// backward compat: 기존 단일 field 는 markdown default 보존.
|
||||
assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_emits_active_parsers_and_chunkers_array_after_ingest() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
let scope = minimal_scope(&workspace);
|
||||
|
||||
kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap();
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(!s.models.active_parsers.is_empty(), "active_parsers populated after ingest");
|
||||
assert!(!s.models.active_chunkers.is_empty(), "active_chunkers populated after ingest");
|
||||
// active arrays must be sorted (ORDER BY in SQL).
|
||||
let mut sorted = s.models.active_parsers.clone();
|
||||
sorted.sort();
|
||||
assert_eq!(s.models.active_parsers, sorted, "active_parsers must be sorted");
|
||||
}
|
||||
@@ -961,6 +961,51 @@ impl SqliteStore {
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// p20-bugfix3 Bug #13: schema.v1.models.active_parsers 의 source.
|
||||
/// `documents.parser_version` 컬럼의 DISTINCT 값을 정렬해 반환.
|
||||
/// 빈 corpus → 빈 Vec.
|
||||
pub fn fetch_distinct_parser_versions(&self) -> anyhow::Result<Vec<String>> {
|
||||
use anyhow::Context;
|
||||
let conn = self.read_conn();
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT DISTINCT parser_version FROM documents \
|
||||
WHERE parser_version IS NOT NULL AND parser_version != '' \
|
||||
ORDER BY parser_version",
|
||||
)
|
||||
.context("prepare fetch_distinct_parser_versions")?;
|
||||
let rows = stmt
|
||||
.query_map([], |row| row.get::<_, String>(0))
|
||||
.context("query fetch_distinct_parser_versions")?;
|
||||
let mut out = Vec::new();
|
||||
for r in rows {
|
||||
out.push(r.context("read parser_version row")?);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// p20-bugfix3 Bug #13: schema.v1.models.active_chunkers 의 source.
|
||||
/// `chunks.chunker_version` 컬럼의 DISTINCT 값을 정렬해 반환.
|
||||
pub fn fetch_distinct_chunker_versions(&self) -> anyhow::Result<Vec<String>> {
|
||||
use anyhow::Context;
|
||||
let conn = self.read_conn();
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT DISTINCT chunker_version FROM chunks \
|
||||
WHERE chunker_version IS NOT NULL AND chunker_version != '' \
|
||||
ORDER BY chunker_version",
|
||||
)
|
||||
.context("prepare fetch_distinct_chunker_versions")?;
|
||||
let rows = stmt
|
||||
.query_map([], |row| row.get::<_, String>(0))
|
||||
.context("query fetch_distinct_chunker_versions")?;
|
||||
let mut out = Vec::new();
|
||||
for r in rows {
|
||||
out.push(r.context("read chunker_version row")?);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the design §5 / task-spec pragmas. Called once per connection.
|
||||
|
||||
@@ -36,6 +36,16 @@
|
||||
"properties": {
|
||||
"parser_version": { "type": "string" },
|
||||
"chunker_version": { "type": "string" },
|
||||
"active_parsers": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" },
|
||||
"description": "v0.20.1+ (Bug #13). 활성 parser version 전체 (DISTINCT, ORDER BY). 빈 corpus → []. backward-compat: optional, 기존 client 무영향."
|
||||
},
|
||||
"active_chunkers": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" },
|
||||
"description": "v0.20.1+ (Bug #13). 활성 chunker version 전체 (DISTINCT, ORDER BY). 빈 corpus → []."
|
||||
},
|
||||
"embedding_version": { "type": "string" },
|
||||
"prompt_template_version": { "type": "string" },
|
||||
"index_version": { "type": "string" },
|
||||
|
||||
@@ -152,7 +152,7 @@ Claude Code spawns `kebab mcp` at session start; the process stays alive across
|
||||
|
||||
Before using streaming or multi-turn features, probe what this binary supports — call `mcp__kebab__schema` (or CLI `kebab schema --json`):
|
||||
|
||||
Returns `schema.v1`: `wire.schemas` (supported wire ids), `capabilities` (bool flags — e.g. `streaming_ask`, `rag_multi_turn`), `models` (version cascade 6-axis), `stats` (doc/chunk/asset count + last_ingest_at, plus p9-fb-37 health surface: `media_breakdown` per-kind doc counts (5 zero-padded keys: markdown / pdf / image / audio / other), `lang_breakdown` per BCP-47 lang (NULL keyed as the literal string `"null"`), `index_bytes.{sqlite,lancedb}` on-disk byte sums, `stale_doc_count` for docs older than `config.search.stale_threshold_days`). Gate streaming / session flows on `capabilities.streaming_ask` / `capabilities.rag_multi_turn` being `true`. Cheap call (no LLM), once per session.
|
||||
Returns `schema.v1`: `wire.schemas` (supported wire ids), `capabilities` (bool flags — e.g. `streaming_ask`, `rag_multi_turn`), `models` (version cascade 6-axis + v0.20.1 `active_parsers` / `active_chunkers` arrays for multi-version corpora), `stats` (doc/chunk/asset count + last_ingest_at, plus p9-fb-37 health surface: `media_breakdown` per-kind doc counts (5 zero-padded keys: markdown / pdf / image / audio / other), `lang_breakdown` per BCP-47 lang (NULL keyed as the literal string `"null"`), `index_bytes.{sqlite,lancedb}` on-disk byte sums, `stale_doc_count` for docs older than `config.search.stale_threshold_days`). Gate streaming / session flows on `capabilities.streaming_ask` / `capabilities.rag_multi_turn` being `true`. Cheap call (no LLM), once per session.
|
||||
|
||||
## Quick health check
|
||||
|
||||
|
||||
Reference in New Issue
Block a user