From d9c7aabce17b2b10e6f35937ab8eb6b482888b49 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 27 May 2026 23:15:58 +0000 Subject: [PATCH] feat(schema): add active_parsers + active_chunkers arrays to schema.v1.models (Bug #13) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 이전: schema.v1.models 가 parser_version / chunker_version 단일 값만 보고 → multi-medium corpus (md + pdf + code Rust/Python + dockerfile + k8s + manifest) 의 version cascade audit 누락 risk. 이후: additive minor — Models struct 에 active_parsers + active_chunkers Vec 추가. backward compat: 기존 단일 field 보존 (markdown default), 신규 array 는 optional (#[serde(default)] + JSON schema required 미포함). source: - kebab_store_sqlite::fetch_distinct_parser_versions() 가 documents.parser_version DISTINCT + ORDER BY 반환. - fetch_distinct_chunker_versions() 가 chunks.chunker_version 동일 pattern. - collect_models 가 매 schema 호출마다 재계산 (cache 없음 — R-3 자동 해결). wire schema additive only — 메이저 bump 불필요. v0.20.1 minor 로 충분. integrations/claude-code/kebab/SKILL.md 동기 갱신. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/schema.rs | 12 ++++ .../kebab-app/tests/schema_active_versions.rs | 64 +++++++++++++++++++ crates/kebab-store-sqlite/src/store.rs | 45 +++++++++++++ docs/wire-schema/v1/schema.schema.json | 10 +++ integrations/claude-code/kebab/SKILL.md | 2 +- 5 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 crates/kebab-app/tests/schema_active_versions.rs diff --git a/crates/kebab-app/src/schema.rs b/crates/kebab-app/src/schema.rs index d99b1a6..aeca125 100644 --- a/crates/kebab-app/src/schema.rs +++ b/crates/kebab-app/src/schema.rs @@ -39,6 +39,14 @@ pub struct Capabilities { pub struct Models { pub parser_version: String, pub chunker_version: String, + /// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체. + /// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존. + #[serde(default)] + pub active_parsers: Vec, + /// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체. + /// 빈 corpus → empty Vec. + #[serde(default)] + pub active_chunkers: Vec, pub embedding_version: String, pub prompt_template_version: String, pub index_version: String, @@ -190,12 +198,16 @@ fn collect_stats( } fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models { + let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default(); + let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default(); Models { // markdown parser only — pdf-page-v1 (P7) / image extractors (P6) // maintain their own versions; surface those when SchemaV1.models // becomes a multi-medium map (P+). parser_version: kebab_parse_md::PARSER_VERSION.to_string(), chunker_version: cfg.chunking.chunker_version.clone(), + active_parsers, + active_chunkers, // EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan. embedding_version: cfg.models.embedding.model.clone(), prompt_template_version: cfg.rag.prompt_template_version.clone(), diff --git a/crates/kebab-app/tests/schema_active_versions.rs b/crates/kebab-app/tests/schema_active_versions.rs new file mode 100644 index 0000000..bd9d118 --- /dev/null +++ b/crates/kebab-app/tests/schema_active_versions.rs @@ -0,0 +1,64 @@ +//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers. + +use kebab_app::schema_with_config; +use kebab_config::Config; +use kebab_core::SourceScope; + +fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config { + let mut cfg = Config::defaults(); + cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.exclude.clear(); + cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); + cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); + cfg.models.embedding.provider = "none".to_string(); + cfg.models.embedding.dimensions = 0; + cfg.chunking.target_tokens = 80; + cfg.chunking.overlap_tokens = 20; + cfg +} + +fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope { + SourceScope { + root: workspace_root.to_path_buf(), + include: vec![], + exclude: vec![], + } +} + +#[test] +fn schema_models_active_arrays_empty_on_empty_corpus() { + let dir = tempfile::tempdir().unwrap(); + let workspace = dir.path().join("kb"); + std::fs::create_dir_all(&workspace).unwrap(); + let cfg = minimal_config(dir.path(), &workspace); + + let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + drop(store); + + let s = schema_with_config(&cfg).unwrap(); + assert!(s.models.active_parsers.is_empty(), "empty corpus → no parsers"); + assert!(s.models.active_chunkers.is_empty(), "empty corpus → no chunkers"); + // backward compat: 기존 단일 field 는 markdown default 보존. + assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION); +} + +#[test] +fn schema_emits_active_parsers_and_chunkers_array_after_ingest() { + let dir = tempfile::tempdir().unwrap(); + let workspace = dir.path().join("kb"); + std::fs::create_dir_all(&workspace).unwrap(); + std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap(); + let cfg = minimal_config(dir.path(), &workspace); + let scope = minimal_scope(&workspace); + + kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap(); + + let s = schema_with_config(&cfg).unwrap(); + assert!(!s.models.active_parsers.is_empty(), "active_parsers populated after ingest"); + assert!(!s.models.active_chunkers.is_empty(), "active_chunkers populated after ingest"); + // active arrays must be sorted (ORDER BY in SQL). + let mut sorted = s.models.active_parsers.clone(); + sorted.sort(); + assert_eq!(s.models.active_parsers, sorted, "active_parsers must be sorted"); +} diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index 23df426..df0fe7e 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -961,6 +961,51 @@ impl SqliteStore { } Ok(out) } + + /// p20-bugfix3 Bug #13: schema.v1.models.active_parsers 의 source. + /// `documents.parser_version` 컬럼의 DISTINCT 값을 정렬해 반환. + /// 빈 corpus → 빈 Vec. + pub fn fetch_distinct_parser_versions(&self) -> anyhow::Result> { + use anyhow::Context; + let conn = self.read_conn(); + let mut stmt = conn + .prepare( + "SELECT DISTINCT parser_version FROM documents \ + WHERE parser_version IS NOT NULL AND parser_version != '' \ + ORDER BY parser_version", + ) + .context("prepare fetch_distinct_parser_versions")?; + let rows = stmt + .query_map([], |row| row.get::<_, String>(0)) + .context("query fetch_distinct_parser_versions")?; + let mut out = Vec::new(); + for r in rows { + out.push(r.context("read parser_version row")?); + } + Ok(out) + } + + /// p20-bugfix3 Bug #13: schema.v1.models.active_chunkers 의 source. + /// `chunks.chunker_version` 컬럼의 DISTINCT 값을 정렬해 반환. + pub fn fetch_distinct_chunker_versions(&self) -> anyhow::Result> { + use anyhow::Context; + let conn = self.read_conn(); + let mut stmt = conn + .prepare( + "SELECT DISTINCT chunker_version FROM chunks \ + WHERE chunker_version IS NOT NULL AND chunker_version != '' \ + ORDER BY chunker_version", + ) + .context("prepare fetch_distinct_chunker_versions")?; + let rows = stmt + .query_map([], |row| row.get::<_, String>(0)) + .context("query fetch_distinct_chunker_versions")?; + let mut out = Vec::new(); + for r in rows { + out.push(r.context("read chunker_version row")?); + } + Ok(out) + } } /// Apply the design §5 / task-spec pragmas. Called once per connection. diff --git a/docs/wire-schema/v1/schema.schema.json b/docs/wire-schema/v1/schema.schema.json index ace9371..dd23700 100644 --- a/docs/wire-schema/v1/schema.schema.json +++ b/docs/wire-schema/v1/schema.schema.json @@ -36,6 +36,16 @@ "properties": { "parser_version": { "type": "string" }, "chunker_version": { "type": "string" }, + "active_parsers": { + "type": "array", + "items": { "type": "string" }, + "description": "v0.20.1+ (Bug #13). 활성 parser version 전체 (DISTINCT, ORDER BY). 빈 corpus → []. backward-compat: optional, 기존 client 무영향." + }, + "active_chunkers": { + "type": "array", + "items": { "type": "string" }, + "description": "v0.20.1+ (Bug #13). 활성 chunker version 전체 (DISTINCT, ORDER BY). 빈 corpus → []." + }, "embedding_version": { "type": "string" }, "prompt_template_version": { "type": "string" }, "index_version": { "type": "string" }, diff --git a/integrations/claude-code/kebab/SKILL.md b/integrations/claude-code/kebab/SKILL.md index 497c398..2af5c74 100644 --- a/integrations/claude-code/kebab/SKILL.md +++ b/integrations/claude-code/kebab/SKILL.md @@ -152,7 +152,7 @@ Claude Code spawns `kebab mcp` at session start; the process stays alive across Before using streaming or multi-turn features, probe what this binary supports — call `mcp__kebab__schema` (or CLI `kebab schema --json`): -Returns `schema.v1`: `wire.schemas` (supported wire ids), `capabilities` (bool flags — e.g. `streaming_ask`, `rag_multi_turn`), `models` (version cascade 6-axis), `stats` (doc/chunk/asset count + last_ingest_at, plus p9-fb-37 health surface: `media_breakdown` per-kind doc counts (5 zero-padded keys: markdown / pdf / image / audio / other), `lang_breakdown` per BCP-47 lang (NULL keyed as the literal string `"null"`), `index_bytes.{sqlite,lancedb}` on-disk byte sums, `stale_doc_count` for docs older than `config.search.stale_threshold_days`). Gate streaming / session flows on `capabilities.streaming_ask` / `capabilities.rag_multi_turn` being `true`. Cheap call (no LLM), once per session. +Returns `schema.v1`: `wire.schemas` (supported wire ids), `capabilities` (bool flags — e.g. `streaming_ask`, `rag_multi_turn`), `models` (version cascade 6-axis + v0.20.1 `active_parsers` / `active_chunkers` arrays for multi-version corpora), `stats` (doc/chunk/asset count + last_ingest_at, plus p9-fb-37 health surface: `media_breakdown` per-kind doc counts (5 zero-padded keys: markdown / pdf / image / audio / other), `lang_breakdown` per BCP-47 lang (NULL keyed as the literal string `"null"`), `index_bytes.{sqlite,lancedb}` on-disk byte sums, `stale_doc_count` for docs older than `config.search.stale_threshold_days`). Gate streaming / session flows on `capabilities.streaming_ask` / `capabilities.rag_multi_turn` being `true`. Cheap call (no LLM), once per session. ## Quick health check