diff --git a/crates/kebab-app/src/schema.rs b/crates/kebab-app/src/schema.rs index 8e085ec..5026c95 100644 --- a/crates/kebab-app/src/schema.rs +++ b/crates/kebab-app/src/schema.rs @@ -71,6 +71,12 @@ pub struct Stats { /// Empty until 1A-2 produces code chunks. #[serde(default)] pub repo_breakdown: std::collections::BTreeMap, + /// v0.17.0 PR-C: sister of [`Self::code_lang_breakdown`] returning + /// chunk counts instead of doc counts. Indexing-pressure metric — + /// one PDF spec → 200 chunks vs one Rust file → 5 chunks shows up + /// here in a way `code_lang_breakdown` (doc count) hides. + #[serde(default)] + pub code_lang_chunk_breakdown: std::collections::BTreeMap, } const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -171,6 +177,9 @@ fn collect_stats( // p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a // placeholder — mirror of code_lang_breakdown for the repo field. repo_breakdown: store.repo_breakdown()?, + // v0.17.0 PR-C: chunk-level companion (closes HOTFIXES + // 2026-05-22 "code_lang_breakdown chunk granularity" LOW). + code_lang_chunk_breakdown: store.code_lang_chunk_breakdown()?, }) } @@ -210,6 +219,11 @@ mod tests_stats_ext { v.get("repo_breakdown").is_some(), "Stats JSON must include repo_breakdown: {v}" ); + // v0.17.0 PR-C: chunk-level companion field. + assert!( + v.get("code_lang_chunk_breakdown").is_some(), + "Stats JSON must include code_lang_chunk_breakdown (v0.17.0 PR-C): {v}" + ); // Empty BTreeMap serializes as `{}` — confirm it's an object, not null. assert!( v["code_lang_breakdown"].is_object(), @@ -219,6 +233,10 @@ mod tests_stats_ext { v["repo_breakdown"].is_object(), "repo_breakdown must be an object: {v}" ); + assert!( + v["code_lang_chunk_breakdown"].is_object(), + "code_lang_chunk_breakdown must be an object: {v}" + ); } #[test] diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index c1e3a9b..b1f54e1 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -892,6 +892,45 @@ impl SqliteStore { Ok(out) } + /// v0.17.0 PR-C: per-code-language **chunk** count for + /// `schema.v1.stats`. Companion to [`Self::code_lang_breakdown`] — + /// that one returns *document* counts. Stats observers wanting + /// indexing-pressure granularity (a single PDF spec → 200 chunks, + /// vs a single Rust file → 5 chunks) need the chunk-level view. + /// + /// SQL joins `chunks → documents`, reads + /// `metadata_json->'$.code_lang'` on the doc side, groups by the + /// language, and skips rows where `code_lang IS NULL`. Returns + /// `BTreeMap` mirroring the doc-count helper above + /// so callers can serialize both with the same shape. + pub fn code_lang_chunk_breakdown( + &self, + ) -> anyhow::Result> { + use anyhow::Context; + let conn = self.read_conn(); + let mut stmt = conn + .prepare( + "SELECT json_extract(d.metadata_json, '$.code_lang') AS cl, \ + COUNT(c.chunk_id) \ + FROM chunks c \ + INNER JOIN documents d ON c.doc_id = d.doc_id \ + WHERE cl IS NOT NULL \ + GROUP BY cl", + ) + .context("prepare code_lang_chunk_breakdown")?; + let rows = stmt + .query_map([], |r| { + Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)? as u32)) + }) + .context("query code_lang_chunk_breakdown")?; + let mut out = std::collections::BTreeMap::new(); + for row in rows { + let (k, v) = row.context("read code_lang_chunk_breakdown row")?; + out.insert(k, v); + } + Ok(out) + } + /// p10-1A-2 follow-up (dogfooding 2026-05-20): per-repo doc count for /// `schema.v1`. /// @@ -1041,6 +1080,108 @@ mod tests { assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}"); } + /// v0.17.0 PR-C: `code_lang_chunk_breakdown` counts *chunks* (not + /// docs) grouped by `documents.metadata_json.code_lang`. Differs + /// from `code_lang_breakdown` (doc count) by joining `chunks` and + /// summing chunk rows so one Rust file with 3 chunks reports + /// `rust=3` here vs `rust=1` in the doc-count helper. + /// + /// Uses a side rusqlite connection (FK enforcement off) so a single + /// doc + multiple chunks fixture can be inserted without standing + /// up `assets` companions. + #[test] + fn code_lang_chunk_breakdown_counts_chunks_not_docs() { + let (dir, store) = open_fresh_store(); + let db_path = dir.path().join("kebab.sqlite"); + let conn = rusqlite::Connection::open(&db_path).unwrap(); + conn.pragma_update(None, "foreign_keys", "OFF").unwrap(); + + // 1 Rust doc + 3 chunks → chunk_breakdown rust=3 / doc_breakdown rust=1. + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, + source_type, trust_level, parser_version, + doc_version, schema_version, + metadata_json, provenance_json, + created_at, updated_at + ) VALUES ( + 'doc-rust-1', 'asset-1', 'src/main.rs', + 'reference', 'primary', 'test-v1', + 1, 1, + '{\"code_lang\":\"rust\"}', '{}', + '2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z' + )", + [], + ) + .unwrap(); + for i in 0..3u32 { + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, 'doc-rust-1', ?, '[]', NULL, '[]', 0, 'cv1', 'h', '[]', '2024-01-01T00:00:00Z')", + rusqlite::params![format!("rust-chunk-{i:0>26}"), format!("body {i}")], + ) + .unwrap(); + } + + // 1 markdown doc + 1 chunk → code_lang = null → must be skipped. + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, + source_type, trust_level, parser_version, + doc_version, schema_version, + metadata_json, provenance_json, + created_at, updated_at + ) VALUES ( + 'doc-md-1', 'asset-2', 'notes/readme.md', + 'markdown', 'primary', 'test-v1', + 1, 1, + '{\"code_lang\":null}', '{}', + '2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z' + )", + [], + ) + .unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES ('md-chunk-00000000000000000000000', 'doc-md-1', 'm', '[]', NULL, '[]', 0, 'cv1', 'h', '[]', '2024-01-01T00:00:00Z')", + [], + ) + .unwrap(); + + drop(conn); + + let chunk_bd = store.code_lang_chunk_breakdown().unwrap(); + assert_eq!( + chunk_bd.get("rust"), + Some(&3u32), + "expected rust=3 chunks (1 doc × 3 chunks): {chunk_bd:?}" + ); + assert!( + !chunk_bd.contains_key("null"), + "null code_lang must be skipped: {chunk_bd:?}" + ); + assert_eq!( + chunk_bd.len(), + 1, + "expected exactly 1 language entry: {chunk_bd:?}" + ); + + // Sanity: the existing doc-count helper still returns 1 for rust, + // proving the two metrics differ as intended. + let doc_bd = store.code_lang_breakdown().unwrap(); + assert_eq!( + doc_bd.get("rust"), + Some(&1u32), + "doc-count helper unchanged: {doc_bd:?}" + ); + } + /// p10-1A-2 follow-up: `repo_breakdown` counts docs by /// `metadata_json.repo`. /// diff --git a/docs/wire-schema/v1/schema.schema.json b/docs/wire-schema/v1/schema.schema.json index 6e610b1..ace9371 100644 --- a/docs/wire-schema/v1/schema.schema.json +++ b/docs/wire-schema/v1/schema.schema.json @@ -81,12 +81,17 @@ }, "code_lang_breakdown": { "type": "object", - "description": "p10-1A-1: per-language code chunk count. Key = lowercase language name (e.g. 'rust', 'python'). Populated after 1A-2 lands; empty on markdown-only corpora.", + "description": "p10-1A-1: per-language **doc** count (one entry per indexed code document). Key = lowercase language name (e.g. 'rust', 'python'). Empty on markdown-only corpora. Pair with `code_lang_chunk_breakdown` for chunk-level granularity (one file's 200 chunks vs one doc).", "additionalProperties": { "type": "integer", "minimum": 0 } }, "repo_breakdown": { "type": "object", - "description": "p10-1A-1: per-repo code chunk count. Key = repo name as detected by kebab-parse-code::repo. Empty on markdown-only corpora.", + "description": "p10-1A-1: per-repo **doc** count. Key = repo name as detected by kebab-parse-code::repo. Empty on markdown-only corpora.", + "additionalProperties": { "type": "integer", "minimum": 0 } + }, + "code_lang_chunk_breakdown": { + "type": "object", + "description": "v0.17.0 PR-C: per-language **chunk** count (closes HOTFIXES 2026-05-22 'code_lang_breakdown chunk granularity'). Companion to `code_lang_breakdown` (doc count) — chunk-level granularity is the indexing-pressure metric (a 200-chunk PDF + a 5-chunk Rust file both appear as `1 doc` but `200` vs `5` chunks). Key = lowercase language name. Empty on markdown-only corpora.", "additionalProperties": { "type": "integer", "minimum": 0 } } } diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index 598d0c7..944cb8d 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -78,6 +78,18 @@ Cross-link: `tasks/p10/INDEX.md`, `migrations/V002__fts.sql`, design §5.5 / §3 Cross-link: `crates/kebab-parse-code/src/c.rs::recover_typedef_alias`, `tasks/p10/p10-1d-c-cpp-ast-chunker.md` Risks/notes section. +## 2026-05-24 — v0.17.0 PR-C: `code_lang_chunk_breakdown` additive wire 필드 (closure of 2026-05-22 LOW) + +`schema.v1.stats` 에 `code_lang_chunk_breakdown: { : }` additive 필드 추가. 기존 `code_lang_breakdown` (doc 수) 와 sister — chunk 수 집계로 indexing 압력 granularity 노출. 한 PDF spec → 200 chunks vs 한 Rust file → 5 chunks 가 동일한 `1 doc` 으로 보이던 한계 closure. + +**구현**: `crates/kebab-store-sqlite/src/store.rs::code_lang_chunk_breakdown()` — `chunks INNER JOIN documents` 후 `json_extract(d.metadata_json, '$.code_lang')` GROUP BY, `COUNT(c.chunk_id)`. `BTreeMap` 반환 (기존 helper 와 동일 shape). `crates/kebab-app/src/schema.rs::Stats` 에 동일 이름 필드 추가 + `collect_stats` builder 에서 호출. `docs/wire-schema/v1/schema.schema.json` 에 additive 필드 명세. **additive 변경 — wire breaking 아님, `schema_version` bump 불필요.** + +**Gemini round 2 권고 반영**: 기존 `code_lang_breakdown` / `repo_breakdown` 의 JSON schema description 이 "code chunk count" 로 잘못 적혀 있던 (실제는 doc count) 부분을 "doc count" 로 정정. 신규 필드만 "chunk count" 로 명시. 사용자가 두 metric 의 의미 차이를 schema 만 보고도 구분 가능. + +**사용자 영향**: `kebab schema --json` 출력에 신규 키 등장. MCP `schema` tool 도 동일. 옛 v0.16.x 가 보낸 호출은 그대로 동작 (additive). + +Cross-link: `crates/kebab-store-sqlite/src/store.rs::code_lang_chunk_breakdown`, `docs/wire-schema/v1/schema.schema.json`. + ## 2026-05-21 — p10-2: k8s multi-resource YAML chunk_id collision **Origin**: P10 종합 도그푸딩 (`/tmp/kebab-p10-dogfood/`, 16 파일). 한 파일에 2+ k8s document (Deployment + Service, `---` 구분) 인 YAML 이 ingest 실패.