Merge pull request 'feat: v0.17.0 PR-C — code_lang_chunk_breakdown additive wire field' (#161) from feat/code-lang-chunk-breakdown into main

Reviewed-on: #161
2026-05-24 20:35:28 +00:00
parent ff9d5f5f86 13a3361ba2
commit 9ee89c2a94
4 changed files with 188 additions and 6 deletions
--- a/crates/kebab-app/src/schema.rs
+++ b/crates/kebab-app/src/schema.rs
@@ -63,14 +63,26 @@ pub struct Stats {
    /// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
    #[serde(default)]
    pub stale_doc_count: u64,
-    /// p10-1A-1: code language breakdown (chunk counts by canonical lowercase
-    /// language identifier). Empty until 1A-2 produces code chunks.
+    /// p10-1A-1: code language breakdown (**doc** counts by canonical
+    /// lowercase language identifier). Empty until 1A-2 produces code
+    /// docs. v0.17.0 PR-C: doc-count semantics corrected here (the
+    /// previous "chunk counts" wording was a longstanding mis-label —
+    /// implementation has always been `COUNT(*) FROM documents
+    /// GROUP BY code_lang`). Use `code_lang_chunk_breakdown` for the
+    /// chunk-level companion.
    #[serde(default)]
    pub code_lang_breakdown: std::collections::BTreeMap<String, u32>,
-    /// p10-1A-1: repo breakdown (chunk counts by `metadata.repo` value).
-    /// Empty until 1A-2 produces code chunks.
+    /// p10-1A-1: repo breakdown (**doc** counts by `metadata.repo`
+    /// value). Empty until 1A-2 produces code docs. v0.17.0 PR-C:
+    /// doc-count wording corrected (mirror of code_lang_breakdown).
    #[serde(default)]
    pub repo_breakdown: std::collections::BTreeMap<String, u32>,
+    /// v0.17.0 PR-C: sister of [`Self::code_lang_breakdown`] returning
+    /// chunk counts instead of doc counts. Indexing-pressure metric —
+    /// one PDF spec → 200 chunks vs one Rust file → 5 chunks shows up
+    /// here in a way `code_lang_breakdown` (doc count) hides.
+    #[serde(default)]
+    pub code_lang_chunk_breakdown: std::collections::BTreeMap<String, u32>,
 }

 const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
@@ -171,6 +183,9 @@ fn collect_stats(
        // p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a
        // placeholder — mirror of code_lang_breakdown for the repo field.
        repo_breakdown: store.repo_breakdown()?,
+        // v0.17.0 PR-C: chunk-level companion (closes HOTFIXES
+        // 2026-05-22 "code_lang_breakdown chunk granularity" LOW).
+        code_lang_chunk_breakdown: store.code_lang_chunk_breakdown()?,
    })
 }

@@ -210,6 +225,11 @@ mod tests_stats_ext {
            v.get("repo_breakdown").is_some(),
            "Stats JSON must include repo_breakdown: {v}"
        );
+        // v0.17.0 PR-C: chunk-level companion field.
+        assert!(
+            v.get("code_lang_chunk_breakdown").is_some(),
+            "Stats JSON must include code_lang_chunk_breakdown (v0.17.0 PR-C): {v}"
+        );
        // Empty BTreeMap serializes as `{}` — confirm it's an object, not null.
        assert!(
            v["code_lang_breakdown"].is_object(),
@@ -219,6 +239,10 @@ mod tests_stats_ext {
            v["repo_breakdown"].is_object(),
            "repo_breakdown must be an object: {v}"
        );
+        assert!(
+            v["code_lang_chunk_breakdown"].is_object(),
+            "code_lang_chunk_breakdown must be an object: {v}"
+        );
    }

    #[test]
--- a/crates/kebab-store-sqlite/src/store.rs
+++ b/crates/kebab-store-sqlite/src/store.rs
@@ -892,6 +892,45 @@ impl SqliteStore {
        Ok(out)
    }

+    /// v0.17.0 PR-C: per-code-language **chunk** count for
+    /// `schema.v1.stats`. Companion to [`Self::code_lang_breakdown`] —
+    /// that one returns *document* counts. Stats observers wanting
+    /// indexing-pressure granularity (a single PDF spec → 200 chunks,
+    /// vs a single Rust file → 5 chunks) need the chunk-level view.
+    ///
+    /// SQL joins `chunks → documents`, reads
+    /// `metadata_json->'$.code_lang'` on the doc side, groups by the
+    /// language, and skips rows where `code_lang IS NULL`. Returns
+    /// `BTreeMap<String, u32>` mirroring the doc-count helper above
+    /// so callers can serialize both with the same shape.
+    pub fn code_lang_chunk_breakdown(
+        &self,
+    ) -> anyhow::Result<std::collections::BTreeMap<String, u32>> {
+        use anyhow::Context;
+        let conn = self.read_conn();
+        let mut stmt = conn
+            .prepare(
+                "SELECT json_extract(d.metadata_json, '$.code_lang') AS cl, \
+                        COUNT(c.chunk_id) \
+                 FROM chunks c \
+                 INNER JOIN documents d ON c.doc_id = d.doc_id \
+                 WHERE cl IS NOT NULL \
+                 GROUP BY cl",
+            )
+            .context("prepare code_lang_chunk_breakdown")?;
+        let rows = stmt
+            .query_map([], |r| {
+                Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)? as u32))
+            })
+            .context("query code_lang_chunk_breakdown")?;
+        let mut out = std::collections::BTreeMap::new();
+        for row in rows {
+            let (k, v) = row.context("read code_lang_chunk_breakdown row")?;
+            out.insert(k, v);
+        }
+        Ok(out)
+    }
+
    /// p10-1A-2 follow-up (dogfooding 2026-05-20): per-repo doc count for
    /// `schema.v1`.
    ///
@@ -1041,6 +1080,108 @@ mod tests {
        assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}");
    }

+    /// v0.17.0 PR-C: `code_lang_chunk_breakdown` counts *chunks* (not
+    /// docs) grouped by `documents.metadata_json.code_lang`. Differs
+    /// from `code_lang_breakdown` (doc count) by joining `chunks` and
+    /// summing chunk rows so one Rust file with 3 chunks reports
+    /// `rust=3` here vs `rust=1` in the doc-count helper.
+    ///
+    /// Uses a side rusqlite connection (FK enforcement off) so a single
+    /// doc + multiple chunks fixture can be inserted without standing
+    /// up `assets` companions.
+    #[test]
+    fn code_lang_chunk_breakdown_counts_chunks_not_docs() {
+        let (dir, store) = open_fresh_store();
+        let db_path = dir.path().join("kebab.sqlite");
+        let conn = rusqlite::Connection::open(&db_path).unwrap();
+        conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
+
+        // 1 Rust doc + 3 chunks → chunk_breakdown rust=3 / doc_breakdown rust=1.
+        conn.execute(
+            "INSERT INTO documents (
+                doc_id, asset_id, workspace_path,
+                source_type, trust_level, parser_version,
+                doc_version, schema_version,
+                metadata_json, provenance_json,
+                created_at, updated_at
+            ) VALUES (
+                'doc-rust-1', 'asset-1', 'src/main.rs',
+                'reference', 'primary', 'test-v1',
+                1, 1,
+                '{\"code_lang\":\"rust\"}', '{}',
+                '2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z'
+            )",
+            [],
+        )
+        .unwrap();
+        for i in 0..3u32 {
+            conn.execute(
+                "INSERT INTO chunks (
+                    chunk_id, doc_id, text, heading_path_json, section_label,
+                    source_spans_json, token_estimate, chunker_version,
+                    policy_hash, block_ids_json, created_at
+                ) VALUES (?, 'doc-rust-1', ?, '[]', NULL, '[]', 0, 'cv1', 'h', '[]', '2024-01-01T00:00:00Z')",
+                rusqlite::params![format!("rust-chunk-{i:0>26}"), format!("body {i}")],
+            )
+            .unwrap();
+        }
+
+        // 1 markdown doc + 1 chunk → code_lang = null → must be skipped.
+        conn.execute(
+            "INSERT INTO documents (
+                doc_id, asset_id, workspace_path,
+                source_type, trust_level, parser_version,
+                doc_version, schema_version,
+                metadata_json, provenance_json,
+                created_at, updated_at
+            ) VALUES (
+                'doc-md-1', 'asset-2', 'notes/readme.md',
+                'markdown', 'primary', 'test-v1',
+                1, 1,
+                '{\"code_lang\":null}', '{}',
+                '2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z'
+            )",
+            [],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT INTO chunks (
+                chunk_id, doc_id, text, heading_path_json, section_label,
+                source_spans_json, token_estimate, chunker_version,
+                policy_hash, block_ids_json, created_at
+            ) VALUES ('md-chunk-00000000000000000000000', 'doc-md-1', 'm', '[]', NULL, '[]', 0, 'cv1', 'h', '[]', '2024-01-01T00:00:00Z')",
+            [],
+        )
+        .unwrap();
+
+        drop(conn);
+
+        let chunk_bd = store.code_lang_chunk_breakdown().unwrap();
+        assert_eq!(
+            chunk_bd.get("rust"),
+            Some(&3u32),
+            "expected rust=3 chunks (1 doc × 3 chunks): {chunk_bd:?}"
+        );
+        assert!(
+            !chunk_bd.contains_key("null"),
+            "null code_lang must be skipped: {chunk_bd:?}"
+        );
+        assert_eq!(
+            chunk_bd.len(),
+            1,
+            "expected exactly 1 language entry: {chunk_bd:?}"
+        );
+
+        // Sanity: the existing doc-count helper still returns 1 for rust,
+        // proving the two metrics differ as intended.
+        let doc_bd = store.code_lang_breakdown().unwrap();
+        assert_eq!(
+            doc_bd.get("rust"),
+            Some(&1u32),
+            "doc-count helper unchanged: {doc_bd:?}"
+        );
+    }
+
    /// p10-1A-2 follow-up: `repo_breakdown` counts docs by
    /// `metadata_json.repo`.
    ///
--- a/docs/wire-schema/v1/schema.schema.json
+++ b/docs/wire-schema/v1/schema.schema.json
@@ -81,12 +81,17 @@
        },
        "code_lang_breakdown": {
          "type": "object",
-          "description": "p10-1A-1: per-language code chunk count. Key = lowercase language name (e.g. 'rust', 'python'). Populated after 1A-2 lands; empty on markdown-only corpora.",
+          "description": "p10-1A-1: per-language **doc** count (one entry per indexed code document). Key = lowercase language name (e.g. 'rust', 'python'). Empty on markdown-only corpora. Pair with `code_lang_chunk_breakdown` for chunk-level granularity (one file's 200 chunks vs one doc).",
          "additionalProperties": { "type": "integer", "minimum": 0 }
        },
        "repo_breakdown": {
          "type": "object",
-          "description": "p10-1A-1: per-repo code chunk count. Key = repo name as detected by kebab-parse-code::repo. Empty on markdown-only corpora.",
+          "description": "p10-1A-1: per-repo **doc** count. Key = repo name as detected by kebab-parse-code::repo. Empty on markdown-only corpora.",
+          "additionalProperties": { "type": "integer", "minimum": 0 }
+        },
+        "code_lang_chunk_breakdown": {
+          "type": "object",
+          "description": "v0.17.0 PR-C: per-language **chunk** count (closes HOTFIXES 2026-05-22 'code_lang_breakdown chunk granularity'). Companion to `code_lang_breakdown` (doc count) — chunk-level granularity is the indexing-pressure metric (a 200-chunk PDF + a 5-chunk Rust file both appear as `1 doc` but `200` vs `5` chunks). Key = lowercase language name. Empty on markdown-only corpora.",
          "additionalProperties": { "type": "integer", "minimum": 0 }
        }
      }
--- a/tasks/HOTFIXES.md
+++ b/tasks/HOTFIXES.md
@@ -78,6 +78,18 @@ Cross-link: `tasks/p10/INDEX.md`, `migrations/V002__fts.sql`, design §5.5 / §3

 Cross-link: `crates/kebab-parse-code/src/c.rs::recover_typedef_alias`, `tasks/p10/p10-1d-c-cpp-ast-chunker.md` Risks/notes section.

+## 2026-05-24 — v0.17.0 PR-C: `code_lang_chunk_breakdown` additive wire 필드 (closure of 2026-05-22 LOW)
+
+`schema.v1.stats` 에 `code_lang_chunk_breakdown: { <lang>: <chunk_count> }` additive 필드 추가. 기존 `code_lang_breakdown` (doc 수) 와 sister — chunk 수 집계로 indexing 압력 granularity 노출. 한 PDF spec → 200 chunks vs 한 Rust file → 5 chunks 가 동일한 `1 doc` 으로 보이던 한계 closure.
+
+**구현**: `crates/kebab-store-sqlite/src/store.rs::code_lang_chunk_breakdown()` — `chunks INNER JOIN documents` 후 `json_extract(d.metadata_json, '$.code_lang')` GROUP BY, `COUNT(c.chunk_id)`. `BTreeMap<String, u32>` 반환 (기존 helper 와 동일 shape). `crates/kebab-app/src/schema.rs::Stats` 에 동일 이름 필드 추가 + `collect_stats` builder 에서 호출. `docs/wire-schema/v1/schema.schema.json` 에 additive 필드 명세. **additive 변경 — wire breaking 아님, `schema_version` bump 불필요.**
+
+**Gemini round 2 권고 반영**: 기존 `code_lang_breakdown` / `repo_breakdown` 의 JSON schema description 이 "code chunk count" 로 잘못 적혀 있던 (실제는 doc count) 부분을 "doc count" 로 정정. 신규 필드만 "chunk count" 로 명시. 사용자가 두 metric 의 의미 차이를 schema 만 보고도 구분 가능.
+
+**사용자 영향**: `kebab schema --json` 출력에 신규 키 등장. MCP `schema` tool 도 동일. 옛 v0.16.x 가 보낸 호출은 그대로 동작 (additive).
+
+Cross-link: `crates/kebab-store-sqlite/src/store.rs::code_lang_chunk_breakdown`, `docs/wire-schema/v1/schema.schema.json`.
+
 ## 2026-05-21 — p10-2: k8s multi-resource YAML chunk_id collision

 **Origin**: P10 종합 도그푸딩 (`/tmp/kebab-p10-dogfood/`, 16 파일). 한 파일에 2+ k8s document (Deployment + Service, `---` 구분) 인 YAML 이 ingest 실패.