feat: v0.17.0 PR-C — code_lang_chunk_breakdown additive wire field #161

Merged
altair823 merged 2 commits from feat/code-lang-chunk-breakdown into main 2026-05-24 20:35:31 +00:00
4 changed files with 188 additions and 6 deletions

View File

@@ -63,14 +63,26 @@ pub struct Stats {
/// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
#[serde(default)]
pub stale_doc_count: u64,
/// p10-1A-1: code language breakdown (chunk counts by canonical lowercase
/// language identifier). Empty until 1A-2 produces code chunks.
/// p10-1A-1: code language breakdown (**doc** counts by canonical
/// lowercase language identifier). Empty until 1A-2 produces code
/// docs. v0.17.0 PR-C: doc-count semantics corrected here (the
/// previous "chunk counts" wording was a longstanding mis-label —
/// implementation has always been `COUNT(*) FROM documents
/// GROUP BY code_lang`). Use `code_lang_chunk_breakdown` for the
/// chunk-level companion.
#[serde(default)]
pub code_lang_breakdown: std::collections::BTreeMap<String, u32>,
/// p10-1A-1: repo breakdown (chunk counts by `metadata.repo` value).
/// Empty until 1A-2 produces code chunks.
/// p10-1A-1: repo breakdown (**doc** counts by `metadata.repo`
/// value). Empty until 1A-2 produces code docs. v0.17.0 PR-C:
/// doc-count wording corrected (mirror of code_lang_breakdown).
#[serde(default)]
pub repo_breakdown: std::collections::BTreeMap<String, u32>,
/// v0.17.0 PR-C: sister of [`Self::code_lang_breakdown`] returning
/// chunk counts instead of doc counts. Indexing-pressure metric —
/// one PDF spec → 200 chunks vs one Rust file → 5 chunks shows up
/// here in a way `code_lang_breakdown` (doc count) hides.
#[serde(default)]
pub code_lang_chunk_breakdown: std::collections::BTreeMap<String, u32>,
}
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
@@ -171,6 +183,9 @@ fn collect_stats(
// p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a
// placeholder — mirror of code_lang_breakdown for the repo field.
repo_breakdown: store.repo_breakdown()?,
// v0.17.0 PR-C: chunk-level companion (closes HOTFIXES
// 2026-05-22 "code_lang_breakdown chunk granularity" LOW).
code_lang_chunk_breakdown: store.code_lang_chunk_breakdown()?,
})
}
@@ -210,6 +225,11 @@ mod tests_stats_ext {
v.get("repo_breakdown").is_some(),
"Stats JSON must include repo_breakdown: {v}"
);
// v0.17.0 PR-C: chunk-level companion field.
assert!(
v.get("code_lang_chunk_breakdown").is_some(),
"Stats JSON must include code_lang_chunk_breakdown (v0.17.0 PR-C): {v}"
);
// Empty BTreeMap serializes as `{}` — confirm it's an object, not null.
assert!(
v["code_lang_breakdown"].is_object(),
@@ -219,6 +239,10 @@ mod tests_stats_ext {
v["repo_breakdown"].is_object(),
"repo_breakdown must be an object: {v}"
);
assert!(
v["code_lang_chunk_breakdown"].is_object(),
"code_lang_chunk_breakdown must be an object: {v}"
);
}
#[test]

View File

@@ -892,6 +892,45 @@ impl SqliteStore {
Ok(out)
}
/// v0.17.0 PR-C: per-code-language **chunk** count for
/// `schema.v1.stats`. Companion to [`Self::code_lang_breakdown`] —
/// that one returns *document* counts. Stats observers wanting
/// indexing-pressure granularity (a single PDF spec → 200 chunks,
/// vs a single Rust file → 5 chunks) need the chunk-level view.
///
/// SQL joins `chunks → documents`, reads
/// `metadata_json->'$.code_lang'` on the doc side, groups by the
/// language, and skips rows where `code_lang IS NULL`. Returns
/// `BTreeMap<String, u32>` mirroring the doc-count helper above
/// so callers can serialize both with the same shape.
pub fn code_lang_chunk_breakdown(
&self,
) -> anyhow::Result<std::collections::BTreeMap<String, u32>> {
use anyhow::Context;
let conn = self.read_conn();
let mut stmt = conn
.prepare(
"SELECT json_extract(d.metadata_json, '$.code_lang') AS cl, \
COUNT(c.chunk_id) \
FROM chunks c \
INNER JOIN documents d ON c.doc_id = d.doc_id \
WHERE cl IS NOT NULL \
GROUP BY cl",
)
.context("prepare code_lang_chunk_breakdown")?;
let rows = stmt
.query_map([], |r| {
Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)? as u32))
})
.context("query code_lang_chunk_breakdown")?;
let mut out = std::collections::BTreeMap::new();
for row in rows {
let (k, v) = row.context("read code_lang_chunk_breakdown row")?;
out.insert(k, v);
}
Ok(out)
}
/// p10-1A-2 follow-up (dogfooding 2026-05-20): per-repo doc count for
/// `schema.v1`.
///
@@ -1041,6 +1080,108 @@ mod tests {
assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}");
}
/// v0.17.0 PR-C: `code_lang_chunk_breakdown` counts *chunks* (not
/// docs) grouped by `documents.metadata_json.code_lang`. Differs
/// from `code_lang_breakdown` (doc count) by joining `chunks` and
/// summing chunk rows so one Rust file with 3 chunks reports
/// `rust=3` here vs `rust=1` in the doc-count helper.
///
/// Uses a side rusqlite connection (FK enforcement off) so a single
/// doc + multiple chunks fixture can be inserted without standing
/// up `assets` companions.
#[test]
fn code_lang_chunk_breakdown_counts_chunks_not_docs() {
let (dir, store) = open_fresh_store();
let db_path = dir.path().join("kebab.sqlite");
let conn = rusqlite::Connection::open(&db_path).unwrap();
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
// 1 Rust doc + 3 chunks → chunk_breakdown rust=3 / doc_breakdown rust=1.
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path,
source_type, trust_level, parser_version,
doc_version, schema_version,
metadata_json, provenance_json,
created_at, updated_at
) VALUES (
'doc-rust-1', 'asset-1', 'src/main.rs',
'reference', 'primary', 'test-v1',
1, 1,
'{\"code_lang\":\"rust\"}', '{}',
'2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z'
)",
[],
)
.unwrap();
for i in 0..3u32 {
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, 'doc-rust-1', ?, '[]', NULL, '[]', 0, 'cv1', 'h', '[]', '2024-01-01T00:00:00Z')",
rusqlite::params![format!("rust-chunk-{i:0>26}"), format!("body {i}")],
)
.unwrap();
}
// 1 markdown doc + 1 chunk → code_lang = null → must be skipped.
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path,
source_type, trust_level, parser_version,
doc_version, schema_version,
metadata_json, provenance_json,
created_at, updated_at
) VALUES (
'doc-md-1', 'asset-2', 'notes/readme.md',
'markdown', 'primary', 'test-v1',
1, 1,
'{\"code_lang\":null}', '{}',
'2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z'
)",
[],
)
.unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES ('md-chunk-00000000000000000000000', 'doc-md-1', 'm', '[]', NULL, '[]', 0, 'cv1', 'h', '[]', '2024-01-01T00:00:00Z')",
[],
)
.unwrap();
drop(conn);
let chunk_bd = store.code_lang_chunk_breakdown().unwrap();
assert_eq!(
chunk_bd.get("rust"),
Some(&3u32),
"expected rust=3 chunks (1 doc × 3 chunks): {chunk_bd:?}"
);
assert!(
!chunk_bd.contains_key("null"),
"null code_lang must be skipped: {chunk_bd:?}"
);
assert_eq!(
chunk_bd.len(),
1,
"expected exactly 1 language entry: {chunk_bd:?}"
);
// Sanity: the existing doc-count helper still returns 1 for rust,
// proving the two metrics differ as intended.
let doc_bd = store.code_lang_breakdown().unwrap();
assert_eq!(
doc_bd.get("rust"),
Some(&1u32),
"doc-count helper unchanged: {doc_bd:?}"
);
}
/// p10-1A-2 follow-up: `repo_breakdown` counts docs by
/// `metadata_json.repo`.
///

View File

@@ -81,12 +81,17 @@
},
"code_lang_breakdown": {
"type": "object",
"description": "p10-1A-1: per-language code chunk count. Key = lowercase language name (e.g. 'rust', 'python'). Populated after 1A-2 lands; empty on markdown-only corpora.",
"description": "p10-1A-1: per-language **doc** count (one entry per indexed code document). Key = lowercase language name (e.g. 'rust', 'python'). Empty on markdown-only corpora. Pair with `code_lang_chunk_breakdown` for chunk-level granularity (one file's 200 chunks vs one doc).",
"additionalProperties": { "type": "integer", "minimum": 0 }
},
"repo_breakdown": {
"type": "object",
"description": "p10-1A-1: per-repo code chunk count. Key = repo name as detected by kebab-parse-code::repo. Empty on markdown-only corpora.",
"description": "p10-1A-1: per-repo **doc** count. Key = repo name as detected by kebab-parse-code::repo. Empty on markdown-only corpora.",
"additionalProperties": { "type": "integer", "minimum": 0 }
},
"code_lang_chunk_breakdown": {
"type": "object",
"description": "v0.17.0 PR-C: per-language **chunk** count (closes HOTFIXES 2026-05-22 'code_lang_breakdown chunk granularity'). Companion to `code_lang_breakdown` (doc count) — chunk-level granularity is the indexing-pressure metric (a 200-chunk PDF + a 5-chunk Rust file both appear as `1 doc` but `200` vs `5` chunks). Key = lowercase language name. Empty on markdown-only corpora.",
"additionalProperties": { "type": "integer", "minimum": 0 }
}
}

View File

@@ -78,6 +78,18 @@ Cross-link: `tasks/p10/INDEX.md`, `migrations/V002__fts.sql`, design §5.5 / §3
Cross-link: `crates/kebab-parse-code/src/c.rs::recover_typedef_alias`, `tasks/p10/p10-1d-c-cpp-ast-chunker.md` Risks/notes section.
## 2026-05-24 — v0.17.0 PR-C: `code_lang_chunk_breakdown` additive wire 필드 (closure of 2026-05-22 LOW)
`schema.v1.stats``code_lang_chunk_breakdown: { <lang>: <chunk_count> }` additive 필드 추가. 기존 `code_lang_breakdown` (doc 수) 와 sister — chunk 수 집계로 indexing 압력 granularity 노출. 한 PDF spec → 200 chunks vs 한 Rust file → 5 chunks 가 동일한 `1 doc` 으로 보이던 한계 closure.
**구현**: `crates/kebab-store-sqlite/src/store.rs::code_lang_chunk_breakdown()``chunks INNER JOIN documents``json_extract(d.metadata_json, '$.code_lang')` GROUP BY, `COUNT(c.chunk_id)`. `BTreeMap<String, u32>` 반환 (기존 helper 와 동일 shape). `crates/kebab-app/src/schema.rs::Stats` 에 동일 이름 필드 추가 + `collect_stats` builder 에서 호출. `docs/wire-schema/v1/schema.schema.json` 에 additive 필드 명세. **additive 변경 — wire breaking 아님, `schema_version` bump 불필요.**
**Gemini round 2 권고 반영**: 기존 `code_lang_breakdown` / `repo_breakdown` 의 JSON schema description 이 "code chunk count" 로 잘못 적혀 있던 (실제는 doc count) 부분을 "doc count" 로 정정. 신규 필드만 "chunk count" 로 명시. 사용자가 두 metric 의 의미 차이를 schema 만 보고도 구분 가능.
**사용자 영향**: `kebab schema --json` 출력에 신규 키 등장. MCP `schema` tool 도 동일. 옛 v0.16.x 가 보낸 호출은 그대로 동작 (additive).
Cross-link: `crates/kebab-store-sqlite/src/store.rs::code_lang_chunk_breakdown`, `docs/wire-schema/v1/schema.schema.json`.
## 2026-05-21 — p10-2: k8s multi-resource YAML chunk_id collision
**Origin**: P10 종합 도그푸딩 (`/tmp/kebab-p10-dogfood/`, 16 파일). 한 파일에 2+ k8s document (Deployment + Service, `---` 구분) 인 YAML 이 ingest 실패.