fix(dogfood): k8s multi-resource YAML chunk_id collision (#158)

This commit was merged in pull request #158.
This commit is contained in:
2026-05-21 23:57:49 +00:00
10 changed files with 121 additions and 25 deletions

47
Cargo.lock generated
View File

@@ -4127,7 +4127,7 @@ dependencies = [
[[package]]
name = "kebab-app"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"base64 0.22.1",
@@ -4172,12 +4172,13 @@ dependencies = [
[[package]]
name = "kebab-chunk"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"blake3",
"kebab-core",
"kebab-normalize",
"kebab-parse-code",
"kebab-parse-md",
"serde_json",
"serde_json_canonicalizer",
@@ -4188,7 +4189,7 @@ dependencies = [
[[package]]
name = "kebab-cli"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"clap",
@@ -4209,7 +4210,7 @@ dependencies = [
[[package]]
name = "kebab-config"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"dirs 5.0.1",
@@ -4224,7 +4225,7 @@ dependencies = [
[[package]]
name = "kebab-core"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"blake3",
@@ -4238,7 +4239,7 @@ dependencies = [
[[package]]
name = "kebab-embed"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"blake3",
@@ -4252,7 +4253,7 @@ dependencies = [
[[package]]
name = "kebab-embed-local"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"fastembed",
@@ -4265,7 +4266,7 @@ dependencies = [
[[package]]
name = "kebab-eval"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"kebab-app",
@@ -4284,7 +4285,7 @@ dependencies = [
[[package]]
name = "kebab-llm"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"kebab-core",
@@ -4293,7 +4294,7 @@ dependencies = [
[[package]]
name = "kebab-llm-local"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"kebab-config",
@@ -4310,7 +4311,7 @@ dependencies = [
[[package]]
name = "kebab-mcp"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"kebab-app",
@@ -4328,7 +4329,7 @@ dependencies = [
[[package]]
name = "kebab-normalize"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"kebab-core",
@@ -4343,7 +4344,7 @@ dependencies = [
[[package]]
name = "kebab-parse-code"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"gix",
@@ -4366,7 +4367,7 @@ dependencies = [
[[package]]
name = "kebab-parse-image"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"ab_glyph",
"anyhow",
@@ -4390,7 +4391,7 @@ dependencies = [
[[package]]
name = "kebab-parse-md"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"kebab-core",
@@ -4407,7 +4408,7 @@ dependencies = [
[[package]]
name = "kebab-parse-pdf"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"blake3",
@@ -4420,7 +4421,7 @@ dependencies = [
[[package]]
name = "kebab-parse-types"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"kebab-core",
"serde",
@@ -4428,7 +4429,7 @@ dependencies = [
[[package]]
name = "kebab-rag"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"blake3",
@@ -4449,7 +4450,7 @@ dependencies = [
[[package]]
name = "kebab-search"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"globset",
@@ -4468,7 +4469,7 @@ dependencies = [
[[package]]
name = "kebab-source-fs"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"blake3",
@@ -4487,7 +4488,7 @@ dependencies = [
[[package]]
name = "kebab-store-sqlite"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"blake3",
@@ -4508,7 +4509,7 @@ dependencies = [
[[package]]
name = "kebab-store-vector"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"arrow",
@@ -4532,7 +4533,7 @@ dependencies = [
[[package]]
name = "kebab-tui"
version = "0.16.0"
version = "0.16.1"
dependencies = [
"anyhow",
"crossterm",

View File

@@ -31,7 +31,7 @@ edition = "2024"
rust-version = "1.85"
license = "MIT OR Apache-2.0"
repository = "https://github.com/altair823/kebab"
version = "0.16.0"
version = "0.16.1"
[workspace.dependencies]
anyhow = "1"

View File

@@ -1286,6 +1286,64 @@ fn tier1_cpp_ingest_searchable() {
);
}
/// P10 dogfood regression: a k8s YAML with 2 documents (Deployment + Service
/// separated by `---`) must ingest without a UNIQUE constraint violation.
/// Before the fix, push_chunks_with_oversize emitted split_key=None for each
/// resource, giving every resource chunk the same id_hash → identical chunk_id
/// → SQLite UNIQUE constraint failure on the second resource.
#[test]
fn tier2_k8s_multi_resource_yaml_ingests_without_collision() {
let env = TestEnv::lexical_only();
let k8s_dir = env.workspace_root.join("k8s");
std::fs::create_dir_all(&k8s_dir).unwrap();
std::fs::write(
k8s_dir.join("k8s-multi.yaml"),
"apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: api\n namespace: prod\nspec:\n replicas: 2\n---\napiVersion: v1\nkind: Service\nmetadata:\n name: api\n namespace: prod\nspec:\n selector:\n app: api\n",
)
.unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
// The bug: this would land in report with an error + UNIQUE constraint message.
let item = report
.items
.as_ref()
.expect("items present")
.iter()
.find(|i| i.doc_path.0.ends_with("k8s-multi.yaml"))
.expect("k8s-multi.yaml in report");
assert!(
item.error.is_none(),
"multi-resource k8s yaml must ingest without error, got: {:?}",
item.error
);
assert!(
matches!(item.kind, IngestItemKind::New),
"expected New, got {:?}",
item.kind
);
// Both resources must be searchable (≥2 hits: Deployment/prod/api + Service/prod/api).
let query = kebab_core::SearchQuery {
text: "api".to_string(),
mode: kebab_core::SearchMode::Lexical,
k: 10,
filters: kebab_core::SearchFilters {
code_lang: vec!["yaml".to_string()],
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
assert!(
hits.len() >= 2,
"expected ≥2 hits (Deployment + Service), got {}",
hits.len()
);
}
/// p10-3 fix regression: a shell file (direct Tier 3, not a fallback)
/// must also report Unchanged on re-ingest. Shell goes straight to
/// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false

View File

@@ -43,6 +43,7 @@ impl Chunker for DockerfileFileV1Chunker {
"<dockerfile>",
"dockerfile",
VERSION_LABEL,
None,
)?;
tracing::debug!(

View File

@@ -85,6 +85,7 @@ impl Chunker for K8sManifestResourceV1Chunker {
&symbol,
"yaml",
VERSION_LABEL,
Some(slice.line_start),
)?;
}

View File

@@ -44,6 +44,7 @@ impl Chunker for ManifestFileV1Chunker {
"<manifest>",
lang,
VERSION_LABEL,
None,
)?;
tracing::debug!(

View File

@@ -25,6 +25,13 @@ pub(crate) fn policy_hash(policy: &ChunkPolicy) -> String {
/// Emit one chunk for `(text, line_start..=line_end, symbol, lang)`, splitting
/// into line-windows of at most `AST_CHUNK_MAX_LINES` if the slice is oversize.
/// Mirrors the oversize path in `code_rust_ast_v1`'s `chunk` impl.
///
/// `base_split_key` is used as the `split_key` for the non-oversize single-chunk
/// case. Callers that emit multiple chunks from the same document (e.g.
/// `K8sManifestResourceV1Chunker` — one call per k8s resource) MUST pass
/// `Some(line_start)` so that each call produces a distinct `chunk_id`.
/// Single-chunk callers (dockerfile-file-v1, manifest-file-v1) pass `None` to
/// keep chunk_ids stable (no sibling can collide when there's only one chunk).
#[allow(clippy::too_many_arguments)]
pub(crate) fn push_chunks_with_oversize(
out: &mut Vec<Chunk>,
@@ -36,6 +43,7 @@ pub(crate) fn push_chunks_with_oversize(
symbol: &str,
lang: &str,
chunker_version: &str,
base_split_key: Option<u32>,
) -> Result<()> {
let n_lines = (line_end - line_start + 1).max(1);
let cv = ChunkerVersion(chunker_version.to_string());
@@ -51,7 +59,7 @@ pub(crate) fn push_chunks_with_oversize(
line_end,
symbol,
lang,
None,
base_split_key,
));
return Ok(());
}

View File

@@ -140,6 +140,17 @@ fn k8s_multi_doc_emits_one_chunk_per_resource() {
for chunk in &chunks {
assert_eq!(chunk.chunker_version.0, "k8s-manifest-resource-v1");
}
// Every chunk from a multi-resource file must have a distinct chunk_id.
// Without the fix, all non-oversize resources get split_key=None which
// collapses to the same id_hash (= base_policy_hash) → UNIQUE constraint
// violation on the second resource.
let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect();
assert_eq!(
ids.len(),
chunks.len(),
"every k8s resource chunk must have a distinct chunk_id (multi-resource collision regression)"
);
}
/// A YAML document with an indentation error (tab in a space-indented context)

View File

@@ -14,6 +14,20 @@ historical contract that was implemented; this file accumulates the
deltas so phase 5+ readers can find the live behavior without diffing
git history.
## 2026-05-21 — p10-2: k8s multi-resource YAML chunk_id collision
**Origin**: P10 종합 도그푸딩 (`/tmp/kebab-p10-dogfood/`, 16 파일). 한 파일에 2+ k8s document (Deployment + Service, `---` 구분) 인 YAML 이 ingest 실패.
**Symptom**: `DocumentStore::put_chunks (code): UNIQUE constraint failed: chunks.chunk_id`. document row 는 생성되나 chunk 0개 → 검색 불가. p10-2 의 통합 테스트 `tier2_k8s_yaml_ingest_searchable` 가 single-Deployment fixture 만 써서 미발견.
**원인**: `tier2_shared::push_chunks_with_oversize` 의 non-oversize 분기가 `split_key = None` 하드코딩. `K8sManifestResourceV1Chunker` 가 resource 마다 호출 — 같은 document 의 모든 resource 가 `doc_id` + `chunker_version` + `base_policy_hash` 공유 + `split_key = None` → 동일 `id_hash` → 동일 `chunk_id`. p10-3 의 `code_text_paragraph_v1` 가 같은 버그였고 `df3c5b8` 에서 fix 됐지만 그건 `build_chunk_no_symbol` 직접 호출 경로, `push_chunks_with_oversize` 경로는 미수정.
**Fix** (PR #158, v0.16.1): `push_chunks_with_oversize``base_split_key: Option<u32>` 추가. k8s chunker 가 `Some(resource.line_start)` 전달 → resource 별 distinct chunk_id. dockerfile / manifest 는 `None` (파일당 1 chunk, 충돌 없음, chunk_id 불변).
**Deviation note**: single-resource k8s YAML 의 chunk_id 도 `None → Some(1)` 으로 바뀜 (`id_hash``base_policy_hash``base_policy_hash#L1`). `chunker_version` (`k8s-manifest-resource-v1`) 은 의도적으로 bump 안 함 — p10-2 가 v0.14.0 (~1주 전) 머지된 dogfood 단계라 prod KB 없음. v0.14.0~v0.16.0 사이 single-resource k8s 를 색인한 KB 는 re-ingest 시 old chunk 가 orphan 될 수 있으나 (UNIQUE 충돌 아님 — 다른 id), `kebab reset` 또는 re-ingest sweep 으로 정리됨. dogfood-only 단계라 chunker_version bump (전체 re-process) 보다 가벼운 선택.
Cross-link: `tasks/p10/p10-2-tier2-resource-aware.md` Risks/notes section.
## 2026-05-21 — p10-1D: typedef-wrapped struct/enum in C falls into glue
**Origin**: PR #156 (p10-1d) code-reviewer review. Verified during dogfood.

View File

@@ -118,3 +118,4 @@ _ → skip (p10-3 fallback 의 자리)
- **`pom.xml` aggregate parent POM** — 매우 큼 (수백~수천 줄). oversize fallback 으로 split. 거대 fixture 로 한 번 검증.
- **`media.rs` 정리** — 1A-1 부터 누적된 inline `match extension` duplication 을 `code_lang_for_path` 호출로 교체. 기존 단위 테스트 동작 보존 (테스트는 결과 값만 보므로 통과해야 함).
- **머지 후 deviation** 은 `tasks/HOTFIXES.md` dated 로그 + 본 spec `Risks / notes` 에 one-line cross-link.
- **[HOTFIXES 2026-05-21]** multi-resource k8s YAML (2+ document) 이 `chunk_id` 충돌로 ingest 실패 — `push_chunks_with_oversize` 의 non-oversize 분기가 `split_key = None` 하드코딩. PR #158 (v0.16.1) 에서 `base_split_key` 파라미터로 fix. See `tasks/HOTFIXES.md` 2026-05-21 entry.