diff --git a/Cargo.lock b/Cargo.lock index 73f9b16..0c678c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4127,7 +4127,7 @@ dependencies = [ [[package]] name = "kebab-app" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "base64 0.22.1", @@ -4172,12 +4172,13 @@ dependencies = [ [[package]] name = "kebab-chunk" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "blake3", "kebab-core", "kebab-normalize", + "kebab-parse-code", "kebab-parse-md", "serde_json", "serde_json_canonicalizer", @@ -4188,7 +4189,7 @@ dependencies = [ [[package]] name = "kebab-cli" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "clap", @@ -4209,7 +4210,7 @@ dependencies = [ [[package]] name = "kebab-config" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "dirs 5.0.1", @@ -4224,7 +4225,7 @@ dependencies = [ [[package]] name = "kebab-core" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "blake3", @@ -4238,7 +4239,7 @@ dependencies = [ [[package]] name = "kebab-embed" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "blake3", @@ -4252,7 +4253,7 @@ dependencies = [ [[package]] name = "kebab-embed-local" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "fastembed", @@ -4265,7 +4266,7 @@ dependencies = [ [[package]] name = "kebab-eval" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "kebab-app", @@ -4284,7 +4285,7 @@ dependencies = [ [[package]] name = "kebab-llm" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "kebab-core", @@ -4293,7 +4294,7 @@ dependencies = [ [[package]] name = "kebab-llm-local" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "kebab-config", @@ -4310,7 +4311,7 @@ dependencies = [ [[package]] name = "kebab-mcp" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "kebab-app", @@ -4328,7 +4329,7 @@ dependencies = [ [[package]] name = "kebab-normalize" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "kebab-core", @@ -4343,7 +4344,7 @@ dependencies = [ [[package]] name = "kebab-parse-code" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "gix", @@ -4366,7 +4367,7 @@ dependencies = [ [[package]] name = "kebab-parse-image" -version = "0.16.0" +version = "0.16.1" dependencies = [ "ab_glyph", "anyhow", @@ -4390,7 +4391,7 @@ dependencies = [ [[package]] name = "kebab-parse-md" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "kebab-core", @@ -4407,7 +4408,7 @@ dependencies = [ [[package]] name = "kebab-parse-pdf" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "blake3", @@ -4420,7 +4421,7 @@ dependencies = [ [[package]] name = "kebab-parse-types" -version = "0.16.0" +version = "0.16.1" dependencies = [ "kebab-core", "serde", @@ -4428,7 +4429,7 @@ dependencies = [ [[package]] name = "kebab-rag" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "blake3", @@ -4449,7 +4450,7 @@ dependencies = [ [[package]] name = "kebab-search" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "globset", @@ -4468,7 +4469,7 @@ dependencies = [ [[package]] name = "kebab-source-fs" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "blake3", @@ -4487,7 +4488,7 @@ dependencies = [ [[package]] name = "kebab-store-sqlite" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "blake3", @@ -4508,7 +4509,7 @@ dependencies = [ [[package]] name = "kebab-store-vector" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "arrow", @@ -4532,7 +4533,7 @@ dependencies = [ [[package]] name = "kebab-tui" -version = "0.16.0" +version = "0.16.1" dependencies = [ "anyhow", "crossterm", diff --git a/Cargo.toml b/Cargo.toml index 3a5c1d5..3832a57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" -version = "0.16.0" +version = "0.16.1" [workspace.dependencies] anyhow = "1" diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index e5f2338..793ece1 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -1286,6 +1286,64 @@ fn tier1_cpp_ingest_searchable() { ); } +/// P10 dogfood regression: a k8s YAML with 2 documents (Deployment + Service +/// separated by `---`) must ingest without a UNIQUE constraint violation. +/// Before the fix, push_chunks_with_oversize emitted split_key=None for each +/// resource, giving every resource chunk the same id_hash → identical chunk_id +/// → SQLite UNIQUE constraint failure on the second resource. +#[test] +fn tier2_k8s_multi_resource_yaml_ingests_without_collision() { + let env = TestEnv::lexical_only(); + + let k8s_dir = env.workspace_root.join("k8s"); + std::fs::create_dir_all(&k8s_dir).unwrap(); + std::fs::write( + k8s_dir.join("k8s-multi.yaml"), + "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: api\n namespace: prod\nspec:\n replicas: 2\n---\napiVersion: v1\nkind: Service\nmetadata:\n name: api\n namespace: prod\nspec:\n selector:\n app: api\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + + // The bug: this would land in report with an error + UNIQUE constraint message. + let item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("k8s-multi.yaml")) + .expect("k8s-multi.yaml in report"); + assert!( + item.error.is_none(), + "multi-resource k8s yaml must ingest without error, got: {:?}", + item.error + ); + assert!( + matches!(item.kind, IngestItemKind::New), + "expected New, got {:?}", + item.kind + ); + + // Both resources must be searchable (≥2 hits: Deployment/prod/api + Service/prod/api). + let query = kebab_core::SearchQuery { + text: "api".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["yaml".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + assert!( + hits.len() >= 2, + "expected ≥2 hits (Deployment + Service), got {}", + hits.len() + ); +} + /// p10-3 fix regression: a shell file (direct Tier 3, not a fallback) /// must also report Unchanged on re-ingest. Shell goes straight to /// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false diff --git a/crates/kebab-chunk/src/dockerfile_file_v1.rs b/crates/kebab-chunk/src/dockerfile_file_v1.rs index 519d1ae..230d86c 100644 --- a/crates/kebab-chunk/src/dockerfile_file_v1.rs +++ b/crates/kebab-chunk/src/dockerfile_file_v1.rs @@ -43,6 +43,7 @@ impl Chunker for DockerfileFileV1Chunker { "", "dockerfile", VERSION_LABEL, + None, )?; tracing::debug!( diff --git a/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs b/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs index 71a4104..5e2b384 100644 --- a/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs +++ b/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs @@ -85,6 +85,7 @@ impl Chunker for K8sManifestResourceV1Chunker { &symbol, "yaml", VERSION_LABEL, + Some(slice.line_start), )?; } diff --git a/crates/kebab-chunk/src/manifest_file_v1.rs b/crates/kebab-chunk/src/manifest_file_v1.rs index 1e859e0..9753fdc 100644 --- a/crates/kebab-chunk/src/manifest_file_v1.rs +++ b/crates/kebab-chunk/src/manifest_file_v1.rs @@ -44,6 +44,7 @@ impl Chunker for ManifestFileV1Chunker { "", lang, VERSION_LABEL, + None, )?; tracing::debug!( diff --git a/crates/kebab-chunk/src/tier2_shared.rs b/crates/kebab-chunk/src/tier2_shared.rs index c80b863..7d2bf5c 100644 --- a/crates/kebab-chunk/src/tier2_shared.rs +++ b/crates/kebab-chunk/src/tier2_shared.rs @@ -25,6 +25,13 @@ pub(crate) fn policy_hash(policy: &ChunkPolicy) -> String { /// Emit one chunk for `(text, line_start..=line_end, symbol, lang)`, splitting /// into line-windows of at most `AST_CHUNK_MAX_LINES` if the slice is oversize. /// Mirrors the oversize path in `code_rust_ast_v1`'s `chunk` impl. +/// +/// `base_split_key` is used as the `split_key` for the non-oversize single-chunk +/// case. Callers that emit multiple chunks from the same document (e.g. +/// `K8sManifestResourceV1Chunker` — one call per k8s resource) MUST pass +/// `Some(line_start)` so that each call produces a distinct `chunk_id`. +/// Single-chunk callers (dockerfile-file-v1, manifest-file-v1) pass `None` to +/// keep chunk_ids stable (no sibling can collide when there's only one chunk). #[allow(clippy::too_many_arguments)] pub(crate) fn push_chunks_with_oversize( out: &mut Vec, @@ -36,6 +43,7 @@ pub(crate) fn push_chunks_with_oversize( symbol: &str, lang: &str, chunker_version: &str, + base_split_key: Option, ) -> Result<()> { let n_lines = (line_end - line_start + 1).max(1); let cv = ChunkerVersion(chunker_version.to_string()); @@ -51,7 +59,7 @@ pub(crate) fn push_chunks_with_oversize( line_end, symbol, lang, - None, + base_split_key, )); return Ok(()); } diff --git a/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs index d3234d7..7c3e216 100644 --- a/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs +++ b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs @@ -140,6 +140,17 @@ fn k8s_multi_doc_emits_one_chunk_per_resource() { for chunk in &chunks { assert_eq!(chunk.chunker_version.0, "k8s-manifest-resource-v1"); } + + // Every chunk from a multi-resource file must have a distinct chunk_id. + // Without the fix, all non-oversize resources get split_key=None which + // collapses to the same id_hash (= base_policy_hash) → UNIQUE constraint + // violation on the second resource. + let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect(); + assert_eq!( + ids.len(), + chunks.len(), + "every k8s resource chunk must have a distinct chunk_id (multi-resource collision regression)" + ); } /// A YAML document with an indentation error (tab in a space-indented context) diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index b6803e7..4155ea4 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,20 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-05-21 — p10-2: k8s multi-resource YAML chunk_id collision + +**Origin**: P10 종합 도그푸딩 (`/tmp/kebab-p10-dogfood/`, 16 파일). 한 파일에 2+ k8s document (Deployment + Service, `---` 구분) 인 YAML 이 ingest 실패. + +**Symptom**: `DocumentStore::put_chunks (code): UNIQUE constraint failed: chunks.chunk_id`. document row 는 생성되나 chunk 0개 → 검색 불가. p10-2 의 통합 테스트 `tier2_k8s_yaml_ingest_searchable` 가 single-Deployment fixture 만 써서 미발견. + +**원인**: `tier2_shared::push_chunks_with_oversize` 의 non-oversize 분기가 `split_key = None` 하드코딩. `K8sManifestResourceV1Chunker` 가 resource 마다 호출 — 같은 document 의 모든 resource 가 `doc_id` + `chunker_version` + `base_policy_hash` 공유 + `split_key = None` → 동일 `id_hash` → 동일 `chunk_id`. p10-3 의 `code_text_paragraph_v1` 가 같은 버그였고 `df3c5b8` 에서 fix 됐지만 그건 `build_chunk_no_symbol` 직접 호출 경로, `push_chunks_with_oversize` 경로는 미수정. + +**Fix** (PR #158, v0.16.1): `push_chunks_with_oversize` 에 `base_split_key: Option` 추가. k8s chunker 가 `Some(resource.line_start)` 전달 → resource 별 distinct chunk_id. dockerfile / manifest 는 `None` (파일당 1 chunk, 충돌 없음, chunk_id 불변). + +**Deviation note**: single-resource k8s YAML 의 chunk_id 도 `None → Some(1)` 으로 바뀜 (`id_hash` 가 `base_policy_hash` → `base_policy_hash#L1`). `chunker_version` (`k8s-manifest-resource-v1`) 은 의도적으로 bump 안 함 — p10-2 가 v0.14.0 (~1주 전) 머지된 dogfood 단계라 prod KB 없음. v0.14.0~v0.16.0 사이 single-resource k8s 를 색인한 KB 는 re-ingest 시 old chunk 가 orphan 될 수 있으나 (UNIQUE 충돌 아님 — 다른 id), `kebab reset` 또는 re-ingest sweep 으로 정리됨. dogfood-only 단계라 chunker_version bump (전체 re-process) 보다 가벼운 선택. + +Cross-link: `tasks/p10/p10-2-tier2-resource-aware.md` Risks/notes section. + ## 2026-05-21 — p10-1D: typedef-wrapped struct/enum in C falls into glue **Origin**: PR #156 (p10-1d) code-reviewer review. Verified during dogfood. diff --git a/tasks/p10/p10-2-tier2-resource-aware.md b/tasks/p10/p10-2-tier2-resource-aware.md index 8ae66ad..deb43ec 100644 --- a/tasks/p10/p10-2-tier2-resource-aware.md +++ b/tasks/p10/p10-2-tier2-resource-aware.md @@ -118,3 +118,4 @@ _ → skip (p10-3 fallback 의 자리) - **`pom.xml` aggregate parent POM** — 매우 큼 (수백~수천 줄). oversize fallback 으로 split. 거대 fixture 로 한 번 검증. - **`media.rs` 정리** — 1A-1 부터 누적된 inline `match extension` duplication 을 `code_lang_for_path` 호출로 교체. 기존 단위 테스트 동작 보존 (테스트는 결과 값만 보므로 통과해야 함). - **머지 후 deviation** 은 `tasks/HOTFIXES.md` dated 로그 + 본 spec `Risks / notes` 에 one-line cross-link. +- **[HOTFIXES 2026-05-21]** multi-resource k8s YAML (2+ document) 이 `chunk_id` 충돌로 ingest 실패 — `push_chunks_with_oversize` 의 non-oversize 분기가 `split_key = None` 하드코딩. PR #158 (v0.16.1) 에서 `base_split_key` 파라미터로 fix. See `tasks/HOTFIXES.md` 2026-05-21 entry.