fix(dogfood): k8s multi-resource YAML chunk_id collision

P10 dogfooding found that a k8s manifest with 2+ documents (e.g.
Deployment + Service in one file) fails to ingest:
  UNIQUE constraint failed: chunks.chunk_id

Root cause: tier2_shared::push_chunks_with_oversize's non-oversize branch
hardcoded split_key = None. K8sManifestResourceV1Chunker calls it once per
resource; with split_key None every resource from the same document gets
the same id_hash (= base_policy_hash) → identical chunk_id. p10-3's
code_text_paragraph_v1 had the same bug (fixed in df3c5b8) but it calls
build_chunk_no_symbol directly — the push_chunks_with_oversize path was
never fixed.

Fix: push_chunks_with_oversize gains a base_split_key parameter for the
non-oversize single-chunk case. k8s chunker passes Some(resource.line_start)
so each resource gets a distinct chunk_id; dockerfile / manifest pass None
(1 chunk per file — no sibling collision, chunk_id stays stable).

Regression coverage: k8s_multi_doc_emits_one_chunk_per_resource now asserts
chunk_id distinctness; new integration test
tier2_k8s_multi_resource_yaml_ingests_without_collision ingests a real
2-document YAML end-to-end.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-21 23:49:37 +00:00
parent c6207d196e
commit 1969c8e3b5
6 changed files with 81 additions and 1 deletions

View File

@@ -1286,6 +1286,64 @@ fn tier1_cpp_ingest_searchable() {
);
}
/// P10 dogfood regression: a k8s YAML with 2 documents (Deployment + Service
/// separated by `---`) must ingest without a UNIQUE constraint violation.
/// Before the fix, push_chunks_with_oversize emitted split_key=None for each
/// resource, giving every resource chunk the same id_hash → identical chunk_id
/// → SQLite UNIQUE constraint failure on the second resource.
#[test]
fn tier2_k8s_multi_resource_yaml_ingests_without_collision() {
let env = TestEnv::lexical_only();
let k8s_dir = env.workspace_root.join("k8s");
std::fs::create_dir_all(&k8s_dir).unwrap();
std::fs::write(
k8s_dir.join("k8s-multi.yaml"),
"apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: api\n namespace: prod\nspec:\n replicas: 2\n---\napiVersion: v1\nkind: Service\nmetadata:\n name: api\n namespace: prod\nspec:\n selector:\n app: api\n",
)
.unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
// The bug: this would land in report with an error + UNIQUE constraint message.
let item = report
.items
.as_ref()
.expect("items present")
.iter()
.find(|i| i.doc_path.0.ends_with("k8s-multi.yaml"))
.expect("k8s-multi.yaml in report");
assert!(
item.error.is_none(),
"multi-resource k8s yaml must ingest without error, got: {:?}",
item.error
);
assert!(
matches!(item.kind, IngestItemKind::New),
"expected New, got {:?}",
item.kind
);
// Both resources must be searchable (≥2 hits: Deployment/prod/api + Service/prod/api).
let query = kebab_core::SearchQuery {
text: "api".to_string(),
mode: kebab_core::SearchMode::Lexical,
k: 10,
filters: kebab_core::SearchFilters {
code_lang: vec!["yaml".to_string()],
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
assert!(
hits.len() >= 2,
"expected ≥2 hits (Deployment + Service), got {}",
hits.len()
);
}
/// p10-3 fix regression: a shell file (direct Tier 3, not a fallback)
/// must also report Unchanged on re-ingest. Shell goes straight to
/// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false

View File

@@ -43,6 +43,7 @@ impl Chunker for DockerfileFileV1Chunker {
"<dockerfile>",
"dockerfile",
VERSION_LABEL,
None,
)?;
tracing::debug!(

View File

@@ -85,6 +85,7 @@ impl Chunker for K8sManifestResourceV1Chunker {
&symbol,
"yaml",
VERSION_LABEL,
Some(slice.line_start),
)?;
}

View File

@@ -44,6 +44,7 @@ impl Chunker for ManifestFileV1Chunker {
"<manifest>",
lang,
VERSION_LABEL,
None,
)?;
tracing::debug!(

View File

@@ -25,6 +25,13 @@ pub(crate) fn policy_hash(policy: &ChunkPolicy) -> String {
/// Emit one chunk for `(text, line_start..=line_end, symbol, lang)`, splitting
/// into line-windows of at most `AST_CHUNK_MAX_LINES` if the slice is oversize.
/// Mirrors the oversize path in `code_rust_ast_v1`'s `chunk` impl.
///
/// `base_split_key` is used as the `split_key` for the non-oversize single-chunk
/// case. Callers that emit multiple chunks from the same document (e.g.
/// `K8sManifestResourceV1Chunker` — one call per k8s resource) MUST pass
/// `Some(line_start)` so that each call produces a distinct `chunk_id`.
/// Single-chunk callers (dockerfile-file-v1, manifest-file-v1) pass `None` to
/// keep chunk_ids stable (no sibling can collide when there's only one chunk).
#[allow(clippy::too_many_arguments)]
pub(crate) fn push_chunks_with_oversize(
out: &mut Vec<Chunk>,
@@ -36,6 +43,7 @@ pub(crate) fn push_chunks_with_oversize(
symbol: &str,
lang: &str,
chunker_version: &str,
base_split_key: Option<u32>,
) -> Result<()> {
let n_lines = (line_end - line_start + 1).max(1);
let cv = ChunkerVersion(chunker_version.to_string());
@@ -51,7 +59,7 @@ pub(crate) fn push_chunks_with_oversize(
line_end,
symbol,
lang,
None,
base_split_key,
));
return Ok(());
}

View File

@@ -140,6 +140,17 @@ fn k8s_multi_doc_emits_one_chunk_per_resource() {
for chunk in &chunks {
assert_eq!(chunk.chunker_version.0, "k8s-manifest-resource-v1");
}
// Every chunk from a multi-resource file must have a distinct chunk_id.
// Without the fix, all non-oversize resources get split_key=None which
// collapses to the same id_hash (= base_policy_hash) → UNIQUE constraint
// violation on the second resource.
let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect();
assert_eq!(
ids.len(),
chunks.len(),
"every k8s resource chunk must have a distinct chunk_id (multi-resource collision regression)"
);
}
/// A YAML document with an indentation error (tab in a space-indented context)