From df3c5b8caf4bf17ad293a19be17cbe0599c46d98 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 11:37:44 +0000 Subject: [PATCH] test(p10-3): integration smoke tests for Tier 3 (shell + yaml fallback) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new tests verify end-to-end Tier 3 wiring: - tier3_shell_ingest_searchable: .sh file → --code-lang shell search → Citation::Code { symbol: None, lang: "shell" }, chunker_version "code-text-paragraph-v1". - tier3_yaml_fallback_picks_up_non_k8s_yaml: docker-compose-shaped yaml (no apiVersion/kind) triggers k8s chunker's Ok(vec![]) result, fallback retries with Tier 3 → Citation::Code { symbol: None, lang: "yaml" } and chunker_version "code-text-paragraph-v1". Also fixes a bug in CodeTextParagraphV1Chunker (Task B): short paragraphs (≤80 lines) were emitted with split_key=None, causing all paragraphs from the same document to share the same chunk_id (UNIQUE constraint violation at put_chunks). Fix: always use para.line_start as split_key so every paragraph gets a distinct id regardless of size. Brings code_ingest_smoke to 14 tests (Tier 1: 9, Tier 2: 3, Tier 3: 2). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/tests/code_ingest_smoke.rs | 175 ++++++++++++++++++ .../kebab-chunk/src/code_text_paragraph_v1.rs | 7 +- 2 files changed, 180 insertions(+), 2 deletions(-) diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index 69ac528..a462666 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -850,6 +850,181 @@ fn tier2_cargo_toml_ingest_searchable() { ); } +/// p10-3 Task E: a `.sh` file is ingested via the shell direct-Tier-3 path +/// and the resulting `Citation::Code` hit must carry `lang="shell"`, +/// `symbol=None`, `line_start >= 1`, and +/// `chunker_version = "code-text-paragraph-v1"`. +#[test] +fn tier3_shell_ingest_searchable() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("deploy.sh"), + "#!/usr/bin/env bash\nset -e\necho hello\n\nkebab ingest --json\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!(report.new >= 1, "shell file ingested: {report:?}"); + + let sh_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh item present"); + assert_eq!( + sh_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("none-v1"), + "parser_version must be none-v1 for shell (Tier 3 direct)" + ); + assert_eq!( + sh_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-text-paragraph-v1"), + "chunker_version must be code-text-paragraph-v1 for shell" + ); + + let query = kebab_core::SearchQuery { + text: "kebab".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["shell".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'kebab'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("shell"), + "citation.lang must be 'shell'" + ); + assert_eq!(*symbol, None, "Tier 3 symbol must be None"); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("shell"), + "SearchHit.code_lang must be 'shell'" + ); + assert_eq!( + h.chunker_version.0.as_str(), + "code-text-paragraph-v1", + "shell chunks must be stamped with the Tier 3 chunker_version" + ); +} + +/// p10-3 Task E: a docker-compose-shaped YAML file (no `apiVersion`/`kind`) +/// is ingested; the k8s chunker returns `Ok(vec![])` and the Tier 3 fallback +/// wrapper retries with `CodeTextParagraphV1Chunker`. The resulting +/// `Citation::Code` hit must carry `lang="yaml"`, `symbol=None`, +/// `line_start >= 1`, and `chunker_version = "code-text-paragraph-v1"`. +#[test] +fn tier3_yaml_fallback_picks_up_non_k8s_yaml() { + let env = TestEnv::lexical_only(); + + // docker-compose-shaped YAML — version + services but no apiVersion/kind. + // The k8s chunker returns Ok(vec![]); Tier 3 fallback should pick this up. + std::fs::write( + env.workspace_root.join("docker-compose.yml"), + "version: '3'\nservices:\n api:\n image: nginx:latest\n ports:\n - 8080:80\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!( + report.new >= 1, + "expected non-k8s yaml ingested via Tier 3, got {} new docs", + report.new + ); + + let yaml_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml item present"); + assert_eq!( + yaml_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("none-v1"), + "parser_version must be none-v1 after Tier 3 fallback" + ); + assert_eq!( + yaml_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-text-paragraph-v1"), + "chunker_version must be code-text-paragraph-v1 after Tier 3 fallback" + ); + + let query = kebab_core::SearchQuery { + text: "nginx".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["yaml".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'nginx'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("yaml"), + "citation.lang must be 'yaml'" + ); + assert_eq!(*symbol, None, "Tier 3 fallback symbol must be None"); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("yaml"), + "SearchHit.code_lang must be 'yaml'" + ); + assert_eq!( + h.chunker_version.0.as_str(), + "code-text-paragraph-v1", + "non-k8s yaml fallback must be stamped code-text-paragraph-v1" + ); +} + /// Re-ingesting the same `.rs` file without changes must report /// `Unchanged` (incremental-skip path exercised). #[test] diff --git a/crates/kebab-chunk/src/code_text_paragraph_v1.rs b/crates/kebab-chunk/src/code_text_paragraph_v1.rs index aa41b52..cda5b99 100644 --- a/crates/kebab-chunk/src/code_text_paragraph_v1.rs +++ b/crates/kebab-chunk/src/code_text_paragraph_v1.rs @@ -124,7 +124,10 @@ fn push_paragraph( let n_lines = (para.line_end - para.line_start + 1) as usize; if n_lines <= FALLBACK_LINES_PER_CHUNK { - // Single chunk — no split_key needed. + // Use line_start as split_key so each paragraph gets a distinct + // chunk_id even when block_ids is empty (no symbol, no AST structure). + // Without this, all short paragraphs from the same doc share the same + // base_policy_hash and therefore the same id_for_chunk result. out.push(build_chunk_no_symbol( doc, policy, @@ -133,7 +136,7 @@ fn push_paragraph( para.line_end, lang, VERSION_LABEL, - None, + Some(para.line_start), )); return Ok(()); }