test(p10-3): integration smoke tests for Tier 3 (shell + yaml fallback)
Two new tests verify end-to-end Tier 3 wiring:
- tier3_shell_ingest_searchable: .sh file → --code-lang shell search →
Citation::Code { symbol: None, lang: "shell" }, chunker_version
"code-text-paragraph-v1".
- tier3_yaml_fallback_picks_up_non_k8s_yaml: docker-compose-shaped yaml
(no apiVersion/kind) triggers k8s chunker's Ok(vec![]) result, fallback
retries with Tier 3 → Citation::Code { symbol: None, lang: "yaml" } and
chunker_version "code-text-paragraph-v1".
Also fixes a bug in CodeTextParagraphV1Chunker (Task B): short paragraphs
(≤80 lines) were emitted with split_key=None, causing all paragraphs from the
same document to share the same chunk_id (UNIQUE constraint violation at
put_chunks). Fix: always use para.line_start as split_key so every paragraph
gets a distinct id regardless of size.
Brings code_ingest_smoke to 14 tests (Tier 1: 9, Tier 2: 3, Tier 3: 2).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -850,6 +850,181 @@ fn tier2_cargo_toml_ingest_searchable() {
|
||||
);
|
||||
}
|
||||
|
||||
/// p10-3 Task E: a `.sh` file is ingested via the shell direct-Tier-3 path
|
||||
/// and the resulting `Citation::Code` hit must carry `lang="shell"`,
|
||||
/// `symbol=None`, `line_start >= 1`, and
|
||||
/// `chunker_version = "code-text-paragraph-v1"`.
|
||||
#[test]
|
||||
fn tier3_shell_ingest_searchable() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
std::fs::write(
|
||||
env.workspace_root.join("deploy.sh"),
|
||||
"#!/usr/bin/env bash\nset -e\necho hello\n\nkebab ingest --json\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
assert_eq!(report.errors, 0, "no ingest errors: {report:?}");
|
||||
assert!(report.new >= 1, "shell file ingested: {report:?}");
|
||||
|
||||
let sh_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
.expect("items present")
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("deploy.sh"))
|
||||
.expect("deploy.sh item present");
|
||||
assert_eq!(
|
||||
sh_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
Some("none-v1"),
|
||||
"parser_version must be none-v1 for shell (Tier 3 direct)"
|
||||
);
|
||||
assert_eq!(
|
||||
sh_item.chunker_version.as_ref().map(|c| c.0.as_str()),
|
||||
Some("code-text-paragraph-v1"),
|
||||
"chunker_version must be code-text-paragraph-v1 for shell"
|
||||
);
|
||||
|
||||
let query = kebab_core::SearchQuery {
|
||||
text: "kebab".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: kebab_core::SearchFilters {
|
||||
code_lang: vec!["shell".to_string()],
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
.find(|h| matches!(&h.citation, Citation::Code { .. }))
|
||||
.expect("at least one Citation::Code hit for 'kebab'");
|
||||
|
||||
match &h.citation {
|
||||
Citation::Code {
|
||||
lang,
|
||||
symbol,
|
||||
line_start,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("shell"),
|
||||
"citation.lang must be 'shell'"
|
||||
);
|
||||
assert_eq!(*symbol, None, "Tier 3 symbol must be None");
|
||||
assert!(*line_start >= 1, "line_start must be >=1");
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
h.code_lang.as_deref(),
|
||||
Some("shell"),
|
||||
"SearchHit.code_lang must be 'shell'"
|
||||
);
|
||||
assert_eq!(
|
||||
h.chunker_version.0.as_str(),
|
||||
"code-text-paragraph-v1",
|
||||
"shell chunks must be stamped with the Tier 3 chunker_version"
|
||||
);
|
||||
}
|
||||
|
||||
/// p10-3 Task E: a docker-compose-shaped YAML file (no `apiVersion`/`kind`)
|
||||
/// is ingested; the k8s chunker returns `Ok(vec![])` and the Tier 3 fallback
|
||||
/// wrapper retries with `CodeTextParagraphV1Chunker`. The resulting
|
||||
/// `Citation::Code` hit must carry `lang="yaml"`, `symbol=None`,
|
||||
/// `line_start >= 1`, and `chunker_version = "code-text-paragraph-v1"`.
|
||||
#[test]
|
||||
fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// docker-compose-shaped YAML — version + services but no apiVersion/kind.
|
||||
// The k8s chunker returns Ok(vec![]); Tier 3 fallback should pick this up.
|
||||
std::fs::write(
|
||||
env.workspace_root.join("docker-compose.yml"),
|
||||
"version: '3'\nservices:\n api:\n image: nginx:latest\n ports:\n - 8080:80\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
assert_eq!(report.errors, 0, "no ingest errors: {report:?}");
|
||||
assert!(
|
||||
report.new >= 1,
|
||||
"expected non-k8s yaml ingested via Tier 3, got {} new docs",
|
||||
report.new
|
||||
);
|
||||
|
||||
let yaml_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
.expect("items present")
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("docker-compose.yml"))
|
||||
.expect("docker-compose.yml item present");
|
||||
assert_eq!(
|
||||
yaml_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
Some("none-v1"),
|
||||
"parser_version must be none-v1 after Tier 3 fallback"
|
||||
);
|
||||
assert_eq!(
|
||||
yaml_item.chunker_version.as_ref().map(|c| c.0.as_str()),
|
||||
Some("code-text-paragraph-v1"),
|
||||
"chunker_version must be code-text-paragraph-v1 after Tier 3 fallback"
|
||||
);
|
||||
|
||||
let query = kebab_core::SearchQuery {
|
||||
text: "nginx".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: kebab_core::SearchFilters {
|
||||
code_lang: vec!["yaml".to_string()],
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
.find(|h| matches!(&h.citation, Citation::Code { .. }))
|
||||
.expect("at least one Citation::Code hit for 'nginx'");
|
||||
|
||||
match &h.citation {
|
||||
Citation::Code {
|
||||
lang,
|
||||
symbol,
|
||||
line_start,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("yaml"),
|
||||
"citation.lang must be 'yaml'"
|
||||
);
|
||||
assert_eq!(*symbol, None, "Tier 3 fallback symbol must be None");
|
||||
assert!(*line_start >= 1, "line_start must be >=1");
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
h.code_lang.as_deref(),
|
||||
Some("yaml"),
|
||||
"SearchHit.code_lang must be 'yaml'"
|
||||
);
|
||||
assert_eq!(
|
||||
h.chunker_version.0.as_str(),
|
||||
"code-text-paragraph-v1",
|
||||
"non-k8s yaml fallback must be stamped code-text-paragraph-v1"
|
||||
);
|
||||
}
|
||||
|
||||
/// Re-ingesting the same `.rs` file without changes must report
|
||||
/// `Unchanged` (incremental-skip path exercised).
|
||||
#[test]
|
||||
|
||||
@@ -124,7 +124,10 @@ fn push_paragraph(
|
||||
let n_lines = (para.line_end - para.line_start + 1) as usize;
|
||||
|
||||
if n_lines <= FALLBACK_LINES_PER_CHUNK {
|
||||
// Single chunk — no split_key needed.
|
||||
// Use line_start as split_key so each paragraph gets a distinct
|
||||
// chunk_id even when block_ids is empty (no symbol, no AST structure).
|
||||
// Without this, all short paragraphs from the same doc share the same
|
||||
// base_policy_hash and therefore the same id_for_chunk result.
|
||||
out.push(build_chunk_no_symbol(
|
||||
doc,
|
||||
policy,
|
||||
@@ -133,7 +136,7 @@ fn push_paragraph(
|
||||
para.line_end,
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
None,
|
||||
Some(para.line_start),
|
||||
));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user