test(ingest): config invalidation e2e + parser_version assert 갱신

- config_invalidation.rs(신규): 동일config=전skip / 청킹변경=md+code재색인 /
  [ingest.code]변경=코드만 / search변경=재색인0 (회귀가드) end-to-end.
- code_ingest_smoke / pdf_pipeline: 저장 parser_version 이 이제
  "{base}|{sig}" composite 라, exact assert 를 base 접두사(split('|').next()) 비교로 갱신.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-03 14:14:15 +00:00
parent e7cb20990a
commit 03b0745e9d
3 changed files with 197 additions and 16 deletions

View File

@@ -52,7 +52,9 @@ fn rust_file_ingests_and_searches_as_code_citation() {
"at least one chunk expected: {code_item:?}" "at least one chunk expected: {code_item:?}"
); );
assert_eq!( assert_eq!(
code_item.parser_version.as_ref().map(|p| p.0.as_str()), code_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("code-rust-v1"), Some("code-rust-v1"),
"parser_version must be code-rust-v1" "parser_version must be code-rust-v1"
); );
@@ -185,7 +187,9 @@ fn python_file_ingests_and_searches_as_code_citation() {
.find(|i| i.doc_path.0.ends_with("metrics.py")) .find(|i| i.doc_path.0.ends_with("metrics.py"))
.expect("metrics.py item"); .expect("metrics.py item");
assert_eq!( assert_eq!(
py_item.parser_version.as_ref().map(|p| p.0.as_str()), py_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("code-python-v1"), Some("code-python-v1"),
"parser_version must be code-python-v1" "parser_version must be code-python-v1"
); );
@@ -261,7 +265,9 @@ fn typescript_file_ingests_and_searches_as_code_citation() {
.find(|i| i.doc_path.0.ends_with("Foo.ts")) .find(|i| i.doc_path.0.ends_with("Foo.ts"))
.expect("Foo.ts item"); .expect("Foo.ts item");
assert_eq!( assert_eq!(
ts_item.parser_version.as_ref().map(|p| p.0.as_str()), ts_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("code-ts-v1"), Some("code-ts-v1"),
"parser_version must be code-ts-v1" "parser_version must be code-ts-v1"
); );
@@ -337,7 +343,9 @@ fn javascript_file_ingests_and_searches_as_code_citation() {
.find(|i| i.doc_path.0.ends_with("Bar.js")) .find(|i| i.doc_path.0.ends_with("Bar.js"))
.expect("Bar.js item"); .expect("Bar.js item");
assert_eq!( assert_eq!(
js_item.parser_version.as_ref().map(|p| p.0.as_str()), js_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("code-js-v1"), Some("code-js-v1"),
"parser_version must be code-js-v1" "parser_version must be code-js-v1"
); );
@@ -415,7 +423,9 @@ fn go_file_ingests_and_searches_as_code_citation() {
.find(|i| i.doc_path.0.ends_with("ast.go")) .find(|i| i.doc_path.0.ends_with("ast.go"))
.expect("ast.go item present"); .expect("ast.go item present");
assert_eq!( assert_eq!(
go_item.parser_version.as_ref().map(|p| p.0.as_str()), go_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("code-go-v1"), Some("code-go-v1"),
"parser_version must be code-go-v1" "parser_version must be code-go-v1"
); );
@@ -486,7 +496,9 @@ fn java_file_ingests_and_searches_as_code_citation() {
.find(|i| i.doc_path.0.ends_with("Foo.java")) .find(|i| i.doc_path.0.ends_with("Foo.java"))
.expect("Foo.java item present"); .expect("Foo.java item present");
assert_eq!( assert_eq!(
java_item.parser_version.as_ref().map(|p| p.0.as_str()), java_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("code-java-v1"), Some("code-java-v1"),
"parser_version must be code-java-v1" "parser_version must be code-java-v1"
); );
@@ -561,7 +573,9 @@ fn kotlin_file_ingests_and_searches_as_code_citation() {
.find(|i| i.doc_path.0.ends_with("Foo.kt")) .find(|i| i.doc_path.0.ends_with("Foo.kt"))
.expect("Foo.kt item present"); .expect("Foo.kt item present");
assert_eq!( assert_eq!(
kt_item.parser_version.as_ref().map(|p| p.0.as_str()), kt_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("code-kotlin-v1"), Some("code-kotlin-v1"),
"parser_version must be code-kotlin-v1" "parser_version must be code-kotlin-v1"
); );
@@ -634,7 +648,9 @@ fn tier2_k8s_yaml_ingest_searchable() {
.find(|i| i.doc_path.0.ends_with("deploy.yaml")) .find(|i| i.doc_path.0.ends_with("deploy.yaml"))
.expect("deploy.yaml item present"); .expect("deploy.yaml item present");
assert_eq!( assert_eq!(
yaml_item.parser_version.as_ref().map(|p| p.0.as_str()), yaml_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("none-v1"), Some("none-v1"),
"parser_version must be none-v1" "parser_version must be none-v1"
); );
@@ -717,7 +733,9 @@ fn tier2_dockerfile_ingest_searchable() {
.find(|i| i.doc_path.0.ends_with("Dockerfile")) .find(|i| i.doc_path.0.ends_with("Dockerfile"))
.expect("Dockerfile item present"); .expect("Dockerfile item present");
assert_eq!( assert_eq!(
df_item.parser_version.as_ref().map(|p| p.0.as_str()), df_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("none-v1"), Some("none-v1"),
"parser_version must be none-v1" "parser_version must be none-v1"
); );
@@ -800,7 +818,9 @@ fn tier2_cargo_toml_ingest_searchable() {
.find(|i| i.doc_path.0.ends_with("Cargo.toml")) .find(|i| i.doc_path.0.ends_with("Cargo.toml"))
.expect("Cargo.toml item present"); .expect("Cargo.toml item present");
assert_eq!( assert_eq!(
toml_item.parser_version.as_ref().map(|p| p.0.as_str()), toml_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("none-v1"), Some("none-v1"),
"parser_version must be none-v1" "parser_version must be none-v1"
); );
@@ -883,7 +903,9 @@ fn tier3_shell_ingest_searchable() {
.find(|i| i.doc_path.0.ends_with("deploy.sh")) .find(|i| i.doc_path.0.ends_with("deploy.sh"))
.expect("deploy.sh item present"); .expect("deploy.sh item present");
assert_eq!( assert_eq!(
sh_item.parser_version.as_ref().map(|p| p.0.as_str()), sh_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("none-v1"), Some("none-v1"),
"parser_version must be none-v1 for shell (Tier 3 direct)" "parser_version must be none-v1 for shell (Tier 3 direct)"
); );
@@ -974,7 +996,9 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
.find(|i| i.doc_path.0.ends_with("docker-compose.yml")) .find(|i| i.doc_path.0.ends_with("docker-compose.yml"))
.expect("docker-compose.yml item present"); .expect("docker-compose.yml item present");
assert_eq!( assert_eq!(
yaml_item.parser_version.as_ref().map(|p| p.0.as_str()), yaml_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("none-v1"), Some("none-v1"),
"parser_version must be none-v1 after Tier 3 fallback" "parser_version must be none-v1 after Tier 3 fallback"
); );
@@ -1144,7 +1168,9 @@ fn tier1_c_ingest_searchable() {
.find(|i| i.doc_path.0.ends_with("parser.c")) .find(|i| i.doc_path.0.ends_with("parser.c"))
.expect("parser.c item present"); .expect("parser.c item present");
assert_eq!( assert_eq!(
c_item.parser_version.as_ref().map(|p| p.0.as_str()), c_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("code-c-v2"), Some("code-c-v2"),
"parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)" "parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)"
); );
@@ -1228,7 +1254,9 @@ fn tier1_cpp_ingest_searchable() {
.find(|i| i.doc_path.0.ends_with("chunker.cpp")) .find(|i| i.doc_path.0.ends_with("chunker.cpp"))
.expect("chunker.cpp item present"); .expect("chunker.cpp item present");
assert_eq!( assert_eq!(
cpp_item.parser_version.as_ref().map(|p| p.0.as_str()), cpp_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("code-cpp-v1"), Some("code-cpp-v1"),
"parser_version must be code-cpp-v1" "parser_version must be code-cpp-v1"
); );

View File

@@ -0,0 +1,148 @@
//! v0.26.2: ingest-config invalidation — changing a setting that affects
//! ingest output auto-re-indexes the affected assets on the next ingest
//! (no `--force-reingest`), while changing an unrelated setting does not.
//!
//! These end-to-end tests exercise the model-free signals (chunking +
//! `[ingest.code]` options vs `search` settings). The exhaustive per-setting
//! mapping (image OCR / caption, pdf.ocr, code options, search/rag/ui
//! invariance) is unit-tested in
//! `kebab-app/src/lib.rs::ingest_config_signature_tests` — those toggles
//! (OCR/caption) require a live vision endpoint to ingest, so the wiring is
//! verified here via the signature-driven chunking path that shares the same
//! `effective_parser_version` plumbing.
mod common;
use common::TestEnv;
use kebab_app::{IngestOpts, ingest_with_config, ingest_with_config_opts};
use kebab_core::IngestItemKind;
/// Seed a workspace with a markdown + a rust file so both the markdown and
/// the code ingest paths are exercised. Returns the first-ingest report.
fn seed_and_first_ingest(env: &TestEnv) -> kebab_core::IngestReport {
std::fs::write(
env.workspace_root.join("demo.rs"),
"/// adds two integers\npub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n",
)
.unwrap();
let first = ingest_with_config(env.config.clone(), env.scope(), false).expect("first ingest");
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
assert!(first.new >= 1, "first ingest creates docs: {first:?}");
assert_eq!(first.unchanged, 0, "first ingest has no unchanged: {first:?}");
first
}
fn reingest(env: &TestEnv) -> kebab_core::IngestReport {
ingest_with_config_opts(env.config.clone(), env.scope(), false, IngestOpts::default())
.expect("re-ingest")
}
/// Re-running with the identical config skips every asset (no spurious
/// re-index). Regression guard for over-invalidation.
#[test]
fn identical_config_skips_all_assets() {
let env = TestEnv::lexical_only();
let first = seed_and_first_ingest(&env);
let scanned = first.scanned;
let second = reingest(&env);
assert_eq!(second.scanned, scanned);
assert_eq!(second.new, 0, "no new docs: {second:?}");
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
assert_eq!(second.unchanged, scanned, "every doc Unchanged: {second:?}");
assert_eq!(second.errors, 0);
}
/// Changing a common chunking parameter re-indexes EVERY media type
/// (markdown + code here) without `--force-reingest`.
#[test]
fn chunking_change_reindexes_all_types() {
let mut env = TestEnv::lexical_only();
let first = seed_and_first_ingest(&env);
let scanned = first.scanned;
// Bump target_tokens — folds into every type's signature.
env.config.chunking.target_tokens += 100;
let second = reingest(&env);
assert_eq!(second.scanned, scanned);
assert_eq!(second.new, 0, "no new docs: {second:?}");
assert_eq!(
second.unchanged, 0,
"chunking change must re-index all: {second:?}"
);
assert_eq!(
second.updated, scanned,
"every doc re-indexed as Updated: {second:?}"
);
assert_eq!(second.errors, 0);
}
/// Changing an `[ingest.code]` option re-indexes only the code asset; the
/// markdown assets stay Unchanged.
#[test]
fn code_option_change_reindexes_code_only() {
let mut env = TestEnv::lexical_only();
let first = seed_and_first_ingest(&env);
let scanned = first.scanned;
// Raise max_file_lines (keeps the tiny demo.rs in-scope; only the code
// signature changes).
env.config.ingest.code.max_file_lines += 1000;
let second = reingest(&env);
assert_eq!(second.scanned, scanned);
assert_eq!(second.new, 0, "no new docs: {second:?}");
assert_eq!(second.errors, 0);
assert_eq!(
second.updated, 1,
"exactly the code asset re-indexed: {second:?}"
);
assert_eq!(
second.unchanged,
scanned - 1,
"all markdown assets stay Unchanged: {second:?}"
);
let items = second.items.as_ref().expect("items present");
let code = items
.iter()
.find(|i| i.doc_path.0.ends_with("demo.rs"))
.expect("demo.rs item");
assert_eq!(
code.kind,
IngestItemKind::Updated,
"demo.rs must be re-indexed: {code:?}"
);
for i in items.iter().filter(|i| i.doc_path.0.ends_with(".md")) {
assert_eq!(
i.kind,
IngestItemKind::Unchanged,
"markdown must be Unchanged: {i:?}"
);
}
}
/// Regression guard: changing a non-ingest setting (`search.default_k`) does
/// NOT re-index anything.
#[test]
fn search_setting_change_reindexes_nothing() {
let mut env = TestEnv::lexical_only();
let first = seed_and_first_ingest(&env);
let scanned = first.scanned;
env.config.search.default_k += 5;
env.config.search.snippet_chars += 50;
env.config.rag.score_gate = 0.5;
let second = reingest(&env);
assert_eq!(second.scanned, scanned);
assert_eq!(
second.unchanged, scanned,
"search/rag changes must not re-index: {second:?}"
);
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
assert_eq!(second.new, 0);
assert_eq!(second.errors, 0);
}

View File

@@ -162,7 +162,9 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
"one chunk per non-empty page" "one chunk per non-empty page"
); );
assert_eq!( assert_eq!(
pdf_item.parser_version.as_ref().map(|p| p.0.as_str()), pdf_item.parser_version
.as_ref()
.map(|p| p.0.split('|').next().unwrap()),
Some("pdf-text-v1") Some("pdf-text-v1")
); );
assert_eq!( assert_eq!(
@@ -477,7 +479,10 @@ fn inspect_doc_surfaces_page_spans() {
.find(|i| i.doc_path.0.ends_with("inspect.pdf")) .find(|i| i.doc_path.0.ends_with("inspect.pdf"))
.unwrap(); .unwrap();
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap(); let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
assert_eq!(doc.parser_version.0, "pdf-text-v1"); // v0.26.2: stored parser_version is now `pdf-text-v1|<ingest-config-sig>`
// (the signature folds chunking / pdf.ocr settings for skip detection).
// Assert the base identity by taking the prefix before the first '|'.
assert_eq!(doc.parser_version.0.split('|').next().unwrap(), "pdf-text-v1");
assert_eq!(doc.blocks.len(), 3); assert_eq!(doc.blocks.len(), 3);
for block in &doc.blocks { for block in &doc.blocks {
match block { match block {