test(ingest): config invalidation e2e + parser_version assert 갱신
- config_invalidation.rs(신규): 동일config=전skip / 청킹변경=md+code재색인 /
[ingest.code]변경=코드만 / search변경=재색인0 (회귀가드) end-to-end.
- code_ingest_smoke / pdf_pipeline: 저장 parser_version 이 이제
"{base}|{sig}" composite 라, exact assert 를 base 접두사(split('|').next()) 비교로 갱신.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -52,7 +52,9 @@ fn rust_file_ingests_and_searches_as_code_citation() {
|
|||||||
"at least one chunk expected: {code_item:?}"
|
"at least one chunk expected: {code_item:?}"
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
code_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
code_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("code-rust-v1"),
|
Some("code-rust-v1"),
|
||||||
"parser_version must be code-rust-v1"
|
"parser_version must be code-rust-v1"
|
||||||
);
|
);
|
||||||
@@ -185,7 +187,9 @@ fn python_file_ingests_and_searches_as_code_citation() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("metrics.py"))
|
.find(|i| i.doc_path.0.ends_with("metrics.py"))
|
||||||
.expect("metrics.py item");
|
.expect("metrics.py item");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
py_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
py_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("code-python-v1"),
|
Some("code-python-v1"),
|
||||||
"parser_version must be code-python-v1"
|
"parser_version must be code-python-v1"
|
||||||
);
|
);
|
||||||
@@ -261,7 +265,9 @@ fn typescript_file_ingests_and_searches_as_code_citation() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("Foo.ts"))
|
.find(|i| i.doc_path.0.ends_with("Foo.ts"))
|
||||||
.expect("Foo.ts item");
|
.expect("Foo.ts item");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
ts_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
ts_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("code-ts-v1"),
|
Some("code-ts-v1"),
|
||||||
"parser_version must be code-ts-v1"
|
"parser_version must be code-ts-v1"
|
||||||
);
|
);
|
||||||
@@ -337,7 +343,9 @@ fn javascript_file_ingests_and_searches_as_code_citation() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("Bar.js"))
|
.find(|i| i.doc_path.0.ends_with("Bar.js"))
|
||||||
.expect("Bar.js item");
|
.expect("Bar.js item");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
js_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
js_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("code-js-v1"),
|
Some("code-js-v1"),
|
||||||
"parser_version must be code-js-v1"
|
"parser_version must be code-js-v1"
|
||||||
);
|
);
|
||||||
@@ -415,7 +423,9 @@ fn go_file_ingests_and_searches_as_code_citation() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("ast.go"))
|
.find(|i| i.doc_path.0.ends_with("ast.go"))
|
||||||
.expect("ast.go item present");
|
.expect("ast.go item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
go_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
go_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("code-go-v1"),
|
Some("code-go-v1"),
|
||||||
"parser_version must be code-go-v1"
|
"parser_version must be code-go-v1"
|
||||||
);
|
);
|
||||||
@@ -486,7 +496,9 @@ fn java_file_ingests_and_searches_as_code_citation() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("Foo.java"))
|
.find(|i| i.doc_path.0.ends_with("Foo.java"))
|
||||||
.expect("Foo.java item present");
|
.expect("Foo.java item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
java_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
java_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("code-java-v1"),
|
Some("code-java-v1"),
|
||||||
"parser_version must be code-java-v1"
|
"parser_version must be code-java-v1"
|
||||||
);
|
);
|
||||||
@@ -561,7 +573,9 @@ fn kotlin_file_ingests_and_searches_as_code_citation() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("Foo.kt"))
|
.find(|i| i.doc_path.0.ends_with("Foo.kt"))
|
||||||
.expect("Foo.kt item present");
|
.expect("Foo.kt item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
kt_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
kt_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("code-kotlin-v1"),
|
Some("code-kotlin-v1"),
|
||||||
"parser_version must be code-kotlin-v1"
|
"parser_version must be code-kotlin-v1"
|
||||||
);
|
);
|
||||||
@@ -634,7 +648,9 @@ fn tier2_k8s_yaml_ingest_searchable() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("deploy.yaml"))
|
.find(|i| i.doc_path.0.ends_with("deploy.yaml"))
|
||||||
.expect("deploy.yaml item present");
|
.expect("deploy.yaml item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
yaml_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
yaml_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("none-v1"),
|
Some("none-v1"),
|
||||||
"parser_version must be none-v1"
|
"parser_version must be none-v1"
|
||||||
);
|
);
|
||||||
@@ -717,7 +733,9 @@ fn tier2_dockerfile_ingest_searchable() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("Dockerfile"))
|
.find(|i| i.doc_path.0.ends_with("Dockerfile"))
|
||||||
.expect("Dockerfile item present");
|
.expect("Dockerfile item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
df_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
df_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("none-v1"),
|
Some("none-v1"),
|
||||||
"parser_version must be none-v1"
|
"parser_version must be none-v1"
|
||||||
);
|
);
|
||||||
@@ -800,7 +818,9 @@ fn tier2_cargo_toml_ingest_searchable() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("Cargo.toml"))
|
.find(|i| i.doc_path.0.ends_with("Cargo.toml"))
|
||||||
.expect("Cargo.toml item present");
|
.expect("Cargo.toml item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
toml_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
toml_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("none-v1"),
|
Some("none-v1"),
|
||||||
"parser_version must be none-v1"
|
"parser_version must be none-v1"
|
||||||
);
|
);
|
||||||
@@ -883,7 +903,9 @@ fn tier3_shell_ingest_searchable() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("deploy.sh"))
|
.find(|i| i.doc_path.0.ends_with("deploy.sh"))
|
||||||
.expect("deploy.sh item present");
|
.expect("deploy.sh item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
sh_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
sh_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("none-v1"),
|
Some("none-v1"),
|
||||||
"parser_version must be none-v1 for shell (Tier 3 direct)"
|
"parser_version must be none-v1 for shell (Tier 3 direct)"
|
||||||
);
|
);
|
||||||
@@ -974,7 +996,9 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("docker-compose.yml"))
|
.find(|i| i.doc_path.0.ends_with("docker-compose.yml"))
|
||||||
.expect("docker-compose.yml item present");
|
.expect("docker-compose.yml item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
yaml_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
yaml_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("none-v1"),
|
Some("none-v1"),
|
||||||
"parser_version must be none-v1 after Tier 3 fallback"
|
"parser_version must be none-v1 after Tier 3 fallback"
|
||||||
);
|
);
|
||||||
@@ -1144,7 +1168,9 @@ fn tier1_c_ingest_searchable() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("parser.c"))
|
.find(|i| i.doc_path.0.ends_with("parser.c"))
|
||||||
.expect("parser.c item present");
|
.expect("parser.c item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
c_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
c_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("code-c-v2"),
|
Some("code-c-v2"),
|
||||||
"parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)"
|
"parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)"
|
||||||
);
|
);
|
||||||
@@ -1228,7 +1254,9 @@ fn tier1_cpp_ingest_searchable() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("chunker.cpp"))
|
.find(|i| i.doc_path.0.ends_with("chunker.cpp"))
|
||||||
.expect("chunker.cpp item present");
|
.expect("chunker.cpp item present");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
cpp_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
cpp_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("code-cpp-v1"),
|
Some("code-cpp-v1"),
|
||||||
"parser_version must be code-cpp-v1"
|
"parser_version must be code-cpp-v1"
|
||||||
);
|
);
|
||||||
|
|||||||
148
crates/kebab-app/tests/config_invalidation.rs
Normal file
148
crates/kebab-app/tests/config_invalidation.rs
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
//! v0.26.2: ingest-config invalidation — changing a setting that affects
|
||||||
|
//! ingest output auto-re-indexes the affected assets on the next ingest
|
||||||
|
//! (no `--force-reingest`), while changing an unrelated setting does not.
|
||||||
|
//!
|
||||||
|
//! These end-to-end tests exercise the model-free signals (chunking +
|
||||||
|
//! `[ingest.code]` options vs `search` settings). The exhaustive per-setting
|
||||||
|
//! mapping (image OCR / caption, pdf.ocr, code options, search/rag/ui
|
||||||
|
//! invariance) is unit-tested in
|
||||||
|
//! `kebab-app/src/lib.rs::ingest_config_signature_tests` — those toggles
|
||||||
|
//! (OCR/caption) require a live vision endpoint to ingest, so the wiring is
|
||||||
|
//! verified here via the signature-driven chunking path that shares the same
|
||||||
|
//! `effective_parser_version` plumbing.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use common::TestEnv;
|
||||||
|
|
||||||
|
use kebab_app::{IngestOpts, ingest_with_config, ingest_with_config_opts};
|
||||||
|
use kebab_core::IngestItemKind;
|
||||||
|
|
||||||
|
/// Seed a workspace with a markdown + a rust file so both the markdown and
|
||||||
|
/// the code ingest paths are exercised. Returns the first-ingest report.
|
||||||
|
fn seed_and_first_ingest(env: &TestEnv) -> kebab_core::IngestReport {
|
||||||
|
std::fs::write(
|
||||||
|
env.workspace_root.join("demo.rs"),
|
||||||
|
"/// adds two integers\npub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let first = ingest_with_config(env.config.clone(), env.scope(), false).expect("first ingest");
|
||||||
|
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||||
|
assert!(first.new >= 1, "first ingest creates docs: {first:?}");
|
||||||
|
assert_eq!(first.unchanged, 0, "first ingest has no unchanged: {first:?}");
|
||||||
|
first
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reingest(env: &TestEnv) -> kebab_core::IngestReport {
|
||||||
|
ingest_with_config_opts(env.config.clone(), env.scope(), false, IngestOpts::default())
|
||||||
|
.expect("re-ingest")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Re-running with the identical config skips every asset (no spurious
|
||||||
|
/// re-index). Regression guard for over-invalidation.
|
||||||
|
#[test]
|
||||||
|
fn identical_config_skips_all_assets() {
|
||||||
|
let env = TestEnv::lexical_only();
|
||||||
|
let first = seed_and_first_ingest(&env);
|
||||||
|
let scanned = first.scanned;
|
||||||
|
|
||||||
|
let second = reingest(&env);
|
||||||
|
assert_eq!(second.scanned, scanned);
|
||||||
|
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||||
|
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
|
||||||
|
assert_eq!(second.unchanged, scanned, "every doc Unchanged: {second:?}");
|
||||||
|
assert_eq!(second.errors, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Changing a common chunking parameter re-indexes EVERY media type
|
||||||
|
/// (markdown + code here) without `--force-reingest`.
|
||||||
|
#[test]
|
||||||
|
fn chunking_change_reindexes_all_types() {
|
||||||
|
let mut env = TestEnv::lexical_only();
|
||||||
|
let first = seed_and_first_ingest(&env);
|
||||||
|
let scanned = first.scanned;
|
||||||
|
|
||||||
|
// Bump target_tokens — folds into every type's signature.
|
||||||
|
env.config.chunking.target_tokens += 100;
|
||||||
|
|
||||||
|
let second = reingest(&env);
|
||||||
|
assert_eq!(second.scanned, scanned);
|
||||||
|
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||||
|
assert_eq!(
|
||||||
|
second.unchanged, 0,
|
||||||
|
"chunking change must re-index all: {second:?}"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
second.updated, scanned,
|
||||||
|
"every doc re-indexed as Updated: {second:?}"
|
||||||
|
);
|
||||||
|
assert_eq!(second.errors, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Changing an `[ingest.code]` option re-indexes only the code asset; the
|
||||||
|
/// markdown assets stay Unchanged.
|
||||||
|
#[test]
|
||||||
|
fn code_option_change_reindexes_code_only() {
|
||||||
|
let mut env = TestEnv::lexical_only();
|
||||||
|
let first = seed_and_first_ingest(&env);
|
||||||
|
let scanned = first.scanned;
|
||||||
|
|
||||||
|
// Raise max_file_lines (keeps the tiny demo.rs in-scope; only the code
|
||||||
|
// signature changes).
|
||||||
|
env.config.ingest.code.max_file_lines += 1000;
|
||||||
|
|
||||||
|
let second = reingest(&env);
|
||||||
|
assert_eq!(second.scanned, scanned);
|
||||||
|
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||||
|
assert_eq!(second.errors, 0);
|
||||||
|
assert_eq!(
|
||||||
|
second.updated, 1,
|
||||||
|
"exactly the code asset re-indexed: {second:?}"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
second.unchanged,
|
||||||
|
scanned - 1,
|
||||||
|
"all markdown assets stay Unchanged: {second:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let items = second.items.as_ref().expect("items present");
|
||||||
|
let code = items
|
||||||
|
.iter()
|
||||||
|
.find(|i| i.doc_path.0.ends_with("demo.rs"))
|
||||||
|
.expect("demo.rs item");
|
||||||
|
assert_eq!(
|
||||||
|
code.kind,
|
||||||
|
IngestItemKind::Updated,
|
||||||
|
"demo.rs must be re-indexed: {code:?}"
|
||||||
|
);
|
||||||
|
for i in items.iter().filter(|i| i.doc_path.0.ends_with(".md")) {
|
||||||
|
assert_eq!(
|
||||||
|
i.kind,
|
||||||
|
IngestItemKind::Unchanged,
|
||||||
|
"markdown must be Unchanged: {i:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Regression guard: changing a non-ingest setting (`search.default_k`) does
|
||||||
|
/// NOT re-index anything.
|
||||||
|
#[test]
|
||||||
|
fn search_setting_change_reindexes_nothing() {
|
||||||
|
let mut env = TestEnv::lexical_only();
|
||||||
|
let first = seed_and_first_ingest(&env);
|
||||||
|
let scanned = first.scanned;
|
||||||
|
|
||||||
|
env.config.search.default_k += 5;
|
||||||
|
env.config.search.snippet_chars += 50;
|
||||||
|
env.config.rag.score_gate = 0.5;
|
||||||
|
|
||||||
|
let second = reingest(&env);
|
||||||
|
assert_eq!(second.scanned, scanned);
|
||||||
|
assert_eq!(
|
||||||
|
second.unchanged, scanned,
|
||||||
|
"search/rag changes must not re-index: {second:?}"
|
||||||
|
);
|
||||||
|
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
|
||||||
|
assert_eq!(second.new, 0);
|
||||||
|
assert_eq!(second.errors, 0);
|
||||||
|
}
|
||||||
@@ -162,7 +162,9 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
|||||||
"one chunk per non-empty page"
|
"one chunk per non-empty page"
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
pdf_item.parser_version
|
||||||
|
.as_ref()
|
||||||
|
.map(|p| p.0.split('|').next().unwrap()),
|
||||||
Some("pdf-text-v1")
|
Some("pdf-text-v1")
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -477,7 +479,10 @@ fn inspect_doc_surfaces_page_spans() {
|
|||||||
.find(|i| i.doc_path.0.ends_with("inspect.pdf"))
|
.find(|i| i.doc_path.0.ends_with("inspect.pdf"))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||||
assert_eq!(doc.parser_version.0, "pdf-text-v1");
|
// v0.26.2: stored parser_version is now `pdf-text-v1|<ingest-config-sig>`
|
||||||
|
// (the signature folds chunking / pdf.ocr settings for skip detection).
|
||||||
|
// Assert the base identity by taking the prefix before the first '|'.
|
||||||
|
assert_eq!(doc.parser_version.0.split('|').next().unwrap(), "pdf-text-v1");
|
||||||
assert_eq!(doc.blocks.len(), 3);
|
assert_eq!(doc.blocks.len(), 3);
|
||||||
for block in &doc.blocks {
|
for block in &doc.blocks {
|
||||||
match block {
|
match block {
|
||||||
|
|||||||
Reference in New Issue
Block a user