From 03b0745e9d19567a8b8e703e78e53ff3f16667e7 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 3 Jun 2026 14:14:15 +0000 Subject: [PATCH] =?UTF-8?q?test(ingest):=20config=20invalidation=20e2e=20+?= =?UTF-8?q?=20parser=5Fversion=20assert=20=EA=B0=B1=EC=8B=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - config_invalidation.rs(신규): 동일config=전skip / 청킹변경=md+code재색인 / [ingest.code]변경=코드만 / search변경=재색인0 (회귀가드) end-to-end. - code_ingest_smoke / pdf_pipeline: 저장 parser_version 이 이제 "{base}|{sig}" composite 라, exact assert 를 base 접두사(split('|').next()) 비교로 갱신. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/kebab-app/tests/code_ingest_smoke.rs | 56 +++++-- crates/kebab-app/tests/config_invalidation.rs | 148 ++++++++++++++++++ crates/kebab-app/tests/pdf_pipeline.rs | 9 +- 3 files changed, 197 insertions(+), 16 deletions(-) create mode 100644 crates/kebab-app/tests/config_invalidation.rs diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index 534e3fd..84c2315 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -52,7 +52,9 @@ fn rust_file_ingests_and_searches_as_code_citation() { "at least one chunk expected: {code_item:?}" ); assert_eq!( - code_item.parser_version.as_ref().map(|p| p.0.as_str()), + code_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-rust-v1"), "parser_version must be code-rust-v1" ); @@ -185,7 +187,9 @@ fn python_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("metrics.py")) .expect("metrics.py item"); assert_eq!( - py_item.parser_version.as_ref().map(|p| p.0.as_str()), + py_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-python-v1"), "parser_version must be code-python-v1" ); @@ -261,7 +265,9 @@ fn typescript_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("Foo.ts")) .expect("Foo.ts item"); assert_eq!( - ts_item.parser_version.as_ref().map(|p| p.0.as_str()), + ts_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-ts-v1"), "parser_version must be code-ts-v1" ); @@ -337,7 +343,9 @@ fn javascript_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("Bar.js")) .expect("Bar.js item"); assert_eq!( - js_item.parser_version.as_ref().map(|p| p.0.as_str()), + js_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-js-v1"), "parser_version must be code-js-v1" ); @@ -415,7 +423,9 @@ fn go_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("ast.go")) .expect("ast.go item present"); assert_eq!( - go_item.parser_version.as_ref().map(|p| p.0.as_str()), + go_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-go-v1"), "parser_version must be code-go-v1" ); @@ -486,7 +496,9 @@ fn java_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("Foo.java")) .expect("Foo.java item present"); assert_eq!( - java_item.parser_version.as_ref().map(|p| p.0.as_str()), + java_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-java-v1"), "parser_version must be code-java-v1" ); @@ -561,7 +573,9 @@ fn kotlin_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("Foo.kt")) .expect("Foo.kt item present"); assert_eq!( - kt_item.parser_version.as_ref().map(|p| p.0.as_str()), + kt_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-kotlin-v1"), "parser_version must be code-kotlin-v1" ); @@ -634,7 +648,9 @@ fn tier2_k8s_yaml_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("deploy.yaml")) .expect("deploy.yaml item present"); assert_eq!( - yaml_item.parser_version.as_ref().map(|p| p.0.as_str()), + yaml_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1" ); @@ -717,7 +733,9 @@ fn tier2_dockerfile_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("Dockerfile")) .expect("Dockerfile item present"); assert_eq!( - df_item.parser_version.as_ref().map(|p| p.0.as_str()), + df_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1" ); @@ -800,7 +818,9 @@ fn tier2_cargo_toml_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("Cargo.toml")) .expect("Cargo.toml item present"); assert_eq!( - toml_item.parser_version.as_ref().map(|p| p.0.as_str()), + toml_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1" ); @@ -883,7 +903,9 @@ fn tier3_shell_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("deploy.sh")) .expect("deploy.sh item present"); assert_eq!( - sh_item.parser_version.as_ref().map(|p| p.0.as_str()), + sh_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1 for shell (Tier 3 direct)" ); @@ -974,7 +996,9 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() { .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) .expect("docker-compose.yml item present"); assert_eq!( - yaml_item.parser_version.as_ref().map(|p| p.0.as_str()), + yaml_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1 after Tier 3 fallback" ); @@ -1144,7 +1168,9 @@ fn tier1_c_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("parser.c")) .expect("parser.c item present"); assert_eq!( - c_item.parser_version.as_ref().map(|p| p.0.as_str()), + c_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-c-v2"), "parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)" ); @@ -1228,7 +1254,9 @@ fn tier1_cpp_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("chunker.cpp")) .expect("chunker.cpp item present"); assert_eq!( - cpp_item.parser_version.as_ref().map(|p| p.0.as_str()), + cpp_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-cpp-v1"), "parser_version must be code-cpp-v1" ); diff --git a/crates/kebab-app/tests/config_invalidation.rs b/crates/kebab-app/tests/config_invalidation.rs new file mode 100644 index 0000000..89ac21e --- /dev/null +++ b/crates/kebab-app/tests/config_invalidation.rs @@ -0,0 +1,148 @@ +//! v0.26.2: ingest-config invalidation — changing a setting that affects +//! ingest output auto-re-indexes the affected assets on the next ingest +//! (no `--force-reingest`), while changing an unrelated setting does not. +//! +//! These end-to-end tests exercise the model-free signals (chunking + +//! `[ingest.code]` options vs `search` settings). The exhaustive per-setting +//! mapping (image OCR / caption, pdf.ocr, code options, search/rag/ui +//! invariance) is unit-tested in +//! `kebab-app/src/lib.rs::ingest_config_signature_tests` — those toggles +//! (OCR/caption) require a live vision endpoint to ingest, so the wiring is +//! verified here via the signature-driven chunking path that shares the same +//! `effective_parser_version` plumbing. + +mod common; + +use common::TestEnv; + +use kebab_app::{IngestOpts, ingest_with_config, ingest_with_config_opts}; +use kebab_core::IngestItemKind; + +/// Seed a workspace with a markdown + a rust file so both the markdown and +/// the code ingest paths are exercised. Returns the first-ingest report. +fn seed_and_first_ingest(env: &TestEnv) -> kebab_core::IngestReport { + std::fs::write( + env.workspace_root.join("demo.rs"), + "/// adds two integers\npub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n", + ) + .unwrap(); + let first = ingest_with_config(env.config.clone(), env.scope(), false).expect("first ingest"); + assert_eq!(first.errors, 0, "first ingest must not error: {first:?}"); + assert!(first.new >= 1, "first ingest creates docs: {first:?}"); + assert_eq!(first.unchanged, 0, "first ingest has no unchanged: {first:?}"); + first +} + +fn reingest(env: &TestEnv) -> kebab_core::IngestReport { + ingest_with_config_opts(env.config.clone(), env.scope(), false, IngestOpts::default()) + .expect("re-ingest") +} + +/// Re-running with the identical config skips every asset (no spurious +/// re-index). Regression guard for over-invalidation. +#[test] +fn identical_config_skips_all_assets() { + let env = TestEnv::lexical_only(); + let first = seed_and_first_ingest(&env); + let scanned = first.scanned; + + let second = reingest(&env); + assert_eq!(second.scanned, scanned); + assert_eq!(second.new, 0, "no new docs: {second:?}"); + assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}"); + assert_eq!(second.unchanged, scanned, "every doc Unchanged: {second:?}"); + assert_eq!(second.errors, 0); +} + +/// Changing a common chunking parameter re-indexes EVERY media type +/// (markdown + code here) without `--force-reingest`. +#[test] +fn chunking_change_reindexes_all_types() { + let mut env = TestEnv::lexical_only(); + let first = seed_and_first_ingest(&env); + let scanned = first.scanned; + + // Bump target_tokens — folds into every type's signature. + env.config.chunking.target_tokens += 100; + + let second = reingest(&env); + assert_eq!(second.scanned, scanned); + assert_eq!(second.new, 0, "no new docs: {second:?}"); + assert_eq!( + second.unchanged, 0, + "chunking change must re-index all: {second:?}" + ); + assert_eq!( + second.updated, scanned, + "every doc re-indexed as Updated: {second:?}" + ); + assert_eq!(second.errors, 0); +} + +/// Changing an `[ingest.code]` option re-indexes only the code asset; the +/// markdown assets stay Unchanged. +#[test] +fn code_option_change_reindexes_code_only() { + let mut env = TestEnv::lexical_only(); + let first = seed_and_first_ingest(&env); + let scanned = first.scanned; + + // Raise max_file_lines (keeps the tiny demo.rs in-scope; only the code + // signature changes). + env.config.ingest.code.max_file_lines += 1000; + + let second = reingest(&env); + assert_eq!(second.scanned, scanned); + assert_eq!(second.new, 0, "no new docs: {second:?}"); + assert_eq!(second.errors, 0); + assert_eq!( + second.updated, 1, + "exactly the code asset re-indexed: {second:?}" + ); + assert_eq!( + second.unchanged, + scanned - 1, + "all markdown assets stay Unchanged: {second:?}" + ); + + let items = second.items.as_ref().expect("items present"); + let code = items + .iter() + .find(|i| i.doc_path.0.ends_with("demo.rs")) + .expect("demo.rs item"); + assert_eq!( + code.kind, + IngestItemKind::Updated, + "demo.rs must be re-indexed: {code:?}" + ); + for i in items.iter().filter(|i| i.doc_path.0.ends_with(".md")) { + assert_eq!( + i.kind, + IngestItemKind::Unchanged, + "markdown must be Unchanged: {i:?}" + ); + } +} + +/// Regression guard: changing a non-ingest setting (`search.default_k`) does +/// NOT re-index anything. +#[test] +fn search_setting_change_reindexes_nothing() { + let mut env = TestEnv::lexical_only(); + let first = seed_and_first_ingest(&env); + let scanned = first.scanned; + + env.config.search.default_k += 5; + env.config.search.snippet_chars += 50; + env.config.rag.score_gate = 0.5; + + let second = reingest(&env); + assert_eq!(second.scanned, scanned); + assert_eq!( + second.unchanged, scanned, + "search/rag changes must not re-index: {second:?}" + ); + assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}"); + assert_eq!(second.new, 0); + assert_eq!(second.errors, 0); +} diff --git a/crates/kebab-app/tests/pdf_pipeline.rs b/crates/kebab-app/tests/pdf_pipeline.rs index 07fb1f7..8c92c71 100644 --- a/crates/kebab-app/tests/pdf_pipeline.rs +++ b/crates/kebab-app/tests/pdf_pipeline.rs @@ -162,7 +162,9 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() { "one chunk per non-empty page" ); assert_eq!( - pdf_item.parser_version.as_ref().map(|p| p.0.as_str()), + pdf_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("pdf-text-v1") ); assert_eq!( @@ -477,7 +479,10 @@ fn inspect_doc_surfaces_page_spans() { .find(|i| i.doc_path.0.ends_with("inspect.pdf")) .unwrap(); let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap(); - assert_eq!(doc.parser_version.0, "pdf-text-v1"); + // v0.26.2: stored parser_version is now `pdf-text-v1|` + // (the signature folds chunking / pdf.ocr settings for skip detection). + // Assert the base identity by taking the prefix before the first '|'. + assert_eq!(doc.parser_version.0.split('|').next().unwrap(), "pdf-text-v1"); assert_eq!(doc.blocks.len(), 3); for block in &doc.blocks { match block {