From e7cb20990a37850b7f0ef8e7d7c7bbd63ccc639f Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 3 Jun 2026 14:14:06 +0000 Subject: [PATCH 1/3] =?UTF-8?q?feat(ingest):=20ingest=20=EC=84=A4=EC=A0=95?= =?UTF-8?q?=20=EB=B3=80=EA=B2=BD=20=EC=8B=9C=20=EC=98=81=ED=96=A5=20?= =?UTF-8?q?=EC=9E=90=EC=82=B0=20=EC=9E=90=EB=8F=99=20=EC=9E=AC=EC=83=89?= =?UTF-8?q?=EC=9D=B8=20(signature=20=ED=8F=B4=EB=94=A9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/[ingest.code])의 결정적 서명을 effective parser_version 에 폴딩 → 변경 시 --force-reingest 없이 영향 자산만 자동 재색인. - ingest_config_signature(config, media_type): per-type 산출-영향 설정만 직렬화. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout)은 제외. - effective_parser_version(config, asset, base) = "{base}|{signature}". - md/image/pdf/code 경로: composite 를 (a) try_skip_unchanged 비교값, (b) persist 전 canonical.parser_version override 에 사용. - doc_id 는 base parser_version 으로 계속 파생 → 설정 변경에도 안정(orphan churn 회피). - code Tier-3 fallback 은 bare "none-v1" sentinel 유지(skip bypass 의존). - 단위테스트 8: 결정성/청킹=전타입/이미지·pdf·code 토글/무관설정 회귀가드. spec: docs/superpowers/specs/2026-06-03-ocr-toggle-invalidation-spec.md Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/kebab-app/src/lib.rs | 396 +++++++++++++++++++++++++++++++++++- 1 file changed, 392 insertions(+), 4 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 94ede3a..6891b62 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -1242,6 +1242,12 @@ fn ingest_one_asset( } }; + // v0.26.2: fold the ingest-config signature into the effective + // parser_version for the skip compare + the stored doc field, so a + // change to any markdown-affecting setting (chunking params) re-indexes. + // `doc_id` keeps deriving from the base version below (stability). + let eff_parser_version = effective_parser_version(&app.config, asset, parser_version); + // p9-fb-23 task 7: incremental-ingest early-skip. When force_reingest // is false AND the on-disk asset's checksum + parser_version + // last_chunker_version + last_embedding_version all match the existing @@ -1251,7 +1257,7 @@ fn ingest_one_asset( if let Some(item) = try_skip_unchanged( app, asset, - parser_version, + &eff_parser_version, &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, @@ -1297,6 +1303,10 @@ fn ingest_one_asset( let mut canonical = build_canonical_document(asset, metadata, parsed_blocks, parser_version, all_warnings) .context("kb-parse-md::build_canonical_document")?; + // v0.26.2: persist the composite parser_version (base|signature) so the + // next run's skip compare matches what was computed above. doc_id was + // already derived from the base version inside build_canonical_document. + canonical.parser_version = eff_parser_version.clone(); let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX); @@ -1529,11 +1539,15 @@ fn ingest_one_image_asset( // embedding-version check matches the markdown path: when the // active embedder's model_version equals what was stamped on the // existing doc, the asset is Unchanged. + // v0.26.2: composite parser_version folds image OCR / caption + chunking + // settings, so toggling `[image.ocr]` / `[image.caption]` (or changing + // their model / prompt version) auto-re-indexes the affected images. let image_parser_version = ParserVersion(kebab_parse_image::PARSER_VERSION.to_string()); + let eff_parser_version = effective_parser_version(&app.config, asset, &image_parser_version); if let Some(item) = try_skip_unchanged( app, asset, - &image_parser_version, + &eff_parser_version, &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, @@ -1563,6 +1577,10 @@ fn ingest_one_image_asset( let mut canonical = app .extract_for(&asset.media_type, &ctx, &bytes) .context("kb-app::extract_for (image)")?; + // v0.26.2: store the composite parser_version (extractor baked the base + // `image-meta-v1`, which already fixed doc_id). Skip compare + stored + // field must agree for next-run detection. + canonical.parser_version = eff_parser_version.clone(); let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX); // 2 + 3. Apply OCR / caption when their adapters exist. Both are @@ -2106,11 +2124,14 @@ fn ingest_one_pdf_asset( // p9-fb-23 task 7: incremental-ingest early-skip for the PDF flow. // PDF docs use `pdf-text-v1` as the parser_version and `PdfPageV1Chunker` // as the chunker — both pinned per-medium today (no config knob). + // v0.26.2: composite parser_version folds pdf.ocr (enabled/always_on/ + // model) + chunking, so enabling scanned-PDF OCR auto-re-indexes PDFs. let pdf_parser_version = ParserVersion(kebab_parse_pdf::PARSER_VERSION.to_string()); + let eff_parser_version = effective_parser_version(&app.config, asset, &pdf_parser_version); if let Some(item) = try_skip_unchanged( app, asset, - &pdf_parser_version, + &eff_parser_version, &PdfPageV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, @@ -2135,6 +2156,9 @@ fn ingest_one_pdf_asset( let mut canonical = app .extract_for(&asset.media_type, &ctx, &bytes) .context("kb-app::extract_for (pdf)")?; + // v0.26.2: store the composite parser_version (base `pdf-text-v1` already + // fixed doc_id) so the next run's skip compare matches. + canonical.parser_version = eff_parser_version.clone(); let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX); // v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry @@ -2510,10 +2534,19 @@ fn ingest_one_code_asset( _ => None, }; + // v0.26.2: composite parser_version folds [ingest.code] options + common + // chunking so editing any code-ingest setting auto-re-indexes code assets. + // The base per-lang version still derives doc_id (synthesize_tier2_document + // / extract_for keep using `parser_version`). A Tier-3 fallback document + // intentionally keeps the bare "none-v1" parser_version (the + // `stored_is_tier3_fallback` bypass in try_skip_unchanged depends on the + // exact "none-v1" sentinel), so the composite is only stamped on the + // normal (non-fallback) outcome below. + let eff_parser_version = effective_parser_version(&app.config, asset, &parser_version); if let Some(item) = try_skip_unchanged( app, asset, - &parser_version, + &eff_parser_version, &chunker_version, embedder.map(|e| e.model_version()).as_ref(), force_reingest, @@ -2678,6 +2711,20 @@ fn ingest_one_code_asset( } }; + // v0.26.2: stamp the composite parser_version for the normal outcome so + // editing any [ingest.code] / chunking setting re-indexes this asset next + // run. A Tier-3 fallback (an AST / manifest lang whose extractor or + // chunker degraded to CodeTextParagraphV1Chunker) must keep the bare + // "none-v1" sentinel, because `try_skip_unchanged`'s + // `stored_is_tier3_fallback` bypass keys off that exact string. `shell` + // is native Tier 3 (no bypass — `tier3_fallback_cv` is None for it), so it + // still gets the composite. + let is_tier3_fallback_outcome = + code_lang != "shell" && chunker_version == CodeTextParagraphV1Chunker.chunker_version(); + if !is_tier3_fallback_outcome { + canonical.parser_version = eff_parser_version.clone(); + } + // Stamp chunker + embedding versions so incremental skip detection has // data on the second run. canonical.last_chunker_version = Some(chunker_version.clone()); @@ -2951,6 +2998,102 @@ fn chunk_policy_from_config(config: &kebab_config::Config) -> ChunkPolicy { } } +/// v0.26.2: deterministic signature of the **ingest-output-affecting** +/// config for an asset's media type, folded into the effective +/// `parser_version` (both the `try_skip_unchanged` compare field AND the +/// persisted `documents.parser_version`). When any setting that changes the +/// produced chunks / embeddings is edited, the next ingest's signature no +/// longer matches the stored one → the affected assets (only) are +/// automatically re-indexed without `--force-reingest`. +/// +/// Inclusion rule: "does changing this value alter the chunk / embedding +/// content that gets indexed?" Settings that do NOT (search / rag / nli / +/// ui / logging / storage / workspace, plus runtime-only knobs like +/// `max_pixels` / `languages` / `*_timeout_secs`) are deliberately excluded +/// to avoid over-invalidation. Embedding model/dim is already covered by the +/// separate `embedding_version` cascade in [`try_skip_unchanged`], so it is +/// not duplicated here. +/// +/// The output is purely a comparison token — it is never parsed back, so the +/// exact format is internal. Field order is fixed and `Vec`s are joined so +/// the same `Config` always yields the same string. +fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String { + // Common (every media type): chunking parameters that move chunk + // boundaries. `target_tokens` / `overlap_tokens` change re-chunking for + // markdown / image / pdf / code alike, so a change re-indexes all types. + let c = &config.chunking; + let mut sig = format!( + "chunk:{}:{}:{}:{}", + c.target_tokens, c.overlap_tokens, c.respect_markdown_headings, c.chunker_version + ); + match media { + MediaType::Image(_) => { + // OCR / caption only affect output when their `enabled` flag is + // on; the model / prompt version matters only then. Off ↔ off is + // a stable empty token so re-running the same config skips. + let ocr = &config.image.ocr; + if ocr.enabled { + sig.push_str(&format!("|ocr:1:{}", ocr.model)); + } else { + sig.push_str("|ocr:0"); + } + let cap = &config.image.caption; + if cap.enabled { + sig.push_str(&format!("|cap:1:{}", cap.prompt_template_version)); + } else { + sig.push_str("|cap:0"); + } + } + MediaType::Pdf => { + // PDF OCR is active when EITHER `enabled` or `always_on` is set + // (mirrors the ingest gate). `model` only matters when active. + let ocr = &config.pdf.ocr; + if ocr.enabled || ocr.always_on { + sig.push_str(&format!( + "|pdfocr:{}:{}:{}", + ocr.enabled, ocr.always_on, ocr.model + )); + } else { + sig.push_str("|pdfocr:0"); + } + } + MediaType::Code(_) => { + let cc = &config.ingest.code; + sig.push_str(&format!( + "|code:{}:{}:{}:{}:{}:{}:{}", + cc.skip_generated_header, + cc.max_file_bytes, + cc.max_file_lines, + cc.extra_skip_globs.join(","), + cc.ast_chunk_max_lines, + cc.fallback_lines_per_chunk, + cc.fallback_lines_overlap + )); + } + // Markdown carries common-only; Audio / Other are not ingested yet. + MediaType::Markdown | MediaType::Audio(_) | MediaType::Other(_) => {} + } + sig +} + +/// Compose an extractor's base `parser_version` with the ingest-config +/// signature for `asset`'s media type. The result is used as the +/// `try_skip_unchanged` compare value and stored on the persisted document, +/// while the **base** version is what derives `doc_id` (kept stable to avoid +/// orphan churn — see the spec at +/// `docs/superpowers/specs/2026-06-03-ocr-toggle-invalidation-spec.md`). +fn effective_parser_version( + config: &kebab_config::Config, + asset: &RawAsset, + base: &ParserVersion, +) -> ParserVersion { + ParserVersion(format!( + "{}|{}", + base.0, + ingest_config_signature(config, &asset.media_type) + )) +} + // ── list_docs / inspect_doc / inspect_chunk ─────────────────────────────── pub fn list_docs(filter: DocFilter) -> anyhow::Result> { @@ -3429,3 +3572,248 @@ fn check_kebabignore_match( .is_ignore() } + +#[cfg(test)] +mod ingest_config_signature_tests { + //! v0.26.2: unit tests for [`ingest_config_signature`] — the + //! ingest-output-affecting config fingerprint that is folded into the + //! effective `parser_version` so that changing any setting that alters + //! the produced chunks/embeddings auto-re-indexes the affected assets, + //! while changes to unrelated settings (search/rag/ui/…) do not. + + use kebab_config::Config; + use kebab_core::{ImageType, MediaType}; + + use super::ingest_config_signature; + + fn img() -> MediaType { + MediaType::Image(ImageType::Png) + } + fn pdf() -> MediaType { + MediaType::Pdf + } + fn code() -> MediaType { + MediaType::Code("rust".to_string()) + } + fn md() -> MediaType { + MediaType::Markdown + } + + /// The signature is deterministic: same config + same media → same string. + #[test] + fn deterministic_for_unchanged_config() { + let c = Config::defaults(); + for m in [md(), img(), pdf(), code()] { + assert_eq!( + ingest_config_signature(&c, &m), + ingest_config_signature(&c, &m), + "signature must be stable for {m:?}" + ); + } + } + + /// Changing a common chunking parameter changes the signature for EVERY + /// media type (re-chunk cascade). + #[test] + fn chunking_change_invalidates_all_types() { + let base = Config::defaults(); + let mut bumped = base.clone(); + bumped.chunking.target_tokens += 100; + for m in [md(), img(), pdf(), code()] { + assert_ne!( + ingest_config_signature(&base, &m), + ingest_config_signature(&bumped, &m), + "target_tokens change must invalidate {m:?}" + ); + } + + let mut overlap = base.clone(); + overlap.chunking.overlap_tokens += 10; + assert_ne!( + ingest_config_signature(&base, &md()), + ingest_config_signature(&overlap, &md()) + ); + + let mut headings = base.clone(); + headings.chunking.respect_markdown_headings = !base.chunking.respect_markdown_headings; + assert_ne!( + ingest_config_signature(&base, &md()), + ingest_config_signature(&headings, &md()) + ); + } + + /// Image OCR toggle (off→on) changes only the image signature; pdf / code + /// / markdown are unaffected. + #[test] + fn image_ocr_toggle_invalidates_image_only() { + let base = Config::defaults(); + assert!(!base.image.ocr.enabled, "default OCR is off"); + let mut on = base.clone(); + on.image.ocr.enabled = true; + + assert_ne!( + ingest_config_signature(&base, &img()), + ingest_config_signature(&on, &img()), + "image OCR toggle must invalidate images" + ); + for m in [md(), pdf(), code()] { + assert_eq!( + ingest_config_signature(&base, &m), + ingest_config_signature(&on, &m), + "image OCR toggle must NOT touch {m:?}" + ); + } + } + + /// When OCR is enabled, changing the OCR model changes the image + /// signature; when OCR is off, the model field is irrelevant. + #[test] + fn image_ocr_model_matters_only_when_enabled() { + let mut off_a = Config::defaults(); + let mut off_b = off_a.clone(); + off_b.image.ocr.model = "some-other-model".to_string(); + assert_eq!( + ingest_config_signature(&off_a, &img()), + ingest_config_signature(&off_b, &img()), + "OCR model is irrelevant while OCR is off" + ); + + off_a.image.ocr.enabled = true; + let mut on_b = off_a.clone(); + on_b.image.ocr.model = "some-other-model".to_string(); + assert_ne!( + ingest_config_signature(&off_a, &img()), + ingest_config_signature(&on_b, &img()), + "OCR model change matters while OCR is on" + ); + } + + /// Image caption toggle + prompt-template-version change invalidate images. + #[test] + fn image_caption_toggle_and_prompt_invalidate_image() { + let base = Config::defaults(); + let mut on = base.clone(); + on.image.caption.enabled = true; + assert_ne!( + ingest_config_signature(&base, &img()), + ingest_config_signature(&on, &img()) + ); + + let mut prompt = on.clone(); + prompt.image.caption.prompt_template_version = "caption-v9".to_string(); + assert_ne!( + ingest_config_signature(&on, &img()), + ingest_config_signature(&prompt, &img()), + "caption prompt version change matters while caption is on" + ); + } + + /// PDF OCR `enabled` and `always_on` both invalidate PDFs (either turns + /// OCR on); they do not touch other media types. + #[test] + fn pdf_ocr_toggle_invalidates_pdf_only() { + let base = Config::defaults(); + let mut enabled = base.clone(); + enabled.pdf.ocr.enabled = true; + assert_ne!( + ingest_config_signature(&base, &pdf()), + ingest_config_signature(&enabled, &pdf()), + "pdf.ocr.enabled toggle must invalidate PDFs" + ); + + let mut always = base.clone(); + always.pdf.ocr.always_on = true; + assert_ne!( + ingest_config_signature(&base, &pdf()), + ingest_config_signature(&always, &pdf()), + "pdf.ocr.always_on toggle must invalidate PDFs" + ); + + for m in [md(), img(), code()] { + assert_eq!( + ingest_config_signature(&base, &m), + ingest_config_signature(&enabled, &m), + "pdf OCR toggle must NOT touch {m:?}" + ); + } + } + + /// Each `[ingest.code]` option change invalidates code assets only. + #[test] + fn code_options_invalidate_code_only() { + let base = Config::defaults(); + + let mut variants = Vec::new(); + let mut v = base.clone(); + v.ingest.code.skip_generated_header = !base.ingest.code.skip_generated_header; + variants.push(v); + let mut v = base.clone(); + v.ingest.code.max_file_bytes += 1; + variants.push(v); + let mut v = base.clone(); + v.ingest.code.max_file_lines += 1; + variants.push(v); + let mut v = base.clone(); + v.ingest.code.extra_skip_globs.push("**/vendor/**".to_string()); + variants.push(v); + let mut v = base.clone(); + v.ingest.code.ast_chunk_max_lines += 1; + variants.push(v); + let mut v = base.clone(); + v.ingest.code.fallback_lines_per_chunk += 1; + variants.push(v); + let mut v = base.clone(); + v.ingest.code.fallback_lines_overlap += 1; + variants.push(v); + + for v in &variants { + assert_ne!( + ingest_config_signature(&base, &code()), + ingest_config_signature(v, &code()), + "code option change must invalidate code assets" + ); + // ...but must NOT touch md / image / pdf. + for m in [md(), img(), pdf()] { + assert_eq!( + ingest_config_signature(&base, &m), + ingest_config_signature(v, &m), + "code option change must NOT touch {m:?}" + ); + } + } + } + + /// Regression guard: search / rag / nli / ui / logging / storage / + /// workspace settings — and ingest runtime-only knobs that do NOT change + /// indexed output — never change the signature for ANY media type. + #[test] + fn unrelated_settings_never_invalidate() { + let base = Config::defaults(); + let mut other = base.clone(); + // search + other.search.default_k += 5; + other.search.rrf_k += 1; + other.search.snippet_chars += 10; + // rag + other.rag.score_gate += 0.1; + other.rag.prompt_template_version = "rag-v99".to_string(); + // ui + other.ui.theme = "light".to_string(); + // image runtime-only (non-output) knobs + other.image.ocr.max_pixels += 100; + other.image.ocr.languages.push("jpn".to_string()); + other.image.ocr.request_timeout_secs += 10; + // pdf runtime-only knobs + other.pdf.ocr.max_pixels += 100; + other.pdf.ocr.request_timeout_secs += 10; + other.pdf.ocr.languages.push("jpn".to_string()); + + for m in [md(), img(), pdf(), code()] { + assert_eq!( + ingest_config_signature(&base, &m), + ingest_config_signature(&other, &m), + "unrelated/runtime-only settings must NOT invalidate {m:?}" + ); + } + } +} From 03b0745e9d19567a8b8e703e78e53ff3f16667e7 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 3 Jun 2026 14:14:15 +0000 Subject: [PATCH 2/3] =?UTF-8?q?test(ingest):=20config=20invalidation=20e2e?= =?UTF-8?q?=20+=20parser=5Fversion=20assert=20=EA=B0=B1=EC=8B=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - config_invalidation.rs(신규): 동일config=전skip / 청킹변경=md+code재색인 / [ingest.code]변경=코드만 / search변경=재색인0 (회귀가드) end-to-end. - code_ingest_smoke / pdf_pipeline: 저장 parser_version 이 이제 "{base}|{sig}" composite 라, exact assert 를 base 접두사(split('|').next()) 비교로 갱신. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/kebab-app/tests/code_ingest_smoke.rs | 56 +++++-- crates/kebab-app/tests/config_invalidation.rs | 148 ++++++++++++++++++ crates/kebab-app/tests/pdf_pipeline.rs | 9 +- 3 files changed, 197 insertions(+), 16 deletions(-) create mode 100644 crates/kebab-app/tests/config_invalidation.rs diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index 534e3fd..84c2315 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -52,7 +52,9 @@ fn rust_file_ingests_and_searches_as_code_citation() { "at least one chunk expected: {code_item:?}" ); assert_eq!( - code_item.parser_version.as_ref().map(|p| p.0.as_str()), + code_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-rust-v1"), "parser_version must be code-rust-v1" ); @@ -185,7 +187,9 @@ fn python_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("metrics.py")) .expect("metrics.py item"); assert_eq!( - py_item.parser_version.as_ref().map(|p| p.0.as_str()), + py_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-python-v1"), "parser_version must be code-python-v1" ); @@ -261,7 +265,9 @@ fn typescript_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("Foo.ts")) .expect("Foo.ts item"); assert_eq!( - ts_item.parser_version.as_ref().map(|p| p.0.as_str()), + ts_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-ts-v1"), "parser_version must be code-ts-v1" ); @@ -337,7 +343,9 @@ fn javascript_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("Bar.js")) .expect("Bar.js item"); assert_eq!( - js_item.parser_version.as_ref().map(|p| p.0.as_str()), + js_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-js-v1"), "parser_version must be code-js-v1" ); @@ -415,7 +423,9 @@ fn go_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("ast.go")) .expect("ast.go item present"); assert_eq!( - go_item.parser_version.as_ref().map(|p| p.0.as_str()), + go_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-go-v1"), "parser_version must be code-go-v1" ); @@ -486,7 +496,9 @@ fn java_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("Foo.java")) .expect("Foo.java item present"); assert_eq!( - java_item.parser_version.as_ref().map(|p| p.0.as_str()), + java_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-java-v1"), "parser_version must be code-java-v1" ); @@ -561,7 +573,9 @@ fn kotlin_file_ingests_and_searches_as_code_citation() { .find(|i| i.doc_path.0.ends_with("Foo.kt")) .expect("Foo.kt item present"); assert_eq!( - kt_item.parser_version.as_ref().map(|p| p.0.as_str()), + kt_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-kotlin-v1"), "parser_version must be code-kotlin-v1" ); @@ -634,7 +648,9 @@ fn tier2_k8s_yaml_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("deploy.yaml")) .expect("deploy.yaml item present"); assert_eq!( - yaml_item.parser_version.as_ref().map(|p| p.0.as_str()), + yaml_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1" ); @@ -717,7 +733,9 @@ fn tier2_dockerfile_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("Dockerfile")) .expect("Dockerfile item present"); assert_eq!( - df_item.parser_version.as_ref().map(|p| p.0.as_str()), + df_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1" ); @@ -800,7 +818,9 @@ fn tier2_cargo_toml_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("Cargo.toml")) .expect("Cargo.toml item present"); assert_eq!( - toml_item.parser_version.as_ref().map(|p| p.0.as_str()), + toml_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1" ); @@ -883,7 +903,9 @@ fn tier3_shell_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("deploy.sh")) .expect("deploy.sh item present"); assert_eq!( - sh_item.parser_version.as_ref().map(|p| p.0.as_str()), + sh_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1 for shell (Tier 3 direct)" ); @@ -974,7 +996,9 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() { .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) .expect("docker-compose.yml item present"); assert_eq!( - yaml_item.parser_version.as_ref().map(|p| p.0.as_str()), + yaml_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("none-v1"), "parser_version must be none-v1 after Tier 3 fallback" ); @@ -1144,7 +1168,9 @@ fn tier1_c_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("parser.c")) .expect("parser.c item present"); assert_eq!( - c_item.parser_version.as_ref().map(|p| p.0.as_str()), + c_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-c-v2"), "parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)" ); @@ -1228,7 +1254,9 @@ fn tier1_cpp_ingest_searchable() { .find(|i| i.doc_path.0.ends_with("chunker.cpp")) .expect("chunker.cpp item present"); assert_eq!( - cpp_item.parser_version.as_ref().map(|p| p.0.as_str()), + cpp_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("code-cpp-v1"), "parser_version must be code-cpp-v1" ); diff --git a/crates/kebab-app/tests/config_invalidation.rs b/crates/kebab-app/tests/config_invalidation.rs new file mode 100644 index 0000000..89ac21e --- /dev/null +++ b/crates/kebab-app/tests/config_invalidation.rs @@ -0,0 +1,148 @@ +//! v0.26.2: ingest-config invalidation — changing a setting that affects +//! ingest output auto-re-indexes the affected assets on the next ingest +//! (no `--force-reingest`), while changing an unrelated setting does not. +//! +//! These end-to-end tests exercise the model-free signals (chunking + +//! `[ingest.code]` options vs `search` settings). The exhaustive per-setting +//! mapping (image OCR / caption, pdf.ocr, code options, search/rag/ui +//! invariance) is unit-tested in +//! `kebab-app/src/lib.rs::ingest_config_signature_tests` — those toggles +//! (OCR/caption) require a live vision endpoint to ingest, so the wiring is +//! verified here via the signature-driven chunking path that shares the same +//! `effective_parser_version` plumbing. + +mod common; + +use common::TestEnv; + +use kebab_app::{IngestOpts, ingest_with_config, ingest_with_config_opts}; +use kebab_core::IngestItemKind; + +/// Seed a workspace with a markdown + a rust file so both the markdown and +/// the code ingest paths are exercised. Returns the first-ingest report. +fn seed_and_first_ingest(env: &TestEnv) -> kebab_core::IngestReport { + std::fs::write( + env.workspace_root.join("demo.rs"), + "/// adds two integers\npub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n", + ) + .unwrap(); + let first = ingest_with_config(env.config.clone(), env.scope(), false).expect("first ingest"); + assert_eq!(first.errors, 0, "first ingest must not error: {first:?}"); + assert!(first.new >= 1, "first ingest creates docs: {first:?}"); + assert_eq!(first.unchanged, 0, "first ingest has no unchanged: {first:?}"); + first +} + +fn reingest(env: &TestEnv) -> kebab_core::IngestReport { + ingest_with_config_opts(env.config.clone(), env.scope(), false, IngestOpts::default()) + .expect("re-ingest") +} + +/// Re-running with the identical config skips every asset (no spurious +/// re-index). Regression guard for over-invalidation. +#[test] +fn identical_config_skips_all_assets() { + let env = TestEnv::lexical_only(); + let first = seed_and_first_ingest(&env); + let scanned = first.scanned; + + let second = reingest(&env); + assert_eq!(second.scanned, scanned); + assert_eq!(second.new, 0, "no new docs: {second:?}"); + assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}"); + assert_eq!(second.unchanged, scanned, "every doc Unchanged: {second:?}"); + assert_eq!(second.errors, 0); +} + +/// Changing a common chunking parameter re-indexes EVERY media type +/// (markdown + code here) without `--force-reingest`. +#[test] +fn chunking_change_reindexes_all_types() { + let mut env = TestEnv::lexical_only(); + let first = seed_and_first_ingest(&env); + let scanned = first.scanned; + + // Bump target_tokens — folds into every type's signature. + env.config.chunking.target_tokens += 100; + + let second = reingest(&env); + assert_eq!(second.scanned, scanned); + assert_eq!(second.new, 0, "no new docs: {second:?}"); + assert_eq!( + second.unchanged, 0, + "chunking change must re-index all: {second:?}" + ); + assert_eq!( + second.updated, scanned, + "every doc re-indexed as Updated: {second:?}" + ); + assert_eq!(second.errors, 0); +} + +/// Changing an `[ingest.code]` option re-indexes only the code asset; the +/// markdown assets stay Unchanged. +#[test] +fn code_option_change_reindexes_code_only() { + let mut env = TestEnv::lexical_only(); + let first = seed_and_first_ingest(&env); + let scanned = first.scanned; + + // Raise max_file_lines (keeps the tiny demo.rs in-scope; only the code + // signature changes). + env.config.ingest.code.max_file_lines += 1000; + + let second = reingest(&env); + assert_eq!(second.scanned, scanned); + assert_eq!(second.new, 0, "no new docs: {second:?}"); + assert_eq!(second.errors, 0); + assert_eq!( + second.updated, 1, + "exactly the code asset re-indexed: {second:?}" + ); + assert_eq!( + second.unchanged, + scanned - 1, + "all markdown assets stay Unchanged: {second:?}" + ); + + let items = second.items.as_ref().expect("items present"); + let code = items + .iter() + .find(|i| i.doc_path.0.ends_with("demo.rs")) + .expect("demo.rs item"); + assert_eq!( + code.kind, + IngestItemKind::Updated, + "demo.rs must be re-indexed: {code:?}" + ); + for i in items.iter().filter(|i| i.doc_path.0.ends_with(".md")) { + assert_eq!( + i.kind, + IngestItemKind::Unchanged, + "markdown must be Unchanged: {i:?}" + ); + } +} + +/// Regression guard: changing a non-ingest setting (`search.default_k`) does +/// NOT re-index anything. +#[test] +fn search_setting_change_reindexes_nothing() { + let mut env = TestEnv::lexical_only(); + let first = seed_and_first_ingest(&env); + let scanned = first.scanned; + + env.config.search.default_k += 5; + env.config.search.snippet_chars += 50; + env.config.rag.score_gate = 0.5; + + let second = reingest(&env); + assert_eq!(second.scanned, scanned); + assert_eq!( + second.unchanged, scanned, + "search/rag changes must not re-index: {second:?}" + ); + assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}"); + assert_eq!(second.new, 0); + assert_eq!(second.errors, 0); +} diff --git a/crates/kebab-app/tests/pdf_pipeline.rs b/crates/kebab-app/tests/pdf_pipeline.rs index 07fb1f7..8c92c71 100644 --- a/crates/kebab-app/tests/pdf_pipeline.rs +++ b/crates/kebab-app/tests/pdf_pipeline.rs @@ -162,7 +162,9 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() { "one chunk per non-empty page" ); assert_eq!( - pdf_item.parser_version.as_ref().map(|p| p.0.as_str()), + pdf_item.parser_version + .as_ref() + .map(|p| p.0.split('|').next().unwrap()), Some("pdf-text-v1") ); assert_eq!( @@ -477,7 +479,10 @@ fn inspect_doc_surfaces_page_spans() { .find(|i| i.doc_path.0.ends_with("inspect.pdf")) .unwrap(); let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap(); - assert_eq!(doc.parser_version.0, "pdf-text-v1"); + // v0.26.2: stored parser_version is now `pdf-text-v1|` + // (the signature folds chunking / pdf.ocr settings for skip detection). + // Assert the base identity by taking the prefix before the first '|'. + assert_eq!(doc.parser_version.0.split('|').next().unwrap(), "pdf-text-v1"); assert_eq!(doc.blocks.len(), 3); for block in &doc.blocks { match block { From 47ef6532f7cd1ad173dfbeb38ec75bab12b024a1 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 3 Jun 2026 14:14:23 +0000 Subject: [PATCH 3/3] =?UTF-8?q?chore(release):=20v0.26.2=20=E2=80=94=20ing?= =?UTF-8?q?est=20=EC=84=A4=EC=A0=95=20=EB=B3=80=EA=B2=BD=20=EC=9E=90?= =?UTF-8?q?=EB=8F=99=20=EC=9E=AC=EC=83=89=EC=9D=B8=20+=20=EB=AC=B8?= =?UTF-8?q?=EC=84=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Cargo.toml workspace version 0.26.1 → 0.26.2 (+Cargo.lock cascade). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정) → patch (CLAUDE.md §Versioning). - tasks/HOTFIXES.md dated entry: 일반화 + 업그레이드 1회 재색인 안내 + 도그푸딩 evidence. - HANDOFF.md 1줄. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 48 +++++++++++++++++++++++----------------------- Cargo.toml | 2 +- HANDOFF.md | 1 + tasks/HOTFIXES.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bb18b21..ad7c8d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4724,7 +4724,7 @@ dependencies = [ [[package]] name = "kebab-app" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "base64 0.22.1", @@ -4772,7 +4772,7 @@ dependencies = [ [[package]] name = "kebab-chunk" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "blake3", @@ -4790,7 +4790,7 @@ dependencies = [ [[package]] name = "kebab-cli" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "clap", @@ -4811,7 +4811,7 @@ dependencies = [ [[package]] name = "kebab-config" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "dirs 5.0.1", @@ -4827,7 +4827,7 @@ dependencies = [ [[package]] name = "kebab-core" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "blake3", @@ -4841,7 +4841,7 @@ dependencies = [ [[package]] name = "kebab-embed" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "blake3", @@ -4855,7 +4855,7 @@ dependencies = [ [[package]] name = "kebab-embed-candle" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "candle-core", @@ -4875,7 +4875,7 @@ dependencies = [ [[package]] name = "kebab-embed-local" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "fastembed", @@ -4888,7 +4888,7 @@ dependencies = [ [[package]] name = "kebab-embed-ollama" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "kebab-config", @@ -4903,7 +4903,7 @@ dependencies = [ [[package]] name = "kebab-eval" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "kebab-app", @@ -4922,7 +4922,7 @@ dependencies = [ [[package]] name = "kebab-llm" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "kebab-core", @@ -4931,7 +4931,7 @@ dependencies = [ [[package]] name = "kebab-llm-local" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "kebab-config", @@ -4948,7 +4948,7 @@ dependencies = [ [[package]] name = "kebab-mcp" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "kebab-app", @@ -4966,7 +4966,7 @@ dependencies = [ [[package]] name = "kebab-nli" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "hf-hub", @@ -4981,7 +4981,7 @@ dependencies = [ [[package]] name = "kebab-parse-code" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "gix", @@ -5004,7 +5004,7 @@ dependencies = [ [[package]] name = "kebab-parse-image" -version = "0.26.1" +version = "0.26.2" dependencies = [ "ab_glyph", "anyhow", @@ -5028,7 +5028,7 @@ dependencies = [ [[package]] name = "kebab-parse-md" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "kebab-core", @@ -5045,7 +5045,7 @@ dependencies = [ [[package]] name = "kebab-parse-pdf" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "blake3", @@ -5060,7 +5060,7 @@ dependencies = [ [[package]] name = "kebab-rag" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "blake3", @@ -5082,7 +5082,7 @@ dependencies = [ [[package]] name = "kebab-search" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "globset", @@ -5101,7 +5101,7 @@ dependencies = [ [[package]] name = "kebab-source-fs" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "blake3", @@ -5119,7 +5119,7 @@ dependencies = [ [[package]] name = "kebab-store-sqlite" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "blake3", @@ -5139,7 +5139,7 @@ dependencies = [ [[package]] name = "kebab-store-vector" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "arrow", @@ -5163,7 +5163,7 @@ dependencies = [ [[package]] name = "kebab-tui" -version = "0.26.1" +version = "0.26.2" dependencies = [ "anyhow", "crossterm", diff --git a/Cargo.toml b/Cargo.toml index f56aae6..249b8fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" -version = "0.26.1" # v0.26.1 — ingest 진행 로그 개선: TTY 진행바에 현재 파일명 + 느린 phase(ocr/caption/embed)+모델명 실시간 + 경과초 heartbeat `(Ns)`, 종료 시 최장 소요 파일 top-5 요약. 신규 wire 이벤트 `asset_phase{idx,total,phase,model}` + `asset_timings.ocr_ms`/`caption_ms` 추가(additive, ingest_progress.v1 유지, serde default 0). 기본 동작 불변. — CLAUDE.md §Release +version = "0.26.2" # v0.26.2 — ingest 설정 변경 시 영향 자산 자동 재색인: ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/[ingest.code])의 결정적 서명을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 해당 설정 변경 시 `--force-reingest` 없이 영향 자산만 자동 재색인. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout 등)은 제외(과도 무효화 회피). doc_id 는 base parser_version 으로 안정 유지(orphan churn 회피). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정) → patch. — CLAUDE.md §Release # pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with # intentional allow-list. The allowed lints are either cosmetic (doc style), diff --git a/HANDOFF.md b/HANDOFF.md index d4fc47b..478464a 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -35,6 +35,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능. 머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만: +- **2026-06-03 ingest 설정 변경 자동 재색인** — v0.26.2. ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/`[ingest.code]`)을 변경하면 `--force-reingest` 없이 영향 자산만 자동 재색인. 그 설정들의 결정적 서명(`ingest_config_signature`)을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 다음 ingest 비교가 mismatch. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout)은 제외(과도 무효화 회피), doc_id 는 base 로 안정 유지. **업그레이드 후 첫 ingest 는 전 자산 1회 재색인**(저장된 상수 parser_version ≠ 새 composite; embedding 은 V012 캐시 히트). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 설정 변경 자동 재색인), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-*invalidation*.md`. - **2026-06-03 ingest 진행 로그 개선** — v0.26.1. 이미지/PDF + OCR/caption on 볼트 ingest 가 "멈춘 듯" 보이던 문제 해소: TTY 진행바에 현재 파일명 + 느린 phase(ocr/caption/embed)+모델명 + 경과초 `(Ns)` heartbeat, 종료 시 최장 소요 파일 top-5 요약. 신규 wire `asset_phase{idx,total,phase,model}` + `asset_timings.ocr_ms`/`caption_ms`(additive, `ingest_progress.v1` 유지, serde default 0). 이미지·PDF 경로도 `asset_timings` emit(이전 markdown 만). 기본 동작 불변. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 진행 로그), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-ingest-log-improve-*.md`. - **2026-06-03 arctic-embed-l-v2.0 임베더 통합** — v0.26.0. 별칭 제거 후 설명형 query recall 보강(측정 recall@10 130/132, e5 +7). `kebab-embed-candle` 모델 레지스트리화(e5 mean + `snowflake-arctic-embed-l-v2.0` CLS, 모델별 pooling/prefix) + 신규 `kebab-embed-ollama`(`provider="ollama"`, `/api/embed`). config `endpoint: Option` 추가. 기본 e5 유지(opt-in), arctic 전환은 embedding_version cascade → 재색인. candle↔Ollama cosine>0.99 게이트로 pooling/prefix 정확성 고정(`#[ignore]`). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 arctic), spec `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`. - **2026-06-03 doc-side expansion(별칭) 기능 완전 제거** — v0.25.0. 아래 2026-05-31 항목의 색인-시 청크당 LLM 별칭 생성 + 별칭 검색 채널을 **전부 제거**(ROI 음수: cross-lingual 은 e5-large 단독으로 충분, 기여는 설명형 +2 그룹뿐인데 대가가 청크당 색인-시 LLM). `Chunk.aliases`/`expansion.rs`/`IngestExpansionCfg`/alias lexical arm/`expansion_progress` wire kind 제거, 신규 마이그레이션 **V013** 이 `chunk_aliases_fts`+`chunks.aliases` DROP. 별칭 default-off 였어 사용자 체감 0, 기존 KB 도 재색인 불요(잔존 별칭 벡터는 `strip_alias_suffix` graceful 매핑/`reset` 정리). `AssetTimings.expansion_ms` 는 wire 호환 위해 값 0 으로 유지. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03), spec `docs/superpowers/specs/2026-06-03-remove-doc-expansion-spec.md`. diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index 90e00d2..e52583d 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,55 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-06-03 — ingest 출력 영향 설정 변경 시 영향 자산 자동 재색인 (v0.26.2) + +**무엇이 깨졌나.** `[image.ocr]` / `[image.caption]` 를 off→색인→on 으로 바꿔도 증분 +skip(`try_skip_unchanged`, `kebab-app/src/lib.rs`)이 그 이미지를 "Unchanged" 로 건너뛰어 +재색인이 안 됐다. 더 일반적으로, skip 판정은 자산 내용(blake3) + `parser_version` + +`chunker_version` + `embedding_version` 만 비교하는데, **ingest 산출물을 바꾸는 다른 설정들** +(청킹 파라미터, OCR/caption, pdf.ocr, `[ingest.code]` 옵션)이 이 셋 중 어디에도 반영되지 +않아, 변경해도 재색인이 트리거되지 않았다. 사용자 요구: OCR/caption 뿐 아니라 **ingest 출력에 +영향 주는 모든 설정**이 변경되면 영향 자산이 자동 재색인. + +**무엇이 바뀌었나 (내부 skip 판정 정정 — 결과 포맷·CLI·wire 불변, patch).** + +- 신규 헬퍼 `ingest_config_signature(config, media_type) -> String` — 그 자산 타입의 + **ingest 산출물에 영향 주는 설정만** 결정적으로 직렬화. 공통(전 타입): `[chunking]` + target_tokens/overlap_tokens/respect_markdown_headings/chunker_version. image: + ocr(enabled, + +model) + caption(enabled, +prompt_template_version). pdf: + pdf.ocr(enabled||always_on 이면 + enabled/always_on/model). code: + `[ingest.code]` 7개 필드. markdown: 공통만. +- 각 ingest 경로(md/image/pdf/code)의 effective parser_version 을 + `format!("{base}|{signature}")` composite 로 만들어 (a) `try_skip_unchanged` 비교값, + (b) **persist 전 `canonical.parser_version` override** — 두 값이 같은 함수에서 나오므로 + 설정 변경 시 다음 run 비교가 mismatch → 영향 자산만 자동 재색인. +- **doc_id 는 손대지 않음**: base parser_version(extractor 상수)으로 계속 파생 → + 설정 변경에도 doc_id 안정(orphan churn 회피). composite 는 비교/저장 필드에만. +- **제외(재색인 트리거 X)**: search/rag/nli/ui/logging/storage/workspace + 산출 무관 + 런타임 파라미터(max_pixels/languages/*_timeout_secs). "그 값이 바뀌면 색인되는 + chunk/embedding 내용이 달라지는가" 기준. 과도 무효화 회피. +- code 의 Tier-3 fallback 문서는 의도적으로 bare `"none-v1"` sentinel 유지(skip 의 + `stored_is_tier3_fallback` bypass 가 정확히 그 문자열에 의존) — composite 는 정상 outcome 에만. + +**업그레이드 1회 효과.** 기존 doc 의 저장 parser_version(상수)이 새 composite 와 달라, +업그레이드 후 첫 `kebab ingest` 에서 **전 자산이 현재 설정대로 1회 재색인**된다(force 불필요). +마크다운/코드도 1회 재청킹되나 embedding 은 V012 derived-cache 히트라 재임베딩 비용은 작다. +`--force-reingest` 는 전체 강제용으로 그대로. + +**도그푸딩 evidence (release 바이너리, Ollama down — OCR 호출은 Lenient 실패).** +이미지 1장, `[image.ocr] enabled=false` 색인 → New=1. config 에서 `enabled=true` 로 변경 후 +`kebab ingest`(force 없이) → **Updated=1**(재색인, errors=0). 동일 config 재실행 → **Unchanged=1** +(불필요 재색인 0). 저장된 parser_version = +`image-meta-v1|chunk:500:80:true:md-heading-v1|ocr:1:gemma4:e4b|cap:0`(base 보존 + OCR on 반영). + +**테스트.** `kebab-app/src/lib.rs::ingest_config_signature_tests`(8 단위: 결정성, 청킹=전타입, +이미지 ocr/caption 토글=이미지만, pdf.ocr=pdf만, code 옵션=코드만, search/rag/ui·런타임 파라미터 +불변 회귀가드) + `kebab-app/tests/config_invalidation.rs`(4 end-to-end: 동일 config=전 skip, +청킹 변경=md+code 재색인, `[ingest.code]` 변경=코드만, search 변경=재색인 0). 기존 skip 테스트 +회귀 0(parser_version exact assert 는 base 접두사 비교로 갱신 — code_ingest_smoke/pdf_pipeline). + +spec/plan: `docs/superpowers/specs/2026-06-03-ocr-toggle-invalidation-spec.md` / +`…/plans/2026-06-03-config-invalidation-plan.md`. + ## 2026-06-03 — ingest 진행 로그 개선: 파일명·phase·heartbeat·slowest 요약 (v0.26.1) **무엇을 왜 추가했나.** arctic 도그푸딩 중 이미지/PDF 혼재 + OCR/caption on 볼트에서