Compare commits
4 Commits
bebf6e4ac7
...
a283e56c5c
| Author | SHA1 | Date | |
|---|---|---|---|
| a283e56c5c | |||
| 47ef6532f7 | |||
| 03b0745e9d | |||
| e7cb20990a |
48
Cargo.lock
generated
48
Cargo.lock
generated
@@ -4724,7 +4724,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-app"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.22.1",
|
||||
@@ -4772,7 +4772,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-chunk"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4790,7 +4790,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-cli"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
@@ -4811,7 +4811,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-config"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"dirs 5.0.1",
|
||||
@@ -4827,7 +4827,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-core"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4841,7 +4841,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4855,7 +4855,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed-candle"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"candle-core",
|
||||
@@ -4875,7 +4875,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed-local"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"fastembed",
|
||||
@@ -4888,7 +4888,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed-ollama"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-config",
|
||||
@@ -4903,7 +4903,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-eval"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4922,7 +4922,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4931,7 +4931,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm-local"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-config",
|
||||
@@ -4948,7 +4948,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-mcp"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4966,7 +4966,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-nli"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"hf-hub",
|
||||
@@ -4981,7 +4981,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-code"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"gix",
|
||||
@@ -5004,7 +5004,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-image"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"ab_glyph",
|
||||
"anyhow",
|
||||
@@ -5028,7 +5028,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-md"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -5045,7 +5045,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-pdf"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -5060,7 +5060,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-rag"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -5082,7 +5082,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-search"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"globset",
|
||||
@@ -5101,7 +5101,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-source-fs"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -5119,7 +5119,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-sqlite"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -5139,7 +5139,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-vector"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arrow",
|
||||
@@ -5163,7 +5163,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-tui"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"crossterm",
|
||||
|
||||
@@ -32,7 +32,7 @@ edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kebab"
|
||||
version = "0.26.1" # v0.26.1 — ingest 진행 로그 개선: TTY 진행바에 현재 파일명 + 느린 phase(ocr/caption/embed)+모델명 실시간 + 경과초 heartbeat `(Ns)`, 종료 시 최장 소요 파일 top-5 요약. 신규 wire 이벤트 `asset_phase{idx,total,phase,model}` + `asset_timings.ocr_ms`/`caption_ms` 추가(additive, ingest_progress.v1 유지, serde default 0). 기본 동작 불변. — CLAUDE.md §Release
|
||||
version = "0.26.2" # v0.26.2 — ingest 설정 변경 시 영향 자산 자동 재색인: ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/[ingest.code])의 결정적 서명을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 해당 설정 변경 시 `--force-reingest` 없이 영향 자산만 자동 재색인. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout 등)은 제외(과도 무효화 회피). doc_id 는 base parser_version 으로 안정 유지(orphan churn 회피). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정) → patch. — CLAUDE.md §Release
|
||||
|
||||
# pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
|
||||
# intentional allow-list. The allowed lints are either cosmetic (doc style),
|
||||
|
||||
@@ -35,6 +35,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
|
||||
머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:
|
||||
|
||||
- **2026-06-03 ingest 설정 변경 자동 재색인** — v0.26.2. ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/`[ingest.code]`)을 변경하면 `--force-reingest` 없이 영향 자산만 자동 재색인. 그 설정들의 결정적 서명(`ingest_config_signature`)을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 다음 ingest 비교가 mismatch. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout)은 제외(과도 무효화 회피), doc_id 는 base 로 안정 유지. **업그레이드 후 첫 ingest 는 전 자산 1회 재색인**(저장된 상수 parser_version ≠ 새 composite; embedding 은 V012 캐시 히트). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 설정 변경 자동 재색인), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-*invalidation*.md`.
|
||||
- **2026-06-03 ingest 진행 로그 개선** — v0.26.1. 이미지/PDF + OCR/caption on 볼트 ingest 가 "멈춘 듯" 보이던 문제 해소: TTY 진행바에 현재 파일명 + 느린 phase(ocr/caption/embed)+모델명 + 경과초 `(Ns)` heartbeat, 종료 시 최장 소요 파일 top-5 요약. 신규 wire `asset_phase{idx,total,phase,model}` + `asset_timings.ocr_ms`/`caption_ms`(additive, `ingest_progress.v1` 유지, serde default 0). 이미지·PDF 경로도 `asset_timings` emit(이전 markdown 만). 기본 동작 불변. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 진행 로그), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-ingest-log-improve-*.md`.
|
||||
- **2026-06-03 arctic-embed-l-v2.0 임베더 통합** — v0.26.0. 별칭 제거 후 설명형 query recall 보강(측정 recall@10 130/132, e5 +7). `kebab-embed-candle` 모델 레지스트리화(e5 mean + `snowflake-arctic-embed-l-v2.0` CLS, 모델별 pooling/prefix) + 신규 `kebab-embed-ollama`(`provider="ollama"`, `/api/embed`). config `endpoint: Option<String>` 추가. 기본 e5 유지(opt-in), arctic 전환은 embedding_version cascade → 재색인. candle↔Ollama cosine>0.99 게이트로 pooling/prefix 정확성 고정(`#[ignore]`). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 arctic), spec `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`.
|
||||
- **2026-06-03 doc-side expansion(별칭) 기능 완전 제거** — v0.25.0. 아래 2026-05-31 항목의 색인-시 청크당 LLM 별칭 생성 + 별칭 검색 채널을 **전부 제거**(ROI 음수: cross-lingual 은 e5-large 단독으로 충분, 기여는 설명형 +2 그룹뿐인데 대가가 청크당 색인-시 LLM). `Chunk.aliases`/`expansion.rs`/`IngestExpansionCfg`/alias lexical arm/`expansion_progress` wire kind 제거, 신규 마이그레이션 **V013** 이 `chunk_aliases_fts`+`chunks.aliases` DROP. 별칭 default-off 였어 사용자 체감 0, 기존 KB 도 재색인 불요(잔존 별칭 벡터는 `strip_alias_suffix` graceful 매핑/`reset` 정리). `AssetTimings.expansion_ms` 는 wire 호환 위해 값 0 으로 유지. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03), spec `docs/superpowers/specs/2026-06-03-remove-doc-expansion-spec.md`.
|
||||
|
||||
@@ -1242,6 +1242,12 @@ fn ingest_one_asset(
|
||||
}
|
||||
};
|
||||
|
||||
// v0.26.2: fold the ingest-config signature into the effective
|
||||
// parser_version for the skip compare + the stored doc field, so a
|
||||
// change to any markdown-affecting setting (chunking params) re-indexes.
|
||||
// `doc_id` keeps deriving from the base version below (stability).
|
||||
let eff_parser_version = effective_parser_version(&app.config, asset, parser_version);
|
||||
|
||||
// p9-fb-23 task 7: incremental-ingest early-skip. When force_reingest
|
||||
// is false AND the on-disk asset's checksum + parser_version +
|
||||
// last_chunker_version + last_embedding_version all match the existing
|
||||
@@ -1251,7 +1257,7 @@ fn ingest_one_asset(
|
||||
if let Some(item) = try_skip_unchanged(
|
||||
app,
|
||||
asset,
|
||||
parser_version,
|
||||
&eff_parser_version,
|
||||
&MdHeadingV1Chunker.chunker_version(),
|
||||
embedder.map(|e| e.model_version()).as_ref(),
|
||||
force_reingest,
|
||||
@@ -1297,6 +1303,10 @@ fn ingest_one_asset(
|
||||
let mut canonical =
|
||||
build_canonical_document(asset, metadata, parsed_blocks, parser_version, all_warnings)
|
||||
.context("kb-parse-md::build_canonical_document")?;
|
||||
// v0.26.2: persist the composite parser_version (base|signature) so the
|
||||
// next run's skip compare matches what was computed above. doc_id was
|
||||
// already derived from the base version inside build_canonical_document.
|
||||
canonical.parser_version = eff_parser_version.clone();
|
||||
|
||||
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
|
||||
|
||||
@@ -1529,11 +1539,15 @@ fn ingest_one_image_asset(
|
||||
// embedding-version check matches the markdown path: when the
|
||||
// active embedder's model_version equals what was stamped on the
|
||||
// existing doc, the asset is Unchanged.
|
||||
// v0.26.2: composite parser_version folds image OCR / caption + chunking
|
||||
// settings, so toggling `[image.ocr]` / `[image.caption]` (or changing
|
||||
// their model / prompt version) auto-re-indexes the affected images.
|
||||
let image_parser_version = ParserVersion(kebab_parse_image::PARSER_VERSION.to_string());
|
||||
let eff_parser_version = effective_parser_version(&app.config, asset, &image_parser_version);
|
||||
if let Some(item) = try_skip_unchanged(
|
||||
app,
|
||||
asset,
|
||||
&image_parser_version,
|
||||
&eff_parser_version,
|
||||
&MdHeadingV1Chunker.chunker_version(),
|
||||
embedder.map(|e| e.model_version()).as_ref(),
|
||||
force_reingest,
|
||||
@@ -1563,6 +1577,10 @@ fn ingest_one_image_asset(
|
||||
let mut canonical = app
|
||||
.extract_for(&asset.media_type, &ctx, &bytes)
|
||||
.context("kb-app::extract_for (image)")?;
|
||||
// v0.26.2: store the composite parser_version (extractor baked the base
|
||||
// `image-meta-v1`, which already fixed doc_id). Skip compare + stored
|
||||
// field must agree for next-run detection.
|
||||
canonical.parser_version = eff_parser_version.clone();
|
||||
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
|
||||
|
||||
// 2 + 3. Apply OCR / caption when their adapters exist. Both are
|
||||
@@ -2106,11 +2124,14 @@ fn ingest_one_pdf_asset(
|
||||
// p9-fb-23 task 7: incremental-ingest early-skip for the PDF flow.
|
||||
// PDF docs use `pdf-text-v1` as the parser_version and `PdfPageV1Chunker`
|
||||
// as the chunker — both pinned per-medium today (no config knob).
|
||||
// v0.26.2: composite parser_version folds pdf.ocr (enabled/always_on/
|
||||
// model) + chunking, so enabling scanned-PDF OCR auto-re-indexes PDFs.
|
||||
let pdf_parser_version = ParserVersion(kebab_parse_pdf::PARSER_VERSION.to_string());
|
||||
let eff_parser_version = effective_parser_version(&app.config, asset, &pdf_parser_version);
|
||||
if let Some(item) = try_skip_unchanged(
|
||||
app,
|
||||
asset,
|
||||
&pdf_parser_version,
|
||||
&eff_parser_version,
|
||||
&PdfPageV1Chunker.chunker_version(),
|
||||
embedder.map(|e| e.model_version()).as_ref(),
|
||||
force_reingest,
|
||||
@@ -2135,6 +2156,9 @@ fn ingest_one_pdf_asset(
|
||||
let mut canonical = app
|
||||
.extract_for(&asset.media_type, &ctx, &bytes)
|
||||
.context("kb-app::extract_for (pdf)")?;
|
||||
// v0.26.2: store the composite parser_version (base `pdf-text-v1` already
|
||||
// fixed doc_id) so the next run's skip compare matches.
|
||||
canonical.parser_version = eff_parser_version.clone();
|
||||
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
|
||||
|
||||
// v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry
|
||||
@@ -2510,10 +2534,19 @@ fn ingest_one_code_asset(
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// v0.26.2: composite parser_version folds [ingest.code] options + common
|
||||
// chunking so editing any code-ingest setting auto-re-indexes code assets.
|
||||
// The base per-lang version still derives doc_id (synthesize_tier2_document
|
||||
// / extract_for keep using `parser_version`). A Tier-3 fallback document
|
||||
// intentionally keeps the bare "none-v1" parser_version (the
|
||||
// `stored_is_tier3_fallback` bypass in try_skip_unchanged depends on the
|
||||
// exact "none-v1" sentinel), so the composite is only stamped on the
|
||||
// normal (non-fallback) outcome below.
|
||||
let eff_parser_version = effective_parser_version(&app.config, asset, &parser_version);
|
||||
if let Some(item) = try_skip_unchanged(
|
||||
app,
|
||||
asset,
|
||||
&parser_version,
|
||||
&eff_parser_version,
|
||||
&chunker_version,
|
||||
embedder.map(|e| e.model_version()).as_ref(),
|
||||
force_reingest,
|
||||
@@ -2678,6 +2711,20 @@ fn ingest_one_code_asset(
|
||||
}
|
||||
};
|
||||
|
||||
// v0.26.2: stamp the composite parser_version for the normal outcome so
|
||||
// editing any [ingest.code] / chunking setting re-indexes this asset next
|
||||
// run. A Tier-3 fallback (an AST / manifest lang whose extractor or
|
||||
// chunker degraded to CodeTextParagraphV1Chunker) must keep the bare
|
||||
// "none-v1" sentinel, because `try_skip_unchanged`'s
|
||||
// `stored_is_tier3_fallback` bypass keys off that exact string. `shell`
|
||||
// is native Tier 3 (no bypass — `tier3_fallback_cv` is None for it), so it
|
||||
// still gets the composite.
|
||||
let is_tier3_fallback_outcome =
|
||||
code_lang != "shell" && chunker_version == CodeTextParagraphV1Chunker.chunker_version();
|
||||
if !is_tier3_fallback_outcome {
|
||||
canonical.parser_version = eff_parser_version.clone();
|
||||
}
|
||||
|
||||
// Stamp chunker + embedding versions so incremental skip detection has
|
||||
// data on the second run.
|
||||
canonical.last_chunker_version = Some(chunker_version.clone());
|
||||
@@ -2951,6 +2998,102 @@ fn chunk_policy_from_config(config: &kebab_config::Config) -> ChunkPolicy {
|
||||
}
|
||||
}
|
||||
|
||||
/// v0.26.2: deterministic signature of the **ingest-output-affecting**
|
||||
/// config for an asset's media type, folded into the effective
|
||||
/// `parser_version` (both the `try_skip_unchanged` compare field AND the
|
||||
/// persisted `documents.parser_version`). When any setting that changes the
|
||||
/// produced chunks / embeddings is edited, the next ingest's signature no
|
||||
/// longer matches the stored one → the affected assets (only) are
|
||||
/// automatically re-indexed without `--force-reingest`.
|
||||
///
|
||||
/// Inclusion rule: "does changing this value alter the chunk / embedding
|
||||
/// content that gets indexed?" Settings that do NOT (search / rag / nli /
|
||||
/// ui / logging / storage / workspace, plus runtime-only knobs like
|
||||
/// `max_pixels` / `languages` / `*_timeout_secs`) are deliberately excluded
|
||||
/// to avoid over-invalidation. Embedding model/dim is already covered by the
|
||||
/// separate `embedding_version` cascade in [`try_skip_unchanged`], so it is
|
||||
/// not duplicated here.
|
||||
///
|
||||
/// The output is purely a comparison token — it is never parsed back, so the
|
||||
/// exact format is internal. Field order is fixed and `Vec`s are joined so
|
||||
/// the same `Config` always yields the same string.
|
||||
fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String {
|
||||
// Common (every media type): chunking parameters that move chunk
|
||||
// boundaries. `target_tokens` / `overlap_tokens` change re-chunking for
|
||||
// markdown / image / pdf / code alike, so a change re-indexes all types.
|
||||
let c = &config.chunking;
|
||||
let mut sig = format!(
|
||||
"chunk:{}:{}:{}:{}",
|
||||
c.target_tokens, c.overlap_tokens, c.respect_markdown_headings, c.chunker_version
|
||||
);
|
||||
match media {
|
||||
MediaType::Image(_) => {
|
||||
// OCR / caption only affect output when their `enabled` flag is
|
||||
// on; the model / prompt version matters only then. Off ↔ off is
|
||||
// a stable empty token so re-running the same config skips.
|
||||
let ocr = &config.image.ocr;
|
||||
if ocr.enabled {
|
||||
sig.push_str(&format!("|ocr:1:{}", ocr.model));
|
||||
} else {
|
||||
sig.push_str("|ocr:0");
|
||||
}
|
||||
let cap = &config.image.caption;
|
||||
if cap.enabled {
|
||||
sig.push_str(&format!("|cap:1:{}", cap.prompt_template_version));
|
||||
} else {
|
||||
sig.push_str("|cap:0");
|
||||
}
|
||||
}
|
||||
MediaType::Pdf => {
|
||||
// PDF OCR is active when EITHER `enabled` or `always_on` is set
|
||||
// (mirrors the ingest gate). `model` only matters when active.
|
||||
let ocr = &config.pdf.ocr;
|
||||
if ocr.enabled || ocr.always_on {
|
||||
sig.push_str(&format!(
|
||||
"|pdfocr:{}:{}:{}",
|
||||
ocr.enabled, ocr.always_on, ocr.model
|
||||
));
|
||||
} else {
|
||||
sig.push_str("|pdfocr:0");
|
||||
}
|
||||
}
|
||||
MediaType::Code(_) => {
|
||||
let cc = &config.ingest.code;
|
||||
sig.push_str(&format!(
|
||||
"|code:{}:{}:{}:{}:{}:{}:{}",
|
||||
cc.skip_generated_header,
|
||||
cc.max_file_bytes,
|
||||
cc.max_file_lines,
|
||||
cc.extra_skip_globs.join(","),
|
||||
cc.ast_chunk_max_lines,
|
||||
cc.fallback_lines_per_chunk,
|
||||
cc.fallback_lines_overlap
|
||||
));
|
||||
}
|
||||
// Markdown carries common-only; Audio / Other are not ingested yet.
|
||||
MediaType::Markdown | MediaType::Audio(_) | MediaType::Other(_) => {}
|
||||
}
|
||||
sig
|
||||
}
|
||||
|
||||
/// Compose an extractor's base `parser_version` with the ingest-config
|
||||
/// signature for `asset`'s media type. The result is used as the
|
||||
/// `try_skip_unchanged` compare value and stored on the persisted document,
|
||||
/// while the **base** version is what derives `doc_id` (kept stable to avoid
|
||||
/// orphan churn — see the spec at
|
||||
/// `docs/superpowers/specs/2026-06-03-ocr-toggle-invalidation-spec.md`).
|
||||
fn effective_parser_version(
|
||||
config: &kebab_config::Config,
|
||||
asset: &RawAsset,
|
||||
base: &ParserVersion,
|
||||
) -> ParserVersion {
|
||||
ParserVersion(format!(
|
||||
"{}|{}",
|
||||
base.0,
|
||||
ingest_config_signature(config, &asset.media_type)
|
||||
))
|
||||
}
|
||||
|
||||
// ── list_docs / inspect_doc / inspect_chunk ───────────────────────────────
|
||||
|
||||
pub fn list_docs(filter: DocFilter) -> anyhow::Result<Vec<DocSummary>> {
|
||||
@@ -3429,3 +3572,248 @@ fn check_kebabignore_match(
|
||||
.is_ignore()
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod ingest_config_signature_tests {
|
||||
//! v0.26.2: unit tests for [`ingest_config_signature`] — the
|
||||
//! ingest-output-affecting config fingerprint that is folded into the
|
||||
//! effective `parser_version` so that changing any setting that alters
|
||||
//! the produced chunks/embeddings auto-re-indexes the affected assets,
|
||||
//! while changes to unrelated settings (search/rag/ui/…) do not.
|
||||
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{ImageType, MediaType};
|
||||
|
||||
use super::ingest_config_signature;
|
||||
|
||||
fn img() -> MediaType {
|
||||
MediaType::Image(ImageType::Png)
|
||||
}
|
||||
fn pdf() -> MediaType {
|
||||
MediaType::Pdf
|
||||
}
|
||||
fn code() -> MediaType {
|
||||
MediaType::Code("rust".to_string())
|
||||
}
|
||||
fn md() -> MediaType {
|
||||
MediaType::Markdown
|
||||
}
|
||||
|
||||
/// The signature is deterministic: same config + same media → same string.
|
||||
#[test]
|
||||
fn deterministic_for_unchanged_config() {
|
||||
let c = Config::defaults();
|
||||
for m in [md(), img(), pdf(), code()] {
|
||||
assert_eq!(
|
||||
ingest_config_signature(&c, &m),
|
||||
ingest_config_signature(&c, &m),
|
||||
"signature must be stable for {m:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Changing a common chunking parameter changes the signature for EVERY
|
||||
/// media type (re-chunk cascade).
|
||||
#[test]
|
||||
fn chunking_change_invalidates_all_types() {
|
||||
let base = Config::defaults();
|
||||
let mut bumped = base.clone();
|
||||
bumped.chunking.target_tokens += 100;
|
||||
for m in [md(), img(), pdf(), code()] {
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &m),
|
||||
ingest_config_signature(&bumped, &m),
|
||||
"target_tokens change must invalidate {m:?}"
|
||||
);
|
||||
}
|
||||
|
||||
let mut overlap = base.clone();
|
||||
overlap.chunking.overlap_tokens += 10;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &md()),
|
||||
ingest_config_signature(&overlap, &md())
|
||||
);
|
||||
|
||||
let mut headings = base.clone();
|
||||
headings.chunking.respect_markdown_headings = !base.chunking.respect_markdown_headings;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &md()),
|
||||
ingest_config_signature(&headings, &md())
|
||||
);
|
||||
}
|
||||
|
||||
/// Image OCR toggle (off→on) changes only the image signature; pdf / code
|
||||
/// / markdown are unaffected.
|
||||
#[test]
|
||||
fn image_ocr_toggle_invalidates_image_only() {
|
||||
let base = Config::defaults();
|
||||
assert!(!base.image.ocr.enabled, "default OCR is off");
|
||||
let mut on = base.clone();
|
||||
on.image.ocr.enabled = true;
|
||||
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &img()),
|
||||
ingest_config_signature(&on, &img()),
|
||||
"image OCR toggle must invalidate images"
|
||||
);
|
||||
for m in [md(), pdf(), code()] {
|
||||
assert_eq!(
|
||||
ingest_config_signature(&base, &m),
|
||||
ingest_config_signature(&on, &m),
|
||||
"image OCR toggle must NOT touch {m:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// When OCR is enabled, changing the OCR model changes the image
|
||||
/// signature; when OCR is off, the model field is irrelevant.
|
||||
#[test]
|
||||
fn image_ocr_model_matters_only_when_enabled() {
|
||||
let mut off_a = Config::defaults();
|
||||
let mut off_b = off_a.clone();
|
||||
off_b.image.ocr.model = "some-other-model".to_string();
|
||||
assert_eq!(
|
||||
ingest_config_signature(&off_a, &img()),
|
||||
ingest_config_signature(&off_b, &img()),
|
||||
"OCR model is irrelevant while OCR is off"
|
||||
);
|
||||
|
||||
off_a.image.ocr.enabled = true;
|
||||
let mut on_b = off_a.clone();
|
||||
on_b.image.ocr.model = "some-other-model".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&off_a, &img()),
|
||||
ingest_config_signature(&on_b, &img()),
|
||||
"OCR model change matters while OCR is on"
|
||||
);
|
||||
}
|
||||
|
||||
/// Image caption toggle + prompt-template-version change invalidate images.
|
||||
#[test]
|
||||
fn image_caption_toggle_and_prompt_invalidate_image() {
|
||||
let base = Config::defaults();
|
||||
let mut on = base.clone();
|
||||
on.image.caption.enabled = true;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &img()),
|
||||
ingest_config_signature(&on, &img())
|
||||
);
|
||||
|
||||
let mut prompt = on.clone();
|
||||
prompt.image.caption.prompt_template_version = "caption-v9".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&on, &img()),
|
||||
ingest_config_signature(&prompt, &img()),
|
||||
"caption prompt version change matters while caption is on"
|
||||
);
|
||||
}
|
||||
|
||||
/// PDF OCR `enabled` and `always_on` both invalidate PDFs (either turns
|
||||
/// OCR on); they do not touch other media types.
|
||||
#[test]
|
||||
fn pdf_ocr_toggle_invalidates_pdf_only() {
|
||||
let base = Config::defaults();
|
||||
let mut enabled = base.clone();
|
||||
enabled.pdf.ocr.enabled = true;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &pdf()),
|
||||
ingest_config_signature(&enabled, &pdf()),
|
||||
"pdf.ocr.enabled toggle must invalidate PDFs"
|
||||
);
|
||||
|
||||
let mut always = base.clone();
|
||||
always.pdf.ocr.always_on = true;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &pdf()),
|
||||
ingest_config_signature(&always, &pdf()),
|
||||
"pdf.ocr.always_on toggle must invalidate PDFs"
|
||||
);
|
||||
|
||||
for m in [md(), img(), code()] {
|
||||
assert_eq!(
|
||||
ingest_config_signature(&base, &m),
|
||||
ingest_config_signature(&enabled, &m),
|
||||
"pdf OCR toggle must NOT touch {m:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Each `[ingest.code]` option change invalidates code assets only.
|
||||
#[test]
|
||||
fn code_options_invalidate_code_only() {
|
||||
let base = Config::defaults();
|
||||
|
||||
let mut variants = Vec::new();
|
||||
let mut v = base.clone();
|
||||
v.ingest.code.skip_generated_header = !base.ingest.code.skip_generated_header;
|
||||
variants.push(v);
|
||||
let mut v = base.clone();
|
||||
v.ingest.code.max_file_bytes += 1;
|
||||
variants.push(v);
|
||||
let mut v = base.clone();
|
||||
v.ingest.code.max_file_lines += 1;
|
||||
variants.push(v);
|
||||
let mut v = base.clone();
|
||||
v.ingest.code.extra_skip_globs.push("**/vendor/**".to_string());
|
||||
variants.push(v);
|
||||
let mut v = base.clone();
|
||||
v.ingest.code.ast_chunk_max_lines += 1;
|
||||
variants.push(v);
|
||||
let mut v = base.clone();
|
||||
v.ingest.code.fallback_lines_per_chunk += 1;
|
||||
variants.push(v);
|
||||
let mut v = base.clone();
|
||||
v.ingest.code.fallback_lines_overlap += 1;
|
||||
variants.push(v);
|
||||
|
||||
for v in &variants {
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &code()),
|
||||
ingest_config_signature(v, &code()),
|
||||
"code option change must invalidate code assets"
|
||||
);
|
||||
// ...but must NOT touch md / image / pdf.
|
||||
for m in [md(), img(), pdf()] {
|
||||
assert_eq!(
|
||||
ingest_config_signature(&base, &m),
|
||||
ingest_config_signature(v, &m),
|
||||
"code option change must NOT touch {m:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Regression guard: search / rag / nli / ui / logging / storage /
|
||||
/// workspace settings — and ingest runtime-only knobs that do NOT change
|
||||
/// indexed output — never change the signature for ANY media type.
|
||||
#[test]
|
||||
fn unrelated_settings_never_invalidate() {
|
||||
let base = Config::defaults();
|
||||
let mut other = base.clone();
|
||||
// search
|
||||
other.search.default_k += 5;
|
||||
other.search.rrf_k += 1;
|
||||
other.search.snippet_chars += 10;
|
||||
// rag
|
||||
other.rag.score_gate += 0.1;
|
||||
other.rag.prompt_template_version = "rag-v99".to_string();
|
||||
// ui
|
||||
other.ui.theme = "light".to_string();
|
||||
// image runtime-only (non-output) knobs
|
||||
other.image.ocr.max_pixels += 100;
|
||||
other.image.ocr.languages.push("jpn".to_string());
|
||||
other.image.ocr.request_timeout_secs += 10;
|
||||
// pdf runtime-only knobs
|
||||
other.pdf.ocr.max_pixels += 100;
|
||||
other.pdf.ocr.request_timeout_secs += 10;
|
||||
other.pdf.ocr.languages.push("jpn".to_string());
|
||||
|
||||
for m in [md(), img(), pdf(), code()] {
|
||||
assert_eq!(
|
||||
ingest_config_signature(&base, &m),
|
||||
ingest_config_signature(&other, &m),
|
||||
"unrelated/runtime-only settings must NOT invalidate {m:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +52,9 @@ fn rust_file_ingests_and_searches_as_code_citation() {
|
||||
"at least one chunk expected: {code_item:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
code_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
code_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("code-rust-v1"),
|
||||
"parser_version must be code-rust-v1"
|
||||
);
|
||||
@@ -185,7 +187,9 @@ fn python_file_ingests_and_searches_as_code_citation() {
|
||||
.find(|i| i.doc_path.0.ends_with("metrics.py"))
|
||||
.expect("metrics.py item");
|
||||
assert_eq!(
|
||||
py_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
py_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("code-python-v1"),
|
||||
"parser_version must be code-python-v1"
|
||||
);
|
||||
@@ -261,7 +265,9 @@ fn typescript_file_ingests_and_searches_as_code_citation() {
|
||||
.find(|i| i.doc_path.0.ends_with("Foo.ts"))
|
||||
.expect("Foo.ts item");
|
||||
assert_eq!(
|
||||
ts_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
ts_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("code-ts-v1"),
|
||||
"parser_version must be code-ts-v1"
|
||||
);
|
||||
@@ -337,7 +343,9 @@ fn javascript_file_ingests_and_searches_as_code_citation() {
|
||||
.find(|i| i.doc_path.0.ends_with("Bar.js"))
|
||||
.expect("Bar.js item");
|
||||
assert_eq!(
|
||||
js_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
js_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("code-js-v1"),
|
||||
"parser_version must be code-js-v1"
|
||||
);
|
||||
@@ -415,7 +423,9 @@ fn go_file_ingests_and_searches_as_code_citation() {
|
||||
.find(|i| i.doc_path.0.ends_with("ast.go"))
|
||||
.expect("ast.go item present");
|
||||
assert_eq!(
|
||||
go_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
go_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("code-go-v1"),
|
||||
"parser_version must be code-go-v1"
|
||||
);
|
||||
@@ -486,7 +496,9 @@ fn java_file_ingests_and_searches_as_code_citation() {
|
||||
.find(|i| i.doc_path.0.ends_with("Foo.java"))
|
||||
.expect("Foo.java item present");
|
||||
assert_eq!(
|
||||
java_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
java_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("code-java-v1"),
|
||||
"parser_version must be code-java-v1"
|
||||
);
|
||||
@@ -561,7 +573,9 @@ fn kotlin_file_ingests_and_searches_as_code_citation() {
|
||||
.find(|i| i.doc_path.0.ends_with("Foo.kt"))
|
||||
.expect("Foo.kt item present");
|
||||
assert_eq!(
|
||||
kt_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
kt_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("code-kotlin-v1"),
|
||||
"parser_version must be code-kotlin-v1"
|
||||
);
|
||||
@@ -634,7 +648,9 @@ fn tier2_k8s_yaml_ingest_searchable() {
|
||||
.find(|i| i.doc_path.0.ends_with("deploy.yaml"))
|
||||
.expect("deploy.yaml item present");
|
||||
assert_eq!(
|
||||
yaml_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
yaml_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("none-v1"),
|
||||
"parser_version must be none-v1"
|
||||
);
|
||||
@@ -717,7 +733,9 @@ fn tier2_dockerfile_ingest_searchable() {
|
||||
.find(|i| i.doc_path.0.ends_with("Dockerfile"))
|
||||
.expect("Dockerfile item present");
|
||||
assert_eq!(
|
||||
df_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
df_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("none-v1"),
|
||||
"parser_version must be none-v1"
|
||||
);
|
||||
@@ -800,7 +818,9 @@ fn tier2_cargo_toml_ingest_searchable() {
|
||||
.find(|i| i.doc_path.0.ends_with("Cargo.toml"))
|
||||
.expect("Cargo.toml item present");
|
||||
assert_eq!(
|
||||
toml_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
toml_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("none-v1"),
|
||||
"parser_version must be none-v1"
|
||||
);
|
||||
@@ -883,7 +903,9 @@ fn tier3_shell_ingest_searchable() {
|
||||
.find(|i| i.doc_path.0.ends_with("deploy.sh"))
|
||||
.expect("deploy.sh item present");
|
||||
assert_eq!(
|
||||
sh_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
sh_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("none-v1"),
|
||||
"parser_version must be none-v1 for shell (Tier 3 direct)"
|
||||
);
|
||||
@@ -974,7 +996,9 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
|
||||
.find(|i| i.doc_path.0.ends_with("docker-compose.yml"))
|
||||
.expect("docker-compose.yml item present");
|
||||
assert_eq!(
|
||||
yaml_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
yaml_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("none-v1"),
|
||||
"parser_version must be none-v1 after Tier 3 fallback"
|
||||
);
|
||||
@@ -1144,7 +1168,9 @@ fn tier1_c_ingest_searchable() {
|
||||
.find(|i| i.doc_path.0.ends_with("parser.c"))
|
||||
.expect("parser.c item present");
|
||||
assert_eq!(
|
||||
c_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
c_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("code-c-v2"),
|
||||
"parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)"
|
||||
);
|
||||
@@ -1228,7 +1254,9 @@ fn tier1_cpp_ingest_searchable() {
|
||||
.find(|i| i.doc_path.0.ends_with("chunker.cpp"))
|
||||
.expect("chunker.cpp item present");
|
||||
assert_eq!(
|
||||
cpp_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
cpp_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("code-cpp-v1"),
|
||||
"parser_version must be code-cpp-v1"
|
||||
);
|
||||
|
||||
148
crates/kebab-app/tests/config_invalidation.rs
Normal file
148
crates/kebab-app/tests/config_invalidation.rs
Normal file
@@ -0,0 +1,148 @@
|
||||
//! v0.26.2: ingest-config invalidation — changing a setting that affects
|
||||
//! ingest output auto-re-indexes the affected assets on the next ingest
|
||||
//! (no `--force-reingest`), while changing an unrelated setting does not.
|
||||
//!
|
||||
//! These end-to-end tests exercise the model-free signals (chunking +
|
||||
//! `[ingest.code]` options vs `search` settings). The exhaustive per-setting
|
||||
//! mapping (image OCR / caption, pdf.ocr, code options, search/rag/ui
|
||||
//! invariance) is unit-tested in
|
||||
//! `kebab-app/src/lib.rs::ingest_config_signature_tests` — those toggles
|
||||
//! (OCR/caption) require a live vision endpoint to ingest, so the wiring is
|
||||
//! verified here via the signature-driven chunking path that shares the same
|
||||
//! `effective_parser_version` plumbing.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
use kebab_app::{IngestOpts, ingest_with_config, ingest_with_config_opts};
|
||||
use kebab_core::IngestItemKind;
|
||||
|
||||
/// Seed a workspace with a markdown + a rust file so both the markdown and
|
||||
/// the code ingest paths are exercised. Returns the first-ingest report.
|
||||
fn seed_and_first_ingest(env: &TestEnv) -> kebab_core::IngestReport {
|
||||
std::fs::write(
|
||||
env.workspace_root.join("demo.rs"),
|
||||
"/// adds two integers\npub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n",
|
||||
)
|
||||
.unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).expect("first ingest");
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest creates docs: {first:?}");
|
||||
assert_eq!(first.unchanged, 0, "first ingest has no unchanged: {first:?}");
|
||||
first
|
||||
}
|
||||
|
||||
fn reingest(env: &TestEnv) -> kebab_core::IngestReport {
|
||||
ingest_with_config_opts(env.config.clone(), env.scope(), false, IngestOpts::default())
|
||||
.expect("re-ingest")
|
||||
}
|
||||
|
||||
/// Re-running with the identical config skips every asset (no spurious
|
||||
/// re-index). Regression guard for over-invalidation.
|
||||
#[test]
|
||||
fn identical_config_skips_all_assets() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
|
||||
assert_eq!(second.unchanged, scanned, "every doc Unchanged: {second:?}");
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
|
||||
/// Changing a common chunking parameter re-indexes EVERY media type
|
||||
/// (markdown + code here) without `--force-reingest`.
|
||||
#[test]
|
||||
fn chunking_change_reindexes_all_types() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
// Bump target_tokens — folds into every type's signature.
|
||||
env.config.chunking.target_tokens += 100;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||
assert_eq!(
|
||||
second.unchanged, 0,
|
||||
"chunking change must re-index all: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.updated, scanned,
|
||||
"every doc re-indexed as Updated: {second:?}"
|
||||
);
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
|
||||
/// Changing an `[ingest.code]` option re-indexes only the code asset; the
|
||||
/// markdown assets stay Unchanged.
|
||||
#[test]
|
||||
fn code_option_change_reindexes_code_only() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
// Raise max_file_lines (keeps the tiny demo.rs in-scope; only the code
|
||||
// signature changes).
|
||||
env.config.ingest.code.max_file_lines += 1000;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||
assert_eq!(second.errors, 0);
|
||||
assert_eq!(
|
||||
second.updated, 1,
|
||||
"exactly the code asset re-indexed: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.unchanged,
|
||||
scanned - 1,
|
||||
"all markdown assets stay Unchanged: {second:?}"
|
||||
);
|
||||
|
||||
let items = second.items.as_ref().expect("items present");
|
||||
let code = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("demo.rs"))
|
||||
.expect("demo.rs item");
|
||||
assert_eq!(
|
||||
code.kind,
|
||||
IngestItemKind::Updated,
|
||||
"demo.rs must be re-indexed: {code:?}"
|
||||
);
|
||||
for i in items.iter().filter(|i| i.doc_path.0.ends_with(".md")) {
|
||||
assert_eq!(
|
||||
i.kind,
|
||||
IngestItemKind::Unchanged,
|
||||
"markdown must be Unchanged: {i:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Regression guard: changing a non-ingest setting (`search.default_k`) does
|
||||
/// NOT re-index anything.
|
||||
#[test]
|
||||
fn search_setting_change_reindexes_nothing() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
env.config.search.default_k += 5;
|
||||
env.config.search.snippet_chars += 50;
|
||||
env.config.rag.score_gate = 0.5;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(
|
||||
second.unchanged, scanned,
|
||||
"search/rag changes must not re-index: {second:?}"
|
||||
);
|
||||
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
|
||||
assert_eq!(second.new, 0);
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
@@ -162,7 +162,9 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
||||
"one chunk per non-empty page"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
pdf_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("pdf-text-v1")
|
||||
);
|
||||
assert_eq!(
|
||||
@@ -477,7 +479,10 @@ fn inspect_doc_surfaces_page_spans() {
|
||||
.find(|i| i.doc_path.0.ends_with("inspect.pdf"))
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
assert_eq!(doc.parser_version.0, "pdf-text-v1");
|
||||
// v0.26.2: stored parser_version is now `pdf-text-v1|<ingest-config-sig>`
|
||||
// (the signature folds chunking / pdf.ocr settings for skip detection).
|
||||
// Assert the base identity by taking the prefix before the first '|'.
|
||||
assert_eq!(doc.parser_version.0.split('|').next().unwrap(), "pdf-text-v1");
|
||||
assert_eq!(doc.blocks.len(), 3);
|
||||
for block in &doc.blocks {
|
||||
match block {
|
||||
|
||||
@@ -14,6 +14,55 @@ historical contract that was implemented; this file accumulates the
|
||||
deltas so phase 5+ readers can find the live behavior without diffing
|
||||
git history.
|
||||
|
||||
## 2026-06-03 — ingest 출력 영향 설정 변경 시 영향 자산 자동 재색인 (v0.26.2)
|
||||
|
||||
**무엇이 깨졌나.** `[image.ocr]` / `[image.caption]` 를 off→색인→on 으로 바꿔도 증분
|
||||
skip(`try_skip_unchanged`, `kebab-app/src/lib.rs`)이 그 이미지를 "Unchanged" 로 건너뛰어
|
||||
재색인이 안 됐다. 더 일반적으로, skip 판정은 자산 내용(blake3) + `parser_version` +
|
||||
`chunker_version` + `embedding_version` 만 비교하는데, **ingest 산출물을 바꾸는 다른 설정들**
|
||||
(청킹 파라미터, OCR/caption, pdf.ocr, `[ingest.code]` 옵션)이 이 셋 중 어디에도 반영되지
|
||||
않아, 변경해도 재색인이 트리거되지 않았다. 사용자 요구: OCR/caption 뿐 아니라 **ingest 출력에
|
||||
영향 주는 모든 설정**이 변경되면 영향 자산이 자동 재색인.
|
||||
|
||||
**무엇이 바뀌었나 (내부 skip 판정 정정 — 결과 포맷·CLI·wire 불변, patch).**
|
||||
|
||||
- 신규 헬퍼 `ingest_config_signature(config, media_type) -> String` — 그 자산 타입의
|
||||
**ingest 산출물에 영향 주는 설정만** 결정적으로 직렬화. 공통(전 타입): `[chunking]`
|
||||
target_tokens/overlap_tokens/respect_markdown_headings/chunker_version. image: + ocr(enabled,
|
||||
+model) + caption(enabled, +prompt_template_version). pdf: + pdf.ocr(enabled||always_on 이면
|
||||
enabled/always_on/model). code: + `[ingest.code]` 7개 필드. markdown: 공통만.
|
||||
- 각 ingest 경로(md/image/pdf/code)의 effective parser_version 을
|
||||
`format!("{base}|{signature}")` composite 로 만들어 (a) `try_skip_unchanged` 비교값,
|
||||
(b) **persist 전 `canonical.parser_version` override** — 두 값이 같은 함수에서 나오므로
|
||||
설정 변경 시 다음 run 비교가 mismatch → 영향 자산만 자동 재색인.
|
||||
- **doc_id 는 손대지 않음**: base parser_version(extractor 상수)으로 계속 파생 →
|
||||
설정 변경에도 doc_id 안정(orphan churn 회피). composite 는 비교/저장 필드에만.
|
||||
- **제외(재색인 트리거 X)**: search/rag/nli/ui/logging/storage/workspace + 산출 무관
|
||||
런타임 파라미터(max_pixels/languages/*_timeout_secs). "그 값이 바뀌면 색인되는
|
||||
chunk/embedding 내용이 달라지는가" 기준. 과도 무효화 회피.
|
||||
- code 의 Tier-3 fallback 문서는 의도적으로 bare `"none-v1"` sentinel 유지(skip 의
|
||||
`stored_is_tier3_fallback` bypass 가 정확히 그 문자열에 의존) — composite 는 정상 outcome 에만.
|
||||
|
||||
**업그레이드 1회 효과.** 기존 doc 의 저장 parser_version(상수)이 새 composite 와 달라,
|
||||
업그레이드 후 첫 `kebab ingest` 에서 **전 자산이 현재 설정대로 1회 재색인**된다(force 불필요).
|
||||
마크다운/코드도 1회 재청킹되나 embedding 은 V012 derived-cache 히트라 재임베딩 비용은 작다.
|
||||
`--force-reingest` 는 전체 강제용으로 그대로.
|
||||
|
||||
**도그푸딩 evidence (release 바이너리, Ollama down — OCR 호출은 Lenient 실패).**
|
||||
이미지 1장, `[image.ocr] enabled=false` 색인 → New=1. config 에서 `enabled=true` 로 변경 후
|
||||
`kebab ingest`(force 없이) → **Updated=1**(재색인, errors=0). 동일 config 재실행 → **Unchanged=1**
|
||||
(불필요 재색인 0). 저장된 parser_version =
|
||||
`image-meta-v1|chunk:500:80:true:md-heading-v1|ocr:1:gemma4:e4b|cap:0`(base 보존 + OCR on 반영).
|
||||
|
||||
**테스트.** `kebab-app/src/lib.rs::ingest_config_signature_tests`(8 단위: 결정성, 청킹=전타입,
|
||||
이미지 ocr/caption 토글=이미지만, pdf.ocr=pdf만, code 옵션=코드만, search/rag/ui·런타임 파라미터
|
||||
불변 회귀가드) + `kebab-app/tests/config_invalidation.rs`(4 end-to-end: 동일 config=전 skip,
|
||||
청킹 변경=md+code 재색인, `[ingest.code]` 변경=코드만, search 변경=재색인 0). 기존 skip 테스트
|
||||
회귀 0(parser_version exact assert 는 base 접두사 비교로 갱신 — code_ingest_smoke/pdf_pipeline).
|
||||
|
||||
spec/plan: `docs/superpowers/specs/2026-06-03-ocr-toggle-invalidation-spec.md` /
|
||||
`…/plans/2026-06-03-config-invalidation-plan.md`.
|
||||
|
||||
## 2026-06-03 — ingest 진행 로그 개선: 파일명·phase·heartbeat·slowest 요약 (v0.26.1)
|
||||
|
||||
**무엇을 왜 추가했나.** arctic 도그푸딩 중 이미지/PDF 혼재 + OCR/caption on 볼트에서
|
||||
|
||||
Reference in New Issue
Block a user