From 21e02d8a93738acb17d1149a382fa3da1eff81e2 Mon Sep 17 00:00:00 2001 From: altair823 Date: Tue, 2 Jun 2026 21:37:43 +0000 Subject: [PATCH] =?UTF-8?q?refactor(app):=20ingest=20=EB=B3=84=EC=B9=AD=20?= =?UTF-8?q?=EC=83=9D=EC=84=B1=C2=B7=EC=BA=90=EC=8B=9C=C2=B7sentinel=20?= =?UTF-8?q?=EB=B2=A1=ED=84=B0=20=EB=A3=A8=ED=94=84=20=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ingest_one_asset 의 청크당 별칭 LLM 생성·derivation_cache 조회/저장· embed_aliases sentinel 벡터(`{orig}#alias#N`) upsert 루프 제거. expansion_ms 는 wire 호환 위해 0 고정. alias_sentinel_ids_to_delete 와 orphan purge 3개 호출부를 본문 chunk_id 직접 삭제로 단순화. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/kebab-app/src/lib.rs | 314 ++---------------------------------- 1 file changed, 11 insertions(+), 303 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 3939ddf..1419a0b 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -63,7 +63,6 @@ pub mod derivation_payload; pub mod doctor_signal; pub mod error_signal; pub mod error_wire; -pub mod expansion; pub mod external; pub mod fetch; pub mod ingest_log; @@ -1302,7 +1301,7 @@ fn ingest_one_asset( let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX); let t_chunk = std::time::Instant::now(); - let mut chunks = MdHeadingV1Chunker + let chunks = MdHeadingV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::MdHeadingV1Chunker::chunk")?; let chunk_ms = u64::try_from(t_chunk.elapsed().as_millis()).unwrap_or(u64::MAX); @@ -1320,113 +1319,9 @@ fn ingest_one_asset( }, ); - // Phase 2 doc-side expansion: flag on 이면 청크당 별칭 생성 (fail-soft). - // derivation cache(§3.4): 같은 청크 text + 같은 alias version_key 면 LLM - // 호출 없이 캐시된 별칭 재사용. version_key = {prompt_version}|{max}|{model}. - let mut alias_cache_hit = 0_usize; - let mut alias_cache_miss = 0_usize; - let mut alias_touch_keys: Vec = Vec::new(); - let t_expansion = std::time::Instant::now(); - if app.config.ingest.expansion.enabled { - let exp = &app.config.ingest.expansion; - let alias_version_key = format!( - "{}|{}|{}", - crate::expansion::PROMPT_VERSION, - exp.max_aliases_per_chunk, - exp.model - ); - let llm_built = if exp.model.is_empty() { - OllamaLanguageModel::new(&app.config) - } else { - OllamaLanguageModel::with_model(&app.config, &exp.model) - }; - match llm_built { - Ok(llm) => { - let generator = - crate::expansion::ExpansionGenerator::new(&llm, exp.max_aliases_per_chunk); - // v0.24.0: throttled live counter through the per-chunk - // expansion loop. Emit at most every 25 chunks or once per - // second — never per chunk (would flood the mpsc channel). - let mut done: u32 = 0; - let mut last_emit = std::time::Instant::now(); - let mut last_done: u32 = 0; - for chunk in &mut chunks { - let key = kebab_core::derivation_cache_key( - "alias", - &chunk.text, - &alias_version_key, - ); - // 히트 = 캐시에 있고 payload 가 정상 UTF-8 로 디코드되는 - // 경우만. 손상(비-UTF8) payload 는 미스로 강등해 재생성 - // 분기로 보낸다(embedding 경로의 decode-실패→미스 강등과 - // 동작 일치, 정확성 우선 §3.5). - let cached_aliases = app - .sqlite - .derivation_cache_get(&key)? - .and_then(|payload| String::from_utf8(payload).ok()); - if let Some(aliases) = cached_aliases { - // 히트: 저장된 별칭(UTF-8) 재사용. LLM 호출 없음. - chunk.aliases = Some(aliases); - alias_cache_hit += 1; - alias_touch_keys.push(key); - } else if crate::expansion::is_nav_boilerplate(chunk) { - // 미스지만 nav boilerplate → 생성 가치 없음(기존 skip 규칙). - // 캐시에 넣지 않음(None 은 payload 로 표현 불가, 다음 run 도 동일 판정). - chunk.aliases = None; - } else { - // 미스 → LLM 생성 후 캐시 저장. - chunk.aliases = generator.generate(chunk); - alias_cache_miss += 1; - if let Some(a) = &chunk.aliases { - app.sqlite - .derivation_cache_put(&key, "alias", a.as_bytes())?; - } - } - // Cache hits count toward `done` too (the brief: show the - // warm-run fast-forward). Throttle: every 25 chunks or - // ≥1s since the last emit. - done += 1; - if done % 25 == 0 - || last_emit.elapsed() >= std::time::Duration::from_secs(1) - { - crate::ingest_progress::emit( - progress, - crate::ingest_progress::IngestEvent::ExpansionProgress { - idx, - total, - done, - chunks: total_chunks, - }, - ); - last_emit = std::time::Instant::now(); - last_done = done; - } - } - // Final frame so the counter lands on done == total — but only - // if the last in-loop emit didn't already report this `done` - // (avoids a duplicate frame when chunks is a multiple of the - // throttle, and skips a 0/0 frame when there are no chunks). - if done != last_done { - crate::ingest_progress::emit( - progress, - crate::ingest_progress::IngestEvent::ExpansionProgress { - idx, - total, - done, - chunks: total_chunks, - }, - ); - } - } - Err(e) => { - tracing::warn!( - target: "kebab-app", error = %e, - "kb-app::ingest: expansion LLM 빌드 실패 — 별칭 없이 진행" - ); - } - } - } - let expansion_ms = u64::try_from(t_expansion.elapsed().as_millis()).unwrap_or(u64::MAX); + // doc-side expansion(별칭) 제거됨 (HOTFIXES 2026-06-03). `expansion_ms` + // 는 wire 호환을 위해 AssetTimings 에 남기되 항상 0. + let expansion_ms = 0_u64; // Stamp chunker + embedding versions so Task 7's skip detection has // data on the second run. @@ -1511,81 +1406,7 @@ fn ingest_one_asset( dimensions, }) .collect(); - // dense 별칭(별도 벡터, sentinel chunk_id). embed_aliases on + - // 별칭 있는 청크만. 본문 records 는 위에서 이미 생성됨(불변). - let mut all_records = records; - if app.config.ingest.expansion.embed_aliases { - let alias_chunks: Vec<&kebab_core::Chunk> = chunks - .iter() - .filter(|c| c.aliases.as_deref().is_some_and(|a| !a.is_empty())) - .collect(); - if !alias_chunks.is_empty() { - // 각 별칭을 줄 단위로 분리해 개별 sentinel 벡터로 임베딩한다. - // 묶음 1벡터는 벡터를 희석시켜 효과가 없으므로(측정), 별칭 i - // 마다 chunk_id `{orig}#alias#{i}` 의 VectorRecord 를 만든다. - // `(청크 참조, 별칭 문자열)` 쌍을 평탄화한 뒤 한 번에 임베딩. - let alias_lines: Vec<(&kebab_core::Chunk, &str)> = alias_chunks - .iter() - .flat_map(|c| { - c.aliases - .as_deref() - .unwrap() - .split('\n') - .map(str::trim) - .filter(|line| !line.is_empty()) - .map(move |line| (*c, line)) - }) - .collect(); - if !alias_lines.is_empty() { - // 별칭 dense 벡터도 본문과 동일한 embedding 캐시 재사용: - // 같은 별칭 문자열이면 본문 embedding 캐시와 같은 키로 적중(§3.4). - let alias_texts: Vec<&str> = - alias_lines.iter().map(|(_, line)| *line).collect(); - let alias_vectors = embed_with_cache( - &**emb, - &app.sqlite, - &alias_texts, - &emb_version_key, - &mut emb_cache_hit, - &mut emb_cache_miss, - &mut emb_touch_keys, - ) - .context("Embedder::embed (alias vectors)")?; - // 같은 청크 안에서 별칭 인덱스를 0부터 매긴다. - let mut per_chunk_idx: std::collections::HashMap = - std::collections::HashMap::new(); - for ((c, line), v) in alias_lines.iter().zip(alias_vectors) { - let i = per_chunk_idx.entry(c.chunk_id.0.clone()).or_insert(0); - let alias_chunk_id = kebab_core::ChunkId(format!( - "{}{}#{}", - c.chunk_id.0, - kebab_core::ALIAS_SUFFIX, - *i - )); - *i += 1; - all_records.push(VectorRecord { - embedding_id: kebab_core::id_for_embedding( - &alias_chunk_id, - &model_id, - &model_version, - dimensions, - ), - chunk_id: alias_chunk_id, - vector: v, - doc_id: canonical.doc_id.clone(), - text: (*line).to_string(), - heading_path: c.heading_path.clone(), - model_id: model_id.clone(), - model_version: model_version.clone(), - dimensions, - }); - } - } - } - } - vec_store - .upsert(&all_records) - .context("VectorStore::upsert")?; + vec_store.upsert(&records).context("VectorStore::upsert")?; // 히트한 embedding 키들의 last_used_at 갱신(LRU 보존, §3.5). app.sqlite.derivation_cache_touch(&emb_touch_keys)?; } @@ -1607,17 +1428,13 @@ fn ingest_one_asset( }, ); - // 히트한 alias 키들의 last_used_at 갱신(LRU 보존, §3.5). - app.sqlite.derivation_cache_touch(&alias_touch_keys)?; - - // 검증용 hit/miss 카운트 노출(§3.4 / §6): warm 재색인이 LLM·embed 0회임을 + // 검증용 hit/miss 카운트 노출(§3.4 / §6): warm 재색인이 embed 0회임을 // 로그로 확인. tracing target 은 stderr 로 흐른다. - if alias_cache_hit + alias_cache_miss + emb_cache_hit + emb_cache_miss > 0 { + if emb_cache_hit + emb_cache_miss > 0 { tracing::info!( target: "kebab-app", doc = %canonical.doc_id.0, - "derivation cache: embedding hit={emb_cache_hit} miss={emb_cache_miss}, \ - alias hit={alias_cache_hit} miss={alias_cache_miss}" + "derivation cache: embedding hit={emb_cache_hit} miss={emb_cache_miss}" ); } @@ -1950,49 +1767,6 @@ fn record_image_analysis_failure( warning_notes.push(note); } -/// Expand a set of body `chunk_id`s into every per-alias sentinel -/// `chunk_id` that orphan cleanup must also delete. -/// -/// PR #195 review (MAJOR): alias dense vectors moved from a single -/// legacy sentinel `{orig}#alias` to per-line sentinels -/// `{orig}#alias#0`, `{orig}#alias#1`, … (one VectorRecord per alias -/// line). These sentinel chunk_ids never appear in SQLite `chunks`, so -/// they are absent from the stale-set the cleanup paths SELECT. Because -/// `delete_by_chunk_ids` matches on exact `chunk_id IN (...)` (not a -/// prefix), deleting only `{orig}#alias` leaked `{orig}#alias#N` rows -/// into LanceDB — stale aliases could still hit search. -/// -/// We reuse the existing exact-match delete infra (approach A): for each -/// body id emit `{id}#alias` (legacy, backward-compat) plus -/// `{id}#alias#0` .. `{id}#alias#{max-1}`. `max` is -/// `expansion.max_aliases_per_chunk`, which is the hard cap -/// `parse_aliases` enforces (it `break`s once `out.len() >= max`), so no -/// index ≥ max is ever produced at ingest time. Indices that were never -/// written are harmless no-ops in an `IN (...)` delete. -fn alias_sentinel_ids_to_delete( - body_ids: &[kebab_core::ChunkId], - max_aliases_per_chunk: usize, -) -> Vec { - let mut out = body_ids.to_vec(); - for id in body_ids { - // Legacy single sentinel (docs ingested before per-line split). - out.push(kebab_core::ChunkId(format!( - "{}{}", - id.0, - kebab_core::ALIAS_SUFFIX - ))); - for i in 0..max_aliases_per_chunk { - out.push(kebab_core::ChunkId(format!( - "{}{}#{}", - id.0, - kebab_core::ALIAS_SUFFIX, - i - ))); - } - } - out -} - /// v0.17.0 PR-B: parser-bump cascade. When a code extractor ships a /// new `PARSER_VERSION` (e.g. `code-c-v1` → `code-c-v2`), the same /// (workspace_path, asset_id) pair re-emerges with a fresh `doc_id`. @@ -2020,15 +1794,8 @@ fn purge_workspace_path_for_parser_bump(app: &App, asset: &RawAsset) -> anyhow:: if !stale.is_empty() { if let Some(vec_store) = app.vector().context("App::vector")? { use kebab_core::VectorStore as _; - // per-alias sentinel 벡터(`{id}#alias#N`)는 SQLite chunks 에 없어 - // stale 에 안 잡힌다 → 본문 + 모든 별칭 sentinel 을 명시적으로 함께 - // 삭제(orphan 누적 방지, PR #195 MAJOR). - let to_delete = alias_sentinel_ids_to_delete( - &stale, - app.config.ingest.expansion.max_aliases_per_chunk, - ); vec_store - .delete_by_chunk_ids(&to_delete) + .delete_by_chunk_ids(&stale) .context("VectorStore::delete_by_chunk_ids (parser-bump orphans)")?; } } @@ -2072,15 +1839,8 @@ fn purge_vector_orphans_for_workspace_path( return Ok(()); } use kebab_core::VectorStore as _; - // per-alias sentinel 벡터(`{id}#alias#N`)는 SQLite chunks 에 없어 stale 에 - // 안 잡힌다 → 본문 + 모든 별칭 sentinel 을 명시적으로 함께 삭제(orphan - // 누적 방지, PR #195 MAJOR). - let to_delete = alias_sentinel_ids_to_delete( - &stale, - app.config.ingest.expansion.max_aliases_per_chunk, - ); vec_store - .delete_by_chunk_ids(&to_delete) + .delete_by_chunk_ids(&stale) .context("VectorStore::delete_by_chunk_ids (orphan vector cleanup)")?; tracing::debug!( target: "kebab-app", @@ -2180,14 +1940,7 @@ fn sweep_deleted_files( if let Some(vec) = vector_store { if !chunk_ids.is_empty() { use kebab_core::VectorStore as _; - // per-alias sentinel 벡터(`{id}#alias#N`)는 SQLite chunks 에 없어 - // chunk_ids 에 안 잡힌다 → 본문 + 모든 별칭 sentinel 을 명시적으로 - // 함께 삭제(orphan 누적 방지, PR #195 MAJOR). - let to_delete = alias_sentinel_ids_to_delete( - &chunk_ids, - app.config.ingest.expansion.max_aliases_per_chunk, - ); - if let Err(e) = vec.delete_by_chunk_ids(&to_delete) { + if let Err(e) = vec.delete_by_chunk_ids(&chunk_ids) { tracing::warn!( target: "kebab-app", path = %stored_path.0, @@ -3563,48 +3316,3 @@ fn check_kebabignore_match( .is_ignore() } -#[cfg(test)] -mod orphan_cleanup_tests { - use super::alias_sentinel_ids_to_delete; - use kebab_core::ChunkId; - - /// PR #195 MAJOR: alias dense 벡터가 줄별 `{id}#alias#N` sentinel 로 색인되므로 - /// orphan cleanup 의 LanceDB delete-set 은 본문 + legacy `{id}#alias` + - /// `{id}#alias#0` .. `{id}#alias#{max-1}` 를 모두 포함해야 한다. 이전 코드는 - /// 단일 `{id}#alias` 만 넣어 per-line sentinel 을 LanceDB 에 누수시켰다. - #[test] - fn expands_body_legacy_and_per_alias_sentinels() { - let body = ChunkId("aabbccddeeff00112233445566778899".to_string()); - let max = 3; - let out = alias_sentinel_ids_to_delete(std::slice::from_ref(&body), max); - let ids: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect(); - - assert!(ids.contains(&body.0.as_str()), "본문 chunk_id 포함"); - assert!( - ids.contains(&"aabbccddeeff00112233445566778899#alias"), - "하위호환 legacy 단일 sentinel 포함" - ); - for i in 0..max { - let expected = format!("aabbccddeeff00112233445566778899#alias#{i}"); - assert!( - ids.contains(&expected.as_str()), - "per-alias sentinel #{i} 포함 (max={max})" - ); - } - // body(1) + legacy(1) + per-alias(max) = max + 2. - assert_eq!(out.len(), max + 2, "정확히 max+2 개 id"); - // max 상한과 일치: #alias#{max} 는 절대 생성 안 함(parse_aliases 가 cap). - assert!( - !ids.contains(&"aabbccddeeff00112233445566778899#alias#3"), - "상한(max) 이상 인덱스는 생성하지 않음" - ); - } - - /// max=0 (확장 비활성 동등) 이면 per-alias sentinel 없이 본문 + legacy 만. - #[test] - fn zero_max_emits_body_and_legacy_only() { - let body = ChunkId("00000000000000000000000000000000".to_string()); - let out = alias_sentinel_ids_to_delete(std::slice::from_ref(&body), 0); - assert_eq!(out.len(), 2, "본문 + legacy sentinel 만"); - } -}