refactor(app): ingest 별칭 생성·캐시·sentinel 벡터 루프 제거

ingest_one_asset 의 청크당 별칭 LLM 생성·derivation_cache 조회/저장·
embed_aliases sentinel 벡터(`{orig}#alias#N`) upsert 루프 제거.
expansion_ms 는 wire 호환 위해 0 고정. alias_sentinel_ids_to_delete 와
orphan purge 3개 호출부를 본문 chunk_id 직접 삭제로 단순화.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-02 21:37:43 +00:00
parent a64c31ee94
commit 21e02d8a93

View File

@@ -63,7 +63,6 @@ pub mod derivation_payload;
pub mod doctor_signal; pub mod doctor_signal;
pub mod error_signal; pub mod error_signal;
pub mod error_wire; pub mod error_wire;
pub mod expansion;
pub mod external; pub mod external;
pub mod fetch; pub mod fetch;
pub mod ingest_log; pub mod ingest_log;
@@ -1302,7 +1301,7 @@ fn ingest_one_asset(
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX); let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
let t_chunk = std::time::Instant::now(); let t_chunk = std::time::Instant::now();
let mut chunks = MdHeadingV1Chunker let chunks = MdHeadingV1Chunker
.chunk(&canonical, chunk_policy) .chunk(&canonical, chunk_policy)
.context("kb-chunk::MdHeadingV1Chunker::chunk")?; .context("kb-chunk::MdHeadingV1Chunker::chunk")?;
let chunk_ms = u64::try_from(t_chunk.elapsed().as_millis()).unwrap_or(u64::MAX); let chunk_ms = u64::try_from(t_chunk.elapsed().as_millis()).unwrap_or(u64::MAX);
@@ -1320,113 +1319,9 @@ fn ingest_one_asset(
}, },
); );
// Phase 2 doc-side expansion: flag on 이면 청크당 별칭 생성 (fail-soft). // doc-side expansion(별칭) 제거됨 (HOTFIXES 2026-06-03). `expansion_ms`
// derivation cache(§3.4): 같은 청크 text + 같은 alias version_key 면 LLM // 는 wire 호환을 위해 AssetTimings 에 남기되 항상 0.
// 호출 없이 캐시된 별칭 재사용. version_key = {prompt_version}|{max}|{model}. let expansion_ms = 0_u64;
let mut alias_cache_hit = 0_usize;
let mut alias_cache_miss = 0_usize;
let mut alias_touch_keys: Vec<String> = Vec::new();
let t_expansion = std::time::Instant::now();
if app.config.ingest.expansion.enabled {
let exp = &app.config.ingest.expansion;
let alias_version_key = format!(
"{}|{}|{}",
crate::expansion::PROMPT_VERSION,
exp.max_aliases_per_chunk,
exp.model
);
let llm_built = if exp.model.is_empty() {
OllamaLanguageModel::new(&app.config)
} else {
OllamaLanguageModel::with_model(&app.config, &exp.model)
};
match llm_built {
Ok(llm) => {
let generator =
crate::expansion::ExpansionGenerator::new(&llm, exp.max_aliases_per_chunk);
// v0.24.0: throttled live counter through the per-chunk
// expansion loop. Emit at most every 25 chunks or once per
// second — never per chunk (would flood the mpsc channel).
let mut done: u32 = 0;
let mut last_emit = std::time::Instant::now();
let mut last_done: u32 = 0;
for chunk in &mut chunks {
let key = kebab_core::derivation_cache_key(
"alias",
&chunk.text,
&alias_version_key,
);
// 히트 = 캐시에 있고 payload 가 정상 UTF-8 로 디코드되는
// 경우만. 손상(비-UTF8) payload 는 미스로 강등해 재생성
// 분기로 보낸다(embedding 경로의 decode-실패→미스 강등과
// 동작 일치, 정확성 우선 §3.5).
let cached_aliases = app
.sqlite
.derivation_cache_get(&key)?
.and_then(|payload| String::from_utf8(payload).ok());
if let Some(aliases) = cached_aliases {
// 히트: 저장된 별칭(UTF-8) 재사용. LLM 호출 없음.
chunk.aliases = Some(aliases);
alias_cache_hit += 1;
alias_touch_keys.push(key);
} else if crate::expansion::is_nav_boilerplate(chunk) {
// 미스지만 nav boilerplate → 생성 가치 없음(기존 skip 규칙).
// 캐시에 넣지 않음(None 은 payload 로 표현 불가, 다음 run 도 동일 판정).
chunk.aliases = None;
} else {
// 미스 → LLM 생성 후 캐시 저장.
chunk.aliases = generator.generate(chunk);
alias_cache_miss += 1;
if let Some(a) = &chunk.aliases {
app.sqlite
.derivation_cache_put(&key, "alias", a.as_bytes())?;
}
}
// Cache hits count toward `done` too (the brief: show the
// warm-run fast-forward). Throttle: every 25 chunks or
// ≥1s since the last emit.
done += 1;
if done % 25 == 0
|| last_emit.elapsed() >= std::time::Duration::from_secs(1)
{
crate::ingest_progress::emit(
progress,
crate::ingest_progress::IngestEvent::ExpansionProgress {
idx,
total,
done,
chunks: total_chunks,
},
);
last_emit = std::time::Instant::now();
last_done = done;
}
}
// Final frame so the counter lands on done == total — but only
// if the last in-loop emit didn't already report this `done`
// (avoids a duplicate frame when chunks is a multiple of the
// throttle, and skips a 0/0 frame when there are no chunks).
if done != last_done {
crate::ingest_progress::emit(
progress,
crate::ingest_progress::IngestEvent::ExpansionProgress {
idx,
total,
done,
chunks: total_chunks,
},
);
}
}
Err(e) => {
tracing::warn!(
target: "kebab-app", error = %e,
"kb-app::ingest: expansion LLM 빌드 실패 — 별칭 없이 진행"
);
}
}
}
let expansion_ms = u64::try_from(t_expansion.elapsed().as_millis()).unwrap_or(u64::MAX);
// Stamp chunker + embedding versions so Task 7's skip detection has // Stamp chunker + embedding versions so Task 7's skip detection has
// data on the second run. // data on the second run.
@@ -1511,81 +1406,7 @@ fn ingest_one_asset(
dimensions, dimensions,
}) })
.collect(); .collect();
// dense 별칭(별도 벡터, sentinel chunk_id). embed_aliases on + vec_store.upsert(&records).context("VectorStore::upsert")?;
// 별칭 있는 청크만. 본문 records 는 위에서 이미 생성됨(불변).
let mut all_records = records;
if app.config.ingest.expansion.embed_aliases {
let alias_chunks: Vec<&kebab_core::Chunk> = chunks
.iter()
.filter(|c| c.aliases.as_deref().is_some_and(|a| !a.is_empty()))
.collect();
if !alias_chunks.is_empty() {
// 각 별칭을 줄 단위로 분리해 개별 sentinel 벡터로 임베딩한다.
// 묶음 1벡터는 벡터를 희석시켜 효과가 없으므로(측정), 별칭 i
// 마다 chunk_id `{orig}#alias#{i}` 의 VectorRecord 를 만든다.
// `(청크 참조, 별칭 문자열)` 쌍을 평탄화한 뒤 한 번에 임베딩.
let alias_lines: Vec<(&kebab_core::Chunk, &str)> = alias_chunks
.iter()
.flat_map(|c| {
c.aliases
.as_deref()
.unwrap()
.split('\n')
.map(str::trim)
.filter(|line| !line.is_empty())
.map(move |line| (*c, line))
})
.collect();
if !alias_lines.is_empty() {
// 별칭 dense 벡터도 본문과 동일한 embedding 캐시 재사용:
// 같은 별칭 문자열이면 본문 embedding 캐시와 같은 키로 적중(§3.4).
let alias_texts: Vec<&str> =
alias_lines.iter().map(|(_, line)| *line).collect();
let alias_vectors = embed_with_cache(
&**emb,
&app.sqlite,
&alias_texts,
&emb_version_key,
&mut emb_cache_hit,
&mut emb_cache_miss,
&mut emb_touch_keys,
)
.context("Embedder::embed (alias vectors)")?;
// 같은 청크 안에서 별칭 인덱스를 0부터 매긴다.
let mut per_chunk_idx: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
for ((c, line), v) in alias_lines.iter().zip(alias_vectors) {
let i = per_chunk_idx.entry(c.chunk_id.0.clone()).or_insert(0);
let alias_chunk_id = kebab_core::ChunkId(format!(
"{}{}#{}",
c.chunk_id.0,
kebab_core::ALIAS_SUFFIX,
*i
));
*i += 1;
all_records.push(VectorRecord {
embedding_id: kebab_core::id_for_embedding(
&alias_chunk_id,
&model_id,
&model_version,
dimensions,
),
chunk_id: alias_chunk_id,
vector: v,
doc_id: canonical.doc_id.clone(),
text: (*line).to_string(),
heading_path: c.heading_path.clone(),
model_id: model_id.clone(),
model_version: model_version.clone(),
dimensions,
});
}
}
}
}
vec_store
.upsert(&all_records)
.context("VectorStore::upsert")?;
// 히트한 embedding 키들의 last_used_at 갱신(LRU 보존, §3.5). // 히트한 embedding 키들의 last_used_at 갱신(LRU 보존, §3.5).
app.sqlite.derivation_cache_touch(&emb_touch_keys)?; app.sqlite.derivation_cache_touch(&emb_touch_keys)?;
} }
@@ -1607,17 +1428,13 @@ fn ingest_one_asset(
}, },
); );
// 히트한 alias 키들의 last_used_at 갱신(LRU 보존, §3.5). // 검증용 hit/miss 카운트 노출(§3.4 / §6): warm 재색인이 embed 0회임을
app.sqlite.derivation_cache_touch(&alias_touch_keys)?;
// 검증용 hit/miss 카운트 노출(§3.4 / §6): warm 재색인이 LLM·embed 0회임을
// 로그로 확인. tracing target 은 stderr 로 흐른다. // 로그로 확인. tracing target 은 stderr 로 흐른다.
if alias_cache_hit + alias_cache_miss + emb_cache_hit + emb_cache_miss > 0 { if emb_cache_hit + emb_cache_miss > 0 {
tracing::info!( tracing::info!(
target: "kebab-app", target: "kebab-app",
doc = %canonical.doc_id.0, doc = %canonical.doc_id.0,
"derivation cache: embedding hit={emb_cache_hit} miss={emb_cache_miss}, \ "derivation cache: embedding hit={emb_cache_hit} miss={emb_cache_miss}"
alias hit={alias_cache_hit} miss={alias_cache_miss}"
); );
} }
@@ -1950,49 +1767,6 @@ fn record_image_analysis_failure(
warning_notes.push(note); warning_notes.push(note);
} }
/// Expand a set of body `chunk_id`s into every per-alias sentinel
/// `chunk_id` that orphan cleanup must also delete.
///
/// PR #195 review (MAJOR): alias dense vectors moved from a single
/// legacy sentinel `{orig}#alias` to per-line sentinels
/// `{orig}#alias#0`, `{orig}#alias#1`, … (one VectorRecord per alias
/// line). These sentinel chunk_ids never appear in SQLite `chunks`, so
/// they are absent from the stale-set the cleanup paths SELECT. Because
/// `delete_by_chunk_ids` matches on exact `chunk_id IN (...)` (not a
/// prefix), deleting only `{orig}#alias` leaked `{orig}#alias#N` rows
/// into LanceDB — stale aliases could still hit search.
///
/// We reuse the existing exact-match delete infra (approach A): for each
/// body id emit `{id}#alias` (legacy, backward-compat) plus
/// `{id}#alias#0` .. `{id}#alias#{max-1}`. `max` is
/// `expansion.max_aliases_per_chunk`, which is the hard cap
/// `parse_aliases` enforces (it `break`s once `out.len() >= max`), so no
/// index ≥ max is ever produced at ingest time. Indices that were never
/// written are harmless no-ops in an `IN (...)` delete.
fn alias_sentinel_ids_to_delete(
body_ids: &[kebab_core::ChunkId],
max_aliases_per_chunk: usize,
) -> Vec<kebab_core::ChunkId> {
let mut out = body_ids.to_vec();
for id in body_ids {
// Legacy single sentinel (docs ingested before per-line split).
out.push(kebab_core::ChunkId(format!(
"{}{}",
id.0,
kebab_core::ALIAS_SUFFIX
)));
for i in 0..max_aliases_per_chunk {
out.push(kebab_core::ChunkId(format!(
"{}{}#{}",
id.0,
kebab_core::ALIAS_SUFFIX,
i
)));
}
}
out
}
/// v0.17.0 PR-B: parser-bump cascade. When a code extractor ships a /// v0.17.0 PR-B: parser-bump cascade. When a code extractor ships a
/// new `PARSER_VERSION` (e.g. `code-c-v1` → `code-c-v2`), the same /// new `PARSER_VERSION` (e.g. `code-c-v1` → `code-c-v2`), the same
/// (workspace_path, asset_id) pair re-emerges with a fresh `doc_id`. /// (workspace_path, asset_id) pair re-emerges with a fresh `doc_id`.
@@ -2020,15 +1794,8 @@ fn purge_workspace_path_for_parser_bump(app: &App, asset: &RawAsset) -> anyhow::
if !stale.is_empty() { if !stale.is_empty() {
if let Some(vec_store) = app.vector().context("App::vector")? { if let Some(vec_store) = app.vector().context("App::vector")? {
use kebab_core::VectorStore as _; use kebab_core::VectorStore as _;
// per-alias sentinel 벡터(`{id}#alias#N`)는 SQLite chunks 에 없어
// stale 에 안 잡힌다 → 본문 + 모든 별칭 sentinel 을 명시적으로 함께
// 삭제(orphan 누적 방지, PR #195 MAJOR).
let to_delete = alias_sentinel_ids_to_delete(
&stale,
app.config.ingest.expansion.max_aliases_per_chunk,
);
vec_store vec_store
.delete_by_chunk_ids(&to_delete) .delete_by_chunk_ids(&stale)
.context("VectorStore::delete_by_chunk_ids (parser-bump orphans)")?; .context("VectorStore::delete_by_chunk_ids (parser-bump orphans)")?;
} }
} }
@@ -2072,15 +1839,8 @@ fn purge_vector_orphans_for_workspace_path(
return Ok(()); return Ok(());
} }
use kebab_core::VectorStore as _; use kebab_core::VectorStore as _;
// per-alias sentinel 벡터(`{id}#alias#N`)는 SQLite chunks 에 없어 stale 에
// 안 잡힌다 → 본문 + 모든 별칭 sentinel 을 명시적으로 함께 삭제(orphan
// 누적 방지, PR #195 MAJOR).
let to_delete = alias_sentinel_ids_to_delete(
&stale,
app.config.ingest.expansion.max_aliases_per_chunk,
);
vec_store vec_store
.delete_by_chunk_ids(&to_delete) .delete_by_chunk_ids(&stale)
.context("VectorStore::delete_by_chunk_ids (orphan vector cleanup)")?; .context("VectorStore::delete_by_chunk_ids (orphan vector cleanup)")?;
tracing::debug!( tracing::debug!(
target: "kebab-app", target: "kebab-app",
@@ -2180,14 +1940,7 @@ fn sweep_deleted_files(
if let Some(vec) = vector_store { if let Some(vec) = vector_store {
if !chunk_ids.is_empty() { if !chunk_ids.is_empty() {
use kebab_core::VectorStore as _; use kebab_core::VectorStore as _;
// per-alias sentinel 벡터(`{id}#alias#N`)는 SQLite chunks 에 없어 if let Err(e) = vec.delete_by_chunk_ids(&chunk_ids) {
// chunk_ids 에 안 잡힌다 → 본문 + 모든 별칭 sentinel 을 명시적으로
// 함께 삭제(orphan 누적 방지, PR #195 MAJOR).
let to_delete = alias_sentinel_ids_to_delete(
&chunk_ids,
app.config.ingest.expansion.max_aliases_per_chunk,
);
if let Err(e) = vec.delete_by_chunk_ids(&to_delete) {
tracing::warn!( tracing::warn!(
target: "kebab-app", target: "kebab-app",
path = %stored_path.0, path = %stored_path.0,
@@ -3563,48 +3316,3 @@ fn check_kebabignore_match(
.is_ignore() .is_ignore()
} }
#[cfg(test)]
mod orphan_cleanup_tests {
use super::alias_sentinel_ids_to_delete;
use kebab_core::ChunkId;
/// PR #195 MAJOR: alias dense 벡터가 줄별 `{id}#alias#N` sentinel 로 색인되므로
/// orphan cleanup 의 LanceDB delete-set 은 본문 + legacy `{id}#alias` +
/// `{id}#alias#0` .. `{id}#alias#{max-1}` 를 모두 포함해야 한다. 이전 코드는
/// 단일 `{id}#alias` 만 넣어 per-line sentinel 을 LanceDB 에 누수시켰다.
#[test]
fn expands_body_legacy_and_per_alias_sentinels() {
let body = ChunkId("aabbccddeeff00112233445566778899".to_string());
let max = 3;
let out = alias_sentinel_ids_to_delete(std::slice::from_ref(&body), max);
let ids: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect();
assert!(ids.contains(&body.0.as_str()), "본문 chunk_id 포함");
assert!(
ids.contains(&"aabbccddeeff00112233445566778899#alias"),
"하위호환 legacy 단일 sentinel 포함"
);
for i in 0..max {
let expected = format!("aabbccddeeff00112233445566778899#alias#{i}");
assert!(
ids.contains(&expected.as_str()),
"per-alias sentinel #{i} 포함 (max={max})"
);
}
// body(1) + legacy(1) + per-alias(max) = max + 2.
assert_eq!(out.len(), max + 2, "정확히 max+2 개 id");
// max 상한과 일치: #alias#{max} 는 절대 생성 안 함(parse_aliases 가 cap).
assert!(
!ids.contains(&"aabbccddeeff00112233445566778899#alias#3"),
"상한(max) 이상 인덱스는 생성하지 않음"
);
}
/// max=0 (확장 비활성 동등) 이면 per-alias sentinel 없이 본문 + legacy 만.
#[test]
fn zero_max_emits_body_and_legacy_only() {
let body = ChunkId("00000000000000000000000000000000".to_string());
let out = alias_sentinel_ids_to_delete(std::slice::from_ref(&body), 0);
assert_eq!(out.len(), 2, "본문 + legacy sentinel 만");
}
}