feat(expansion): doc-side expansion 별칭 개별 dense 벡터 + 파생물 캐시(V012)
별칭을 줄별 개별 dense 벡터(sentinel `{chunk}#alias#N`)로 색인하고
boilerplate 청크는 별칭 생성을 skip. 묶음 1벡터 방식은 평균화로 특정
표현이 희석돼 오히려 회귀(13/18)했던 것을 폐기. 변형 일관성 14/18 →
16/18, mean_spread@10 0.222 → 0.111 (나무위키 ~1000 문서 CS corpus).
`kebab-core::strip_alias_suffix` 가 suffix 형과 per-alias 형 둘 다 처리.
파생물 캐시(V012): embedding 벡터 + 별칭 LLM 결과를 청크 내용 해시
키로 캐싱해 재색인 시 내용 불변 청크의 재계산을 skip. cache_key =
blake3(kind ‖ text_blake3 ‖ version_key)[:32], version_key 에
model/prompt/dimensions 포함 → §9 cascade 와 정합(버전 bump 시 자동
miss). 측정: 정답 3개 cold 1879s → warm 13s ≈ 145배. 순수 가산이라
corpus_revision bump 없음. search/ask 는 kebab.sqlite+lancedb 만으로
동작 → 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능.
V012 schema migration + 신규 surface 로 workspace version 0.20.2 →
0.21.0 (minor) bump. README/HANDOFF/ARCHITECTURE/HOTFIXES sync.
known limitation: stack·svm 설명형 2개 잔존 + grounded 판정이 부분
인용을 grounded 로 오분류(후속 후보).
측정 상세: docs/superpowers/handoffs/2026-05-31-namu-wiki-alias-cache-study.md
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
110
crates/kebab-core/src/derivation.rs
Normal file
110
crates/kebab-core/src/derivation.rs
Normal file
@@ -0,0 +1,110 @@
|
||||
//! Content-hash derivation cache key (design 2026-05-31 §3.1).
|
||||
//!
|
||||
//! Expensive ingest derivations (embedding vectors, LLM aliases, optional
|
||||
//! Korean morphological tokens) are cached by the *content hash* of the chunk
|
||||
//! text so that re-indexing an updated document skips recomputation for any
|
||||
//! chunk whose text is unchanged — independent of position / `chunk_id`
|
||||
//! (which is position-based, see `ids::id_for_block`).
|
||||
//!
|
||||
//! ```text
|
||||
//! cache_key = blake3_hex( kind || 0x00 || text_blake3 || 0x00 || version_key )[:32]
|
||||
//! ```
|
||||
//! - `text_blake3` = blake3(NFC-normalized UTF-8 bytes of the chunk text).
|
||||
//! - `kind` ∈ { "embedding", "alias", "korean_tokens" }.
|
||||
//! - `version_key` folds every §9 version-cascade input for that kind
|
||||
//! (model / prompt / tokenizer version). A version bump changes the key →
|
||||
//! automatic cache miss → recompute, keeping the cache consistent with the
|
||||
//! cascade contract (§3.5 / §3.6).
|
||||
//!
|
||||
//! Pure: depends only on `blake3` + `unicode-normalization`. No other
|
||||
//! `kebab-*` crate is referenced (deps boundary §5).
|
||||
|
||||
use crate::normalize::nfc;
|
||||
|
||||
/// Derivation-cache key per design §3.1.
|
||||
///
|
||||
/// `text` is NFC-normalized before hashing so the same logical content always
|
||||
/// maps to the same key regardless of Unicode encoding form. `kind` and
|
||||
/// `version_key` are folded in with `0x00` separators (which cannot occur in
|
||||
/// hex digests) so distinct kinds / versions never collide.
|
||||
pub fn derivation_cache_key(kind: &str, text: &str, version_key: &str) -> String {
|
||||
let text_blake3 = blake3::hash(nfc(text).as_bytes()).to_hex().to_string();
|
||||
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
hasher.update(kind.as_bytes());
|
||||
hasher.update(&[0x00]);
|
||||
hasher.update(text_blake3.as_bytes());
|
||||
hasher.update(&[0x00]);
|
||||
hasher.update(version_key.as_bytes());
|
||||
|
||||
hasher.finalize().to_hex().to_string()[..32].to_string()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn key_is_32_hex_chars() {
|
||||
let k = derivation_cache_key("embedding", "hello world", "v1");
|
||||
assert_eq!(k.len(), 32);
|
||||
assert!(k.bytes().all(|b| b.is_ascii_hexdigit()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_inputs_same_key() {
|
||||
let a = derivation_cache_key("embedding", "러스트 소유권", "model|1|1024");
|
||||
let b = derivation_cache_key("embedding", "러스트 소유권", "model|1|1024");
|
||||
assert_eq!(a, b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nfc_normalization_collapses_encoding_forms() {
|
||||
// "가" as a precomposed syllable (NFC) vs decomposed jamo (NFD) must
|
||||
// hash to the same key after NFC normalization.
|
||||
let precomposed = "\u{AC00}"; // 가
|
||||
let decomposed = "\u{1100}\u{1161}"; // ᄀ + ᅡ
|
||||
assert_ne!(precomposed, decomposed);
|
||||
let a = derivation_cache_key("embedding", precomposed, "v1");
|
||||
let b = derivation_cache_key("embedding", decomposed, "v1");
|
||||
assert_eq!(a, b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_kind_different_key() {
|
||||
let e = derivation_cache_key("embedding", "same text", "v1");
|
||||
let a = derivation_cache_key("alias", "same text", "v1");
|
||||
assert_ne!(e, a);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_version_key_different_key_miss() {
|
||||
// §3.6 correctness guard: a version_key change MUST produce a different
|
||||
// cache_key (so a stale derivation never gets reused after a cascade
|
||||
// bump). This is the most safety-critical invariant of the cache.
|
||||
let v1 = derivation_cache_key("embedding", "same text", "modelA|1|1024");
|
||||
let v2 = derivation_cache_key("embedding", "same text", "modelA|2|1024");
|
||||
assert_ne!(v1, v2);
|
||||
|
||||
// alias prompt_version bump → miss.
|
||||
let p1 = derivation_cache_key("alias", "문단", "expansion-v1|8|");
|
||||
let p2 = derivation_cache_key("alias", "문단", "expansion-v2|8|");
|
||||
assert_ne!(p1, p2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_text_different_key() {
|
||||
let a = derivation_cache_key("embedding", "text one", "v1");
|
||||
let b = derivation_cache_key("embedding", "text two", "v1");
|
||||
assert_ne!(a, b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn separator_prevents_field_smearing() {
|
||||
// Without the 0x00 separators, ("ab","","c") and ("a","b","c") shaped
|
||||
// inputs could collide. The kind/version boundaries must be distinct.
|
||||
let a = derivation_cache_key("ab", "x", "c");
|
||||
let b = derivation_cache_key("a", "x", "bc");
|
||||
assert_ne!(a, b);
|
||||
}
|
||||
}
|
||||
@@ -61,10 +61,18 @@ fn validate_hex32(s: &str) -> Result<(), CoreError> {
|
||||
/// Suffix appended to a chunk's vector ID to mark an alias embedding row.
|
||||
pub const ALIAS_SUFFIX: &str = "#alias";
|
||||
|
||||
/// Strip `#alias` suffix from `id`, returning the bare chunk ID.
|
||||
/// If `id` does not end with `ALIAS_SUFFIX`, returns `id` unchanged.
|
||||
/// Strip the alias marker from `id`, returning the bare chunk ID.
|
||||
///
|
||||
/// Returns everything before the first occurrence of `ALIAS_SUFFIX`. This
|
||||
/// handles both the suffix form `{orig}#alias` and the per-alias form
|
||||
/// `{orig}#alias#N`. A bare chunk ID is blake3 hex (32 chars, no `#`), so the
|
||||
/// first `#alias` always marks the boundary. If `id` contains no `ALIAS_SUFFIX`,
|
||||
/// returns `id` unchanged.
|
||||
pub fn strip_alias_suffix(id: &str) -> &str {
|
||||
id.strip_suffix(ALIAS_SUFFIX).unwrap_or(id)
|
||||
match id.find(ALIAS_SUFFIX) {
|
||||
Some(pos) => &id[..pos],
|
||||
None => id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Canonical-JSON + blake3 + hex prefix 32. Per design §4.2.
|
||||
@@ -447,6 +455,10 @@ mod tests {
|
||||
assert_eq!(strip_alias_suffix(bare), bare);
|
||||
assert_eq!(strip_alias_suffix(""), "");
|
||||
assert_eq!(strip_alias_suffix("#alias"), "");
|
||||
// Per-alias form `{orig}#alias#N` strips to the bare chunk ID.
|
||||
assert_eq!(strip_alias_suffix(&format!("{bare}{ALIAS_SUFFIX}#3")), bare);
|
||||
assert_eq!(strip_alias_suffix(&format!("{bare}{ALIAS_SUFFIX}#0")), bare);
|
||||
assert_eq!(strip_alias_suffix("#alias#3"), "");
|
||||
}
|
||||
|
||||
/// Independent pin for id_for_index.
|
||||
|
||||
@@ -11,6 +11,7 @@ pub mod answer;
|
||||
pub mod asset;
|
||||
pub mod chunk;
|
||||
pub mod citation;
|
||||
pub mod derivation;
|
||||
pub mod document;
|
||||
pub mod errors;
|
||||
pub mod fetch;
|
||||
@@ -35,6 +36,7 @@ pub use answer::{
|
||||
pub use asset::{AssetStorage, RawAsset, SourceUri, WorkspacePath};
|
||||
pub use chunk::Chunk;
|
||||
pub use citation::Citation;
|
||||
pub use derivation::derivation_cache_key;
|
||||
pub use document::{
|
||||
AudioRefBlock, Block, CanonicalDocument, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock,
|
||||
Inline, ListBlock, ModelCaption, OcrRegion, OcrText, SourceSpan, TableBlock, TextBlock,
|
||||
|
||||
Reference in New Issue
Block a user