Files
kebab/crates/kebab-core/src/ids.rs
altair823 a8fd76499c feat(expansion): doc-side expansion 별칭 개별 dense 벡터 + 파생물 캐시(V012)
별칭을 줄별 개별 dense 벡터(sentinel `{chunk}#alias#N`)로 색인하고
boilerplate 청크는 별칭 생성을 skip. 묶음 1벡터 방식은 평균화로 특정
표현이 희석돼 오히려 회귀(13/18)했던 것을 폐기. 변형 일관성 14/18 →
16/18, mean_spread@10 0.222 → 0.111 (나무위키 ~1000 문서 CS corpus).
`kebab-core::strip_alias_suffix` 가 suffix 형과 per-alias 형 둘 다 처리.

파생물 캐시(V012): embedding 벡터 + 별칭 LLM 결과를 청크 내용 해시
키로 캐싱해 재색인 시 내용 불변 청크의 재계산을 skip. cache_key =
blake3(kind ‖ text_blake3 ‖ version_key)[:32], version_key 에
model/prompt/dimensions 포함 → §9 cascade 와 정합(버전 bump 시 자동
miss). 측정: 정답 3개 cold 1879s → warm 13s ≈ 145배. 순수 가산이라
corpus_revision bump 없음. search/ask 는 kebab.sqlite+lancedb 만으로
동작 → 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능.

V012 schema migration + 신규 surface 로 workspace version 0.20.2 →
0.21.0 (minor) bump. README/HANDOFF/ARCHITECTURE/HOTFIXES sync.
known limitation: stack·svm 설명형 2개 잔존 + grounded 판정이 부분
인용을 grounded 로 오분류(후속 후보).

측정 상세: docs/superpowers/handoffs/2026-05-31-namu-wiki-alias-cache-study.md

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-31 08:24:04 +00:00

506 lines
17 KiB
Rust

//! Newtype IDs (§3.1) + ID generation recipe (§4.2).
//!
//! Every ID is `blake3(canonical_json(tuple))[..32]`. `Display` returns the
//! inner hex string; `FromStr` accepts 32 hex characters (mixed case) and
//! normalizes the stored representation to lowercase so equality and hashing
//! are canonical.
use std::fmt;
use std::str::FromStr;
use serde::{Deserialize, Serialize};
use crate::asset::WorkspacePath;
use crate::document::SourceSpan;
use crate::errors::CoreError;
use crate::versions::{
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion,
};
macro_rules! newtype_id {
($name:ident) => {
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct $name(pub String);
impl fmt::Display for $name {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.0)
}
}
impl FromStr for $name {
type Err = CoreError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
validate_hex32(s)?;
Ok(Self(s.to_ascii_lowercase()))
}
}
};
}
newtype_id!(AssetId);
newtype_id!(DocumentId);
newtype_id!(BlockId);
newtype_id!(ChunkId);
newtype_id!(EmbeddingId);
newtype_id!(IndexId);
fn validate_hex32(s: &str) -> Result<(), CoreError> {
if s.len() != 32 {
return Err(CoreError::InvalidId(format!(
"expected 32 hex chars, got {}",
s.len()
)));
}
if !s.bytes().all(|b| b.is_ascii_hexdigit()) {
return Err(CoreError::InvalidId(format!("non-hex character in {s:?}")));
}
Ok(())
}
/// Suffix appended to a chunk's vector ID to mark an alias embedding row.
pub const ALIAS_SUFFIX: &str = "#alias";
/// Strip the alias marker from `id`, returning the bare chunk ID.
///
/// Returns everything before the first occurrence of `ALIAS_SUFFIX`. This
/// handles both the suffix form `{orig}#alias` and the per-alias form
/// `{orig}#alias#N`. A bare chunk ID is blake3 hex (32 chars, no `#`), so the
/// first `#alias` always marks the boundary. If `id` contains no `ALIAS_SUFFIX`,
/// returns `id` unchanged.
pub fn strip_alias_suffix(id: &str) -> &str {
match id.find(ALIAS_SUFFIX) {
Some(pos) => &id[..pos],
None => id,
}
}
/// Canonical-JSON + blake3 + hex prefix 32. Per design §4.2.
pub fn id_from<T: Serialize>(tuple: T) -> String {
let bytes = serde_json_canonicalizer::to_vec(&tuple)
.expect("canonical JSON serialization must not fail for kb-core inputs");
// The crate exposes `to_vec` for `T: Serialize` returning `Vec<u8>`.
let hex = blake3::hash(&bytes).to_hex().to_string();
hex[..32].to_string()
}
#[derive(Serialize)]
struct AssetTuple<'a> {
kind: &'static str,
asset_blake3: &'a str,
}
#[derive(Serialize)]
struct DocTuple<'a> {
kind: &'static str,
workspace_path: &'a str,
asset_id: &'a str,
parser_version: &'a str,
}
#[derive(Serialize)]
struct BlockTuple<'a> {
kind: &'static str,
doc_id: &'a str,
block_kind: &'a str,
heading_path: &'a [String],
ordinal: u32,
source_span: &'a SourceSpan,
}
#[derive(Serialize)]
struct ChunkTuple<'a> {
kind: &'static str,
doc_id: &'a str,
chunker_version: &'a str,
block_ids: Vec<&'a str>,
policy_hash: &'a str,
}
#[derive(Serialize)]
struct EmbeddingTuple<'a> {
kind: &'static str,
chunk_id: &'a str,
model_id: &'a str,
model_version: &'a str,
dimensions: usize,
}
#[derive(Serialize)]
struct IndexTuple<'a> {
kind: &'static str,
collection: &'a str,
embedding_model: &'a str,
dimensions: usize,
index_version: &'a str,
index_kind: &'a str,
index_params_hash: &'a str,
}
pub fn id_for_asset(asset_blake3_full_hex: &str) -> AssetId {
AssetId(id_from(AssetTuple {
kind: "asset",
asset_blake3: asset_blake3_full_hex,
}))
}
pub fn id_for_doc(
workspace_path: &WorkspacePath,
asset: &AssetId,
parser_version: &ParserVersion,
) -> DocumentId {
DocumentId(id_from(DocTuple {
kind: "doc",
workspace_path: &workspace_path.0,
asset_id: &asset.0,
parser_version: &parser_version.0,
}))
}
pub fn id_for_block(
doc: &DocumentId,
block_kind: &str,
heading_path: &[String],
ordinal: u32,
span: &SourceSpan,
) -> BlockId {
BlockId(id_from(BlockTuple {
kind: "block",
doc_id: &doc.0,
block_kind,
heading_path,
ordinal,
source_span: span,
}))
}
pub fn id_for_chunk(
doc: &DocumentId,
chunker_version: &ChunkerVersion,
block_ids: &[BlockId],
policy_hash: &str,
) -> ChunkId {
ChunkId(id_from(ChunkTuple {
kind: "chunk",
doc_id: &doc.0,
chunker_version: &chunker_version.0,
block_ids: block_ids.iter().map(|b| b.0.as_str()).collect(),
policy_hash,
}))
}
pub fn id_for_embedding(
chunk: &ChunkId,
model: &EmbeddingModelId,
version: &EmbeddingVersion,
dims: usize,
) -> EmbeddingId {
EmbeddingId(id_from(EmbeddingTuple {
kind: "embedding",
chunk_id: &chunk.0,
model_id: &model.0,
model_version: &version.0,
dimensions: dims,
}))
}
pub fn id_for_index(
collection: &str,
model: &EmbeddingModelId,
dims: usize,
version: &IndexVersion,
kind: &str,
params_hash: &str,
) -> IndexId {
IndexId(id_from(IndexTuple {
kind: "index",
collection,
embedding_model: &model.0,
dimensions: dims,
index_version: &version.0,
index_kind: kind,
index_params_hash: params_hash,
}))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn newtype_display_roundtrip() {
let s = "0123456789abcdef0123456789abcdef";
let id: AssetId = s.parse().unwrap();
assert_eq!(id.to_string(), s);
}
#[test]
fn newtype_rejects_short() {
let r: Result<AssetId, _> = "abc".parse();
assert!(r.is_err());
}
#[test]
fn newtype_rejects_non_hex() {
let r: Result<AssetId, _> = "ZZZ456789abcdef0123456789abcdef0".parse();
assert!(r.is_err());
}
#[test]
fn newtype_accepts_uppercase_normalizes_to_lowercase() {
let r: Result<AssetId, _> = "0123456789ABCDEF0123456789ABCDEF".parse();
let id = r.expect("uppercase hex must be accepted");
assert_eq!(id.0, "0123456789abcdef0123456789abcdef");
assert_eq!(id.to_string(), "0123456789abcdef0123456789abcdef");
}
#[test]
fn newtype_rejects_invalid_chars_after_uppercase_pass() {
// Mix of upper-hex (would pass) and non-hex `XYZ` (must reject).
let r: Result<AssetId, _> = "DEADBEEFCAFEBAB1XYZ23456789ABCD0".parse();
assert!(r.is_err());
}
/// Determinism: 1000 runs of `id_from` over the same input yield the same
/// hex.
#[test]
fn id_from_deterministic_1000() {
#[derive(Serialize)]
struct T<'a> {
a: u32,
b: &'a str,
}
let input = T { a: 7, b: "hello" };
let first = id_from(&input);
for _ in 0..1000 {
assert_eq!(id_from(&input), first);
}
assert_eq!(first.len(), 32);
}
/// Key order in the source struct does not affect hash (canonical JSON
/// sorts keys alphabetically).
#[test]
fn id_from_key_order_invariant() {
#[derive(Serialize)]
struct A {
a: u32,
b: u32,
}
#[derive(Serialize)]
struct B {
b: u32,
a: u32,
}
assert_eq!(id_from(A { a: 1, b: 2 }), id_from(B { b: 2, a: 1 }));
}
/// The expected hex below is hand-computed via design §4.2:
/// tuple = { "kind": "asset", "asset_blake3": "deadbeef" }
/// canonical JSON (key-sorted, no whitespace, both keys are pure ASCII):
/// {"asset_blake3":"deadbeef","kind":"asset"}
/// blake3 of those bytes → hex → first 32 chars.
/// Pinned via an independent tool (b3sum, computed once outside the code
/// under test) so a regression in our JCS or hash pipeline is caught.
#[test]
fn id_for_asset_pinned() {
// printf '{"asset_blake3":"deadbeef","kind":"asset"}' | b3sum
// → cec9353553efb238a7919d38d3e148f1...
let id = id_for_asset("deadbeef");
assert_eq!(id.0, "cec9353553efb238a7919d38d3e148f1");
}
/// Independent pin for id_for_doc.
/// canonical JSON:
/// {"asset_id":"6cb0ef0eb89c63b8b6e76ec53dca6e7d",
/// "kind":"doc",
/// "parser_version":"pulldown-cmark-0.x",
/// "workspace_path":"notes/test.md"}
/// (concatenated, no whitespace).
#[test]
fn id_for_doc_pinned() {
let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string());
let path = WorkspacePath::new("notes/test.md".to_string()).unwrap();
let pv = ParserVersion("pulldown-cmark-0.x".to_string());
let id = id_for_doc(&path, &asset, &pv);
assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db");
}
/// Independent pin for id_for_block.
/// inputs:
/// doc=DocumentId("aabbccdd00112233445566778899aabb"),
/// block_kind="paragraph", heading_path=["Intro"], ordinal=3,
/// span=SourceSpan::Line { start: 10, end: 20 }
/// canonical JSON (key-sorted, compact, no whitespace):
/// {"block_kind":"paragraph",
/// "doc_id":"aabbccdd00112233445566778899aabb",
/// "heading_path":["Intro"],
/// "kind":"block",
/// "ordinal":3,
/// "source_span":{"end":20,"kind":"line","start":10}}
/// computed via:
/// printf '{"block_kind":"paragraph","doc_id":"aabbccdd00112233445566778899aabb","heading_path":["Intro"],"kind":"block","ordinal":3,"source_span":{"end":20,"kind":"line","start":10}}' \
/// | ~/.cargo/bin/b3sum --no-names | cut -c1-32
/// → 8a7bf22de7ec3293a792028c829b3812
#[test]
fn id_for_block_pinned() {
let doc = DocumentId("aabbccdd00112233445566778899aabb".to_string());
let heading = vec!["Intro".to_string()];
let span = SourceSpan::Line { start: 10, end: 20 };
// Sanity check: confirm that the canonical JSON our code produces
// matches the literal we hashed externally. If a future field-order
// change (or rename) silently shifts the hash, this assertion fails
// before the hex comparison and points at the JSON layer directly.
let expected_json = b"{\"block_kind\":\"paragraph\",\"doc_id\":\"aabbccdd00112233445566778899aabb\",\"heading_path\":[\"Intro\"],\"kind\":\"block\",\"ordinal\":3,\"source_span\":{\"end\":20,\"kind\":\"line\",\"start\":10}}";
let tuple = BlockTuple {
kind: "block",
doc_id: &doc.0,
block_kind: "paragraph",
heading_path: &heading,
ordinal: 3,
source_span: &span,
};
assert_eq!(
serde_json_canonicalizer::to_vec(&tuple).unwrap(),
expected_json
);
let id = id_for_block(&doc, "paragraph", &heading, 3, &span);
assert_eq!(id.0, "8a7bf22de7ec3293a792028c829b3812");
}
/// Independent pin for id_for_chunk.
/// inputs:
/// doc=DocumentId("aabbccdd00112233445566778899aabb"),
/// chunker_version=ChunkerVersion("greedy-1.0"),
/// block_ids=[BlockId("a1b2c3d4e5f6789012345678abcdef00")],
/// policy_hash="abc123"
/// canonical JSON (key-sorted, compact, no whitespace):
/// {"block_ids":["a1b2c3d4e5f6789012345678abcdef00"],
/// "chunker_version":"greedy-1.0",
/// "doc_id":"aabbccdd00112233445566778899aabb",
/// "kind":"chunk",
/// "policy_hash":"abc123"}
/// computed via:
/// printf '{"block_ids":["a1b2c3d4e5f6789012345678abcdef00"],"chunker_version":"greedy-1.0","doc_id":"aabbccdd00112233445566778899aabb","kind":"chunk","policy_hash":"abc123"}' \
/// | ~/.cargo/bin/b3sum --no-names | cut -c1-32
/// → 8809f627777fe7ca5c4433b97dd88ce9
#[test]
fn id_for_chunk_pinned() {
let doc = DocumentId("aabbccdd00112233445566778899aabb".to_string());
let cv = ChunkerVersion("greedy-1.0".to_string());
let blocks = vec![BlockId("a1b2c3d4e5f6789012345678abcdef00".to_string())];
let expected_json = b"{\"block_ids\":[\"a1b2c3d4e5f6789012345678abcdef00\"],\"chunker_version\":\"greedy-1.0\",\"doc_id\":\"aabbccdd00112233445566778899aabb\",\"kind\":\"chunk\",\"policy_hash\":\"abc123\"}";
let tuple = ChunkTuple {
kind: "chunk",
doc_id: &doc.0,
chunker_version: &cv.0,
block_ids: blocks.iter().map(|b| b.0.as_str()).collect(),
policy_hash: "abc123",
};
assert_eq!(
serde_json_canonicalizer::to_vec(&tuple).unwrap(),
expected_json
);
let id = id_for_chunk(&doc, &cv, &blocks, "abc123");
assert_eq!(id.0, "8809f627777fe7ca5c4433b97dd88ce9");
}
/// Independent pin for id_for_embedding.
/// inputs:
/// chunk=ChunkId("d1e2f3a4b5c6789012345678aabbccdd"),
/// model_id=EmbeddingModelId("BAAI/bge-small-en"),
/// model_version=EmbeddingVersion("v1"), dimensions=384
/// canonical JSON (key-sorted, compact, no whitespace):
/// {"chunk_id":"d1e2f3a4b5c6789012345678aabbccdd",
/// "dimensions":384,
/// "kind":"embedding",
/// "model_id":"BAAI/bge-small-en",
/// "model_version":"v1"}
/// computed via:
/// printf '{"chunk_id":"d1e2f3a4b5c6789012345678aabbccdd","dimensions":384,"kind":"embedding","model_id":"BAAI/bge-small-en","model_version":"v1"}' \
/// | ~/.cargo/bin/b3sum --no-names | cut -c1-32
/// → 71992c457a5da39880a6d17d646ed0fd
#[test]
fn id_for_embedding_pinned() {
let chunk = ChunkId("d1e2f3a4b5c6789012345678aabbccdd".to_string());
let model = EmbeddingModelId("BAAI/bge-small-en".to_string());
let version = EmbeddingVersion("v1".to_string());
let expected_json = b"{\"chunk_id\":\"d1e2f3a4b5c6789012345678aabbccdd\",\"dimensions\":384,\"kind\":\"embedding\",\"model_id\":\"BAAI/bge-small-en\",\"model_version\":\"v1\"}";
let tuple = EmbeddingTuple {
kind: "embedding",
chunk_id: &chunk.0,
model_id: &model.0,
model_version: &version.0,
dimensions: 384,
};
assert_eq!(
serde_json_canonicalizer::to_vec(&tuple).unwrap(),
expected_json
);
let id = id_for_embedding(&chunk, &model, &version, 384);
assert_eq!(id.0, "71992c457a5da39880a6d17d646ed0fd");
}
#[test]
fn strip_alias_suffix_roundtrip() {
let bare = "0123456789abcdef0123456789abcdef";
let with_suffix = format!("{bare}{ALIAS_SUFFIX}");
assert_eq!(strip_alias_suffix(&with_suffix), bare);
assert_eq!(strip_alias_suffix(bare), bare);
assert_eq!(strip_alias_suffix(""), "");
assert_eq!(strip_alias_suffix("#alias"), "");
// Per-alias form `{orig}#alias#N` strips to the bare chunk ID.
assert_eq!(strip_alias_suffix(&format!("{bare}{ALIAS_SUFFIX}#3")), bare);
assert_eq!(strip_alias_suffix(&format!("{bare}{ALIAS_SUFFIX}#0")), bare);
assert_eq!(strip_alias_suffix("#alias#3"), "");
}
/// Independent pin for id_for_index.
/// inputs:
/// collection="default",
/// embedding_model=EmbeddingModelId("BAAI/bge-small-en"),
/// dimensions=384, version=IndexVersion("v1"),
/// kind="hnsw", params_hash="xyz"
/// canonical JSON (key-sorted, compact, no whitespace):
/// {"collection":"default",
/// "dimensions":384,
/// "embedding_model":"BAAI/bge-small-en",
/// "index_kind":"hnsw",
/// "index_params_hash":"xyz",
/// "index_version":"v1",
/// "kind":"index"}
/// computed via:
/// printf '{"collection":"default","dimensions":384,"embedding_model":"BAAI/bge-small-en","index_kind":"hnsw","index_params_hash":"xyz","index_version":"v1","kind":"index"}' \
/// | ~/.cargo/bin/b3sum --no-names | cut -c1-32
/// → e733ee2f9936f0e1ac5143cdbf0f2b54
#[test]
fn id_for_index_pinned() {
let model = EmbeddingModelId("BAAI/bge-small-en".to_string());
let version = IndexVersion("v1".to_string());
let expected_json = b"{\"collection\":\"default\",\"dimensions\":384,\"embedding_model\":\"BAAI/bge-small-en\",\"index_kind\":\"hnsw\",\"index_params_hash\":\"xyz\",\"index_version\":\"v1\",\"kind\":\"index\"}";
let tuple = IndexTuple {
kind: "index",
collection: "default",
embedding_model: &model.0,
dimensions: 384,
index_version: &version.0,
index_kind: "hnsw",
index_params_hash: "xyz",
};
assert_eq!(
serde_json_canonicalizer::to_vec(&tuple).unwrap(),
expected_json
);
let id = id_for_index("default", &model, 384, &version, "hnsw", "xyz");
assert_eq!(id.0, "e733ee2f9936f0e1ac5143cdbf0f2b54");
}
}