두 번째 commit. 사용자 facing surface (CLI binary, env vars, XDG paths) + 코드 안 single-letter token (`KB_`, `kb.sqlite`, `/kb/`, tracing target) 일괄 rename. 그리고 3 개 file rename: - 디자인 doc `2026-04-27-kb-final-form-design.md` → `2026-04-27-kebab-final-form-design.md` - 최초 보고서 `kb_local_rust_report.md` → `kebab_local_rust_report.md` - workspace ignore `.kbignore` → `.kebabignore` ## 변경 - `crates/kebab-cli/Cargo.toml`: `[[bin]] name = "kb"` → `"kebab"`. - `crates/kebab-cli/src/main.rs`: `#[command(name = "kb", …)]` → `name = "kebab"`. - 모든 `KB_*` env var (코드 + doc + 테스트) → `KEBAB_*`. apply_env prefix 매칭 + 30+ 개 setting 키 모두. - XDG paths: `~/.config/kb` / `~/.local/share/kb` / `~/.cache/kb` / `~/.local/state/kb` → `~/.config/kebab` 등. config defaults + expand_path tests + paths.rs 의 hardcode 모두. - SQLite filename: `kb.sqlite` → `kebab.sqlite` (`SQLITE_FILE` const + 테스트 hardcode 모두). - tracing target: `target: "kb-*"` → `"kebab-*"` (10+ 곳). - snapshot fixture: `.kbignore` → `.kebabignore` (`fixtures/source-fs/ tree-1.snapshot.json` 갱신). ## 검증 - `cargo test --workspace -j 1` clean (linker OOM 회피 위해 직렬). - `cargo clippy --workspace --all-targets -- -D warnings` clean. 다음 commit 에서 docs sweep. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
307 lines
12 KiB
Rust
307 lines
12 KiB
Rust
//! `kb-embed-local` — `FastembedEmbedder`, a local ONNX-backed
|
|
//! [`Embedder`](kebab_embed::Embedder) implementation.
|
|
//!
|
|
//! Wraps [`fastembed::TextEmbedding`] for the default `multilingual-e5-small`
|
|
//! (384-dim) model. Honors `config.models.embedding.batch_size` and applies
|
|
//! the e5 prefix convention (§11.3 of the design report):
|
|
//!
|
|
//! * `EmbeddingKind::Document` → `"passage: "` prefix
|
|
//! * `EmbeddingKind::Query` → `"query: "` prefix
|
|
//!
|
|
//! The underlying fastembed `TextEmbedding::embed` already L2-normalizes each
|
|
//! row (see `fastembed::text_embedding::output::transformer_with_precedence`),
|
|
//! so we do not re-normalize; the unit-norm test in `tests/` keeps that
|
|
//! invariant pinned in case fastembed changes its default.
|
|
//!
|
|
//! Model files are cached under
|
|
//! `config.storage.model_dir/fastembed/`. The `model_dir` template
|
|
//! (default `"{data_dir}/models"`) is resolved with the same expansion
|
|
//! rules `kb-store-sqlite` applies to `data_dir` (`${XDG_DATA_HOME:-…}`,
|
|
//! leading `~`, `{data_dir}` substitution).
|
|
//!
|
|
//! See `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`
|
|
//! §7.2 (Embedder), §6.4 ([models.embedding]), §9 (versioning).
|
|
|
|
use std::sync::Mutex;
|
|
|
|
use anyhow::{Context, Result};
|
|
use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
|
|
use kebab_config::expand_path;
|
|
use kebab_embed::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion};
|
|
|
|
/// Subdirectory under `config.storage.model_dir` where the fastembed
|
|
/// adapter writes / reads ONNX + tokenizer files. Hard-coded per task
|
|
/// spec ("Model files cached under `config.storage.model_dir/fastembed/`").
|
|
const FASTEMBED_CACHE_SUBDIR: &str = "fastembed";
|
|
|
|
/// Local fastembed-rs adapter.
|
|
///
|
|
/// Construct via [`FastembedEmbedder::new`]. The constructor performs the
|
|
/// (potentially network-bound) model download on first use, so prefer to
|
|
/// share an instance across calls.
|
|
pub struct FastembedEmbedder {
|
|
// Mutex serializes calls into TextEmbedding's underlying ONNX session.
|
|
// fastembed::TextEmbedding::embed is `&self` in 4.9 and ORT Session is
|
|
// Send + Sync, so this Mutex is conservative — it serializes inference
|
|
// where parallel ORT calls would in principle work. Acceptable here
|
|
// because callers (kb-app indexer) batch sequentially anyway. Revisit
|
|
// in P3-3+ if profiling shows contention.
|
|
inner: Mutex<TextEmbedding>,
|
|
model_id: EmbeddingModelId,
|
|
version: EmbeddingVersion,
|
|
dimensions: usize,
|
|
batch_size: usize,
|
|
}
|
|
|
|
impl FastembedEmbedder {
|
|
/// Build an embedder from `Config`. Validates that
|
|
/// `config.models.embedding.dimensions` matches the model's actual
|
|
/// dim BEFORE returning, so a mismatch fails at construction (not on
|
|
/// first `embed`).
|
|
pub fn new(config: &kebab_config::Config) -> Result<Self> {
|
|
// 1. Resolve `{data_dir}/models/fastembed/` from the config
|
|
// templates. Goes through the shared `kebab_config::expand_path`
|
|
// so every crate resolves storage paths identically.
|
|
let data_dir = expand_path(&config.storage.data_dir, "");
|
|
let model_dir = expand_path(&config.storage.model_dir, &data_dir.to_string_lossy());
|
|
let cache_dir = model_dir.join(FASTEMBED_CACHE_SUBDIR);
|
|
std::fs::create_dir_all(&cache_dir)
|
|
.with_context(|| format!("create fastembed cache dir {}", cache_dir.display()))?;
|
|
|
|
// 2. Resolve the fastembed enum variant from
|
|
// `config.models.embedding.model`. Currently only the default
|
|
// `multilingual-e5-small` is wired; other model names error
|
|
// out with a clear message rather than silently misconfiguring.
|
|
let model_name = resolve_model(&config.models.embedding.model)?;
|
|
|
|
// 3. Verify dim match BEFORE loading the model — if the config
|
|
// is wrong we want to fail without paying the ONNX
|
|
// initialization cost.
|
|
let model_info = TextEmbedding::get_model_info(&model_name)
|
|
.context("fastembed: get_model_info")?;
|
|
check_dim(model_info.dim, config.models.embedding.dimensions)?;
|
|
|
|
tracing::info!(
|
|
target: "kebab-embed-local",
|
|
cache_dir = %cache_dir.display(),
|
|
model = %config.models.embedding.model,
|
|
dims = model_info.dim,
|
|
"initializing FastembedEmbedder"
|
|
);
|
|
|
|
// 4. Build the underlying TextEmbedding. `show_download_progress`
|
|
// is forced to `false` so test output stays clean; first-run
|
|
// download progress is surfaced via the `tracing::info!`
|
|
// pair around `TextEmbedding::try_new` instead.
|
|
let opts = InitOptions::new(model_name.clone())
|
|
.with_cache_dir(cache_dir.clone())
|
|
.with_show_download_progress(false);
|
|
tracing::info!(
|
|
target: "kebab-embed-local",
|
|
model = %config.models.embedding.model,
|
|
cache_dir = %cache_dir.display(),
|
|
"loading embedding model (first run will download ~470MB)"
|
|
);
|
|
let inner = TextEmbedding::try_new(opts)
|
|
.context("fastembed: TextEmbedding::try_new")?;
|
|
let dimensions = model_info.dim;
|
|
tracing::info!(
|
|
target: "kebab-embed-local",
|
|
model = %config.models.embedding.model,
|
|
dimensions,
|
|
"embedding model loaded"
|
|
);
|
|
|
|
Ok(Self {
|
|
inner: Mutex::new(inner),
|
|
model_id: EmbeddingModelId(config.models.embedding.model.clone()),
|
|
version: EmbeddingVersion(config.models.embedding.version.clone()),
|
|
dimensions,
|
|
batch_size: config.models.embedding.batch_size,
|
|
})
|
|
}
|
|
}
|
|
|
|
impl Embedder for FastembedEmbedder {
|
|
fn model_id(&self) -> EmbeddingModelId {
|
|
self.model_id.clone()
|
|
}
|
|
|
|
fn model_version(&self) -> EmbeddingVersion {
|
|
self.version.clone()
|
|
}
|
|
|
|
fn dimensions(&self) -> usize {
|
|
self.dimensions
|
|
}
|
|
|
|
fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> Result<Vec<Vec<f32>>> {
|
|
if inputs.is_empty() {
|
|
return Ok(Vec::new());
|
|
}
|
|
|
|
// Apply e5 prefix per §11.3 BEFORE tokenization. The fastembed
|
|
// model is unaware of the document/query distinction; the prefix
|
|
// is the only signal that lets it produce different embeddings
|
|
// for the same surface text in different roles.
|
|
let prefixed: Vec<String> = inputs.iter().map(prefix_input).collect();
|
|
|
|
// We run our own batch loop on top of fastembed's internal one
|
|
// so that `config.models.embedding.batch_size` is honored
|
|
// exactly. fastembed's `embed(_, Some(batch_size))` does the
|
|
// same internally; calling once with our batch size matches
|
|
// intent and avoids an extra per-batch allocation.
|
|
let mut out: Vec<Vec<f32>> = Vec::with_capacity(prefixed.len());
|
|
for chunk in prefixed.chunks(self.batch_size) {
|
|
let chunk_vec: Vec<&str> = chunk.iter().map(String::as_str).collect();
|
|
let guard = self
|
|
.inner
|
|
.lock()
|
|
.unwrap_or_else(|p| p.into_inner());
|
|
let batch: Vec<Vec<f32>> = guard
|
|
.embed(chunk_vec, Some(self.batch_size))
|
|
.context("fastembed: embed")?;
|
|
drop(guard);
|
|
// Defensive shape check — every returned vector must match
|
|
// the configured `dimensions`. Mismatch here means fastembed
|
|
// and our config drifted at runtime (extremely unlikely;
|
|
// would have been caught at construction).
|
|
for v in &batch {
|
|
if v.len() != self.dimensions {
|
|
anyhow::bail!(
|
|
"fastembed returned vector of length {} but adapter expects {}",
|
|
v.len(),
|
|
self.dimensions
|
|
);
|
|
}
|
|
}
|
|
out.extend(batch);
|
|
}
|
|
|
|
debug_assert_eq!(out.len(), inputs.len());
|
|
Ok(out)
|
|
}
|
|
}
|
|
|
|
/// Build the prefixed string for one [`EmbeddingInput`]. Free function so
|
|
/// the unit test can pin the exact format without going through `embed`.
|
|
fn prefix_input(input: &EmbeddingInput<'_>) -> String {
|
|
match input.kind {
|
|
EmbeddingKind::Document => format!("passage: {}", input.text),
|
|
EmbeddingKind::Query => format!("query: {}", input.text),
|
|
}
|
|
}
|
|
|
|
/// Resolve a `config.models.embedding.model` string to a fastembed
|
|
/// `EmbeddingModel` enum variant. Only `multilingual-e5-small` is wired
|
|
/// for p3-2; additional model names should be added (and their dims
|
|
/// pinned in tests) as needed.
|
|
fn resolve_model(name: &str) -> Result<EmbeddingModel> {
|
|
match name {
|
|
"multilingual-e5-small" => Ok(EmbeddingModel::MultilingualE5Small),
|
|
other => anyhow::bail!(
|
|
"kb-embed-local: unsupported embedding model {other:?}; \
|
|
this adapter currently only ships `multilingual-e5-small`. \
|
|
Add a new arm to `resolve_model` (and a fastembed feature \
|
|
flag if needed) to support more."
|
|
),
|
|
}
|
|
}
|
|
|
|
/// Compare model dim against the configured dim. Extracted so a unit
|
|
/// test can exercise the error branch without loading ONNX.
|
|
pub(crate) fn check_dim(model_dim: usize, cfg_dim: usize) -> Result<()> {
|
|
if model_dim != cfg_dim {
|
|
anyhow::bail!(
|
|
"dimension mismatch: model={model_dim}, config={cfg_dim}; \
|
|
update `config.models.embedding.dimensions` to match the model \
|
|
(or pick a different model)."
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use kebab_embed::EmbeddingInput;
|
|
|
|
// ── check_dim ────────────────────────────────────────────────────
|
|
//
|
|
// Exercises the construction-time dim mismatch branch WITHOUT
|
|
// loading the real model. The integration test that builds a full
|
|
// FastembedEmbedder is `#[ignore]`d (loads ~470 MB of weights).
|
|
|
|
#[test]
|
|
fn check_dim_match_ok() {
|
|
check_dim(384, 384).expect("matching dims must pass");
|
|
}
|
|
|
|
#[test]
|
|
fn check_dim_mismatch_errors() {
|
|
let err = check_dim(384, 512).expect_err("mismatch must error");
|
|
let msg = format!("{err}");
|
|
assert!(msg.contains("dimension mismatch"), "msg={msg}");
|
|
assert!(msg.contains("384"), "msg={msg}");
|
|
assert!(msg.contains("512"), "msg={msg}");
|
|
}
|
|
|
|
// ── prefix_input ─────────────────────────────────────────────────
|
|
//
|
|
// Pin the exact e5 prefix strings; a silent regression here
|
|
// degrades retrieval quality without any test failing in the
|
|
// dim/norm/snapshot suite.
|
|
|
|
#[test]
|
|
fn prefix_document_uses_passage() {
|
|
let input = EmbeddingInput {
|
|
text: "hello world",
|
|
kind: EmbeddingKind::Document,
|
|
};
|
|
assert_eq!(prefix_input(&input), "passage: hello world");
|
|
}
|
|
|
|
#[test]
|
|
fn prefix_query_uses_query() {
|
|
let input = EmbeddingInput {
|
|
text: "hello world",
|
|
kind: EmbeddingKind::Query,
|
|
};
|
|
assert_eq!(prefix_input(&input), "query: hello world");
|
|
}
|
|
|
|
#[test]
|
|
fn prefix_handles_empty_text() {
|
|
let doc = EmbeddingInput {
|
|
text: "",
|
|
kind: EmbeddingKind::Document,
|
|
};
|
|
let qry = EmbeddingInput {
|
|
text: "",
|
|
kind: EmbeddingKind::Query,
|
|
};
|
|
assert_eq!(prefix_input(&doc), "passage: ");
|
|
assert_eq!(prefix_input(&qry), "query: ");
|
|
}
|
|
|
|
// ── resolve_model ────────────────────────────────────────────────
|
|
|
|
#[test]
|
|
fn resolve_default_model_ok() {
|
|
// The exact enum variant is opaque, but `is_ok` plus a
|
|
// round-trip through the fastembed metadata gives confidence
|
|
// we hit the right arm.
|
|
resolve_model("multilingual-e5-small").expect("default model resolves");
|
|
}
|
|
|
|
#[test]
|
|
fn resolve_unknown_model_errors() {
|
|
let err = resolve_model("not-a-real-model").expect_err("unknown model errors");
|
|
let msg = format!("{err}");
|
|
assert!(msg.contains("unsupported embedding model"), "msg={msg}");
|
|
}
|
|
|
|
// expand_path tests live in `kb-config::paths`. The adapter imports
|
|
// it and trusts the upstream coverage rather than duplicating it.
|
|
}
|