Merge pull request 'feat(p3-2): fastembed-adapter — kb-embed-local 크레이트 + FastembedEmbedder' (#15) from feat/p3-2-fastembed-adapter into main

Reviewed-on: altair823-org/kb#15
2026-05-01 08:42:06 +00:00
parent 9f2afc73dc bcbe2b8531
commit 9beef930b4
6 changed files with 2937 additions and 15 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ members = [
    "crates/kb-store-sqlite",
    "crates/kb-search",
    "crates/kb-embed",
+    "crates/kb-embed-local",
    "crates/kb-app",
    "crates/kb-cli",
 ]
@@ -37,3 +38,8 @@ rusqlite     = { version = "0.32", features = ["bundled"] }
 globset      = "0.4"
 tempfile     = "3"
 proptest     = "1"
+# fastembed-rs ships ONNX runtime via the `ort-download-binaries` feature
+# in its default set (which also pulls `hf-hub` for first-run model
+# downloads). Pinned to the 4.x line per task p3-2 (current 5.x release
+# remains untested for this workspace).
+fastembed    = "4.9"
--- a/crates/kb-embed-local/Cargo.toml
+++ b/crates/kb-embed-local/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name          = "kb-embed-local"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "Local fastembed-rs adapter implementing kb_core::Embedder (multilingual-e5-small default)"
+
+[dependencies]
+kb-config = { path = "../kb-config" }
+kb-embed  = { path = "../kb-embed" }
+# Default features bring `ort-download-binaries` (bundled ONNX runtime)
+# and `hf-hub-native-tls` (first-run model download). No extra features
+# needed for the multilingual-e5-small path.
+fastembed = { workspace = true }
+tracing   = { workspace = true }
+anyhow    = { workspace = true }
+
+[dev-dependencies]
+tempfile = { workspace = true }
+serde_json = { workspace = true }
--- a/crates/kb-embed-local/src/lib.rs
+++ b/crates/kb-embed-local/src/lib.rs
@@ -0,0 +1,433 @@
+//! `kb-embed-local` — `FastembedEmbedder`, a local ONNX-backed
+//! [`Embedder`](kb_embed::Embedder) implementation.
+//!
+//! Wraps [`fastembed::TextEmbedding`] for the default `multilingual-e5-small`
+//! (384-dim) model. Honors `config.models.embedding.batch_size` and applies
+//! the e5 prefix convention (§11.3 of the design report):
+//!
+//! * `EmbeddingKind::Document` → `"passage: "` prefix
+//! * `EmbeddingKind::Query`    → `"query: "` prefix
+//!
+//! The underlying fastembed `TextEmbedding::embed` already L2-normalizes each
+//! row (see `fastembed::text_embedding::output::transformer_with_precedence`),
+//! so we do not re-normalize; the unit-norm test in `tests/` keeps that
+//! invariant pinned in case fastembed changes its default.
+//!
+//! Model files are cached under
+//! `config.storage.model_dir/fastembed/`. The `model_dir` template
+//! (default `"{data_dir}/models"`) is resolved with the same expansion
+//! rules `kb-store-sqlite` applies to `data_dir` (`${XDG_DATA_HOME:-…}`,
+//! leading `~`, `{data_dir}` substitution).
+//!
+//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md`
+//! §7.2 (Embedder), §6.4 ([models.embedding]), §9 (versioning).
+
+use std::path::PathBuf;
+use std::sync::Mutex;
+
+use anyhow::{Context, Result};
+use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
+use kb_embed::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion};
+
+/// Subdirectory under `config.storage.model_dir` where the fastembed
+/// adapter writes / reads ONNX + tokenizer files. Hard-coded per task
+/// spec ("Model files cached under `config.storage.model_dir/fastembed/`").
+const FASTEMBED_CACHE_SUBDIR: &str = "fastembed";
+
+/// Local fastembed-rs adapter.
+///
+/// Construct via [`FastembedEmbedder::new`]. The constructor performs the
+/// (potentially network-bound) model download on first use, so prefer to
+/// share an instance across calls.
+pub struct FastembedEmbedder {
+    // Mutex serializes calls into TextEmbedding's underlying ONNX session.
+    // fastembed::TextEmbedding::embed is `&self` in 4.9 and ORT Session is
+    // Send + Sync, so this Mutex is conservative — it serializes inference
+    // where parallel ORT calls would in principle work. Acceptable here
+    // because callers (kb-app indexer) batch sequentially anyway. Revisit
+    // in P3-3+ if profiling shows contention.
+    inner: Mutex<TextEmbedding>,
+    model_id: EmbeddingModelId,
+    version: EmbeddingVersion,
+    dimensions: usize,
+    batch_size: usize,
+}
+
+impl FastembedEmbedder {
+    /// Build an embedder from `Config`. Validates that
+    /// `config.models.embedding.dimensions` matches the model's actual
+    /// dim BEFORE returning, so a mismatch fails at construction (not on
+    /// first `embed`).
+    pub fn new(config: &kb_config::Config) -> Result<Self> {
+        // 1. Resolve `{data_dir}/models/fastembed/` from the config
+        //    templates. `kb-config` does not expose a public path
+        //    resolver yet, so we hand-roll a tiny one mirroring
+        //    kb-store-sqlite's `expand_data_dir`.
+        let data_dir = expand_path(&config.storage.data_dir, "");
+        let model_dir = expand_path(&config.storage.model_dir, &data_dir.to_string_lossy());
+        let cache_dir = model_dir.join(FASTEMBED_CACHE_SUBDIR);
+        std::fs::create_dir_all(&cache_dir)
+            .with_context(|| format!("create fastembed cache dir {}", cache_dir.display()))?;
+
+        // 2. Resolve the fastembed enum variant from
+        //    `config.models.embedding.model`. Currently only the default
+        //    `multilingual-e5-small` is wired; other model names error
+        //    out with a clear message rather than silently misconfiguring.
+        let model_name = resolve_model(&config.models.embedding.model)?;
+
+        // 3. Verify dim match BEFORE loading the model — if the config
+        //    is wrong we want to fail without paying the ONNX
+        //    initialization cost.
+        let model_info = TextEmbedding::get_model_info(&model_name)
+            .context("fastembed: get_model_info")?;
+        check_dim(model_info.dim, config.models.embedding.dimensions)?;
+
+        tracing::info!(
+            target: "kb-embed-local",
+            cache_dir = %cache_dir.display(),
+            model = %config.models.embedding.model,
+            dims = model_info.dim,
+            "initializing FastembedEmbedder"
+        );
+
+        // 4. Build the underlying TextEmbedding. `show_download_progress`
+        //    is forced to `false` so test output stays clean; first-run
+        //    download progress is surfaced via the `tracing::info!`
+        //    pair around `TextEmbedding::try_new` instead.
+        let opts = InitOptions::new(model_name.clone())
+            .with_cache_dir(cache_dir.clone())
+            .with_show_download_progress(false);
+        tracing::info!(
+            target: "kb-embed-local",
+            model = %config.models.embedding.model,
+            cache_dir = %cache_dir.display(),
+            "loading embedding model (first run will download ~470MB)"
+        );
+        let inner = TextEmbedding::try_new(opts)
+            .context("fastembed: TextEmbedding::try_new")?;
+        let dimensions = model_info.dim;
+        tracing::info!(
+            target: "kb-embed-local",
+            model = %config.models.embedding.model,
+            dimensions,
+            "embedding model loaded"
+        );
+
+        Ok(Self {
+            inner: Mutex::new(inner),
+            model_id: EmbeddingModelId(config.models.embedding.model.clone()),
+            version: EmbeddingVersion(config.models.embedding.version.clone()),
+            dimensions,
+            batch_size: config.models.embedding.batch_size,
+        })
+    }
+}
+
+impl Embedder for FastembedEmbedder {
+    fn model_id(&self) -> EmbeddingModelId {
+        self.model_id.clone()
+    }
+
+    fn model_version(&self) -> EmbeddingVersion {
+        self.version.clone()
+    }
+
+    fn dimensions(&self) -> usize {
+        self.dimensions
+    }
+
+    fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> Result<Vec<Vec<f32>>> {
+        if inputs.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        // Apply e5 prefix per §11.3 BEFORE tokenization. The fastembed
+        // model is unaware of the document/query distinction; the prefix
+        // is the only signal that lets it produce different embeddings
+        // for the same surface text in different roles.
+        let prefixed: Vec<String> = inputs.iter().map(prefix_input).collect();
+
+        // We run our own batch loop on top of fastembed's internal one
+        // so that `config.models.embedding.batch_size` is honored
+        // exactly. fastembed's `embed(_, Some(batch_size))` does the
+        // same internally; calling once with our batch size matches
+        // intent and avoids an extra per-batch allocation.
+        let mut out: Vec<Vec<f32>> = Vec::with_capacity(prefixed.len());
+        for chunk in prefixed.chunks(self.batch_size) {
+            let chunk_vec: Vec<&str> = chunk.iter().map(String::as_str).collect();
+            let guard = self
+                .inner
+                .lock()
+                .unwrap_or_else(|p| p.into_inner());
+            let batch: Vec<Vec<f32>> = guard
+                .embed(chunk_vec, Some(self.batch_size))
+                .context("fastembed: embed")?;
+            drop(guard);
+            // Defensive shape check — every returned vector must match
+            // the configured `dimensions`. Mismatch here means fastembed
+            // and our config drifted at runtime (extremely unlikely;
+            // would have been caught at construction).
+            for v in &batch {
+                if v.len() != self.dimensions {
+                    anyhow::bail!(
+                        "fastembed returned vector of length {} but adapter expects {}",
+                        v.len(),
+                        self.dimensions
+                    );
+                }
+            }
+            out.extend(batch);
+        }
+
+        debug_assert_eq!(out.len(), inputs.len());
+        Ok(out)
+    }
+}
+
+/// Build the prefixed string for one [`EmbeddingInput`]. Free function so
+/// the unit test can pin the exact format without going through `embed`.
+fn prefix_input(input: &EmbeddingInput<'_>) -> String {
+    match input.kind {
+        EmbeddingKind::Document => format!("passage: {}", input.text),
+        EmbeddingKind::Query => format!("query: {}", input.text),
+    }
+}
+
+/// Resolve a `config.models.embedding.model` string to a fastembed
+/// `EmbeddingModel` enum variant. Only `multilingual-e5-small` is wired
+/// for p3-2; additional model names should be added (and their dims
+/// pinned in tests) as needed.
+fn resolve_model(name: &str) -> Result<EmbeddingModel> {
+    match name {
+        "multilingual-e5-small" => Ok(EmbeddingModel::MultilingualE5Small),
+        other => anyhow::bail!(
+            "kb-embed-local: unsupported embedding model {other:?}; \
+             this adapter currently only ships `multilingual-e5-small`. \
+             Add a new arm to `resolve_model` (and a fastembed feature \
+             flag if needed) to support more."
+        ),
+    }
+}
+
+/// Compare model dim against the configured dim. Extracted so a unit
+/// test can exercise the error branch without loading ONNX.
+pub(crate) fn check_dim(model_dim: usize, cfg_dim: usize) -> Result<()> {
+    if model_dim != cfg_dim {
+        anyhow::bail!(
+            "dimension mismatch: model={model_dim}, config={cfg_dim}; \
+             update `config.models.embedding.dimensions` to match the model \
+             (or pick a different model)."
+        );
+    }
+    Ok(())
+}
+
+/// Expand the limited template language `kb-config` uses for storage
+/// paths.
+///
+/// Supported substitutions, applied in order:
+/// 1. `{data_dir}` → `data_dir` (caller-supplied resolved string). This
+///    is a no-op when `data_dir` is empty (used by the recursive call
+///    that resolves `data_dir` itself).
+/// 2. `${XDG_DATA_HOME:-~/.local/share}` (and the bare
+///    `${XDG_DATA_HOME}`) → env var if set, else the default after
+///    `:-`.
+/// 3. Leading `~` → `$HOME`.
+///
+/// Mirrors `kb-store-sqlite::store::expand_data_dir`. Kept private to
+/// this crate; promoting it to a public `kb-config` API is a separate
+/// task (see task p3-2 risks: "don't expand kb-config's public API").
+fn expand_path(raw: &str, data_dir: &str) -> PathBuf {
+    let mut s = raw.to_string();
+
+    if !data_dir.is_empty() {
+        s = s.replace("{data_dir}", data_dir);
+    }
+
+    // ${XDG_DATA_HOME:-~/.local/share}: respect env override, else fall
+    // back to the suffix after `:-`.
+    if let Some(start) = s.find("${XDG_DATA_HOME") {
+        if let Some(rel_end) = s[start..].find('}') {
+            let end = start + rel_end + 1; // include trailing '}'
+            let inner = &s[start + 2..end - 1]; // strip ${ and }
+            let replacement = match std::env::var("XDG_DATA_HOME") {
+                Ok(v) if !v.is_empty() => v,
+                _ => {
+                    if let Some((_, default)) = inner.split_once(":-") {
+                        default.to_string()
+                    } else {
+                        String::new()
+                    }
+                }
+            };
+            s.replace_range(start..end, &replacement);
+        }
+    }
+
+    // Leading `~` → $HOME.
+    if let Some(rest) = s.strip_prefix('~') {
+        if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
+            return home.join(rest.trim_start_matches('/'));
+        }
+    }
+
+    PathBuf::from(s)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use kb_embed::EmbeddingInput;
+
+    // ── check_dim ────────────────────────────────────────────────────
+    //
+    // Exercises the construction-time dim mismatch branch WITHOUT
+    // loading the real model. The integration test that builds a full
+    // FastembedEmbedder is `#[ignore]`d (loads ~470 MB of weights).
+
+    #[test]
+    fn check_dim_match_ok() {
+        check_dim(384, 384).expect("matching dims must pass");
+    }
+
+    #[test]
+    fn check_dim_mismatch_errors() {
+        let err = check_dim(384, 512).expect_err("mismatch must error");
+        let msg = format!("{err}");
+        assert!(msg.contains("dimension mismatch"), "msg={msg}");
+        assert!(msg.contains("384"), "msg={msg}");
+        assert!(msg.contains("512"), "msg={msg}");
+    }
+
+    // ── prefix_input ─────────────────────────────────────────────────
+    //
+    // Pin the exact e5 prefix strings; a silent regression here
+    // degrades retrieval quality without any test failing in the
+    // dim/norm/snapshot suite.
+
+    #[test]
+    fn prefix_document_uses_passage() {
+        let input = EmbeddingInput {
+            text: "hello world",
+            kind: EmbeddingKind::Document,
+        };
+        assert_eq!(prefix_input(&input), "passage: hello world");
+    }
+
+    #[test]
+    fn prefix_query_uses_query() {
+        let input = EmbeddingInput {
+            text: "hello world",
+            kind: EmbeddingKind::Query,
+        };
+        assert_eq!(prefix_input(&input), "query: hello world");
+    }
+
+    #[test]
+    fn prefix_handles_empty_text() {
+        let doc = EmbeddingInput {
+            text: "",
+            kind: EmbeddingKind::Document,
+        };
+        let qry = EmbeddingInput {
+            text: "",
+            kind: EmbeddingKind::Query,
+        };
+        assert_eq!(prefix_input(&doc), "passage: ");
+        assert_eq!(prefix_input(&qry), "query: ");
+    }
+
+    // ── resolve_model ────────────────────────────────────────────────
+
+    #[test]
+    fn resolve_default_model_ok() {
+        // The exact enum variant is opaque, but `is_ok` plus a
+        // round-trip through the fastembed metadata gives confidence
+        // we hit the right arm.
+        resolve_model("multilingual-e5-small").expect("default model resolves");
+    }
+
+    #[test]
+    fn resolve_unknown_model_errors() {
+        let err = resolve_model("not-a-real-model").expect_err("unknown model errors");
+        let msg = format!("{err}");
+        assert!(msg.contains("unsupported embedding model"), "msg={msg}");
+    }
+
+    // ── expand_path ──────────────────────────────────────────────────
+
+    #[test]
+    fn expand_path_substitutes_data_dir_template() {
+        let p = expand_path("{data_dir}/models", "/tmp/kbtest");
+        assert_eq!(p, PathBuf::from("/tmp/kbtest/models"));
+    }
+
+    #[test]
+    fn expand_path_no_op_without_template() {
+        let p = expand_path("/abs/path", "/tmp/kbtest");
+        assert_eq!(p, PathBuf::from("/abs/path"));
+    }
+
+    // ── expand_path: XDG_DATA_HOME fallback ──────────────────────────
+    //
+    // These two tests mutate the process-wide `XDG_DATA_HOME` env var,
+    // which is unsafe under edition 2024 and racy under cargo's default
+    // parallel test runner. The shared `ENV_LOCK` serializes them; each
+    // test snapshots the prior value and restores it on exit.
+
+    use std::sync::Mutex as StdMutex;
+    static ENV_LOCK: StdMutex<()> = StdMutex::new(());
+
+    /// RAII guard: snapshots `XDG_DATA_HOME` on construction, restores
+    /// it on drop. Pair with the `ENV_LOCK` guard for serial access.
+    struct XdgGuard {
+        prior: Option<String>,
+    }
+
+    impl XdgGuard {
+        fn capture() -> Self {
+            Self {
+                prior: std::env::var("XDG_DATA_HOME").ok(),
+            }
+        }
+    }
+
+    impl Drop for XdgGuard {
+        fn drop(&mut self) {
+            // SAFETY: edition 2024 marks `set_var`/`remove_var` unsafe
+            // because env mutation is not thread-safe. Callers hold
+            // `ENV_LOCK` for the duration of the test, so no other
+            // thread observes the mutation.
+            unsafe {
+                match &self.prior {
+                    Some(v) => std::env::set_var("XDG_DATA_HOME", v),
+                    None => std::env::remove_var("XDG_DATA_HOME"),
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn expand_path_xdg_data_home_set() {
+        let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
+        let _guard = XdgGuard::capture();
+        // SAFETY: lock held for the duration of this test.
+        unsafe { std::env::set_var("XDG_DATA_HOME", "/custom/path") };
+
+        let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
+        assert_eq!(p, PathBuf::from("/custom/path/kb"));
+    }
+
+    #[test]
+    fn expand_path_xdg_data_home_unset_falls_back_to_home() {
+        let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
+        let _guard = XdgGuard::capture();
+        // SAFETY: lock held for the duration of this test.
+        unsafe { std::env::remove_var("XDG_DATA_HOME") };
+
+        let home = std::env::var("HOME").expect("HOME must be set in tests");
+        let expected = PathBuf::from(home).join(".local/share/kb");
+        let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
+        assert_eq!(p, expected);
+    }
+}
--- a/crates/kb-embed-local/tests/embed_model.rs
+++ b/crates/kb-embed-local/tests/embed_model.rs
@@ -0,0 +1,284 @@
+//! Integration tests for [`FastembedEmbedder`] that load the real ONNX
+//! model.
+//!
+//! ## Why every test in this file is `#[ignore]`
+//!
+//! The first call to `FastembedEmbedder::new` downloads ~470 MB of
+//! weights from Hugging Face into `data_dir/models/fastembed/`. Doing
+//! that on every `cargo test` invocation is wasteful, so the bare
+//! invocation skips this file entirely.
+//!
+//! Run the full suite with:
+//! ```text
+//! cargo test -p kb-embed-local -- --ignored
+//! ```
+//!
+//! All tests share a `OnceLock<FastembedEmbedder>` so the model loads
+//! exactly once per process invocation (ONNX runtime first-load latency
+//! is 1-2 s on M-series Macs per design risks list).
+
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
+use std::sync::OnceLock;
+use std::time::Instant;
+
+use kb_embed::{Embedder, EmbeddingInput, EmbeddingKind};
+use kb_embed_local::FastembedEmbedder;
+
+/// Build a `Config` whose `data_dir` lives in a per-process temp dir so
+/// the test never writes into the developer's real `~/.local/share/kb`.
+/// Returns the `Config` and the `TempDir` guard (caller keeps the guard
+/// alive for the test duration).
+fn test_config() -> (kb_config::Config, tempfile::TempDir) {
+    let tmp = tempfile::tempdir().expect("create tempdir");
+    let mut cfg = kb_config::Config::defaults();
+    cfg.storage.data_dir = tmp.path().to_string_lossy().into_owned();
+    // model_dir keeps its default `{data_dir}/models` template; the
+    // adapter resolves it itself.
+    (cfg, tmp)
+}
+
+/// Single shared embedder for the `--ignored` lane. Held behind a
+/// `OnceLock` so we pay the ~1-2 s ONNX init + (first run only) the
+/// network download just once.
+fn shared_embedder() -> &'static FastembedEmbedder {
+    static EMBEDDER: OnceLock<FastembedEmbedder> = OnceLock::new();
+    EMBEDDER.get_or_init(|| {
+        let (cfg, _tmp) = test_config();
+        // We deliberately leak `_tmp` here: the OnceLock outlives the
+        // function scope so the cache directory must persist for the
+        // process. (`tempfile::TempDir`'s `Drop` would erase the cache
+        // and wreck subsequent calls.) The OS will reclaim the leaked
+        // path when the test process exits.
+        let _ = std::mem::ManuallyDrop::new(_tmp);
+        FastembedEmbedder::new(&cfg).expect("init FastembedEmbedder")
+    })
+}
+
+// ─── construction ─────────────────────────────────────────────────────
+
+#[test]
+#[ignore = "downloads ~470MB ONNX model on first run; CI-only"]
+fn default_config_constructs_with_dims_384() {
+    let emb = shared_embedder();
+    assert_eq!(emb.dimensions(), 384);
+    assert_eq!(emb.model_id().0, "multilingual-e5-small");
+    assert_eq!(emb.model_version().0, "v1");
+}
+
+#[test]
+#[ignore = "downloads ~470MB ONNX model on first run; CI-only"]
+fn mismatched_dims_in_config_errors_at_construction() {
+    let (mut cfg, _tmp) = test_config();
+    cfg.models.embedding.dimensions = 512; // model is 384
+    // `FastembedEmbedder` deliberately does not implement `Debug`
+    // (its inner ONNX session has no useful debug shape), so we
+    // can't use `expect_err`; match the Result manually.
+    let err = match FastembedEmbedder::new(&cfg) {
+        Ok(_) => panic!("dim mismatch must error"),
+        Err(e) => e,
+    };
+    let msg = format!("{err}");
+    assert!(msg.contains("dimension mismatch"), "msg={msg}");
+    assert!(msg.contains("384"), "msg={msg}");
+    assert!(msg.contains("512"), "msg={msg}");
+}
+
+// ─── e5 prefix differentiation ────────────────────────────────────────
+
+#[test]
+#[ignore = "loads ONNX model; CI-only"]
+fn document_and_query_yield_different_vectors() {
+    let emb = shared_embedder();
+    let text = "The quick brown fox jumps over the lazy dog.";
+    let out = emb
+        .embed(&[
+            EmbeddingInput {
+                text,
+                kind: EmbeddingKind::Document,
+            },
+            EmbeddingInput {
+                text,
+                kind: EmbeddingKind::Query,
+            },
+        ])
+        .expect("embed two inputs");
+    assert_eq!(out.len(), 2);
+    assert_eq!(out[0].len(), 384);
+    assert_eq!(out[1].len(), 384);
+
+    // Both vectors are L2-normalized → cosine similarity == dot product.
+    let cos: f32 = out[0]
+        .iter()
+        .zip(out[1].iter())
+        .map(|(a, b)| a * b)
+        .sum();
+    // Same text, different prefix → vectors must NOT be identical.
+    assert!(
+        cos < 0.9999,
+        "expected distinct vectors for Document vs Query, got cos={cos}"
+    );
+}
+
+// ─── L2 normalization ─────────────────────────────────────────────────
+
+#[test]
+#[ignore = "loads ONNX model; CI-only"]
+fn output_vectors_are_l2_normalized() {
+    let emb = shared_embedder();
+    let inputs = [
+        EmbeddingInput {
+            text: "hello world",
+            kind: EmbeddingKind::Document,
+        },
+        EmbeddingInput {
+            text: "vector search",
+            kind: EmbeddingKind::Document,
+        },
+        EmbeddingInput {
+            text: "embedding model",
+            kind: EmbeddingKind::Query,
+        },
+    ];
+    let out = emb.embed(&inputs).expect("embed");
+    // Per `kb_embed::assert_unit_norm` docs: `5e-4` is the safe bound at
+    // 384 dims (f32::EPSILON × √384 ≈ 2.3e-6, but ONNX kernels add
+    // their own per-component noise; 1e-3 is very generous and matches
+    // the spec's `± 1e-3`).
+    kb_embed::assert_unit_norm(&out, 1e-3);
+    kb_embed::assert_vector_shape(&out, 384);
+}
+
+// ─── determinism ──────────────────────────────────────────────────────
+
+#[test]
+#[ignore = "loads ONNX model; CI-only"]
+fn identical_input_yields_identical_output() {
+    let emb = shared_embedder();
+    let inputs = [
+        EmbeddingInput {
+            text: "deterministic embedding test",
+            kind: EmbeddingKind::Document,
+        },
+        EmbeddingInput {
+            text: "second sentence for variety",
+            kind: EmbeddingKind::Document,
+        },
+    ];
+    let a = emb.embed(&inputs).expect("first embed");
+    let b = emb.embed(&inputs).expect("second embed");
+    assert_eq!(a, b, "two calls with the same inputs must be byte-equal");
+}
+
+// ─── performance ──────────────────────────────────────────────────────
+
+#[test]
+#[ignore = "performance test; downloads model and runs 64-vec batch"]
+fn batch_of_64_short_inputs_under_5s() {
+    let emb = shared_embedder();
+    // 64 distinct short strings → forces the full default batch_size
+    // through one fastembed call.
+    let texts: Vec<String> = (0..64)
+        .map(|i| format!("perf-test sentence number {i}"))
+        .collect();
+    let inputs: Vec<EmbeddingInput<'_>> = texts
+        .iter()
+        .map(|t| EmbeddingInput {
+            text: t.as_str(),
+            kind: EmbeddingKind::Document,
+        })
+        .collect();
+    let t0 = Instant::now();
+    let out = emb.embed(&inputs).expect("embed batch of 64");
+    let elapsed = t0.elapsed();
+    assert_eq!(out.len(), 64);
+    assert!(
+        elapsed.as_secs_f32() < 5.0,
+        "batch-64 took {elapsed:?}, expected < 5s"
+    );
+}
+
+// ─── snapshot ─────────────────────────────────────────────────────────
+
+/// Aggregate hash of vectors for the 5 fixture sentences.
+///
+/// Computed by:
+/// 1. embed each sentence as `EmbeddingKind::Document`,
+/// 2. round each `f32` component to 4 decimal places (multiply by 1e4,
+///    round, store as `i32`),
+/// 3. write the rounded i32 components into a `DefaultHasher` in row-
+///    major order,
+/// 4. read out the `u64` finish value.
+///
+/// The 4-decimal tolerance is intentional float-tolerance per task spec:
+/// exact f32 equality is too strict given ONNX kernel + hardware
+/// variation.
+///
+/// **Pinning workflow** (a snapshot test must FAIL UNTIL PINNED):
+/// 1. With `SNAPSHOT_HASH_BASELINE = 0`, run
+///    `cargo test -p kb-embed-local -- --ignored snapshot`. The test
+///    panics with a message containing the captured hash.
+/// 2. Paste the printed hex value into `SNAPSHOT_HASH_BASELINE` below.
+/// 3. Re-run the same command — the test now asserts equality and
+///    passes, confirming the pin.
+///
+/// On a genuine model upgrade, reset to `0`, re-pin, and bump
+/// `EmbeddingVersion` per design §9 in the same PR.
+const SNAPSHOT_HASH_BASELINE: u64 = 0;
+
+#[test]
+#[ignore = "loads ONNX model; CI-only"]
+fn snapshot_aggregate_hash_is_stable() {
+    let emb = shared_embedder();
+    let fixture_path =
+        std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/embed/known-sentences.json");
+    let raw = std::fs::read_to_string(&fixture_path).expect("read fixture");
+    let json: serde_json::Value = serde_json::from_str(&raw).expect("parse fixture json");
+    let sentences: Vec<String> = json["sentences"]
+        .as_array()
+        .expect("`sentences` array")
+        .iter()
+        .map(|v| v.as_str().expect("sentence is str").to_string())
+        .collect();
+    assert_eq!(sentences.len(), 5, "fixture must have exactly 5 sentences");
+
+    let inputs: Vec<EmbeddingInput<'_>> = sentences
+        .iter()
+        .map(|s| EmbeddingInput {
+            text: s.as_str(),
+            kind: EmbeddingKind::Document,
+        })
+        .collect();
+    let out = emb.embed(&inputs).expect("embed snapshot fixture");
+
+    // Round every component to 4 decimal places, hash deterministically.
+    let mut hasher = DefaultHasher::new();
+    for (i, v) in out.iter().enumerate() {
+        assert_eq!(v.len(), 384, "row {i} dim mismatch");
+        for x in v {
+            let rounded: i32 = (*x * 1.0e4).round() as i32;
+            rounded.hash(&mut hasher);
+        }
+    }
+    let observed = hasher.finish();
+    if SNAPSHOT_HASH_BASELINE == 0 {
+        // Unpinned baseline: panic with the captured hash. A snapshot
+        // test that silently passes on first run defeats its purpose,
+        // so we hard-fail until a maintainer commits the pin. Both
+        // hex (paste-friendly) and decimal forms are printed.
+        eprintln!(
+            "kb-embed-local snapshot baseline (paste into SNAPSHOT_HASH_BASELINE): \
+             {observed:#x} ({observed})"
+        );
+        panic!(
+            "snapshot baseline unpinned — paste {observed:#x} into \
+             SNAPSHOT_HASH_BASELINE then re-run"
+        );
+    }
+    assert_eq!(
+        observed, SNAPSHOT_HASH_BASELINE,
+        "snapshot drift: model output for the fixture sentences changed; \
+         either fastembed weights changed (bump EmbeddingVersion per §9) \
+         or there's an ONNX kernel diff."
+    );
+}
--- a/crates/kb-embed-local/tests/fixtures/embed/known-sentences.json
+++ b/crates/kb-embed-local/tests/fixtures/embed/known-sentences.json
@@ -0,0 +1,10 @@
+{
+  "_comment": "Snapshot fixture for FastembedEmbedder. Five short sentences in mixed languages chosen so the multilingual-e5-small model exercises both Latin and CJK token paths. The kb-embed-local snapshot test embeds these as `EmbeddingKind::Document`, rounds each f32 component to 4 decimal places, and checks an aggregate hash against a constant baked into the test source. The 4-decimal tolerance hides the per-platform ONNX kernel jitter we observed during P3 bring-up; tighter tolerances would make the test flaky across hardware.",
+  "sentences": [
+    "The quick brown fox jumps over the lazy dog.",
+    "Vector search is fast on small models.",
+    "Knowledge bases benefit from hybrid retrieval.",
+    "한국어 문장도 임베딩이 잘 됩니다.",
+    "Embeddings should be deterministic given the same input."
+  ]
+}