feat(p3-1): kb-embed crate — Embedder trait re-export + MockEmbedder
Establishes the kb-embed trait crate so concrete embedding adapters
(p3-2 fastembed, future ollama-embed/candle) target a stable surface.
Pure re-export of kb_core::{Embedder, EmbeddingInput, EmbeddingKind,
EmbeddingModelId, EmbeddingVersion} plus a feature-gated deterministic
mock for downstream tests.
MockEmbedder (cfg(feature = "mock"), default OFF):
- Per-component hash recipe: blake3(seed_le8 || kind_byte ||
text_len_le8 || text || i_le8). Length-prefixed text avoids the
domain-separation ambiguity where two (text, i) pairs could shift
bytes between text tail and the i field.
- Document = 0u8, Query = 1u8 — same text different kind yields
different vectors (mirrors e5 prefix behaviour).
- Per component: blake3 first 8 bytes → u64 → reinterpret as i64 →
f64/i64::MAX → f32. i64::MIN gives -1.0000000000000002 which f32
rounds to -1.0; range [-1, 1] holds.
- L2 unit-normalised. Norm sums in f64 (avoid catastrophic precision
loss) before f32 cast. Zero-norm guard skips the divide.
- with_seed(...) constructor lets two embedders share identity but
produce different vectors — useful for downstream parametric tests.
Helpers:
- assert_vector_shape(vecs, dims) — len + finite check.
- assert_unit_norm(vecs, tolerance) — caller-supplied tolerance;
5e-4 documented as safe for dims=384 under f32 epsilon × √dims.
Tests:
- cargo test -p kb-embed (no features): 2 reexport/dyn-dispatch tests.
- cargo test -p kb-embed --features mock: 7 tests including 100-case
proptest asserting len == dims, all finite, ‖v‖ ≈ 1.0 within
tolerance, Doc(text) byte-equal Doc(text), Doc(text) ≠ Query(text),
Doc(text1) ≠ Doc(text2).
- All 220 workspace tests pass; clippy clean for both default and
mock-on feature configurations.
Symbol gating: nm on the release rlib confirms zero MockEmbedder
symbols under default features; three trait impl symbols under
--features mock. Spec invariant "release builds MUST NOT include
MockEmbedder" verified at the symbol level.
Allowed deps respected: kb-core, kb-config, serde, thiserror, tracing,
plus anyhow (forced by trait return type) and blake3 (justified by
the determinism contract; already in workspace lockfile via kb-core).
No fastembed/ort/tokenizers anywhere.
Out of scope: real adapter (p3-2), reranker traits (P+).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
134
Cargo.lock
generated
134
Cargo.lock
generated
@@ -108,6 +108,21 @@ version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||
|
||||
[[package]]
|
||||
name = "bit-set"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
|
||||
dependencies = [
|
||||
"bit-vec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit-vec"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.11.1"
|
||||
@@ -369,6 +384,12 @@ version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "foldhash"
|
||||
version = "0.1.5"
|
||||
@@ -761,6 +782,20 @@ dependencies = [
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-embed"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"proptest",
|
||||
"serde",
|
||||
"thiserror 2.0.18",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-normalize"
|
||||
version = "0.1.0"
|
||||
@@ -1090,6 +1125,15 @@ version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
|
||||
dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prettyplease"
|
||||
version = "0.2.37"
|
||||
@@ -1109,6 +1153,25 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proptest"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744"
|
||||
dependencies = [
|
||||
"bit-set",
|
||||
"bit-vec",
|
||||
"bitflags",
|
||||
"num-traits",
|
||||
"rand",
|
||||
"rand_chacha",
|
||||
"rand_xorshift",
|
||||
"regex-syntax",
|
||||
"rusty-fork",
|
||||
"tempfile",
|
||||
"unarray",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pulldown-cmark"
|
||||
version = "0.13.3"
|
||||
@@ -1120,6 +1183,12 @@ dependencies = [
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-error"
|
||||
version = "1.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.45"
|
||||
@@ -1141,6 +1210,44 @@ version = "6.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
|
||||
dependencies = [
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
|
||||
dependencies = [
|
||||
"getrandom 0.3.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_xorshift"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a"
|
||||
dependencies = [
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.12.0"
|
||||
@@ -1287,6 +1394,18 @@ version = "1.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
||||
|
||||
[[package]]
|
||||
name = "rusty-fork"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"quick-error",
|
||||
"tempfile",
|
||||
"wait-timeout",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.23"
|
||||
@@ -1744,6 +1863,12 @@ dependencies = [
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unarray"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.9.0"
|
||||
@@ -1819,6 +1944,15 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "wait-timeout"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.5.0"
|
||||
|
||||
@@ -10,6 +10,7 @@ members = [
|
||||
"crates/kb-chunk",
|
||||
"crates/kb-store-sqlite",
|
||||
"crates/kb-search",
|
||||
"crates/kb-embed",
|
||||
"crates/kb-app",
|
||||
"crates/kb-cli",
|
||||
]
|
||||
@@ -35,3 +36,4 @@ tracing = "0.1"
|
||||
rusqlite = { version = "0.32", features = ["bundled"] }
|
||||
globset = "0.4"
|
||||
tempfile = "3"
|
||||
proptest = "1"
|
||||
|
||||
30
crates/kb-embed/Cargo.toml
Normal file
30
crates/kb-embed/Cargo.toml
Normal file
@@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "kb-embed"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Embedder trait re-exports + opt-in deterministic MockEmbedder for downstream tests"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-config = { path = "../kb-config" }
|
||||
serde = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
# Used only by `MockEmbedder` (feature = "mock") for deterministic per-component
|
||||
# hashing. Kept as an unconditional dep because `blake3` is already in the
|
||||
# workspace lockfile (transitively via kb-core); pulling it in here adds zero
|
||||
# build cost and keeps Cargo.toml simple.
|
||||
blake3 = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Opt-in `MockEmbedder`. Default OFF so release builds (no `--features mock`)
|
||||
# compile the symbol out entirely (verifiable via `nm`/`cargo bloat`).
|
||||
mock = []
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = { workspace = true }
|
||||
78
crates/kb-embed/src/lib.rs
Normal file
78
crates/kb-embed/src/lib.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
//! `kb-embed` — thin re-export crate for the [`Embedder`] trait surface.
|
||||
//!
|
||||
//! This crate exists so downstream code (`kb-store-vector`, `kb-search`,
|
||||
//! adapters in p3-2) can `use kb_embed::Embedder` and stay stable across
|
||||
//! kb-core reorganizations. It defines **no new types**; everything is a
|
||||
//! re-export of [`kb_core`].
|
||||
//!
|
||||
//! ## Mock implementation
|
||||
//!
|
||||
//! [`MockEmbedder`] (gated behind the `mock` feature, default **OFF**) is a
|
||||
//! deterministic test double. Real adapters (fastembed, candle, ollama-embed)
|
||||
//! live in p3-2 and MUST NOT be implemented here.
|
||||
//!
|
||||
//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md` §7.1, §7.2,
|
||||
//! §11 for the contract.
|
||||
|
||||
// ── Trait re-exports ──────────────────────────────────────────────────────
|
||||
//
|
||||
// Per spec §7.2 — these are the only public-surface types this crate offers.
|
||||
// Adding new types is forbidden by the task contract.
|
||||
|
||||
pub use kb_core::{
|
||||
Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion,
|
||||
};
|
||||
|
||||
// ── Test helper ───────────────────────────────────────────────────────────
|
||||
|
||||
/// Assert every vector has length `expected_dims` and contains only finite
|
||||
/// floats. Intended for downstream test crates so they don't each rewrite the
|
||||
/// shape check.
|
||||
///
|
||||
/// Panics on mismatch (test-only helper — callers are tests).
|
||||
pub fn assert_vector_shape(vecs: &[Vec<f32>], expected_dims: usize) {
|
||||
for (i, v) in vecs.iter().enumerate() {
|
||||
assert_eq!(
|
||||
v.len(),
|
||||
expected_dims,
|
||||
"vector {i}: dims {} != expected {expected_dims}",
|
||||
v.len(),
|
||||
);
|
||||
for (j, x) in v.iter().enumerate() {
|
||||
assert!(x.is_finite(), "vector {i}[{j}] = {x} is not finite");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Assert every vector has L2 norm within `tolerance` of `1.0`.
|
||||
///
|
||||
/// L2 norm is computed in `f64` (per-component square accumulation in `f64`
|
||||
/// then `sqrt`) before truncating back to `f32`, so the comparison is not
|
||||
/// dominated by accumulation error in the check itself — only the f32
|
||||
/// truncation of the input vector's components contributes.
|
||||
///
|
||||
/// Tolerance guidance: callers pass their own. For `dims = 384` and
|
||||
/// f32-truncated unit vectors, `5e-4` is a safe upper bound under quadratic
|
||||
/// accumulation of per-component f32 truncation (`f32::EPSILON × √dims`).
|
||||
/// Smaller dims tolerate tighter bounds; larger dims need looser ones.
|
||||
///
|
||||
/// Panics on mismatch (test-only helper — callers are tests).
|
||||
pub fn assert_unit_norm(vecs: &[Vec<f32>], tolerance: f32) {
|
||||
for (i, v) in vecs.iter().enumerate() {
|
||||
let norm_sq: f64 = v.iter().map(|&x| (x as f64) * (x as f64)).sum();
|
||||
let norm = norm_sq.sqrt() as f32;
|
||||
assert!(
|
||||
(norm - 1.0).abs() <= tolerance,
|
||||
"vector {i}: ‖v‖ = {norm} (off from 1.0 by {})",
|
||||
(norm - 1.0).abs(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── MockEmbedder (feature = "mock") ───────────────────────────────────────
|
||||
|
||||
#[cfg(feature = "mock")]
|
||||
mod mock;
|
||||
|
||||
#[cfg(feature = "mock")]
|
||||
pub use mock::MockEmbedder;
|
||||
146
crates/kb-embed/src/mock.rs
Normal file
146
crates/kb-embed/src/mock.rs
Normal file
@@ -0,0 +1,146 @@
|
||||
//! Deterministic mock embedder for downstream tests.
|
||||
//!
|
||||
//! Compiled only when the `mock` feature is enabled. Default builds
|
||||
//! (`cargo build --release -p kb-embed`) MUST NOT contain the `MockEmbedder`
|
||||
//! symbol — verifiable by symbol scan (`nm`, `cargo bloat`).
|
||||
//!
|
||||
//! ## Determinism contract
|
||||
//!
|
||||
//! For every call to [`MockEmbedder::embed`], component `i` of the output
|
||||
//! vector for input `(text, kind)` is computed as:
|
||||
//!
|
||||
//! ```text
|
||||
//! h = blake3(seed_le8 || kind_byte || text_len_le8 || text_utf8 || i_le8)
|
||||
//! raw_i64 = i64::from_le_bytes(h[0..8])
|
||||
//! comp = (raw_i64 as f64 / i64::MAX as f64) as f32 // ∈ [-1.0, 1.0]
|
||||
//! ```
|
||||
//!
|
||||
//! `kind_byte` is `0u8` for [`EmbeddingKind::Document`] and `1u8` for
|
||||
//! [`EmbeddingKind::Query`] — mirrors the e5-style prefix behavior (the same
|
||||
//! text in different roles produces different vectors). `text_len_le8` is the
|
||||
//! length of `text_utf8` (in bytes) as a little-endian `u64`; it provides
|
||||
//! domain separation so the boundary between `text` and the trailing `i_le8`
|
||||
//! cannot be ambiguous (without it, e.g. `("ABCDEFGH", 0)` and
|
||||
//! `("", u64::from_le_bytes(*b"ABCDEFGH"))` would hash identically).
|
||||
//!
|
||||
//! After the per-component pass each vector is **L2-normalized to unit
|
||||
//! length** so downstream cosine-similarity tests can rely on a unit-norm
|
||||
//! input (‖v‖ ≈ 1.0 within f32 epsilon × √dims — the per-component f32
|
||||
//! truncation is bounded by `f32::EPSILON`, summed in quadrature gives
|
||||
//! roughly `√dims · EPSILON` in the L2 norm). If a vector ends up all-zeros
|
||||
//! (vanishingly unlikely from BLAKE3), it is left untouched rather than
|
||||
//! dividing by zero.
|
||||
//!
|
||||
//! Invariants the contract guarantees:
|
||||
//!
|
||||
//! * Identical `(seed, kind, text, dimensions)` → byte-identical output.
|
||||
//! * Different `kind` for the same text → different output (kind_byte differs).
|
||||
//! * Different `text` → different output with overwhelming probability.
|
||||
//! * All output components are finite (`is_finite()`).
|
||||
|
||||
use kb_core::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion};
|
||||
|
||||
/// Deterministic test double. See module docs for the hashing recipe.
|
||||
pub struct MockEmbedder {
|
||||
model_id: EmbeddingModelId,
|
||||
version: EmbeddingVersion,
|
||||
dimensions: usize,
|
||||
seed: u64,
|
||||
}
|
||||
|
||||
impl MockEmbedder {
|
||||
/// Construct with `seed = 0`. Use [`Self::with_seed`] to pick a different
|
||||
/// seed (e.g., to verify two embedders with the same identity but
|
||||
/// different seeds yield different vectors).
|
||||
pub fn new(
|
||||
model_id: EmbeddingModelId,
|
||||
version: EmbeddingVersion,
|
||||
dimensions: usize,
|
||||
) -> Self {
|
||||
Self {
|
||||
model_id,
|
||||
version,
|
||||
dimensions,
|
||||
seed: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct with an explicit seed. Useful for differential tests.
|
||||
pub fn with_seed(
|
||||
model_id: EmbeddingModelId,
|
||||
version: EmbeddingVersion,
|
||||
dimensions: usize,
|
||||
seed: u64,
|
||||
) -> Self {
|
||||
Self {
|
||||
model_id,
|
||||
version,
|
||||
dimensions,
|
||||
seed,
|
||||
}
|
||||
}
|
||||
|
||||
fn kind_byte(kind: EmbeddingKind) -> u8 {
|
||||
match kind {
|
||||
EmbeddingKind::Document => 0,
|
||||
EmbeddingKind::Query => 1,
|
||||
}
|
||||
}
|
||||
|
||||
fn component(&self, kind: EmbeddingKind, text: &str, i: usize) -> f32 {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
hasher.update(&self.seed.to_le_bytes());
|
||||
hasher.update(&[Self::kind_byte(kind)]);
|
||||
// Length-prefix `text` (LE u64) so the boundary between `text` and the
|
||||
// trailing `i` field is unambiguous — without this, `("ABCDEFGH", 0)`
|
||||
// and `("", u64::from_le_bytes(*b"ABCDEFGH"))` would feed identical
|
||||
// bytes into the hasher.
|
||||
hasher.update(&(text.len() as u64).to_le_bytes());
|
||||
hasher.update(text.as_bytes());
|
||||
hasher.update(&(i as u64).to_le_bytes());
|
||||
let digest = hasher.finalize();
|
||||
let bytes = digest.as_bytes();
|
||||
let mut head = [0u8; 8];
|
||||
head.copy_from_slice(&bytes[..8]);
|
||||
let raw = i64::from_le_bytes(head);
|
||||
// Map to [-1.0, 1.0]. `i64::MAX` is finite in f64 so the ratio is
|
||||
// always finite. Casting back to f32 cannot produce a NaN/Inf for
|
||||
// values in this range.
|
||||
// Note: i64::MIN/i64::MAX gives -1.0000000000000002 → f32 cast rounds to -1.0; range [-1, 1] holds in f32 even with this asymmetry.
|
||||
((raw as f64) / (i64::MAX as f64)) as f32
|
||||
}
|
||||
}
|
||||
|
||||
impl Embedder for MockEmbedder {
|
||||
fn model_id(&self) -> EmbeddingModelId {
|
||||
self.model_id.clone()
|
||||
}
|
||||
|
||||
fn model_version(&self) -> EmbeddingVersion {
|
||||
self.version.clone()
|
||||
}
|
||||
|
||||
fn dimensions(&self) -> usize {
|
||||
self.dimensions
|
||||
}
|
||||
|
||||
fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result<Vec<Vec<f32>>> {
|
||||
let mut out = Vec::with_capacity(inputs.len());
|
||||
for input in inputs {
|
||||
let mut v: Vec<f32> = (0..self.dimensions)
|
||||
.map(|i| self.component(input.kind, input.text, i))
|
||||
.collect();
|
||||
|
||||
// L2-normalize. Skip the rare all-zero case to avoid 0/0 = NaN.
|
||||
let norm_sq: f64 = v.iter().map(|&x| (x as f64) * (x as f64)).sum();
|
||||
if norm_sq > 0.0 {
|
||||
let inv = (1.0 / norm_sq.sqrt()) as f32;
|
||||
for x in v.iter_mut() {
|
||||
*x *= inv;
|
||||
}
|
||||
}
|
||||
out.push(v);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
176
crates/kb-embed/tests/mock.rs
Normal file
176
crates/kb-embed/tests/mock.rs
Normal file
@@ -0,0 +1,176 @@
|
||||
//! Integration tests for `MockEmbedder`. Gated behind the `mock` feature.
|
||||
//!
|
||||
//! Canonical invocation: `cargo test -p kb-embed --features mock`.
|
||||
|
||||
#![cfg(feature = "mock")]
|
||||
|
||||
use kb_embed::{
|
||||
Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion, MockEmbedder,
|
||||
assert_unit_norm, assert_vector_shape,
|
||||
};
|
||||
use proptest::prelude::*;
|
||||
|
||||
fn mk(dims: usize) -> MockEmbedder {
|
||||
MockEmbedder::new(
|
||||
EmbeddingModelId("mock-test".into()),
|
||||
EmbeddingVersion("0".into()),
|
||||
dims,
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dyn_dispatch_through_box() {
|
||||
let e: Box<dyn Embedder> = Box::new(mk(8));
|
||||
assert_eq!(e.dimensions(), 8);
|
||||
assert_eq!(e.model_id(), EmbeddingModelId("mock-test".into()));
|
||||
assert_eq!(e.model_version(), EmbeddingVersion("0".into()));
|
||||
|
||||
let inputs = [EmbeddingInput {
|
||||
text: "a fox",
|
||||
kind: EmbeddingKind::Document,
|
||||
}];
|
||||
let v = e.embed(&inputs).expect("embed via box");
|
||||
assert_eq!(v.len(), 1);
|
||||
assert_vector_shape(&v, 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn identical_input_yields_byte_identical_vector() {
|
||||
let e = mk(16);
|
||||
let a = e
|
||||
.embed(&[EmbeddingInput {
|
||||
text: "the quick brown fox",
|
||||
kind: EmbeddingKind::Document,
|
||||
}])
|
||||
.unwrap();
|
||||
let b = e
|
||||
.embed(&[EmbeddingInput {
|
||||
text: "the quick brown fox",
|
||||
kind: EmbeddingKind::Document,
|
||||
}])
|
||||
.unwrap();
|
||||
// Vec<Vec<f32>> equality is byte-equal because we did not mutate
|
||||
// either side and the hash + normalization path is pure.
|
||||
assert_eq!(a, b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn document_and_query_kinds_differ_for_same_text() {
|
||||
let e = mk(32);
|
||||
let inputs = [
|
||||
EmbeddingInput {
|
||||
text: "needle in haystack",
|
||||
kind: EmbeddingKind::Document,
|
||||
},
|
||||
EmbeddingInput {
|
||||
text: "needle in haystack",
|
||||
kind: EmbeddingKind::Query,
|
||||
},
|
||||
];
|
||||
let v = e.embed(&inputs).unwrap();
|
||||
assert_eq!(v.len(), 2);
|
||||
assert_vector_shape(&v, 32);
|
||||
assert_ne!(
|
||||
v[0], v[1],
|
||||
"Document and Query kinds must produce different vectors for identical text"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dimensions_match_construction() {
|
||||
for dims in [1usize, 4, 64, 384, 768, 1024] {
|
||||
let e = mk(dims);
|
||||
assert_eq!(e.dimensions(), dims);
|
||||
let v = e
|
||||
.embed(&[EmbeddingInput {
|
||||
text: "x",
|
||||
kind: EmbeddingKind::Document,
|
||||
}])
|
||||
.unwrap();
|
||||
assert_vector_shape(&v, dims);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_seeds_produce_different_vectors() {
|
||||
let a = MockEmbedder::with_seed(
|
||||
EmbeddingModelId("m".into()),
|
||||
EmbeddingVersion("0".into()),
|
||||
16,
|
||||
0,
|
||||
);
|
||||
let b = MockEmbedder::with_seed(
|
||||
EmbeddingModelId("m".into()),
|
||||
EmbeddingVersion("0".into()),
|
||||
16,
|
||||
1,
|
||||
);
|
||||
let inputs = [EmbeddingInput {
|
||||
text: "same input",
|
||||
kind: EmbeddingKind::Document,
|
||||
}];
|
||||
assert_ne!(a.embed(&inputs).unwrap(), b.embed(&inputs).unwrap());
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig {
|
||||
cases: 100,
|
||||
..ProptestConfig::default()
|
||||
})]
|
||||
|
||||
/// 100 random `(text, kind)` pairs: every output vector must have
|
||||
/// `len == dimensions`, contain only finite floats, contain no NaNs,
|
||||
/// be L2 unit-norm within tolerance, be re-deterministic across calls,
|
||||
/// differ between Document/Query kinds, and differ between distinct texts.
|
||||
#[test]
|
||||
fn random_inputs_yield_well_formed_vectors(
|
||||
text in ".{0,256}",
|
||||
text2 in ".{0,256}",
|
||||
is_query in any::<bool>(),
|
||||
// dims ≥ 2: a 1-dim unit-norm vector has only two possible values
|
||||
// (`[1.0]` or `[-1.0]`), which makes the kind/text differential
|
||||
// assertions degenerate. Pick a floor of 2 so the differentials
|
||||
// exercise non-degenerate vector space.
|
||||
dims in 2usize..=128,
|
||||
) {
|
||||
// Skip degenerate case where the two random texts collide; the
|
||||
// "distinct text → distinct vector" assertion below requires them to
|
||||
// differ.
|
||||
prop_assume!(text != text2);
|
||||
|
||||
let e = mk(dims);
|
||||
let kind = if is_query { EmbeddingKind::Query } else { EmbeddingKind::Document };
|
||||
let v = e.embed(&[EmbeddingInput { text: &text, kind }]).unwrap();
|
||||
prop_assert_eq!(v.len(), 1);
|
||||
prop_assert_eq!(v[0].len(), dims);
|
||||
for x in &v[0] {
|
||||
prop_assert!(x.is_finite(), "component {x} not finite");
|
||||
prop_assert!(!x.is_nan(), "component {x} is NaN");
|
||||
}
|
||||
|
||||
// L2 unit-norm within tolerance. `5e-4` is a safe upper bound up to
|
||||
// dims = 128 here (would-be floor: f32::EPSILON × √dims).
|
||||
assert_unit_norm(&v, 5e-4);
|
||||
|
||||
// Re-determinism: embedding `text` as Document twice → byte-equal.
|
||||
let doc_a = e
|
||||
.embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Document }])
|
||||
.unwrap();
|
||||
let doc_b = e
|
||||
.embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Document }])
|
||||
.unwrap();
|
||||
prop_assert_eq!(&doc_a, &doc_b, "Doc(text) must be byte-equal across calls");
|
||||
|
||||
// Kind differential: Doc(text) != Query(text).
|
||||
let q = e
|
||||
.embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Query }])
|
||||
.unwrap();
|
||||
prop_assert_ne!(&doc_a, &q, "Doc(text) must differ from Query(text)");
|
||||
|
||||
// Text differential: Doc(text) != Doc(text2) when text != text2.
|
||||
let doc_other = e
|
||||
.embed(&[EmbeddingInput { text: &text2, kind: EmbeddingKind::Document }])
|
||||
.unwrap();
|
||||
prop_assert_ne!(&doc_a, &doc_other, "distinct texts must yield distinct Doc vectors");
|
||||
}
|
||||
}
|
||||
61
crates/kb-embed/tests/reexports.rs
Normal file
61
crates/kb-embed/tests/reexports.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
//! Compile-only test: verifies the crate's public surface (trait re-exports
|
||||
//! and the `assert_vector_shape` helper) is reachable without the `mock`
|
||||
//! feature.
|
||||
//!
|
||||
//! Runs under both `cargo test -p kb-embed` and
|
||||
//! `cargo test -p kb-embed --features mock`.
|
||||
|
||||
use kb_embed::{
|
||||
Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion,
|
||||
assert_vector_shape,
|
||||
};
|
||||
|
||||
/// A trivial in-test impl that does NOT rely on the `mock` feature — proves
|
||||
/// the trait surface alone is enough to write an `Embedder`.
|
||||
struct ZeroEmbedder {
|
||||
dims: usize,
|
||||
}
|
||||
|
||||
impl Embedder for ZeroEmbedder {
|
||||
fn model_id(&self) -> EmbeddingModelId {
|
||||
EmbeddingModelId("zero".into())
|
||||
}
|
||||
fn model_version(&self) -> EmbeddingVersion {
|
||||
EmbeddingVersion("0".into())
|
||||
}
|
||||
fn dimensions(&self) -> usize {
|
||||
self.dims
|
||||
}
|
||||
fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result<Vec<Vec<f32>>> {
|
||||
Ok(inputs.iter().map(|_| vec![0.0; self.dims]).collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reexports_compile_without_mock_feature() {
|
||||
let e: Box<dyn Embedder> = Box::new(ZeroEmbedder { dims: 4 });
|
||||
let inputs = [
|
||||
EmbeddingInput {
|
||||
text: "hello",
|
||||
kind: EmbeddingKind::Document,
|
||||
},
|
||||
EmbeddingInput {
|
||||
text: "world",
|
||||
kind: EmbeddingKind::Query,
|
||||
},
|
||||
];
|
||||
let v = e.embed(&inputs).expect("zero embed");
|
||||
assert_eq!(v.len(), 2);
|
||||
assert_vector_shape(&v, 4);
|
||||
}
|
||||
|
||||
/// Sanity: when built WITHOUT `--features mock`, the `MockEmbedder` symbol
|
||||
/// is absent. We can't usefully test `nm` from inside a unit test, but we
|
||||
/// can at least confirm the cfg gate parses both ways. See PR notes for the
|
||||
/// CI-side `nm`/`cargo bloat` symbol scan.
|
||||
#[cfg(not(feature = "mock"))]
|
||||
#[test]
|
||||
fn mock_feature_off_compiles() {
|
||||
// No-op — the test's existence proves the `not(feature = "mock")` gate
|
||||
// compiles and the crate is usable without `MockEmbedder`.
|
||||
}
|
||||
Reference in New Issue
Block a user