Establishes the kb-embed trait crate so concrete embedding adapters
(p3-2 fastembed, future ollama-embed/candle) target a stable surface.
Pure re-export of kb_core::{Embedder, EmbeddingInput, EmbeddingKind,
EmbeddingModelId, EmbeddingVersion} plus a feature-gated deterministic
mock for downstream tests.
MockEmbedder (cfg(feature = "mock"), default OFF):
- Per-component hash recipe: blake3(seed_le8 || kind_byte ||
text_len_le8 || text || i_le8). Length-prefixed text avoids the
domain-separation ambiguity where two (text, i) pairs could shift
bytes between text tail and the i field.
- Document = 0u8, Query = 1u8 — same text different kind yields
different vectors (mirrors e5 prefix behaviour).
- Per component: blake3 first 8 bytes → u64 → reinterpret as i64 →
f64/i64::MAX → f32. i64::MIN gives -1.0000000000000002 which f32
rounds to -1.0; range [-1, 1] holds.
- L2 unit-normalised. Norm sums in f64 (avoid catastrophic precision
loss) before f32 cast. Zero-norm guard skips the divide.
- with_seed(...) constructor lets two embedders share identity but
produce different vectors — useful for downstream parametric tests.
Helpers:
- assert_vector_shape(vecs, dims) — len + finite check.
- assert_unit_norm(vecs, tolerance) — caller-supplied tolerance;
5e-4 documented as safe for dims=384 under f32 epsilon × √dims.
Tests:
- cargo test -p kb-embed (no features): 2 reexport/dyn-dispatch tests.
- cargo test -p kb-embed --features mock: 7 tests including 100-case
proptest asserting len == dims, all finite, ‖v‖ ≈ 1.0 within
tolerance, Doc(text) byte-equal Doc(text), Doc(text) ≠ Query(text),
Doc(text1) ≠ Doc(text2).
- All 220 workspace tests pass; clippy clean for both default and
mock-on feature configurations.
Symbol gating: nm on the release rlib confirms zero MockEmbedder
symbols under default features; three trait impl symbols under
--features mock. Spec invariant "release builds MUST NOT include
MockEmbedder" verified at the symbol level.
Allowed deps respected: kb-core, kb-config, serde, thiserror, tracing,
plus anyhow (forced by trait return type) and blake3 (justified by
the determinism contract; already in workspace lockfile via kb-core).
No fastembed/ort/tokenizers anywhere.
Out of scope: real adapter (p3-2), reranker traits (P+).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
177 lines
5.6 KiB
Rust
177 lines
5.6 KiB
Rust
//! Integration tests for `MockEmbedder`. Gated behind the `mock` feature.
|
||
//!
|
||
//! Canonical invocation: `cargo test -p kb-embed --features mock`.
|
||
|
||
#![cfg(feature = "mock")]
|
||
|
||
use kb_embed::{
|
||
Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion, MockEmbedder,
|
||
assert_unit_norm, assert_vector_shape,
|
||
};
|
||
use proptest::prelude::*;
|
||
|
||
fn mk(dims: usize) -> MockEmbedder {
|
||
MockEmbedder::new(
|
||
EmbeddingModelId("mock-test".into()),
|
||
EmbeddingVersion("0".into()),
|
||
dims,
|
||
)
|
||
}
|
||
|
||
#[test]
|
||
fn dyn_dispatch_through_box() {
|
||
let e: Box<dyn Embedder> = Box::new(mk(8));
|
||
assert_eq!(e.dimensions(), 8);
|
||
assert_eq!(e.model_id(), EmbeddingModelId("mock-test".into()));
|
||
assert_eq!(e.model_version(), EmbeddingVersion("0".into()));
|
||
|
||
let inputs = [EmbeddingInput {
|
||
text: "a fox",
|
||
kind: EmbeddingKind::Document,
|
||
}];
|
||
let v = e.embed(&inputs).expect("embed via box");
|
||
assert_eq!(v.len(), 1);
|
||
assert_vector_shape(&v, 8);
|
||
}
|
||
|
||
#[test]
|
||
fn identical_input_yields_byte_identical_vector() {
|
||
let e = mk(16);
|
||
let a = e
|
||
.embed(&[EmbeddingInput {
|
||
text: "the quick brown fox",
|
||
kind: EmbeddingKind::Document,
|
||
}])
|
||
.unwrap();
|
||
let b = e
|
||
.embed(&[EmbeddingInput {
|
||
text: "the quick brown fox",
|
||
kind: EmbeddingKind::Document,
|
||
}])
|
||
.unwrap();
|
||
// Vec<Vec<f32>> equality is byte-equal because we did not mutate
|
||
// either side and the hash + normalization path is pure.
|
||
assert_eq!(a, b);
|
||
}
|
||
|
||
#[test]
|
||
fn document_and_query_kinds_differ_for_same_text() {
|
||
let e = mk(32);
|
||
let inputs = [
|
||
EmbeddingInput {
|
||
text: "needle in haystack",
|
||
kind: EmbeddingKind::Document,
|
||
},
|
||
EmbeddingInput {
|
||
text: "needle in haystack",
|
||
kind: EmbeddingKind::Query,
|
||
},
|
||
];
|
||
let v = e.embed(&inputs).unwrap();
|
||
assert_eq!(v.len(), 2);
|
||
assert_vector_shape(&v, 32);
|
||
assert_ne!(
|
||
v[0], v[1],
|
||
"Document and Query kinds must produce different vectors for identical text"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn dimensions_match_construction() {
|
||
for dims in [1usize, 4, 64, 384, 768, 1024] {
|
||
let e = mk(dims);
|
||
assert_eq!(e.dimensions(), dims);
|
||
let v = e
|
||
.embed(&[EmbeddingInput {
|
||
text: "x",
|
||
kind: EmbeddingKind::Document,
|
||
}])
|
||
.unwrap();
|
||
assert_vector_shape(&v, dims);
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn different_seeds_produce_different_vectors() {
|
||
let a = MockEmbedder::with_seed(
|
||
EmbeddingModelId("m".into()),
|
||
EmbeddingVersion("0".into()),
|
||
16,
|
||
0,
|
||
);
|
||
let b = MockEmbedder::with_seed(
|
||
EmbeddingModelId("m".into()),
|
||
EmbeddingVersion("0".into()),
|
||
16,
|
||
1,
|
||
);
|
||
let inputs = [EmbeddingInput {
|
||
text: "same input",
|
||
kind: EmbeddingKind::Document,
|
||
}];
|
||
assert_ne!(a.embed(&inputs).unwrap(), b.embed(&inputs).unwrap());
|
||
}
|
||
|
||
proptest! {
|
||
#![proptest_config(ProptestConfig {
|
||
cases: 100,
|
||
..ProptestConfig::default()
|
||
})]
|
||
|
||
/// 100 random `(text, kind)` pairs: every output vector must have
|
||
/// `len == dimensions`, contain only finite floats, contain no NaNs,
|
||
/// be L2 unit-norm within tolerance, be re-deterministic across calls,
|
||
/// differ between Document/Query kinds, and differ between distinct texts.
|
||
#[test]
|
||
fn random_inputs_yield_well_formed_vectors(
|
||
text in ".{0,256}",
|
||
text2 in ".{0,256}",
|
||
is_query in any::<bool>(),
|
||
// dims ≥ 2: a 1-dim unit-norm vector has only two possible values
|
||
// (`[1.0]` or `[-1.0]`), which makes the kind/text differential
|
||
// assertions degenerate. Pick a floor of 2 so the differentials
|
||
// exercise non-degenerate vector space.
|
||
dims in 2usize..=128,
|
||
) {
|
||
// Skip degenerate case where the two random texts collide; the
|
||
// "distinct text → distinct vector" assertion below requires them to
|
||
// differ.
|
||
prop_assume!(text != text2);
|
||
|
||
let e = mk(dims);
|
||
let kind = if is_query { EmbeddingKind::Query } else { EmbeddingKind::Document };
|
||
let v = e.embed(&[EmbeddingInput { text: &text, kind }]).unwrap();
|
||
prop_assert_eq!(v.len(), 1);
|
||
prop_assert_eq!(v[0].len(), dims);
|
||
for x in &v[0] {
|
||
prop_assert!(x.is_finite(), "component {x} not finite");
|
||
prop_assert!(!x.is_nan(), "component {x} is NaN");
|
||
}
|
||
|
||
// L2 unit-norm within tolerance. `5e-4` is a safe upper bound up to
|
||
// dims = 128 here (would-be floor: f32::EPSILON × √dims).
|
||
assert_unit_norm(&v, 5e-4);
|
||
|
||
// Re-determinism: embedding `text` as Document twice → byte-equal.
|
||
let doc_a = e
|
||
.embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Document }])
|
||
.unwrap();
|
||
let doc_b = e
|
||
.embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Document }])
|
||
.unwrap();
|
||
prop_assert_eq!(&doc_a, &doc_b, "Doc(text) must be byte-equal across calls");
|
||
|
||
// Kind differential: Doc(text) != Query(text).
|
||
let q = e
|
||
.embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Query }])
|
||
.unwrap();
|
||
prop_assert_ne!(&doc_a, &q, "Doc(text) must differ from Query(text)");
|
||
|
||
// Text differential: Doc(text) != Doc(text2) when text != text2.
|
||
let doc_other = e
|
||
.embed(&[EmbeddingInput { text: &text2, kind: EmbeddingKind::Document }])
|
||
.unwrap();
|
||
prop_assert_ne!(&doc_a, &doc_other, "distinct texts must yield distinct Doc vectors");
|
||
}
|
||
}
|