Files
kebab/crates/kb-embed/tests/mock.rs
altair823 2e3eb8f437 feat(p3-1): kb-embed crate — Embedder trait re-export + MockEmbedder
Establishes the kb-embed trait crate so concrete embedding adapters
(p3-2 fastembed, future ollama-embed/candle) target a stable surface.
Pure re-export of kb_core::{Embedder, EmbeddingInput, EmbeddingKind,
EmbeddingModelId, EmbeddingVersion} plus a feature-gated deterministic
mock for downstream tests.

MockEmbedder (cfg(feature = "mock"), default OFF):
- Per-component hash recipe: blake3(seed_le8 || kind_byte ||
  text_len_le8 || text || i_le8). Length-prefixed text avoids the
  domain-separation ambiguity where two (text, i) pairs could shift
  bytes between text tail and the i field.
- Document = 0u8, Query = 1u8 — same text different kind yields
  different vectors (mirrors e5 prefix behaviour).
- Per component: blake3 first 8 bytes → u64 → reinterpret as i64 →
  f64/i64::MAX → f32. i64::MIN gives -1.0000000000000002 which f32
  rounds to -1.0; range [-1, 1] holds.
- L2 unit-normalised. Norm sums in f64 (avoid catastrophic precision
  loss) before f32 cast. Zero-norm guard skips the divide.
- with_seed(...) constructor lets two embedders share identity but
  produce different vectors — useful for downstream parametric tests.

Helpers:
- assert_vector_shape(vecs, dims) — len + finite check.
- assert_unit_norm(vecs, tolerance) — caller-supplied tolerance;
  5e-4 documented as safe for dims=384 under f32 epsilon × √dims.

Tests:
- cargo test -p kb-embed (no features): 2 reexport/dyn-dispatch tests.
- cargo test -p kb-embed --features mock: 7 tests including 100-case
  proptest asserting len == dims, all finite, ‖v‖ ≈ 1.0 within
  tolerance, Doc(text) byte-equal Doc(text), Doc(text) ≠ Query(text),
  Doc(text1) ≠ Doc(text2).
- All 220 workspace tests pass; clippy clean for both default and
  mock-on feature configurations.

Symbol gating: nm on the release rlib confirms zero MockEmbedder
symbols under default features; three trait impl symbols under
--features mock. Spec invariant "release builds MUST NOT include
MockEmbedder" verified at the symbol level.

Allowed deps respected: kb-core, kb-config, serde, thiserror, tracing,
plus anyhow (forced by trait return type) and blake3 (justified by
the determinism contract; already in workspace lockfile via kb-core).
No fastembed/ort/tokenizers anywhere.

Out of scope: real adapter (p3-2), reranker traits (P+).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 08:15:44 +00:00

177 lines
5.6 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Integration tests for `MockEmbedder`. Gated behind the `mock` feature.
//!
//! Canonical invocation: `cargo test -p kb-embed --features mock`.
#![cfg(feature = "mock")]
use kb_embed::{
Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion, MockEmbedder,
assert_unit_norm, assert_vector_shape,
};
use proptest::prelude::*;
fn mk(dims: usize) -> MockEmbedder {
MockEmbedder::new(
EmbeddingModelId("mock-test".into()),
EmbeddingVersion("0".into()),
dims,
)
}
#[test]
fn dyn_dispatch_through_box() {
let e: Box<dyn Embedder> = Box::new(mk(8));
assert_eq!(e.dimensions(), 8);
assert_eq!(e.model_id(), EmbeddingModelId("mock-test".into()));
assert_eq!(e.model_version(), EmbeddingVersion("0".into()));
let inputs = [EmbeddingInput {
text: "a fox",
kind: EmbeddingKind::Document,
}];
let v = e.embed(&inputs).expect("embed via box");
assert_eq!(v.len(), 1);
assert_vector_shape(&v, 8);
}
#[test]
fn identical_input_yields_byte_identical_vector() {
let e = mk(16);
let a = e
.embed(&[EmbeddingInput {
text: "the quick brown fox",
kind: EmbeddingKind::Document,
}])
.unwrap();
let b = e
.embed(&[EmbeddingInput {
text: "the quick brown fox",
kind: EmbeddingKind::Document,
}])
.unwrap();
// Vec<Vec<f32>> equality is byte-equal because we did not mutate
// either side and the hash + normalization path is pure.
assert_eq!(a, b);
}
#[test]
fn document_and_query_kinds_differ_for_same_text() {
let e = mk(32);
let inputs = [
EmbeddingInput {
text: "needle in haystack",
kind: EmbeddingKind::Document,
},
EmbeddingInput {
text: "needle in haystack",
kind: EmbeddingKind::Query,
},
];
let v = e.embed(&inputs).unwrap();
assert_eq!(v.len(), 2);
assert_vector_shape(&v, 32);
assert_ne!(
v[0], v[1],
"Document and Query kinds must produce different vectors for identical text"
);
}
#[test]
fn dimensions_match_construction() {
for dims in [1usize, 4, 64, 384, 768, 1024] {
let e = mk(dims);
assert_eq!(e.dimensions(), dims);
let v = e
.embed(&[EmbeddingInput {
text: "x",
kind: EmbeddingKind::Document,
}])
.unwrap();
assert_vector_shape(&v, dims);
}
}
#[test]
fn different_seeds_produce_different_vectors() {
let a = MockEmbedder::with_seed(
EmbeddingModelId("m".into()),
EmbeddingVersion("0".into()),
16,
0,
);
let b = MockEmbedder::with_seed(
EmbeddingModelId("m".into()),
EmbeddingVersion("0".into()),
16,
1,
);
let inputs = [EmbeddingInput {
text: "same input",
kind: EmbeddingKind::Document,
}];
assert_ne!(a.embed(&inputs).unwrap(), b.embed(&inputs).unwrap());
}
proptest! {
#![proptest_config(ProptestConfig {
cases: 100,
..ProptestConfig::default()
})]
/// 100 random `(text, kind)` pairs: every output vector must have
/// `len == dimensions`, contain only finite floats, contain no NaNs,
/// be L2 unit-norm within tolerance, be re-deterministic across calls,
/// differ between Document/Query kinds, and differ between distinct texts.
#[test]
fn random_inputs_yield_well_formed_vectors(
text in ".{0,256}",
text2 in ".{0,256}",
is_query in any::<bool>(),
// dims ≥ 2: a 1-dim unit-norm vector has only two possible values
// (`[1.0]` or `[-1.0]`), which makes the kind/text differential
// assertions degenerate. Pick a floor of 2 so the differentials
// exercise non-degenerate vector space.
dims in 2usize..=128,
) {
// Skip degenerate case where the two random texts collide; the
// "distinct text → distinct vector" assertion below requires them to
// differ.
prop_assume!(text != text2);
let e = mk(dims);
let kind = if is_query { EmbeddingKind::Query } else { EmbeddingKind::Document };
let v = e.embed(&[EmbeddingInput { text: &text, kind }]).unwrap();
prop_assert_eq!(v.len(), 1);
prop_assert_eq!(v[0].len(), dims);
for x in &v[0] {
prop_assert!(x.is_finite(), "component {x} not finite");
prop_assert!(!x.is_nan(), "component {x} is NaN");
}
// L2 unit-norm within tolerance. `5e-4` is a safe upper bound up to
// dims = 128 here (would-be floor: f32::EPSILON × √dims).
assert_unit_norm(&v, 5e-4);
// Re-determinism: embedding `text` as Document twice → byte-equal.
let doc_a = e
.embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Document }])
.unwrap();
let doc_b = e
.embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Document }])
.unwrap();
prop_assert_eq!(&doc_a, &doc_b, "Doc(text) must be byte-equal across calls");
// Kind differential: Doc(text) != Query(text).
let q = e
.embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Query }])
.unwrap();
prop_assert_ne!(&doc_a, &q, "Doc(text) must differ from Query(text)");
// Text differential: Doc(text) != Doc(text2) when text != text2.
let doc_other = e
.embed(&[EmbeddingInput { text: &text2, kind: EmbeddingKind::Document }])
.unwrap();
prop_assert_ne!(&doc_a, &doc_other, "distinct texts must yield distinct Doc vectors");
}
}