From 2e3eb8f437ee284efe33f448d30444ebd2c91d97 Mon Sep 17 00:00:00 2001 From: altair823 Date: Fri, 1 May 2026 08:15:44 +0000 Subject: [PATCH] =?UTF-8?q?feat(p3-1):=20kb-embed=20crate=20=E2=80=94=20Em?= =?UTF-8?q?bedder=20trait=20re-export=20+=20MockEmbedder?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Establishes the kb-embed trait crate so concrete embedding adapters (p3-2 fastembed, future ollama-embed/candle) target a stable surface. Pure re-export of kb_core::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion} plus a feature-gated deterministic mock for downstream tests. MockEmbedder (cfg(feature = "mock"), default OFF): - Per-component hash recipe: blake3(seed_le8 || kind_byte || text_len_le8 || text || i_le8). Length-prefixed text avoids the domain-separation ambiguity where two (text, i) pairs could shift bytes between text tail and the i field. - Document = 0u8, Query = 1u8 — same text different kind yields different vectors (mirrors e5 prefix behaviour). - Per component: blake3 first 8 bytes → u64 → reinterpret as i64 → f64/i64::MAX → f32. i64::MIN gives -1.0000000000000002 which f32 rounds to -1.0; range [-1, 1] holds. - L2 unit-normalised. Norm sums in f64 (avoid catastrophic precision loss) before f32 cast. Zero-norm guard skips the divide. - with_seed(...) constructor lets two embedders share identity but produce different vectors — useful for downstream parametric tests. Helpers: - assert_vector_shape(vecs, dims) — len + finite check. - assert_unit_norm(vecs, tolerance) — caller-supplied tolerance; 5e-4 documented as safe for dims=384 under f32 epsilon × √dims. Tests: - cargo test -p kb-embed (no features): 2 reexport/dyn-dispatch tests. - cargo test -p kb-embed --features mock: 7 tests including 100-case proptest asserting len == dims, all finite, ‖v‖ ≈ 1.0 within tolerance, Doc(text) byte-equal Doc(text), Doc(text) ≠ Query(text), Doc(text1) ≠ Doc(text2). - All 220 workspace tests pass; clippy clean for both default and mock-on feature configurations. Symbol gating: nm on the release rlib confirms zero MockEmbedder symbols under default features; three trait impl symbols under --features mock. Spec invariant "release builds MUST NOT include MockEmbedder" verified at the symbol level. Allowed deps respected: kb-core, kb-config, serde, thiserror, tracing, plus anyhow (forced by trait return type) and blake3 (justified by the determinism contract; already in workspace lockfile via kb-core). No fastembed/ort/tokenizers anywhere. Out of scope: real adapter (p3-2), reranker traits (P+). Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 134 ++++++++++++++++++++++ Cargo.toml | 2 + crates/kb-embed/Cargo.toml | 30 +++++ crates/kb-embed/src/lib.rs | 78 +++++++++++++ crates/kb-embed/src/mock.rs | 146 ++++++++++++++++++++++++ crates/kb-embed/tests/mock.rs | 176 +++++++++++++++++++++++++++++ crates/kb-embed/tests/reexports.rs | 61 ++++++++++ 7 files changed, 627 insertions(+) create mode 100644 crates/kb-embed/Cargo.toml create mode 100644 crates/kb-embed/src/lib.rs create mode 100644 crates/kb-embed/src/mock.rs create mode 100644 crates/kb-embed/tests/mock.rs create mode 100644 crates/kb-embed/tests/reexports.rs diff --git a/Cargo.lock b/Cargo.lock index 1bd474c..8ca9e2c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -108,6 +108,21 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "2.11.1" @@ -369,6 +384,12 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foldhash" version = "0.1.5" @@ -761,6 +782,20 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "kb-embed" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "kb-config", + "kb-core", + "proptest", + "serde", + "thiserror 2.0.18", + "tracing", +] + [[package]] name = "kb-normalize" version = "0.1.0" @@ -1090,6 +1125,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "prettyplease" version = "0.2.37" @@ -1109,6 +1153,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + [[package]] name = "pulldown-cmark" version = "0.13.3" @@ -1120,6 +1183,12 @@ dependencies = [ "unicase", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.45" @@ -1141,6 +1210,44 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core", +] + [[package]] name = "rayon" version = "1.12.0" @@ -1287,6 +1394,18 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "ryu" version = "1.0.23" @@ -1744,6 +1863,12 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicase" version = "2.9.0" @@ -1819,6 +1944,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.5.0" diff --git a/Cargo.toml b/Cargo.toml index 951f3ac..3e81b7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ members = [ "crates/kb-chunk", "crates/kb-store-sqlite", "crates/kb-search", + "crates/kb-embed", "crates/kb-app", "crates/kb-cli", ] @@ -35,3 +36,4 @@ tracing = "0.1" rusqlite = { version = "0.32", features = ["bundled"] } globset = "0.4" tempfile = "3" +proptest = "1" diff --git a/crates/kb-embed/Cargo.toml b/crates/kb-embed/Cargo.toml new file mode 100644 index 0000000..2719095 --- /dev/null +++ b/crates/kb-embed/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "kb-embed" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Embedder trait re-exports + opt-in deterministic MockEmbedder for downstream tests" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-config = { path = "../kb-config" } +serde = { workspace = true } +thiserror = { workspace = true } +tracing = { workspace = true } +anyhow = { workspace = true } +# Used only by `MockEmbedder` (feature = "mock") for deterministic per-component +# hashing. Kept as an unconditional dep because `blake3` is already in the +# workspace lockfile (transitively via kb-core); pulling it in here adds zero +# build cost and keeps Cargo.toml simple. +blake3 = { workspace = true } + +[features] +default = [] +# Opt-in `MockEmbedder`. Default OFF so release builds (no `--features mock`) +# compile the symbol out entirely (verifiable via `nm`/`cargo bloat`). +mock = [] + +[dev-dependencies] +proptest = { workspace = true } diff --git a/crates/kb-embed/src/lib.rs b/crates/kb-embed/src/lib.rs new file mode 100644 index 0000000..a4adba8 --- /dev/null +++ b/crates/kb-embed/src/lib.rs @@ -0,0 +1,78 @@ +//! `kb-embed` — thin re-export crate for the [`Embedder`] trait surface. +//! +//! This crate exists so downstream code (`kb-store-vector`, `kb-search`, +//! adapters in p3-2) can `use kb_embed::Embedder` and stay stable across +//! kb-core reorganizations. It defines **no new types**; everything is a +//! re-export of [`kb_core`]. +//! +//! ## Mock implementation +//! +//! [`MockEmbedder`] (gated behind the `mock` feature, default **OFF**) is a +//! deterministic test double. Real adapters (fastembed, candle, ollama-embed) +//! live in p3-2 and MUST NOT be implemented here. +//! +//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md` §7.1, §7.2, +//! §11 for the contract. + +// ── Trait re-exports ────────────────────────────────────────────────────── +// +// Per spec §7.2 — these are the only public-surface types this crate offers. +// Adding new types is forbidden by the task contract. + +pub use kb_core::{ + Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion, +}; + +// ── Test helper ─────────────────────────────────────────────────────────── + +/// Assert every vector has length `expected_dims` and contains only finite +/// floats. Intended for downstream test crates so they don't each rewrite the +/// shape check. +/// +/// Panics on mismatch (test-only helper — callers are tests). +pub fn assert_vector_shape(vecs: &[Vec], expected_dims: usize) { + for (i, v) in vecs.iter().enumerate() { + assert_eq!( + v.len(), + expected_dims, + "vector {i}: dims {} != expected {expected_dims}", + v.len(), + ); + for (j, x) in v.iter().enumerate() { + assert!(x.is_finite(), "vector {i}[{j}] = {x} is not finite"); + } + } +} + +/// Assert every vector has L2 norm within `tolerance` of `1.0`. +/// +/// L2 norm is computed in `f64` (per-component square accumulation in `f64` +/// then `sqrt`) before truncating back to `f32`, so the comparison is not +/// dominated by accumulation error in the check itself — only the f32 +/// truncation of the input vector's components contributes. +/// +/// Tolerance guidance: callers pass their own. For `dims = 384` and +/// f32-truncated unit vectors, `5e-4` is a safe upper bound under quadratic +/// accumulation of per-component f32 truncation (`f32::EPSILON × √dims`). +/// Smaller dims tolerate tighter bounds; larger dims need looser ones. +/// +/// Panics on mismatch (test-only helper — callers are tests). +pub fn assert_unit_norm(vecs: &[Vec], tolerance: f32) { + for (i, v) in vecs.iter().enumerate() { + let norm_sq: f64 = v.iter().map(|&x| (x as f64) * (x as f64)).sum(); + let norm = norm_sq.sqrt() as f32; + assert!( + (norm - 1.0).abs() <= tolerance, + "vector {i}: ‖v‖ = {norm} (off from 1.0 by {})", + (norm - 1.0).abs(), + ); + } +} + +// ── MockEmbedder (feature = "mock") ─────────────────────────────────────── + +#[cfg(feature = "mock")] +mod mock; + +#[cfg(feature = "mock")] +pub use mock::MockEmbedder; diff --git a/crates/kb-embed/src/mock.rs b/crates/kb-embed/src/mock.rs new file mode 100644 index 0000000..3ca846a --- /dev/null +++ b/crates/kb-embed/src/mock.rs @@ -0,0 +1,146 @@ +//! Deterministic mock embedder for downstream tests. +//! +//! Compiled only when the `mock` feature is enabled. Default builds +//! (`cargo build --release -p kb-embed`) MUST NOT contain the `MockEmbedder` +//! symbol — verifiable by symbol scan (`nm`, `cargo bloat`). +//! +//! ## Determinism contract +//! +//! For every call to [`MockEmbedder::embed`], component `i` of the output +//! vector for input `(text, kind)` is computed as: +//! +//! ```text +//! h = blake3(seed_le8 || kind_byte || text_len_le8 || text_utf8 || i_le8) +//! raw_i64 = i64::from_le_bytes(h[0..8]) +//! comp = (raw_i64 as f64 / i64::MAX as f64) as f32 // ∈ [-1.0, 1.0] +//! ``` +//! +//! `kind_byte` is `0u8` for [`EmbeddingKind::Document`] and `1u8` for +//! [`EmbeddingKind::Query`] — mirrors the e5-style prefix behavior (the same +//! text in different roles produces different vectors). `text_len_le8` is the +//! length of `text_utf8` (in bytes) as a little-endian `u64`; it provides +//! domain separation so the boundary between `text` and the trailing `i_le8` +//! cannot be ambiguous (without it, e.g. `("ABCDEFGH", 0)` and +//! `("", u64::from_le_bytes(*b"ABCDEFGH"))` would hash identically). +//! +//! After the per-component pass each vector is **L2-normalized to unit +//! length** so downstream cosine-similarity tests can rely on a unit-norm +//! input (‖v‖ ≈ 1.0 within f32 epsilon × √dims — the per-component f32 +//! truncation is bounded by `f32::EPSILON`, summed in quadrature gives +//! roughly `√dims · EPSILON` in the L2 norm). If a vector ends up all-zeros +//! (vanishingly unlikely from BLAKE3), it is left untouched rather than +//! dividing by zero. +//! +//! Invariants the contract guarantees: +//! +//! * Identical `(seed, kind, text, dimensions)` → byte-identical output. +//! * Different `kind` for the same text → different output (kind_byte differs). +//! * Different `text` → different output with overwhelming probability. +//! * All output components are finite (`is_finite()`). + +use kb_core::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion}; + +/// Deterministic test double. See module docs for the hashing recipe. +pub struct MockEmbedder { + model_id: EmbeddingModelId, + version: EmbeddingVersion, + dimensions: usize, + seed: u64, +} + +impl MockEmbedder { + /// Construct with `seed = 0`. Use [`Self::with_seed`] to pick a different + /// seed (e.g., to verify two embedders with the same identity but + /// different seeds yield different vectors). + pub fn new( + model_id: EmbeddingModelId, + version: EmbeddingVersion, + dimensions: usize, + ) -> Self { + Self { + model_id, + version, + dimensions, + seed: 0, + } + } + + /// Construct with an explicit seed. Useful for differential tests. + pub fn with_seed( + model_id: EmbeddingModelId, + version: EmbeddingVersion, + dimensions: usize, + seed: u64, + ) -> Self { + Self { + model_id, + version, + dimensions, + seed, + } + } + + fn kind_byte(kind: EmbeddingKind) -> u8 { + match kind { + EmbeddingKind::Document => 0, + EmbeddingKind::Query => 1, + } + } + + fn component(&self, kind: EmbeddingKind, text: &str, i: usize) -> f32 { + let mut hasher = blake3::Hasher::new(); + hasher.update(&self.seed.to_le_bytes()); + hasher.update(&[Self::kind_byte(kind)]); + // Length-prefix `text` (LE u64) so the boundary between `text` and the + // trailing `i` field is unambiguous — without this, `("ABCDEFGH", 0)` + // and `("", u64::from_le_bytes(*b"ABCDEFGH"))` would feed identical + // bytes into the hasher. + hasher.update(&(text.len() as u64).to_le_bytes()); + hasher.update(text.as_bytes()); + hasher.update(&(i as u64).to_le_bytes()); + let digest = hasher.finalize(); + let bytes = digest.as_bytes(); + let mut head = [0u8; 8]; + head.copy_from_slice(&bytes[..8]); + let raw = i64::from_le_bytes(head); + // Map to [-1.0, 1.0]. `i64::MAX` is finite in f64 so the ratio is + // always finite. Casting back to f32 cannot produce a NaN/Inf for + // values in this range. + // Note: i64::MIN/i64::MAX gives -1.0000000000000002 → f32 cast rounds to -1.0; range [-1, 1] holds in f32 even with this asymmetry. + ((raw as f64) / (i64::MAX as f64)) as f32 + } +} + +impl Embedder for MockEmbedder { + fn model_id(&self) -> EmbeddingModelId { + self.model_id.clone() + } + + fn model_version(&self) -> EmbeddingVersion { + self.version.clone() + } + + fn dimensions(&self) -> usize { + self.dimensions + } + + fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result>> { + let mut out = Vec::with_capacity(inputs.len()); + for input in inputs { + let mut v: Vec = (0..self.dimensions) + .map(|i| self.component(input.kind, input.text, i)) + .collect(); + + // L2-normalize. Skip the rare all-zero case to avoid 0/0 = NaN. + let norm_sq: f64 = v.iter().map(|&x| (x as f64) * (x as f64)).sum(); + if norm_sq > 0.0 { + let inv = (1.0 / norm_sq.sqrt()) as f32; + for x in v.iter_mut() { + *x *= inv; + } + } + out.push(v); + } + Ok(out) + } +} diff --git a/crates/kb-embed/tests/mock.rs b/crates/kb-embed/tests/mock.rs new file mode 100644 index 0000000..3923f1d --- /dev/null +++ b/crates/kb-embed/tests/mock.rs @@ -0,0 +1,176 @@ +//! Integration tests for `MockEmbedder`. Gated behind the `mock` feature. +//! +//! Canonical invocation: `cargo test -p kb-embed --features mock`. + +#![cfg(feature = "mock")] + +use kb_embed::{ + Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion, MockEmbedder, + assert_unit_norm, assert_vector_shape, +}; +use proptest::prelude::*; + +fn mk(dims: usize) -> MockEmbedder { + MockEmbedder::new( + EmbeddingModelId("mock-test".into()), + EmbeddingVersion("0".into()), + dims, + ) +} + +#[test] +fn dyn_dispatch_through_box() { + let e: Box = Box::new(mk(8)); + assert_eq!(e.dimensions(), 8); + assert_eq!(e.model_id(), EmbeddingModelId("mock-test".into())); + assert_eq!(e.model_version(), EmbeddingVersion("0".into())); + + let inputs = [EmbeddingInput { + text: "a fox", + kind: EmbeddingKind::Document, + }]; + let v = e.embed(&inputs).expect("embed via box"); + assert_eq!(v.len(), 1); + assert_vector_shape(&v, 8); +} + +#[test] +fn identical_input_yields_byte_identical_vector() { + let e = mk(16); + let a = e + .embed(&[EmbeddingInput { + text: "the quick brown fox", + kind: EmbeddingKind::Document, + }]) + .unwrap(); + let b = e + .embed(&[EmbeddingInput { + text: "the quick brown fox", + kind: EmbeddingKind::Document, + }]) + .unwrap(); + // Vec> equality is byte-equal because we did not mutate + // either side and the hash + normalization path is pure. + assert_eq!(a, b); +} + +#[test] +fn document_and_query_kinds_differ_for_same_text() { + let e = mk(32); + let inputs = [ + EmbeddingInput { + text: "needle in haystack", + kind: EmbeddingKind::Document, + }, + EmbeddingInput { + text: "needle in haystack", + kind: EmbeddingKind::Query, + }, + ]; + let v = e.embed(&inputs).unwrap(); + assert_eq!(v.len(), 2); + assert_vector_shape(&v, 32); + assert_ne!( + v[0], v[1], + "Document and Query kinds must produce different vectors for identical text" + ); +} + +#[test] +fn dimensions_match_construction() { + for dims in [1usize, 4, 64, 384, 768, 1024] { + let e = mk(dims); + assert_eq!(e.dimensions(), dims); + let v = e + .embed(&[EmbeddingInput { + text: "x", + kind: EmbeddingKind::Document, + }]) + .unwrap(); + assert_vector_shape(&v, dims); + } +} + +#[test] +fn different_seeds_produce_different_vectors() { + let a = MockEmbedder::with_seed( + EmbeddingModelId("m".into()), + EmbeddingVersion("0".into()), + 16, + 0, + ); + let b = MockEmbedder::with_seed( + EmbeddingModelId("m".into()), + EmbeddingVersion("0".into()), + 16, + 1, + ); + let inputs = [EmbeddingInput { + text: "same input", + kind: EmbeddingKind::Document, + }]; + assert_ne!(a.embed(&inputs).unwrap(), b.embed(&inputs).unwrap()); +} + +proptest! { + #![proptest_config(ProptestConfig { + cases: 100, + ..ProptestConfig::default() + })] + + /// 100 random `(text, kind)` pairs: every output vector must have + /// `len == dimensions`, contain only finite floats, contain no NaNs, + /// be L2 unit-norm within tolerance, be re-deterministic across calls, + /// differ between Document/Query kinds, and differ between distinct texts. + #[test] + fn random_inputs_yield_well_formed_vectors( + text in ".{0,256}", + text2 in ".{0,256}", + is_query in any::(), + // dims ≥ 2: a 1-dim unit-norm vector has only two possible values + // (`[1.0]` or `[-1.0]`), which makes the kind/text differential + // assertions degenerate. Pick a floor of 2 so the differentials + // exercise non-degenerate vector space. + dims in 2usize..=128, + ) { + // Skip degenerate case where the two random texts collide; the + // "distinct text → distinct vector" assertion below requires them to + // differ. + prop_assume!(text != text2); + + let e = mk(dims); + let kind = if is_query { EmbeddingKind::Query } else { EmbeddingKind::Document }; + let v = e.embed(&[EmbeddingInput { text: &text, kind }]).unwrap(); + prop_assert_eq!(v.len(), 1); + prop_assert_eq!(v[0].len(), dims); + for x in &v[0] { + prop_assert!(x.is_finite(), "component {x} not finite"); + prop_assert!(!x.is_nan(), "component {x} is NaN"); + } + + // L2 unit-norm within tolerance. `5e-4` is a safe upper bound up to + // dims = 128 here (would-be floor: f32::EPSILON × √dims). + assert_unit_norm(&v, 5e-4); + + // Re-determinism: embedding `text` as Document twice → byte-equal. + let doc_a = e + .embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Document }]) + .unwrap(); + let doc_b = e + .embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Document }]) + .unwrap(); + prop_assert_eq!(&doc_a, &doc_b, "Doc(text) must be byte-equal across calls"); + + // Kind differential: Doc(text) != Query(text). + let q = e + .embed(&[EmbeddingInput { text: &text, kind: EmbeddingKind::Query }]) + .unwrap(); + prop_assert_ne!(&doc_a, &q, "Doc(text) must differ from Query(text)"); + + // Text differential: Doc(text) != Doc(text2) when text != text2. + let doc_other = e + .embed(&[EmbeddingInput { text: &text2, kind: EmbeddingKind::Document }]) + .unwrap(); + prop_assert_ne!(&doc_a, &doc_other, "distinct texts must yield distinct Doc vectors"); + } +} diff --git a/crates/kb-embed/tests/reexports.rs b/crates/kb-embed/tests/reexports.rs new file mode 100644 index 0000000..cfa5c14 --- /dev/null +++ b/crates/kb-embed/tests/reexports.rs @@ -0,0 +1,61 @@ +//! Compile-only test: verifies the crate's public surface (trait re-exports +//! and the `assert_vector_shape` helper) is reachable without the `mock` +//! feature. +//! +//! Runs under both `cargo test -p kb-embed` and +//! `cargo test -p kb-embed --features mock`. + +use kb_embed::{ + Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion, + assert_vector_shape, +}; + +/// A trivial in-test impl that does NOT rely on the `mock` feature — proves +/// the trait surface alone is enough to write an `Embedder`. +struct ZeroEmbedder { + dims: usize, +} + +impl Embedder for ZeroEmbedder { + fn model_id(&self) -> EmbeddingModelId { + EmbeddingModelId("zero".into()) + } + fn model_version(&self) -> EmbeddingVersion { + EmbeddingVersion("0".into()) + } + fn dimensions(&self) -> usize { + self.dims + } + fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result>> { + Ok(inputs.iter().map(|_| vec![0.0; self.dims]).collect()) + } +} + +#[test] +fn reexports_compile_without_mock_feature() { + let e: Box = Box::new(ZeroEmbedder { dims: 4 }); + let inputs = [ + EmbeddingInput { + text: "hello", + kind: EmbeddingKind::Document, + }, + EmbeddingInput { + text: "world", + kind: EmbeddingKind::Query, + }, + ]; + let v = e.embed(&inputs).expect("zero embed"); + assert_eq!(v.len(), 2); + assert_vector_shape(&v, 4); +} + +/// Sanity: when built WITHOUT `--features mock`, the `MockEmbedder` symbol +/// is absent. We can't usefully test `nm` from inside a unit test, but we +/// can at least confirm the cfg gate parses both ways. See PR notes for the +/// CI-side `nm`/`cargo bloat` symbol scan. +#[cfg(not(feature = "mock"))] +#[test] +fn mock_feature_off_compiles() { + // No-op — the test's existence proves the `not(feature = "mock")` gate + // compiles and the crate is usable without `MockEmbedder`. +}