Files
kebab/crates/kebab-embed-candle/tests/arctic_ollama_parity.rs
altair823 cbcae69abf feat(embed): candle 모델 레지스트리 + arctic-embed-l-v2.0 (CLS pooling)
e5 하드코딩(HF_MODEL/SUPPORTED_MODEL/mean/query:+passage:) → 모델 레지스트리
EmbedModelSpec{name,hf_repo,pooling,query_prefix,doc_prefix,dim,version_tag}.
e5(mean, query:/passage:) + arctic(CLS, query:/무접두어). pooling 모델별 분기
(mean=attention-mask-weighted / CLS=hidden[:,0,:]), tokenize/forward/L2 공유.
arctic pooling=CLS 는 HF 1_Pooling/config.json(pooling_mode_cls_token:true) 확인.
model_version 은 arctic 일 때 +arctic-cls 태그(embedding_version cascade 트리거);
e5 는 fastembed-e5 호환(NUMA 드롭인) 위해 plain config.version 유지.

correctness 게이트: tests/arctic_ollama_parity.rs (#[ignore], live Ollama) —
candle arctic vs Ollama snowflake-arctic-embed2 per-sentence 코사인>0.99.
수동 실측 cosine_min=0.999984 (recall@10 130 재현 보장).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 04:59:11 +00:00

129 lines
5.5 KiB
Rust

//! arctic-embed-l-v2.0 correctness gate (`#[ignore]` — needs the ~2GB candle
//! model + a live Ollama serving `snowflake-arctic-embed2`).
//!
//! This is the load-bearing pooling/prefix check for the arctic integration.
//! The recall measurement that justified adopting arctic (recall@10 130/132)
//! went through Ollama's `snowflake-arctic-embed2`. The candle path
//! re-implements the model (XLM-RoBERTa-large + **CLS** pooling + `query: ` on
//! queries / no prefix on documents). If candle's pooling or prefix is wrong,
//! its vectors silently diverge from the measured route and the 130 number
//! does NOT carry over. This test pins them together: per-sentence cosine
//! between the candle vector and the Ollama vector must be **> 0.99**.
//!
//! `#[ignore]` because it depends on an external Ollama daemon (CI is
//! headless/offline). The leader MUST run it once before merge.
//!
//! ## Manual run
//!
//! 1. Confirm Ollama is reachable and has the model:
//! ```sh
//! curl -s http://192.168.0.47:11434/api/tags # should list snowflake-arctic-embed2
//! ```
//! 2. Run (downloads the ~2GB candle safetensors on first run):
//! ```sh
//! CARGO_TARGET_DIR=/build/out/cargo-target \
//! KEBAB_ARCTIC_OLLAMA_ENDPOINT=http://192.168.0.47:11434 \
//! cargo test -p kebab-embed-candle --test arctic_ollama_parity -- --ignored --nocapture
//! ```
//! The endpoint defaults to `http://192.168.0.47:11434` if the env is unset.
//!
//! Record the printed `ARCTIC_PARITY_SUMMARY cosine_min=...` in
//! `/tmp/arctic-result.md` + `tasks/HOTFIXES.md`.
use kebab_config::Config;
use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind};
use kebab_embed_candle::CandleEmbedder;
use kebab_embed_ollama::OllamaEmbedder;
const DOGFOOD_CONFIG: &str = "/build/dogfood/config.toml";
const DEFAULT_OLLAMA_ENDPOINT: &str = "http://192.168.0.47:11434";
/// Mixed Korean / English + the descriptive-recall shapes arctic was adopted
/// for (synonym / abbreviation / English term). Covers both prefix paths.
const SENTENCES: &[&str] = &[
"스택 자료구조",
"후입선출 방식으로 동작하는 자료구조",
"큐는 선입선출 자료구조이다",
"Rust ownership and the borrow checker",
"소유권과 빌림 검사기는 메모리 안전성을 보장한다",
"SVM 은 support vector machine 의 약자이다",
"정렬 알고리즘의 시간 복잡도",
"The capital of France is Paris.",
];
fn cosine(a: &[f32], b: &[f32]) -> f32 {
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
let na: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let nb: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
dot / (na * nb)
}
/// Base config: prefer the canonical dogfood config (for storage/cache roots),
/// fall back to `Config::defaults()` so the test still runs on a bare clone.
fn base_config() -> Config {
Config::load(Some(std::path::Path::new(DOGFOOD_CONFIG))).unwrap_or_else(|_| Config::defaults())
}
#[test]
#[ignore = "needs ~2GB candle model + live Ollama (snowflake-arctic-embed2); run manually before merge"]
fn candle_arctic_matches_ollama_arctic() {
let endpoint = std::env::var("KEBAB_ARCTIC_OLLAMA_ENDPOINT")
.unwrap_or_else(|_| DEFAULT_OLLAMA_ENDPOINT.to_string());
// candle side: the in-process arctic model.
let mut candle_cfg = base_config();
candle_cfg.models.embedding.provider = "candle".to_string();
candle_cfg.models.embedding.model = "snowflake-arctic-embed-l-v2.0".to_string();
candle_cfg.models.embedding.dimensions = 1024;
// Ollama side: the reference route the recall numbers came from.
let mut ollama_cfg = base_config();
ollama_cfg.models.embedding.provider = "ollama".to_string();
ollama_cfg.models.embedding.model = "snowflake-arctic-embed2".to_string();
ollama_cfg.models.embedding.dimensions = 1024;
ollama_cfg.models.embedding.endpoint = Some(endpoint.clone());
let candle = CandleEmbedder::new(&candle_cfg).expect("build candle arctic embedder");
let ollama = OllamaEmbedder::new(&ollama_cfg).expect("build ollama arctic embedder");
// Exercise BOTH prefix paths so a query-side divergence can't hide.
let inputs: Vec<EmbeddingInput> = SENTENCES
.iter()
.flat_map(|s| {
[EmbeddingKind::Document, EmbeddingKind::Query]
.into_iter()
.map(move |kind| EmbeddingInput { text: s, kind })
})
.collect();
let cv = candle.embed(&inputs).expect("candle embed");
let ov = ollama
.embed(&inputs)
.expect("ollama embed (is snowflake-arctic-embed2 pulled @ the endpoint?)");
assert_eq!(cv.len(), ov.len(), "embedding counts must match");
assert_eq!(cv.len(), inputs.len(), "one vector per input");
assert_eq!(candle.dimensions(), 1024);
let mut min_cos = f32::INFINITY;
for (i, inp) in inputs.iter().enumerate() {
assert_eq!(cv[i].len(), 1024, "candle dim");
assert_eq!(ov[i].len(), 1024, "ollama dim");
let c = cosine(&cv[i], &ov[i]);
min_cos = min_cos.min(c);
let kind = match inp.kind {
EmbeddingKind::Document => "doc",
EmbeddingKind::Query => "qry",
};
let preview: String = inp.text.chars().take(36).collect();
println!("[{i:>2}] {kind} cos={c:.6} {preview}");
}
println!("ARCTIC_PARITY_SUMMARY cosine_min={min_cos:.6} endpoint={endpoint}");
assert!(
min_cos > 0.99,
"candle arctic vs Ollama arctic cosine_min={min_cos:.6} ≤ 0.99 — \
pooling/prefix mismatch; the recall=130 measurement will NOT reproduce"
);
}