e5 하드코딩(HF_MODEL/SUPPORTED_MODEL/mean/query:+passage:) → 모델 레지스트리
EmbedModelSpec{name,hf_repo,pooling,query_prefix,doc_prefix,dim,version_tag}.
e5(mean, query:/passage:) + arctic(CLS, query:/무접두어). pooling 모델별 분기
(mean=attention-mask-weighted / CLS=hidden[:,0,:]), tokenize/forward/L2 공유.
arctic pooling=CLS 는 HF 1_Pooling/config.json(pooling_mode_cls_token:true) 확인.
model_version 은 arctic 일 때 +arctic-cls 태그(embedding_version cascade 트리거);
e5 는 fastembed-e5 호환(NUMA 드롭인) 위해 plain config.version 유지.
correctness 게이트: tests/arctic_ollama_parity.rs (#[ignore], live Ollama) —
candle arctic vs Ollama snowflake-arctic-embed2 per-sentence 코사인>0.99.
수동 실측 cosine_min=0.999984 (recall@10 130 재현 보장).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
51 lines
2.1 KiB
TOML
51 lines
2.1 KiB
TOML
[package]
|
||
name = "kebab-embed-candle"
|
||
version = { workspace = true }
|
||
edition = { workspace = true }
|
||
rust-version = { workspace = true }
|
||
license = { workspace = true }
|
||
repository = { workspace = true }
|
||
description = "Pure-Rust candle adapter implementing kb_core::Embedder (multilingual-e5-large, NUMA-safe thread cap)"
|
||
|
||
[dependencies]
|
||
kebab-core = { path = "../kebab-core" }
|
||
kebab-config = { path = "../kebab-config" }
|
||
# candle stack — pinned to the workspace-locked crates.io release (0.10.x),
|
||
# same versions the Phase 0 spike compiled so build artifacts are reused.
|
||
candle-core = "0.10.2"
|
||
candle-nn = "0.10.2"
|
||
candle-transformers = "0.10.2"
|
||
tokenizers = "0.21"
|
||
hf-hub = { version = "0.4", features = ["ureq"] }
|
||
serde_json = { workspace = true }
|
||
# Thread cap: a one-shot global rayon pool sizes candle's CPU threads
|
||
# (the Phase 0 spike proved RAYON_NUM_THREADS caps candle), so a NUMA host
|
||
# can keep onnxruntime's hard-coded 48-intra-op heap corruption at bay.
|
||
rayon = "1"
|
||
anyhow = { workspace = true }
|
||
tracing = { workspace = true }
|
||
|
||
[features]
|
||
# opt-in: run candle on the Apple Silicon GPU (Metal). macOS-only — the build
|
||
# enables candle's metal backend and `select_device()` picks Metal (CPU fallback
|
||
# on failure). Lets an M-series Mac ingest e5-large on GPU (10×+ vs CPU); the
|
||
# resulting vectors are cross-compatible with the CPU path (same model), so the
|
||
# Linux server can serve queries on CPU candle.
|
||
metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]
|
||
|
||
[dev-dependencies]
|
||
# Integration-test binaries can only see the library's public API + these,
|
||
# not the library's own (non-dev) dependencies — so rayon/kebab-config/kebab-core
|
||
# are repeated here for tests/parity.rs and tests/thread_cap.rs.
|
||
kebab-embed-local = { path = "../kebab-embed-local" }
|
||
# arctic↔Ollama parity test drives the real Ollama adapter for the reference
|
||
# vectors (tests/arctic_ollama_parity.rs, `#[ignore]` — live Ollama).
|
||
kebab-embed-ollama = { path = "../kebab-embed-ollama" }
|
||
kebab-config = { path = "../kebab-config" }
|
||
kebab-core = { path = "../kebab-core" }
|
||
rayon = "1"
|
||
tempfile = { workspace = true }
|
||
|
||
[lints]
|
||
workspace = true
|