Merge pull request 'feat(embed): arctic-embed-l-v2.0 임베더(candle+ollama)' (#203) from feat/arctic-embedder into main
Reviewed-on: #203
This commit was merged in pull request #203.
This commit is contained in:
63
Cargo.lock
generated
63
Cargo.lock
generated
@@ -4724,7 +4724,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-app"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.22.1",
|
||||
@@ -4739,6 +4739,7 @@ dependencies = [
|
||||
"kebab-embed",
|
||||
"kebab-embed-candle",
|
||||
"kebab-embed-local",
|
||||
"kebab-embed-ollama",
|
||||
"kebab-llm",
|
||||
"kebab-llm-local",
|
||||
"kebab-nli",
|
||||
@@ -4771,7 +4772,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-chunk"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4789,7 +4790,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-cli"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
@@ -4810,7 +4811,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-config"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"dirs 5.0.1",
|
||||
@@ -4826,7 +4827,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-core"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4840,7 +4841,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4854,7 +4855,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed-candle"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"candle-core",
|
||||
@@ -4864,6 +4865,7 @@ dependencies = [
|
||||
"kebab-config",
|
||||
"kebab-core",
|
||||
"kebab-embed-local",
|
||||
"kebab-embed-ollama",
|
||||
"rayon",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
@@ -4873,7 +4875,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed-local"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"fastembed",
|
||||
@@ -4884,9 +4886,24 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed-ollama"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-config",
|
||||
"kebab-core",
|
||||
"reqwest 0.12.28",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"wiremock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kebab-eval"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4905,7 +4922,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4914,7 +4931,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm-local"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-config",
|
||||
@@ -4931,7 +4948,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-mcp"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4949,7 +4966,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-nli"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"hf-hub",
|
||||
@@ -4964,7 +4981,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-code"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"gix",
|
||||
@@ -4987,7 +5004,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-image"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"ab_glyph",
|
||||
"anyhow",
|
||||
@@ -5011,7 +5028,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-md"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -5028,7 +5045,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-pdf"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -5043,7 +5060,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-rag"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -5065,7 +5082,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-search"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"globset",
|
||||
@@ -5084,7 +5101,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-source-fs"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -5102,7 +5119,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-sqlite"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -5122,7 +5139,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-vector"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arrow",
|
||||
@@ -5146,7 +5163,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-tui"
|
||||
version = "0.25.0"
|
||||
version = "0.26.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"crossterm",
|
||||
|
||||
@@ -12,6 +12,7 @@ members = [
|
||||
"crates/kebab-embed",
|
||||
"crates/kebab-embed-local",
|
||||
"crates/kebab-embed-candle",
|
||||
"crates/kebab-embed-ollama",
|
||||
"crates/kebab-llm",
|
||||
"crates/kebab-llm-local",
|
||||
"crates/kebab-rag",
|
||||
@@ -31,7 +32,7 @@ edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kebab"
|
||||
version = "0.25.0" # v0.25.0 — doc-side expansion(별칭) 기능 완전 제거: Chunk.aliases / expansion.rs / IngestExpansionCfg / alias lexical arm / expansion_progress wire kind 제거, 신규 마이그레이션 V013 이 chunk_aliases_fts + chunks.aliases DROP. AssetTimings.expansion_ms 는 wire 호환 위해 값 0 유지. 별칭 default-off 였어 사용자 체감 0. — CLAUDE.md §Release
|
||||
version = "0.26.0" # v0.26.0 — arctic-embed-l-v2.0 임베더 통합: kebab-embed-candle 다중 모델 레지스트리(e5 mean + arctic CLS, 모델별 pooling/prefix 분기) + 신규 kebab-embed-ollama 크레이트(provider="ollama", POST /api/embed, L2 정규화, batch+fail-soft). config models.embedding.provider 에 "ollama" 추가 + endpoint: Option<String>. 기본 동작 불변(provider=fastembed e5), arctic 은 opt-in, embedding_version cascade(arctic-cls / ollama:{model} 태그). — CLAUDE.md §Release
|
||||
|
||||
# pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
|
||||
# intentional allow-list. The allowed lints are either cosmetic (doc style),
|
||||
|
||||
@@ -35,6 +35,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
|
||||
머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:
|
||||
|
||||
- **2026-06-03 arctic-embed-l-v2.0 임베더 통합** — v0.26.0. 별칭 제거 후 설명형 query recall 보강(측정 recall@10 130/132, e5 +7). `kebab-embed-candle` 모델 레지스트리화(e5 mean + `snowflake-arctic-embed-l-v2.0` CLS, 모델별 pooling/prefix) + 신규 `kebab-embed-ollama`(`provider="ollama"`, `/api/embed`). config `endpoint: Option<String>` 추가. 기본 e5 유지(opt-in), arctic 전환은 embedding_version cascade → 재색인. candle↔Ollama cosine>0.99 게이트로 pooling/prefix 정확성 고정(`#[ignore]`). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 arctic), spec `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`.
|
||||
- **2026-06-03 doc-side expansion(별칭) 기능 완전 제거** — v0.25.0. 아래 2026-05-31 항목의 색인-시 청크당 LLM 별칭 생성 + 별칭 검색 채널을 **전부 제거**(ROI 음수: cross-lingual 은 e5-large 단독으로 충분, 기여는 설명형 +2 그룹뿐인데 대가가 청크당 색인-시 LLM). `Chunk.aliases`/`expansion.rs`/`IngestExpansionCfg`/alias lexical arm/`expansion_progress` wire kind 제거, 신규 마이그레이션 **V013** 이 `chunk_aliases_fts`+`chunks.aliases` DROP. 별칭 default-off 였어 사용자 체감 0, 기존 KB 도 재색인 불요(잔존 별칭 벡터는 `strip_alias_suffix` graceful 매핑/`reset` 정리). `AssetTimings.expansion_ms` 는 wire 호환 위해 값 0 으로 유지. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03), spec `docs/superpowers/specs/2026-06-03-remove-doc-expansion-spec.md`.
|
||||
- **2026-05-31 Phase 2 doc-side expansion 별칭(개별 dense 벡터) + 파생물 캐시(V012)** — v0.21.0 cut. 색인 시 LLM 이 청크별 별칭("같은 의미 다른 표현")을 생성, 줄별 **개별 dense 벡터**(sentinel `{chunk}#alias#N`)로 색인 (묶음 1벡터는 평균화 희석으로 회귀 → 폐기) + boilerplate 청크 skip. `[ingest.expansion]` default off. 측정(나무위키 ~1000 문서 CS corpus): 변형 일관성 14/18 → **16/18**, spread 0.222→0.111, 대조군 false-positive 별칭 무죄. 비용 병목(별칭 18문서 2.5h)은 **파생물 캐시(V012, 청크 내용 해시 키)**로 해소 — 정답 3개 cold 1879s → warm 13s **≈ 145배**, embedding+별칭 LLM 캐싱, version_key cascade 정합. search/ask 가 `kebab.sqlite`+`lancedb` 만으로 동작 → 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능. **결정/known limitation**: grounded/refusal 판정이 부분 인용을 grounded 로 오분류(정직한 거부가 false-positive 로 집계) — 별도 개선 후보. stack·svm 설명형 2개 잔존. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-31), 측정: `docs/superpowers/handoffs/2026-05-31-namu-wiki-alias-cache-study.md`.
|
||||
- **2026-05-29 v0.20.2 dogfood findings + 검색 품질 baseline** — 8-finding 라운드 완료. (1) Ask 응답언어: rag-v3 default (질문 언어 = 답변 언어). (2) eval `--config` facade 패치 로 dogfood KB 직접 eval 가능. (3) 검색 품질 baseline — hybrid hit@3=1.0 / MRR=0.833, lexical hit@3=1.0 / MRR=0.7 (golden 10 query). **O-2 known limitation**: 소형 모델(gemma4:e4b) refusal 메시지의 query 언어 불일치 가능 — 판정은 정상, 표시 문구만 해당. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-29).
|
||||
|
||||
34
README.md
34
README.md
@@ -111,18 +111,46 @@ root = "~/KnowledgeBase" # 색인할 폴더. 절대 / tilde / env / 상대 경
|
||||
|
||||
[models.embedding]
|
||||
provider = "fastembed" # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust)
|
||||
# / "none"(lexical-only). candle 는 같은 모델·같은 벡터를
|
||||
# 순수 Rust 로 돌려 NUMA 서버의 onnxruntime 48-스레드
|
||||
# double-free 를 피하는 opt-in 백엔드 (재색인 불필요).
|
||||
# / "ollama"(원격 HTTP) / "none"(lexical-only).
|
||||
# candle 는 같은 모델·같은 벡터를 순수 Rust 로 돌려
|
||||
# NUMA 서버의 onnxruntime 48-스레드 double-free 를 피하는
|
||||
# opt-in 백엔드 (e5 는 재색인 불필요).
|
||||
model = "multilingual-e5-large" # 다국어 sentence embedding (1024-dim).
|
||||
# 첫 ingest 시 ONNX (~1.3GB) 자동 다운로드.
|
||||
# candle provider 는 safetensors (~2GB) 다운로드.
|
||||
# candle/ollama 는 "snowflake-arctic-embed-l-v2.0"
|
||||
# (설명형 query 의 recall 보강) 도 지원 — 아래 참고.
|
||||
dimensions = 1024 # config 와 LanceDB stored dim 불일치 시 검색 0건.
|
||||
num_threads = 0 # candle 전용 CPU 스레드 캡 (0=auto=#cores).
|
||||
# env KEBAB_EMBED_THREADS 가 우선. NUMA 노드 바인딩은
|
||||
# numactl 과 조합. fastembed provider 는 무시.
|
||||
# endpoint = "http://127.0.0.1:11434" # provider="ollama" 전용 HTTP endpoint.
|
||||
# 생략 시 [models.llm].endpoint 로 폴백.
|
||||
# fastembed/candle provider 는 무시.
|
||||
```
|
||||
|
||||
**arctic-embed-l-v2.0 (설명형 query recall 보강)**: 기본 e5-large 대신
|
||||
Snowflake `arctic-embed-l-v2.0` 임베더를 쓸 수 있다 (1024-dim, opt-in). 측정에서
|
||||
설명형/약어/영문 용어 query 의 recall@10 이 e5 대비 향상됐다. 두 경로:
|
||||
|
||||
```toml
|
||||
# (A) candle 백엔드 — 순수 Rust, in-process (NUMA 안전, Metal GPU 가능):
|
||||
[models.embedding]
|
||||
provider = "candle"
|
||||
model = "snowflake-arctic-embed-l-v2.0" # CLS pooling, query 에 "query: " 접두어
|
||||
# (문서는 무접두어). safetensors ~2GB 다운로드.
|
||||
|
||||
# (B) ollama 백엔드 — 원격/로컬 Ollama 데몬에 위임 (POST /api/embed):
|
||||
[models.embedding]
|
||||
provider = "ollama"
|
||||
model = "snowflake-arctic-embed2" # Ollama 모델 태그 (ollama pull 필요)
|
||||
endpoint = "http://127.0.0.1:11434" # 생략 시 [models.llm].endpoint
|
||||
```
|
||||
|
||||
> ⚠️ e5 → arctic 전환은 `embedding_version` cascade 를 트리거한다 (모델이 다르면
|
||||
> 벡터도 다름). 기존 e5 KB 와 혼용 불가 — 전환 시 **재색인** 필요 (`kebab reset`
|
||||
> 후 재 ingest). 기본값은 e5 라 기존 사용자는 영향 없음.
|
||||
|
||||
**Apple Silicon GPU 가속 (candle / macOS)**: M-시리즈 맥에서 candle 임베딩을
|
||||
GPU(Metal)로 돌리면 CPU 대비 대용량 ingest 가 크게 빨라진다. 빌드 또는 설치 시
|
||||
`embed_metal` feature 를 켠다:
|
||||
|
||||
@@ -19,6 +19,7 @@ kebab-search = { path = "../kebab-search" }
|
||||
kebab-embed = { path = "../kebab-embed" }
|
||||
kebab-embed-local = { path = "../kebab-embed-local" }
|
||||
kebab-embed-candle = { path = "../kebab-embed-candle" }
|
||||
kebab-embed-ollama = { path = "../kebab-embed-ollama" }
|
||||
kebab-llm = { path = "../kebab-llm" }
|
||||
kebab-llm-local = { path = "../kebab-llm-local" }
|
||||
kebab-rag = { path = "../kebab-rag" }
|
||||
|
||||
@@ -45,6 +45,7 @@ use kebab_core::{
|
||||
};
|
||||
use kebab_embed_candle::CandleEmbedder;
|
||||
use kebab_embed_local::FastembedEmbedder;
|
||||
use kebab_embed_ollama::OllamaEmbedder;
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_parse_code::{
|
||||
CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor,
|
||||
@@ -834,11 +835,13 @@ impl App {
|
||||
if let Some(e) = self.embedder.get() {
|
||||
return Ok(Some(e.clone()));
|
||||
}
|
||||
// Provider branch (Track 1 spec §3). `embeddings_disabled()` above
|
||||
// already handled `"none"`; here we route the live providers.
|
||||
// `fastembed`/`onnx`/(empty) keep the default onnxruntime path
|
||||
// (vectors unchanged — `embedding_version` is preserved); `candle`
|
||||
// selects the pure-Rust NUMA-safe backend.
|
||||
// Provider branch (Track 1 spec §3 + arctic-embedder spec). The
|
||||
// `embeddings_disabled()` check above already handled `"none"`; here we
|
||||
// route the live providers. `fastembed`/`onnx`/(empty) keep the default
|
||||
// onnxruntime path (vectors unchanged — `embedding_version` is
|
||||
// preserved); `candle` selects the pure-Rust NUMA-safe backend (e5 or
|
||||
// arctic via its model registry); `ollama` offloads to a remote
|
||||
// `/api/embed` daemon.
|
||||
let provider = self.config.models.embedding.provider.as_str();
|
||||
let emb: Arc<dyn Embedder + Send + Sync> = match provider {
|
||||
"fastembed" | "onnx" | "" => Arc::new(
|
||||
@@ -847,10 +850,13 @@ impl App {
|
||||
"candle" => Arc::new(
|
||||
CandleEmbedder::new(&self.config).context("kb-app: load CandleEmbedder")?,
|
||||
),
|
||||
"ollama" => Arc::new(
|
||||
OllamaEmbedder::new(&self.config).context("kb-app: load OllamaEmbedder")?,
|
||||
),
|
||||
other => {
|
||||
return Err(anyhow!(
|
||||
"kb-app: unknown embedding provider {other:?}; expected one of \
|
||||
`fastembed` (default), `candle`, or `none` (lexical-only)"
|
||||
`fastembed` (default), `candle`, `ollama`, or `none` (lexical-only)"
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -155,9 +155,10 @@ impl NliCfg {
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EmbeddingModelCfg {
|
||||
/// `fastembed` (default, onnxruntime) or `candle` (pure-Rust,
|
||||
/// NUMA-safe). `none` disables embeddings (lexical-only). Unknown
|
||||
/// values error at embedder construction.
|
||||
/// `fastembed` (default, onnxruntime), `candle` (pure-Rust, NUMA-safe),
|
||||
/// or `ollama` (remote HTTP embedding endpoint). `none` disables
|
||||
/// embeddings (lexical-only). Unknown values error at embedder
|
||||
/// construction.
|
||||
pub provider: String,
|
||||
pub model: String,
|
||||
pub version: String,
|
||||
@@ -170,6 +171,13 @@ pub struct EmbeddingModelCfg {
|
||||
/// provider. Defaulted on load so pre-0.22 config files still parse.
|
||||
#[serde(default)]
|
||||
pub num_threads: u32,
|
||||
/// HTTP endpoint for the `ollama` embedding provider (e.g.
|
||||
/// `"http://127.0.0.1:11434"`). `None` (or a missing key in TOML) means
|
||||
/// "fall back to `models.llm.endpoint`" — same convention as the OCR /
|
||||
/// vision endpoints. Ignored by the `fastembed` / `candle` providers.
|
||||
/// Defaulted on load so pre-0.26 config files still parse.
|
||||
#[serde(default)]
|
||||
pub endpoint: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -688,6 +696,7 @@ impl Config {
|
||||
dimensions: 1024,
|
||||
batch_size: 64,
|
||||
num_threads: 0,
|
||||
endpoint: None,
|
||||
},
|
||||
llm: LlmCfg {
|
||||
provider: "ollama".to_string(),
|
||||
@@ -950,6 +959,12 @@ impl Config {
|
||||
self.models.embedding.num_threads = n;
|
||||
}
|
||||
}
|
||||
"KEBAB_MODELS_EMBEDDING_ENDPOINT" => {
|
||||
// Empty value → None (= fall back to models.llm.endpoint),
|
||||
// mirroring the OCR endpoint override semantics.
|
||||
self.models.embedding.endpoint =
|
||||
if v.is_empty() { None } else { Some(v.clone()) };
|
||||
}
|
||||
|
||||
// models.llm
|
||||
"KEBAB_MODELS_LLM_PROVIDER" => self.models.llm.provider = v.clone(),
|
||||
|
||||
@@ -38,6 +38,9 @@ metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]
|
||||
# not the library's own (non-dev) dependencies — so rayon/kebab-config/kebab-core
|
||||
# are repeated here for tests/parity.rs and tests/thread_cap.rs.
|
||||
kebab-embed-local = { path = "../kebab-embed-local" }
|
||||
# arctic↔Ollama parity test drives the real Ollama adapter for the reference
|
||||
# vectors (tests/arctic_ollama_parity.rs, `#[ignore]` — live Ollama).
|
||||
kebab-embed-ollama = { path = "../kebab-embed-ollama" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
rayon = "1"
|
||||
|
||||
@@ -1,31 +1,44 @@
|
||||
//! `kebab-embed-candle` — [`CandleEmbedder`], a pure-Rust (candle)
|
||||
//! implementation of [`Embedder`](kebab_core::Embedder).
|
||||
//!
|
||||
//! Runs the same `intfloat/multilingual-e5-large` model as the default
|
||||
//! [`FastembedEmbedder`](kebab_embed_local) but through `candle`
|
||||
//! (`candle-transformers`' XLM-RoBERTa) instead of onnxruntime. Motivation:
|
||||
//! fastembed 4.9's onnxruntime hard-codes 48 intra-op threads, which corrupts
|
||||
//! the heap (double-free) on dual-socket NUMA hosts. candle's CPU backend
|
||||
//! sizes its threads off the global rayon pool, so a one-shot
|
||||
//! Runs an XLM-RoBERTa-large embedding model through `candle`
|
||||
//! (`candle-transformers`' XLM-RoBERTa) instead of onnxruntime. Two models
|
||||
//! are wired through a small **registry** ([`MODEL_REGISTRY`]):
|
||||
//!
|
||||
//! * `multilingual-e5-large` — the same weights the default
|
||||
//! [`FastembedEmbedder`](kebab_embed_local) uses (mean pooling,
|
||||
//! `query: `/`passage: ` prefixes). candle is the NUMA-safe drop-in:
|
||||
//! fastembed 4.9's onnxruntime hard-codes 48 intra-op threads, which
|
||||
//! corrupts the heap (double-free) on dual-socket NUMA hosts. candle's
|
||||
//! CPU backend sizes its threads off the global rayon pool, so a one-shot
|
||||
//! [`rayon::ThreadPoolBuilder`] cap (config `num_threads` / env
|
||||
//! `KEBAB_EMBED_THREADS`) keeps the worker count NUMA-safe.
|
||||
//! * `snowflake-arctic-embed-l-v2.0` — Snowflake's arctic-embed v2.0
|
||||
//! (CLS pooling, `query: ` on queries / no prefix on documents). Same
|
||||
//! XLM-RoBERTa-large architecture, dim 1024, so it rides the exact same
|
||||
//! tokenize → forward → L2 pipeline; only the pooling step and prefixes
|
||||
//! differ (both keyed off the per-model [`EmbedModelSpec`]).
|
||||
//!
|
||||
//! Output parity with the onnxruntime path was proven by the Phase 0 spike
|
||||
//! (cosine 1.000000); this crate absorbs that pipeline verbatim:
|
||||
//! Output parity with the onnxruntime path (for e5) was proven by the
|
||||
//! Phase 0 spike (cosine 1.000000); the arctic path's pooling/prefix
|
||||
//! correctness is pinned by an `#[ignore]`d cosine>0.99 cross-check against
|
||||
//! Ollama's `snowflake-arctic-embed2` (see `tests/arctic_ollama_parity.rs`).
|
||||
//! The shared pipeline:
|
||||
//!
|
||||
//! 1. e5 prefix (`passage: ` for documents, `query: ` for queries — the same
|
||||
//! convention as `kebab-embed-local`'s `prefix_input`);
|
||||
//! 1. instruction prefix per [`EmbedModelSpec`] (query/doc);
|
||||
//! 2. tokenize (max_len 512, batch-longest padding, special tokens);
|
||||
//! 3. XLM-RoBERTa forward on `Device::Cpu`;
|
||||
//! 4. attention-mask-weighted mean pooling;
|
||||
//! 3. XLM-RoBERTa forward on the selected [`Device`];
|
||||
//! 4. pooling — mean (attention-mask-weighted) or CLS (first token);
|
||||
//! 5. L2 normalization.
|
||||
//!
|
||||
//! Model files (`config.json`, `tokenizer.json`, `model.safetensors`) are
|
||||
//! fetched via `hf-hub` into `{config.storage.model_dir}/candle/`.
|
||||
//! fetched via `hf-hub` into `{config.storage.model_dir}/candle/` (hf-hub's
|
||||
//! cache layout namespaces by repo, so e5 and arctic never collide).
|
||||
//!
|
||||
//! This crate is **opt-in** (`config.models.embedding.provider = "candle"`);
|
||||
//! the default provider stays `fastembed`. See
|
||||
//! `docs/superpowers/specs/2026-06-01-embed-candle-track-spec.md`.
|
||||
//! `docs/superpowers/specs/2026-06-01-embed-candle-track-spec.md` and
|
||||
//! `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`.
|
||||
|
||||
use std::sync::Mutex;
|
||||
|
||||
@@ -42,22 +55,95 @@ use tokenizers::{PaddingParams, PaddingStrategy, Tokenizer, TruncationParams};
|
||||
/// `fastembed/` subdir so the two backends never collide.
|
||||
const CANDLE_CACHE_SUBDIR: &str = "candle";
|
||||
|
||||
/// HuggingFace repo id for the multilingual e5 large model. Same weights the
|
||||
/// onnxruntime path uses, just the safetensors variant candle can read.
|
||||
const HF_MODEL: &str = "intfloat/multilingual-e5-large";
|
||||
|
||||
/// The only `config.models.embedding.model` value the candle adapter accepts
|
||||
/// (the e5-large weights `HF_MODEL` resolves to). Guards against silently
|
||||
/// downloading e5-large while `model_id()` reports a different name.
|
||||
const SUPPORTED_MODEL: &str = "multilingual-e5-large";
|
||||
|
||||
/// Token truncation length (e5 was trained at 512).
|
||||
/// Token truncation length (both e5 and arctic-embed-l-v2.0 train at 512).
|
||||
const MAX_LEN: usize = 512;
|
||||
|
||||
/// Env var that overrides `config.models.embedding.num_threads`. Read once in
|
||||
/// [`CandleEmbedder::new`]; `0`/unset/unparseable means "leave rayon default".
|
||||
const ENV_EMBED_THREADS: &str = "KEBAB_EMBED_THREADS";
|
||||
|
||||
/// Pooling strategy over the model's last hidden state. Keyed per-model by
|
||||
/// [`EmbedModelSpec::pooling`] — e5 is mean, arctic is CLS.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum Pooling {
|
||||
/// Attention-mask-weighted mean over all tokens (e5 / sentence-transformers
|
||||
/// `pooling_mode_mean_tokens`).
|
||||
Mean,
|
||||
/// First token (`<s>`/`[CLS]`) hidden state (arctic-embed v2.0 —
|
||||
/// `1_Pooling/config.json` has `pooling_mode_cls_token: true`).
|
||||
Cls,
|
||||
}
|
||||
|
||||
/// One supported embedding model: the HF repo candle downloads, the pooling
|
||||
/// strategy, and the e5-style instruction prefixes. [`MODEL_REGISTRY`] maps a
|
||||
/// `config.models.embedding.model` value to one of these.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct EmbedModelSpec {
|
||||
/// The short `config.models.embedding.model` value that selects this spec.
|
||||
pub name: &'static str,
|
||||
/// HuggingFace repo id candle fetches `config.json` / `tokenizer.json` /
|
||||
/// `model.safetensors` from.
|
||||
pub hf_repo: &'static str,
|
||||
/// Pooling over the last hidden state.
|
||||
pub pooling: Pooling,
|
||||
/// Prefix prepended to **query** inputs before tokenization.
|
||||
pub query_prefix: &'static str,
|
||||
/// Prefix prepended to **document** inputs before tokenization (arctic
|
||||
/// uses `""` — documents are embedded raw).
|
||||
pub doc_prefix: &'static str,
|
||||
/// Expected embedding dimension (model hidden size).
|
||||
pub dim: usize,
|
||||
/// Suffix folded into `model_version` so switching **to** this model
|
||||
/// triggers the `embedding_version` cascade even if the operator forgets
|
||||
/// to bump `config.version`. `None` keeps the bare `config.version` — used
|
||||
/// by e5 so candle-e5 and fastembed-e5 report the *same* version and stay
|
||||
/// interchangeable (the NUMA drop-in invariant — Phase 0 cosine 1.0).
|
||||
pub version_tag: Option<&'static str>,
|
||||
}
|
||||
|
||||
/// The models the candle adapter can load. Adding a model = one entry here
|
||||
/// (plus, for a non-XLM-R architecture, a new forward path — both current
|
||||
/// entries are XLM-RoBERTa-large so they share everything but pooling/prefix).
|
||||
static MODEL_REGISTRY: &[EmbedModelSpec] = &[
|
||||
EmbedModelSpec {
|
||||
name: "multilingual-e5-large",
|
||||
hf_repo: "intfloat/multilingual-e5-large",
|
||||
pooling: Pooling::Mean,
|
||||
query_prefix: "query: ",
|
||||
doc_prefix: "passage: ",
|
||||
dim: 1024,
|
||||
version_tag: None,
|
||||
},
|
||||
EmbedModelSpec {
|
||||
name: "snowflake-arctic-embed-l-v2.0",
|
||||
hf_repo: "Snowflake/snowflake-arctic-embed-l-v2.0",
|
||||
pooling: Pooling::Cls,
|
||||
query_prefix: "query: ",
|
||||
doc_prefix: "",
|
||||
dim: 1024,
|
||||
version_tag: Some("arctic-cls"),
|
||||
},
|
||||
];
|
||||
|
||||
/// Look up a model spec by `config.models.embedding.model`. Accepts either the
|
||||
/// short `name` or the full `hf_repo` id (mirrors the old e5 guard, which
|
||||
/// accepted both `multilingual-e5-large` and `intfloat/multilingual-e5-large`).
|
||||
pub(crate) fn lookup_spec(model: &str) -> Option<&'static EmbedModelSpec> {
|
||||
MODEL_REGISTRY
|
||||
.iter()
|
||||
.find(|s| s.name == model || s.hf_repo == model)
|
||||
}
|
||||
|
||||
/// Comma-separated list of supported model names, for the
|
||||
/// unsupported-model error message.
|
||||
fn supported_models() -> String {
|
||||
MODEL_REGISTRY
|
||||
.iter()
|
||||
.map(|s| s.name)
|
||||
.collect::<Vec<_>>()
|
||||
.join("`, `")
|
||||
}
|
||||
|
||||
/// Pure-Rust candle adapter. Construct via [`CandleEmbedder::new`]; the
|
||||
/// constructor downloads the model on first use, so share one instance.
|
||||
pub struct CandleEmbedder {
|
||||
@@ -68,6 +154,9 @@ pub struct CandleEmbedder {
|
||||
model: Mutex<XLMRobertaModel>,
|
||||
tokenizer: Tokenizer,
|
||||
device: Device,
|
||||
/// The resolved model spec (pooling + prefixes) — drives `embed` and
|
||||
/// `embed_batch`.
|
||||
spec: &'static EmbedModelSpec,
|
||||
model_id: EmbeddingModelId,
|
||||
version: EmbeddingVersion,
|
||||
dimensions: usize,
|
||||
@@ -75,7 +164,8 @@ pub struct CandleEmbedder {
|
||||
}
|
||||
|
||||
impl CandleEmbedder {
|
||||
/// Build an embedder from `Config`. Applies the NUMA thread cap, fetches
|
||||
/// Build an embedder from `Config`. Resolves the model spec from
|
||||
/// `config.models.embedding.model`, applies the NUMA thread cap, fetches
|
||||
/// the model into `{model_dir}/candle/`, and validates that the model's
|
||||
/// hidden size matches `config.models.embedding.dimensions` before
|
||||
/// returning.
|
||||
@@ -104,21 +194,20 @@ impl CandleEmbedder {
|
||||
}
|
||||
}
|
||||
|
||||
// 1b. Model guard. `HF_MODEL` is hard-coded (candle currently only wires
|
||||
// e5-large), so if the operator configured a *different* model name
|
||||
// we must NOT silently download e5-large and then label its vectors
|
||||
// with the configured name via `model_id()` — that would mislabel
|
||||
// `embedding_version` and corrupt a mixed index. Fail fast, before
|
||||
// the ~2GB download.
|
||||
// 1b. Model registry lookup. If the operator configured a model the
|
||||
// candle adapter doesn't know, fail fast (BEFORE the ~2GB
|
||||
// download) — never silently download one model and then label its
|
||||
// vectors with another name via `model_id()`, which would mislabel
|
||||
// `embedding_version` and corrupt a mixed index.
|
||||
let want = config.models.embedding.model.as_str();
|
||||
if want != SUPPORTED_MODEL && want != HF_MODEL {
|
||||
anyhow::bail!(
|
||||
"candle provider currently supports only '{SUPPORTED_MODEL}' (or \
|
||||
the HF id '{HF_MODEL}'), but config.models.embedding.model = \
|
||||
'{want}'. Use provider=fastembed for other models, or set \
|
||||
model = \"{SUPPORTED_MODEL}\"."
|
||||
);
|
||||
}
|
||||
let spec = lookup_spec(want).ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"candle provider supports the models `{}`, but \
|
||||
config.models.embedding.model = '{want}'. Use provider=fastembed \
|
||||
for other models, or pick a supported one.",
|
||||
supported_models()
|
||||
)
|
||||
})?;
|
||||
|
||||
// 2. Resolve `{data_dir}/models/candle/` exactly like the fastembed
|
||||
// adapter resolves its own subdir.
|
||||
@@ -134,14 +223,15 @@ impl CandleEmbedder {
|
||||
tracing::info!(
|
||||
target: "kebab-embed-candle",
|
||||
cache_dir = %cache_dir.display(),
|
||||
model = HF_MODEL,
|
||||
model = spec.hf_repo,
|
||||
pooling = ?spec.pooling,
|
||||
"loading candle embedding model (first run downloads ~2GB safetensors)"
|
||||
);
|
||||
let api = hf_hub::api::sync::ApiBuilder::new()
|
||||
.with_cache_dir(cache_dir.clone())
|
||||
.build()
|
||||
.context("kb-embed-candle: build hf-hub api")?;
|
||||
let repo = api.model(HF_MODEL.to_string());
|
||||
let repo = api.model(spec.hf_repo.to_string());
|
||||
let config_path = repo.get("config.json").context("download config.json")?;
|
||||
let tokenizer_path = repo
|
||||
.get("tokenizer.json")
|
||||
@@ -180,10 +270,21 @@ impl CandleEmbedder {
|
||||
}))
|
||||
.map_err(|e| anyhow::anyhow!("kb-embed-candle: set truncation: {e}"))?;
|
||||
|
||||
// model_version: fold the model tag in for non-e5 models so a switch
|
||||
// triggers the embedding_version cascade; e5 keeps the bare
|
||||
// config.version to stay interchangeable with fastembed-e5.
|
||||
let version = match spec.version_tag {
|
||||
Some(tag) => {
|
||||
EmbeddingVersion(format!("{}+{}", config.models.embedding.version, tag))
|
||||
}
|
||||
None => EmbeddingVersion(config.models.embedding.version.clone()),
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
target: "kebab-embed-candle",
|
||||
dimensions = cfg.hidden_size,
|
||||
layers = cfg.num_hidden_layers,
|
||||
model = spec.name,
|
||||
"candle embedding model loaded"
|
||||
);
|
||||
|
||||
@@ -191,16 +292,17 @@ impl CandleEmbedder {
|
||||
model: Mutex::new(model),
|
||||
tokenizer,
|
||||
device,
|
||||
spec,
|
||||
model_id: EmbeddingModelId(config.models.embedding.model.clone()),
|
||||
version: EmbeddingVersion(config.models.embedding.version.clone()),
|
||||
version,
|
||||
dimensions: cfg.hidden_size,
|
||||
batch_size: config.models.embedding.batch_size.max(1),
|
||||
})
|
||||
}
|
||||
|
||||
/// Embed one batch of **already-prefixed** strings (the e5 `query:`/
|
||||
/// `passage:` prefix is applied by the caller [`CandleEmbedder::embed`])
|
||||
/// through the candle pipeline: tokenize → forward → masked mean pool → L2.
|
||||
/// Embed one batch of **already-prefixed** strings (the per-model prefix
|
||||
/// is applied by the caller [`CandleEmbedder::embed`]) through the candle
|
||||
/// pipeline: tokenize → forward → pool (mean|CLS) → L2.
|
||||
fn embed_batch(&self, prefixed: &[String]) -> Result<Vec<Vec<f32>>> {
|
||||
let encodings = self
|
||||
.tokenizer
|
||||
@@ -237,18 +339,30 @@ impl CandleEmbedder {
|
||||
guard.forward(&input_ids, &attn_f32, &token_type_ids, None, None, None)?
|
||||
};
|
||||
|
||||
// Pooling — per the model spec.
|
||||
let pooled = match self.spec.pooling {
|
||||
Pooling::Mean => {
|
||||
// attention-mask-weighted mean pooling
|
||||
let mask3 = attn_f32.unsqueeze(2)?; // (b, seq, 1)
|
||||
let summed = hidden.broadcast_mul(&mask3)?.sum(1)?; // (b, hidden)
|
||||
// counts ≥ 1 always: every input is e5-prefixed AND special tokens are
|
||||
// added (encode_batch(_, true)), so no row has an all-zero mask. If that
|
||||
// invariant ever breaks, broadcast_div would emit NaN vectors.
|
||||
// counts ≥ 1 always: every input is prefixed AND special
|
||||
// tokens are added (encode_batch(_, true)), so no row has an
|
||||
// all-zero mask. If that invariant ever breaks, broadcast_div
|
||||
// would emit NaN vectors.
|
||||
let counts = mask3.sum(1)?; // (b, 1)
|
||||
let mean = summed.broadcast_div(&counts)?;
|
||||
summed.broadcast_div(&counts)?
|
||||
}
|
||||
Pooling::Cls => {
|
||||
// CLS pooling: the first token's hidden state. arctic-embed
|
||||
// v2.0 prepends `<s>` (the XLM-R BOS/CLS) at index 0, so
|
||||
// `hidden[:, 0, :]` is the sentence embedding.
|
||||
hidden.narrow(1, 0, 1)?.squeeze(1)? // (b, hidden)
|
||||
}
|
||||
};
|
||||
|
||||
// L2 normalize
|
||||
let norm = mean.sqr()?.sum_keepdim(1)?.sqrt()?;
|
||||
let normalized = mean.broadcast_div(&norm)?;
|
||||
let norm = pooled.sqr()?.sum_keepdim(1)?.sqrt()?;
|
||||
let normalized = pooled.broadcast_div(&norm)?;
|
||||
|
||||
// `.contiguous()` before host copy: broadcast ops can leave a strided
|
||||
// view, which `to_vec2` rejects on the Metal backend (CPU tolerates it).
|
||||
@@ -274,9 +388,9 @@ impl Embedder for CandleEmbedder {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// e5 prefix per §11.3 BEFORE tokenization (same convention as
|
||||
// FastembedEmbedder so the two backends produce comparable vectors).
|
||||
let prefixed: Vec<String> = inputs.iter().map(prefix_input).collect();
|
||||
// Per-model instruction prefix BEFORE tokenization (same convention as
|
||||
// FastembedEmbedder for e5; arctic uses `query: `/no-prefix).
|
||||
let prefixed: Vec<String> = inputs.iter().map(|i| prefix_input(self.spec, i)).collect();
|
||||
|
||||
let mut out: Vec<Vec<f32>> = Vec::with_capacity(prefixed.len());
|
||||
for chunk in prefixed.chunks(self.batch_size) {
|
||||
@@ -298,22 +412,22 @@ impl Embedder for CandleEmbedder {
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the e5-prefixed string for one [`EmbeddingInput`]. Free function so
|
||||
/// a unit test can pin the format without loading the model. Byte-identical to
|
||||
/// `kebab-embed-local`'s `prefix_input` — the two backends MUST agree here or
|
||||
/// their vectors diverge.
|
||||
fn prefix_input(input: &EmbeddingInput<'_>) -> String {
|
||||
/// Build the prefixed string for one [`EmbeddingInput`] using the model spec.
|
||||
/// Free function so a unit test can pin the format without loading the model.
|
||||
/// For e5 this is byte-identical to `kebab-embed-local`'s `prefix_input` — the
|
||||
/// two backends MUST agree there or their vectors diverge.
|
||||
fn prefix_input(spec: &EmbedModelSpec, input: &EmbeddingInput<'_>) -> String {
|
||||
match input.kind {
|
||||
EmbeddingKind::Document => format!("passage: {}", input.text),
|
||||
EmbeddingKind::Query => format!("query: {}", input.text),
|
||||
EmbeddingKind::Document => format!("{}{}", spec.doc_prefix, input.text),
|
||||
EmbeddingKind::Query => format!("{}{}", spec.query_prefix, input.text),
|
||||
}
|
||||
}
|
||||
|
||||
/// Select the compute device. Built with the `metal` feature (Apple Silicon
|
||||
/// GPU), try Metal and fall back to CPU on failure; otherwise CPU. Metal only
|
||||
/// compiles/runs on macOS — the Linux server builds the CPU path. e5-large
|
||||
/// vectors are model-defined, so Metal-produced and CPU-produced embeddings are
|
||||
/// cross-compatible (a Mac can ingest on GPU, the server query on CPU).
|
||||
/// compiles/runs on macOS — the Linux server builds the CPU path. Embedding
|
||||
/// vectors are model-defined, so Metal-produced and CPU-produced embeddings
|
||||
/// are cross-compatible (a Mac can ingest on GPU, the server query on CPU).
|
||||
fn select_device() -> Device {
|
||||
#[cfg(feature = "metal")]
|
||||
{
|
||||
@@ -367,26 +481,85 @@ pub(crate) fn check_dim(model_dim: usize, cfg_dim: usize) -> Result<()> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// ── prefix_input ─────────────────────────────────────────────────
|
||||
// Pin the exact e5 prefix strings; these MUST match
|
||||
// kebab-embed-local::prefix_input or candle vs fastembed parity breaks.
|
||||
fn e5_spec() -> &'static EmbedModelSpec {
|
||||
lookup_spec("multilingual-e5-large").expect("e5 in registry")
|
||||
}
|
||||
|
||||
fn arctic_spec() -> &'static EmbedModelSpec {
|
||||
lookup_spec("snowflake-arctic-embed-l-v2.0").expect("arctic in registry")
|
||||
}
|
||||
|
||||
// ── registry ─────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn prefix_document_uses_passage() {
|
||||
fn registry_resolves_e5_by_name_and_hf_repo() {
|
||||
assert_eq!(
|
||||
lookup_spec("multilingual-e5-large").map(|s| s.name),
|
||||
Some("multilingual-e5-large")
|
||||
);
|
||||
assert_eq!(
|
||||
lookup_spec("intfloat/multilingual-e5-large").map(|s| s.name),
|
||||
Some("multilingual-e5-large")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn registry_resolves_arctic_and_its_pooling_is_cls() {
|
||||
let s = arctic_spec();
|
||||
assert_eq!(s.name, "snowflake-arctic-embed-l-v2.0");
|
||||
assert_eq!(s.hf_repo, "Snowflake/snowflake-arctic-embed-l-v2.0");
|
||||
assert_eq!(s.pooling, Pooling::Cls);
|
||||
assert_eq!(s.dim, 1024);
|
||||
assert_eq!(s.version_tag, Some("arctic-cls"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn registry_e5_is_mean_pooling_no_version_tag() {
|
||||
let s = e5_spec();
|
||||
assert_eq!(s.pooling, Pooling::Mean);
|
||||
assert_eq!(s.version_tag, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn registry_rejects_unknown_model() {
|
||||
assert!(lookup_spec("multilingual-e5-small").is_none());
|
||||
}
|
||||
|
||||
// ── prefix_input ─────────────────────────────────────────────────
|
||||
// e5 prefixes MUST match kebab-embed-local::prefix_input or candle vs
|
||||
// fastembed parity breaks; arctic uses query-only prefixing.
|
||||
|
||||
#[test]
|
||||
fn e5_prefix_document_uses_passage() {
|
||||
let input = EmbeddingInput {
|
||||
text: "hello world",
|
||||
kind: EmbeddingKind::Document,
|
||||
};
|
||||
assert_eq!(prefix_input(&input), "passage: hello world");
|
||||
assert_eq!(prefix_input(e5_spec(), &input), "passage: hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefix_query_uses_query() {
|
||||
fn e5_prefix_query_uses_query() {
|
||||
let input = EmbeddingInput {
|
||||
text: "hello world",
|
||||
kind: EmbeddingKind::Query,
|
||||
};
|
||||
assert_eq!(prefix_input(&input), "query: hello world");
|
||||
assert_eq!(prefix_input(e5_spec(), &input), "query: hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn arctic_prefix_query_uses_query_doc_is_bare() {
|
||||
let doc = EmbeddingInput {
|
||||
text: "후입선출 자료구조",
|
||||
kind: EmbeddingKind::Document,
|
||||
};
|
||||
let qry = EmbeddingInput {
|
||||
text: "스택 자료구조",
|
||||
kind: EmbeddingKind::Query,
|
||||
};
|
||||
// arctic: documents are embedded raw, queries get `query: `.
|
||||
assert_eq!(prefix_input(arctic_spec(), &doc), "후입선출 자료구조");
|
||||
assert_eq!(prefix_input(arctic_spec(), &qry), "query: 스택 자료구조");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -399,8 +572,10 @@ mod tests {
|
||||
text: "",
|
||||
kind: EmbeddingKind::Query,
|
||||
};
|
||||
assert_eq!(prefix_input(&doc), "passage: ");
|
||||
assert_eq!(prefix_input(&qry), "query: ");
|
||||
assert_eq!(prefix_input(e5_spec(), &doc), "passage: ");
|
||||
assert_eq!(prefix_input(e5_spec(), &qry), "query: ");
|
||||
assert_eq!(prefix_input(arctic_spec(), &doc), "");
|
||||
assert_eq!(prefix_input(arctic_spec(), &qry), "query: ");
|
||||
}
|
||||
|
||||
// ── check_dim ────────────────────────────────────────────────────
|
||||
@@ -421,9 +596,9 @@ mod tests {
|
||||
}
|
||||
|
||||
// ── model guard ──────────────────────────────────────────────────
|
||||
// A non-e5-large model name must fail fast (BEFORE the ~2GB download),
|
||||
// so we never download e5-large yet label its vectors with another name
|
||||
// via model_id() — which would mislabel embedding_version.
|
||||
// A model name not in the registry must fail fast (BEFORE the ~2GB
|
||||
// download), so we never download one model yet label its vectors with
|
||||
// another name via model_id() — which would mislabel embedding_version.
|
||||
|
||||
#[test]
|
||||
fn new_rejects_unsupported_model() {
|
||||
@@ -437,8 +612,8 @@ mod tests {
|
||||
.expect("unsupported model must error");
|
||||
let msg = format!("{err:#}");
|
||||
assert!(
|
||||
msg.contains("candle provider currently supports only"),
|
||||
"expected model-guard error, got: {msg}"
|
||||
msg.contains("candle provider supports the models"),
|
||||
"expected model-registry error, got: {msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
128
crates/kebab-embed-candle/tests/arctic_ollama_parity.rs
Normal file
128
crates/kebab-embed-candle/tests/arctic_ollama_parity.rs
Normal file
@@ -0,0 +1,128 @@
|
||||
//! arctic-embed-l-v2.0 correctness gate (`#[ignore]` — needs the ~2GB candle
|
||||
//! model + a live Ollama serving `snowflake-arctic-embed2`).
|
||||
//!
|
||||
//! This is the load-bearing pooling/prefix check for the arctic integration.
|
||||
//! The recall measurement that justified adopting arctic (recall@10 130/132)
|
||||
//! went through Ollama's `snowflake-arctic-embed2`. The candle path
|
||||
//! re-implements the model (XLM-RoBERTa-large + **CLS** pooling + `query: ` on
|
||||
//! queries / no prefix on documents). If candle's pooling or prefix is wrong,
|
||||
//! its vectors silently diverge from the measured route and the 130 number
|
||||
//! does NOT carry over. This test pins them together: per-sentence cosine
|
||||
//! between the candle vector and the Ollama vector must be **> 0.99**.
|
||||
//!
|
||||
//! `#[ignore]` because it depends on an external Ollama daemon (CI is
|
||||
//! headless/offline). The leader MUST run it once before merge.
|
||||
//!
|
||||
//! ## Manual run
|
||||
//!
|
||||
//! 1. Confirm Ollama is reachable and has the model:
|
||||
//! ```sh
|
||||
//! curl -s http://192.168.0.47:11434/api/tags # should list snowflake-arctic-embed2
|
||||
//! ```
|
||||
//! 2. Run (downloads the ~2GB candle safetensors on first run):
|
||||
//! ```sh
|
||||
//! CARGO_TARGET_DIR=/build/out/cargo-target \
|
||||
//! KEBAB_ARCTIC_OLLAMA_ENDPOINT=http://192.168.0.47:11434 \
|
||||
//! cargo test -p kebab-embed-candle --test arctic_ollama_parity -- --ignored --nocapture
|
||||
//! ```
|
||||
//! The endpoint defaults to `http://192.168.0.47:11434` if the env is unset.
|
||||
//!
|
||||
//! Record the printed `ARCTIC_PARITY_SUMMARY cosine_min=...` in
|
||||
//! `/tmp/arctic-result.md` + `tasks/HOTFIXES.md`.
|
||||
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind};
|
||||
use kebab_embed_candle::CandleEmbedder;
|
||||
use kebab_embed_ollama::OllamaEmbedder;
|
||||
|
||||
const DOGFOOD_CONFIG: &str = "/build/dogfood/config.toml";
|
||||
const DEFAULT_OLLAMA_ENDPOINT: &str = "http://192.168.0.47:11434";
|
||||
|
||||
/// Mixed Korean / English + the descriptive-recall shapes arctic was adopted
|
||||
/// for (synonym / abbreviation / English term). Covers both prefix paths.
|
||||
const SENTENCES: &[&str] = &[
|
||||
"스택 자료구조",
|
||||
"후입선출 방식으로 동작하는 자료구조",
|
||||
"큐는 선입선출 자료구조이다",
|
||||
"Rust ownership and the borrow checker",
|
||||
"소유권과 빌림 검사기는 메모리 안전성을 보장한다",
|
||||
"SVM 은 support vector machine 의 약자이다",
|
||||
"정렬 알고리즘의 시간 복잡도",
|
||||
"The capital of France is Paris.",
|
||||
];
|
||||
|
||||
fn cosine(a: &[f32], b: &[f32]) -> f32 {
|
||||
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
|
||||
let na: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
let nb: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
dot / (na * nb)
|
||||
}
|
||||
|
||||
/// Base config: prefer the canonical dogfood config (for storage/cache roots),
|
||||
/// fall back to `Config::defaults()` so the test still runs on a bare clone.
|
||||
fn base_config() -> Config {
|
||||
Config::load(Some(std::path::Path::new(DOGFOOD_CONFIG))).unwrap_or_else(|_| Config::defaults())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore = "needs ~2GB candle model + live Ollama (snowflake-arctic-embed2); run manually before merge"]
|
||||
fn candle_arctic_matches_ollama_arctic() {
|
||||
let endpoint = std::env::var("KEBAB_ARCTIC_OLLAMA_ENDPOINT")
|
||||
.unwrap_or_else(|_| DEFAULT_OLLAMA_ENDPOINT.to_string());
|
||||
|
||||
// candle side: the in-process arctic model.
|
||||
let mut candle_cfg = base_config();
|
||||
candle_cfg.models.embedding.provider = "candle".to_string();
|
||||
candle_cfg.models.embedding.model = "snowflake-arctic-embed-l-v2.0".to_string();
|
||||
candle_cfg.models.embedding.dimensions = 1024;
|
||||
|
||||
// Ollama side: the reference route the recall numbers came from.
|
||||
let mut ollama_cfg = base_config();
|
||||
ollama_cfg.models.embedding.provider = "ollama".to_string();
|
||||
ollama_cfg.models.embedding.model = "snowflake-arctic-embed2".to_string();
|
||||
ollama_cfg.models.embedding.dimensions = 1024;
|
||||
ollama_cfg.models.embedding.endpoint = Some(endpoint.clone());
|
||||
|
||||
let candle = CandleEmbedder::new(&candle_cfg).expect("build candle arctic embedder");
|
||||
let ollama = OllamaEmbedder::new(&ollama_cfg).expect("build ollama arctic embedder");
|
||||
|
||||
// Exercise BOTH prefix paths so a query-side divergence can't hide.
|
||||
let inputs: Vec<EmbeddingInput> = SENTENCES
|
||||
.iter()
|
||||
.flat_map(|s| {
|
||||
[EmbeddingKind::Document, EmbeddingKind::Query]
|
||||
.into_iter()
|
||||
.map(move |kind| EmbeddingInput { text: s, kind })
|
||||
})
|
||||
.collect();
|
||||
|
||||
let cv = candle.embed(&inputs).expect("candle embed");
|
||||
let ov = ollama
|
||||
.embed(&inputs)
|
||||
.expect("ollama embed (is snowflake-arctic-embed2 pulled @ the endpoint?)");
|
||||
|
||||
assert_eq!(cv.len(), ov.len(), "embedding counts must match");
|
||||
assert_eq!(cv.len(), inputs.len(), "one vector per input");
|
||||
assert_eq!(candle.dimensions(), 1024);
|
||||
|
||||
let mut min_cos = f32::INFINITY;
|
||||
for (i, inp) in inputs.iter().enumerate() {
|
||||
assert_eq!(cv[i].len(), 1024, "candle dim");
|
||||
assert_eq!(ov[i].len(), 1024, "ollama dim");
|
||||
let c = cosine(&cv[i], &ov[i]);
|
||||
min_cos = min_cos.min(c);
|
||||
let kind = match inp.kind {
|
||||
EmbeddingKind::Document => "doc",
|
||||
EmbeddingKind::Query => "qry",
|
||||
};
|
||||
let preview: String = inp.text.chars().take(36).collect();
|
||||
println!("[{i:>2}] {kind} cos={c:.6} {preview}");
|
||||
}
|
||||
|
||||
println!("ARCTIC_PARITY_SUMMARY cosine_min={min_cos:.6} endpoint={endpoint}");
|
||||
assert!(
|
||||
min_cos > 0.99,
|
||||
"candle arctic vs Ollama arctic cosine_min={min_cos:.6} ≤ 0.99 — \
|
||||
pooling/prefix mismatch; the recall=130 measurement will NOT reproduce"
|
||||
);
|
||||
}
|
||||
30
crates/kebab-embed-ollama/Cargo.toml
Normal file
30
crates/kebab-embed-ollama/Cargo.toml
Normal file
@@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "kebab-embed-ollama"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Ollama HTTP adapter implementing kebab_core::Embedder (POST /api/embed, L2-normalized, batched + fail-soft)"
|
||||
|
||||
[dependencies]
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
# `default-features = false` drops native-tls (system OpenSSL); we pin rustls.
|
||||
# reqwest 0.12's `blocking` feature wraps a private current-thread tokio
|
||||
# runtime — this crate exposes NO async surface (no `async`/`await`/`tokio::*`
|
||||
# symbols), matching the kebab-llm-local invariant.
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
# wiremock hosts the mock /api/embed server (needs a tokio runtime); tokio is
|
||||
# also pulled transitively at runtime by reqwest's `blocking` feature.
|
||||
wiremock = { workspace = true }
|
||||
tokio = { workspace = true, features = ["macros", "rt"] }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
310
crates/kebab-embed-ollama/src/lib.rs
Normal file
310
crates/kebab-embed-ollama/src/lib.rs
Normal file
@@ -0,0 +1,310 @@
|
||||
//! `kebab-embed-ollama` — [`OllamaEmbedder`], a `reqwest::blocking` adapter
|
||||
//! implementing [`Embedder`](kebab_core::Embedder) over Ollama's
|
||||
//! `POST /api/embed` endpoint.
|
||||
//!
|
||||
//! ## Why this exists
|
||||
//!
|
||||
//! The candle backend ([`kebab-embed-candle`]) runs arctic-embed-l-v2.0
|
||||
//! in-process (pure Rust, NUMA-safe). This crate is the **fallback** path:
|
||||
//! it offloads embedding to a local/remote Ollama daemon (`snowflake-arctic-embed2`),
|
||||
//! which is exactly the route the recall measurements used — so it reproduces
|
||||
//! the measured numbers (recall@10 130/132) byte-for-route. Opt-in via
|
||||
//! `config.models.embedding.provider = "ollama"`.
|
||||
//!
|
||||
//! ## Wire shape
|
||||
//!
|
||||
//! Request (`POST {endpoint}/api/embed`):
|
||||
//!
|
||||
//! ```json
|
||||
//! { "model": "snowflake-arctic-embed2", "input": ["query: 스택", "후입선출 ..."] }
|
||||
//! ```
|
||||
//!
|
||||
//! Response:
|
||||
//!
|
||||
//! ```json
|
||||
//! { "model": "...", "embeddings": [[0.01, ...], [0.02, ...]] }
|
||||
//! ```
|
||||
//!
|
||||
//! ## Pipeline
|
||||
//!
|
||||
//! 1. instruction prefix per model ([`prefixes_for`] — arctic: `query: ` on
|
||||
//! queries, no prefix on documents; e5: `query: `/`passage: `);
|
||||
//! 2. batch into `BATCH` (48) inputs per request;
|
||||
//! 3. `POST /api/embed`, with fail-soft retry (`MAX_RETRIES`);
|
||||
//! 4. **L2 normalize** each returned vector — Ollama returns raw (un-normalized)
|
||||
//! embeddings, so we normalize for cosine consistency with the candle path;
|
||||
//! 5. dim check against `config.models.embedding.dimensions`.
|
||||
//!
|
||||
//! ## Send-safety
|
||||
//!
|
||||
//! `reqwest::blocking::Client: Send + Sync`; the adapter holds only the client,
|
||||
//! an endpoint string, and small config scalars, so it is trivially `Send + Sync`
|
||||
//! as the [`Embedder`] trait requires.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Inputs per `/api/embed` request. Ollama handles arbitrary batch sizes, but
|
||||
/// a cap keeps a single HTTP body bounded and lets a partial failure retry a
|
||||
/// smaller unit.
|
||||
const BATCH: usize = 48;
|
||||
|
||||
/// Fail-soft retry attempts per batch before the error propagates. Cold model
|
||||
/// load on the Ollama side can transiently 500/timeout; a couple of retries
|
||||
/// smooth that over without masking a hard misconfiguration.
|
||||
const MAX_RETRIES: u32 = 3;
|
||||
|
||||
/// Default per-request HTTP timeout (seconds). Cold-loading an embedding model
|
||||
/// on first call can take tens of seconds; this matches the generous default
|
||||
/// used by the LLM adapter.
|
||||
const REQUEST_TIMEOUT_SECS: u64 = 300;
|
||||
|
||||
/// Resolve the (query_prefix, doc_prefix) for an Ollama embedding model tag.
|
||||
///
|
||||
/// Mirrors `kebab-embed-candle`'s `MODEL_REGISTRY`, but keyed on the **Ollama
|
||||
/// model tag** (which differs from the HF id — e.g. `snowflake-arctic-embed2`
|
||||
/// vs `Snowflake/snowflake-arctic-embed-l-v2.0`). Kept here rather than shared
|
||||
/// so this crate does not depend on the candle backend.
|
||||
///
|
||||
/// An unrecognized model gets no prefix (`("", "")`): many embedding models
|
||||
/// are not instruction-tuned, so embedding the raw text is the correct default
|
||||
/// — and a misspelled known model surfaces as a recall regression, not a silent
|
||||
/// wrong-prefix, because the dim check still passes either way.
|
||||
fn prefixes_for(model: &str) -> (&'static str, &'static str) {
|
||||
let m = model.to_ascii_lowercase();
|
||||
if m.contains("arctic-embed") {
|
||||
// arctic-embed v2.0: `query: ` on queries, documents embedded raw.
|
||||
("query: ", "")
|
||||
} else if m.contains("e5") {
|
||||
// multilingual-e5: `query: ` / `passage: `.
|
||||
("query: ", "passage: ")
|
||||
} else {
|
||||
("", "")
|
||||
}
|
||||
}
|
||||
|
||||
/// `reqwest::blocking` adapter implementing [`Embedder`] over Ollama's
|
||||
/// `/api/embed`. Construction is offline; the first network call happens in
|
||||
/// [`Embedder::embed`].
|
||||
pub struct OllamaEmbedder {
|
||||
client: reqwest::blocking::Client,
|
||||
/// Validated endpoint base (e.g. `"http://127.0.0.1:11434"`).
|
||||
endpoint: String,
|
||||
/// Ollama model tag (e.g. `"snowflake-arctic-embed2"`).
|
||||
model: String,
|
||||
query_prefix: &'static str,
|
||||
doc_prefix: &'static str,
|
||||
model_id: EmbeddingModelId,
|
||||
version: EmbeddingVersion,
|
||||
dimensions: usize,
|
||||
}
|
||||
|
||||
impl OllamaEmbedder {
|
||||
/// Build from a workspace [`kebab_config::Config`]. Reads
|
||||
/// `config.models.embedding.{model, dimensions}` and resolves the endpoint
|
||||
/// as `models.embedding.endpoint` → fallback `models.llm.endpoint`.
|
||||
///
|
||||
/// Does NOT touch the network. The caller (app layer) is expected to have
|
||||
/// validated `provider == "ollama"`.
|
||||
pub fn new(config: &kebab_config::Config) -> Result<Self> {
|
||||
let emb = &config.models.embedding;
|
||||
let endpoint = emb
|
||||
.endpoint
|
||||
.clone()
|
||||
.filter(|e| !e.is_empty())
|
||||
.unwrap_or_else(|| config.models.llm.endpoint.clone());
|
||||
if endpoint.is_empty() {
|
||||
anyhow::bail!(
|
||||
"ollama embedding provider needs an endpoint: set \
|
||||
`models.embedding.endpoint` (or `models.llm.endpoint`)"
|
||||
);
|
||||
}
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
|
||||
.build()
|
||||
.context("kb-embed-ollama: build reqwest client")?;
|
||||
let (query_prefix, doc_prefix) = prefixes_for(&emb.model);
|
||||
Ok(Self {
|
||||
client,
|
||||
endpoint,
|
||||
model: emb.model.clone(),
|
||||
query_prefix,
|
||||
doc_prefix,
|
||||
model_id: EmbeddingModelId(emb.model.clone()),
|
||||
// model_version = `ollama:{model}` so a provider/model switch
|
||||
// triggers the embedding_version cascade and never collides with
|
||||
// the candle path's version string for the same model.
|
||||
version: EmbeddingVersion(format!("ollama:{}", emb.model)),
|
||||
dimensions: emb.dimensions,
|
||||
})
|
||||
}
|
||||
|
||||
/// Embed one already-prefixed batch via `/api/embed`, with fail-soft retry.
|
||||
fn embed_batch(&self, prefixed: &[String]) -> Result<Vec<Vec<f32>>> {
|
||||
let url = format!("{}/api/embed", self.endpoint.trim_end_matches('/'));
|
||||
let body = EmbedRequest {
|
||||
model: &self.model,
|
||||
input: prefixed,
|
||||
};
|
||||
|
||||
let mut last_err: Option<anyhow::Error> = None;
|
||||
for attempt in 1..=MAX_RETRIES {
|
||||
match self.try_once(&url, &body) {
|
||||
Ok(resp) => return self.finalize(resp, prefixed.len()),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-embed-ollama",
|
||||
attempt,
|
||||
max = MAX_RETRIES,
|
||||
error = %e,
|
||||
"ollama /api/embed attempt failed; retrying"
|
||||
);
|
||||
last_err = Some(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(last_err.unwrap_or_else(|| {
|
||||
anyhow::anyhow!("kb-embed-ollama: all {MAX_RETRIES} attempts failed")
|
||||
}))
|
||||
}
|
||||
|
||||
/// One HTTP round-trip. Network / non-2xx / decode errors all map to
|
||||
/// `Err` so the retry loop can decide.
|
||||
fn try_once(&self, url: &str, body: &EmbedRequest<'_>) -> Result<EmbedResponse> {
|
||||
let resp = self
|
||||
.client
|
||||
.post(url)
|
||||
.json(body)
|
||||
.send()
|
||||
.with_context(|| format!("kb-embed-ollama: POST {url}"))?;
|
||||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let text = resp.text().unwrap_or_default();
|
||||
anyhow::bail!("kb-embed-ollama: /api/embed returned {status}: {text}");
|
||||
}
|
||||
resp.json::<EmbedResponse>()
|
||||
.context("kb-embed-ollama: decode /api/embed response")
|
||||
}
|
||||
|
||||
/// Validate count + dim, then L2-normalize each vector.
|
||||
fn finalize(&self, resp: EmbedResponse, expected: usize) -> Result<Vec<Vec<f32>>> {
|
||||
if resp.embeddings.len() != expected {
|
||||
anyhow::bail!(
|
||||
"kb-embed-ollama: expected {expected} embeddings, got {}",
|
||||
resp.embeddings.len()
|
||||
);
|
||||
}
|
||||
let mut out = Vec::with_capacity(resp.embeddings.len());
|
||||
for v in resp.embeddings {
|
||||
if v.len() != self.dimensions {
|
||||
anyhow::bail!(
|
||||
"kb-embed-ollama: model returned dim {} but config expects {} \
|
||||
(check models.embedding.dimensions vs the Ollama model)",
|
||||
v.len(),
|
||||
self.dimensions
|
||||
);
|
||||
}
|
||||
out.push(l2_normalize(v));
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
impl Embedder for OllamaEmbedder {
|
||||
fn model_id(&self) -> EmbeddingModelId {
|
||||
self.model_id.clone()
|
||||
}
|
||||
|
||||
fn model_version(&self) -> EmbeddingVersion {
|
||||
self.version.clone()
|
||||
}
|
||||
|
||||
fn dimensions(&self) -> usize {
|
||||
self.dimensions
|
||||
}
|
||||
|
||||
fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> Result<Vec<Vec<f32>>> {
|
||||
if inputs.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let prefixed: Vec<String> = inputs.iter().map(|i| self.prefix(i)).collect();
|
||||
let mut out = Vec::with_capacity(prefixed.len());
|
||||
for chunk in prefixed.chunks(BATCH) {
|
||||
out.extend(self.embed_batch(chunk)?);
|
||||
}
|
||||
debug_assert_eq!(out.len(), inputs.len());
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
impl OllamaEmbedder {
|
||||
/// Prefix one input per the resolved model prefixes.
|
||||
fn prefix(&self, input: &EmbeddingInput<'_>) -> String {
|
||||
match input.kind {
|
||||
EmbeddingKind::Document => format!("{}{}", self.doc_prefix, input.text),
|
||||
EmbeddingKind::Query => format!("{}{}", self.query_prefix, input.text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// L2-normalize a vector in place-ish (consumes + returns). A zero vector is
|
||||
/// returned unchanged (norm 0 → no division) so a degenerate embedding can
|
||||
/// never produce NaNs.
|
||||
fn l2_normalize(mut v: Vec<f32>) -> Vec<f32> {
|
||||
let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if norm > 0.0 {
|
||||
for x in &mut v {
|
||||
*x /= norm;
|
||||
}
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
// ── Wire types ──────────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct EmbedRequest<'a> {
|
||||
model: &'a str,
|
||||
input: &'a [String],
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct EmbedResponse {
|
||||
embeddings: Vec<Vec<f32>>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn prefixes_for_arctic_is_query_only() {
|
||||
assert_eq!(prefixes_for("snowflake-arctic-embed2"), ("query: ", ""));
|
||||
assert_eq!(prefixes_for("snowflake-arctic-embed2:latest"), ("query: ", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefixes_for_e5_is_query_passage() {
|
||||
assert_eq!(prefixes_for("multilingual-e5-large"), ("query: ", "passage: "));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefixes_for_unknown_is_bare() {
|
||||
assert_eq!(prefixes_for("nomic-embed-text"), ("", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn l2_normalize_unit_length() {
|
||||
let v = l2_normalize(vec![3.0, 4.0]);
|
||||
let norm = (v[0] * v[0] + v[1] * v[1]).sqrt();
|
||||
assert!((norm - 1.0).abs() < 1e-6, "norm = {norm}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn l2_normalize_zero_vector_is_unchanged() {
|
||||
assert_eq!(l2_normalize(vec![0.0, 0.0, 0.0]), vec![0.0, 0.0, 0.0]);
|
||||
}
|
||||
}
|
||||
99
crates/kebab-embed-ollama/tests/embed_mock.rs
Normal file
99
crates/kebab-embed-ollama/tests/embed_mock.rs
Normal file
@@ -0,0 +1,99 @@
|
||||
//! `/api/embed` behavior against a `wiremock`-hosted mock server.
|
||||
//!
|
||||
//! `wiremock` is async, so the tests are `#[tokio::test]`; the sync
|
||||
//! [`OllamaEmbedder`] is driven from `spawn_blocking` to keep `reqwest::blocking`
|
||||
//! off the async runtime (same pattern as `kebab-llm-local`'s streaming tests).
|
||||
//! tokio is a `dev-dependency` only.
|
||||
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind};
|
||||
use kebab_embed_ollama::OllamaEmbedder;
|
||||
use wiremock::matchers::{method, path};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
/// Config pointing at the mock server, with a small dim so the mock body is
|
||||
/// tiny. `model` is an arctic tag so prefix resolution is exercised.
|
||||
fn cfg_for(endpoint: &str, dim: usize) -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.models.embedding.provider = "ollama".to_string();
|
||||
cfg.models.embedding.model = "snowflake-arctic-embed2".to_string();
|
||||
cfg.models.embedding.dimensions = dim;
|
||||
cfg.models.embedding.endpoint = Some(endpoint.to_string());
|
||||
cfg
|
||||
}
|
||||
|
||||
async fn embed_blocking(
|
||||
cfg: Config,
|
||||
inputs: Vec<(String, EmbeddingKind)>,
|
||||
) -> anyhow::Result<Vec<Vec<f32>>> {
|
||||
tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Vec<f32>>> {
|
||||
let emb = OllamaEmbedder::new(&cfg)?;
|
||||
let refs: Vec<EmbeddingInput<'_>> = inputs
|
||||
.iter()
|
||||
.map(|(t, k)| EmbeddingInput { text: t, kind: *k })
|
||||
.collect();
|
||||
emb.embed(&refs)
|
||||
})
|
||||
.await
|
||||
.expect("blocking task panicked")
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embed_returns_l2_normalized_vectors() {
|
||||
let server = MockServer::start().await;
|
||||
// Two raw (un-normalized) vectors of dim 2; the adapter must L2-normalize.
|
||||
let body = r#"{"model":"snowflake-arctic-embed2","embeddings":[[3.0,4.0],[0.0,5.0]]}"#;
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/embed"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_string(body))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let out = embed_blocking(
|
||||
cfg_for(&server.uri(), 2),
|
||||
vec![
|
||||
("스택 자료구조".to_string(), EmbeddingKind::Query),
|
||||
("후입선출".to_string(), EmbeddingKind::Document),
|
||||
],
|
||||
)
|
||||
.await
|
||||
.expect("embed should succeed");
|
||||
|
||||
assert_eq!(out.len(), 2);
|
||||
for v in &out {
|
||||
let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
assert!((norm - 1.0).abs() < 1e-5, "expected unit norm, got {norm}");
|
||||
}
|
||||
// [3,4] → [0.6, 0.8].
|
||||
assert!((out[0][0] - 0.6).abs() < 1e-5 && (out[0][1] - 0.8).abs() < 1e-5);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embed_rejects_dim_mismatch() {
|
||||
let server = MockServer::start().await;
|
||||
// Server returns dim 3, config expects dim 2 → hard error.
|
||||
let body = r#"{"model":"snowflake-arctic-embed2","embeddings":[[1.0,2.0,3.0]]}"#;
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/embed"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_string(body))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let err = embed_blocking(
|
||||
cfg_for(&server.uri(), 2),
|
||||
vec![("q".to_string(), EmbeddingKind::Query)],
|
||||
)
|
||||
.await
|
||||
.expect_err("dim mismatch must error");
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("dim"), "expected dim error, got: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn embed_empty_input_is_noop() {
|
||||
// No mock needed — empty input must never hit the network.
|
||||
let out = embed_blocking(cfg_for("http://127.0.0.1:1", 2), vec![])
|
||||
.await
|
||||
.expect("empty embed should be Ok(empty)");
|
||||
assert!(out.is_empty());
|
||||
}
|
||||
@@ -16,7 +16,7 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab-
|
||||
| metadata | SQLite + FTS5 (lexical search + v0.20.1 한국어 형태소 tokenizer via lindera-ko-dic) |
|
||||
| vector | LanceDB (embedded, model 별 분리 table) |
|
||||
| Markdown parser | `pulldown-cmark`. frontmatter 에 title 없으면 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (`parser_version = md-frontmatter-v2`, 기존 doc 도 다음 ingest 에서 갱신) |
|
||||
| embedding | `fastembed-rs` (`multilingual-e5-large`, 1024d, v0.18.0부터 default 업그레이드) |
|
||||
| embedding | `fastembed-rs` (`multilingual-e5-large`, 1024d, v0.18.0부터 default 업그레이드). opt-in 대안: candle (e5 또는 `snowflake-arctic-embed-l-v2.0`) / Ollama `/api/embed`. arctic = 설명형 query recall 보강 (v0.26.0, 아래 결정표) |
|
||||
| 한국어 형태소분석 | `lindera-ko-dic` (FTS5 외부 tokenizer, v0.20.1) — 2자 이상 한국어 query 지원 |
|
||||
| LLM | Ollama HTTP (default `gemma4:e4b` ─ OCR / caption 와 family 통일. 사용자가 더 큰 variant `gemma4:26b` 등으로 override 가능) |
|
||||
| 음성 ASR | `whisper.cpp` (via `whisper-rs`) — P8 보류, 시스템 dep brainstorm 후 |
|
||||
@@ -67,7 +67,8 @@ flowchart TB
|
||||
subgraph Adapters ["traits + adapters"]
|
||||
embed["kebab-embed<br/>(trait)"]
|
||||
embedlocal["kebab-embed-local<br/>(fastembed, default)"]
|
||||
embedcandle["kebab-embed-candle<br/>(candle, NUMA-safe opt-in)"]
|
||||
embedcandle["kebab-embed-candle<br/>(candle, e5+arctic, NUMA-safe opt-in)"]
|
||||
embedollama["kebab-embed-ollama<br/>(Ollama /api/embed, opt-in)"]
|
||||
llm["kebab-llm<br/>(trait)"]
|
||||
llmlocal["kebab-llm-local<br/>(Ollama)"]
|
||||
search["kebab-search"]
|
||||
@@ -94,6 +95,7 @@ flowchart TB
|
||||
app --> vector
|
||||
app --> embedlocal
|
||||
app --> embedcandle
|
||||
app --> embedollama
|
||||
app --> llmlocal
|
||||
app --> search
|
||||
app --> rag
|
||||
@@ -108,6 +110,8 @@ flowchart TB
|
||||
embedlocal --> embed
|
||||
embedcandle --> core
|
||||
embedcandle --> config
|
||||
embedollama --> core
|
||||
embedollama --> config
|
||||
llmlocal --> llm
|
||||
rag --> search
|
||||
rag --> llm
|
||||
@@ -136,6 +140,23 @@ UI → store/llm/parse 직접 의존 금지. 모든 user-facing 진입은 `kebab
|
||||
|
||||
`kebab-parse-code` 의 외부 tree-sitter grammar crate 의존: P10-1A-2 에서 `tree-sitter-rust` 추가, P10-1B 에서 `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` 추가, P10-1C-Go 에서 `tree-sitter-go` 추가, P10-1C-JK 에서 `tree-sitter-java` / `tree-sitter-kotlin-ng` 추가, P10-1D 에서 `tree-sitter-c` / `tree-sitter-cpp` 추가. 모두 `kebab-parse-code` 에만 격리 (facade 룰 — UI crate / chunker 가 직접 import 금지). Kotlin 은 `tree-sitter-kotlin-ng` 사용 (bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착 — 사용 불가). v0.18.0+ 부터 `kebab-source-fs` 는 자체 `code_meta` 모듈 (lang detect + skip helpers + BUILTIN_BLACKLIST) 을 보유, kebab-parse-code 와 분리 (refactor 2026-05-26). v0.19.0 부터 `kebab-parse-md` 가 `kebab-parse-types` (parser intermediate types) + `kebab-normalize` (CanonicalDocument lift) 두 crate 를 흡수 — 24 → 22 crates, design §3.7b 재작성 (HOTFIXES 2026-05-26). v0.20.1 부터 `kebab-search` 가 `lindera-ko-dic` 를 의존해 한국어 FTS5 형태소 tokenizer 지원 — V009 migration 으로 2자 이상 한국어 query 매칭 (Bug #8 closure).
|
||||
|
||||
### 임베딩 백엔드 결정표 (v0.26.0)
|
||||
|
||||
| provider | 모델 | pooling / prefix | 위치 | 언제 |
|
||||
|---|---|---|---|---|
|
||||
| `fastembed` (기본) | `multilingual-e5-large` | mean / `query:`·`passage:` | in-process (onnxruntime) | 기본. 단일 소켓 호스트 |
|
||||
| `candle` | e5 또는 `snowflake-arctic-embed-l-v2.0` | 모델별 (e5=mean, arctic=CLS) / arctic=`query:`·무접두어 | in-process (pure Rust) | NUMA 서버 (onnxruntime 48-스레드 double-free 회피), Apple Silicon Metal GPU |
|
||||
| `ollama` | `snowflake-arctic-embed2` 등 | 모델 태그로 추론 / arctic=`query:`·무접두어 | 원격 HTTP (`/api/embed`) | candle 폴백, 측정에 쓴 경로 그대로 재현 |
|
||||
|
||||
**arctic-embed-l-v2.0 채택 근거**: 별칭(doc-side expansion) 제거(v0.25.0) 후 설명형
|
||||
query 의 recall 보강책. 측정(`/build/dogfood/logs/2026-06-03-method-measurements.md`)에서
|
||||
arctic = recall@10 130/132 (e5 대비 +7, 색인 1회·per-query 0·LLM 0, 용어 무손실).
|
||||
candle 이 주 백엔드(in-process, NUMA 안전), Ollama 가 폴백(측정 경로 재현). 두 경로의
|
||||
pooling/prefix 정확성은 `kebab-embed-candle/tests/arctic_ollama_parity.rs`
|
||||
(candle arctic vs Ollama arctic 코사인>0.99, `#[ignore]`) 로 고정. e5 → arctic 전환은
|
||||
`embedding_version` cascade (모델별 벡터 상이) → 재색인 필요. 기본값 e5 유지라 기존
|
||||
사용자 무영향. 자세한 내용: [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 2026-06-03 arctic entry.
|
||||
|
||||
## 디렉토리 구조
|
||||
|
||||
```text
|
||||
@@ -184,7 +205,8 @@ kebab/
|
||||
│ ├── kebab-store-sqlite/ # SQLite + FTS5 (V001/V002/V003) (P1-6, P2-1, P3-3). src/derivation_cache.rs = derivation_cache 테이블 저장소 (V012, v0.21.0)
|
||||
│ ├── kebab-search/ # Lexical + Vector + Hybrid retriever (P2-2, P3-4)
|
||||
│ ├── kebab-embed/ kebab-embed-local/ # Embedder trait + fastembed adapter (P3-1, P3-2)
|
||||
│ ├── kebab-embed-candle/ # candle (pure-Rust) Embedder, NUMA-safe opt-in provider=candle (Track 1, v0.22.0)
|
||||
│ ├── kebab-embed-candle/ # candle (pure-Rust) Embedder, 모델 레지스트리(e5 mean + arctic CLS), NUMA-safe opt-in provider=candle (Track 1, v0.22.0; arctic v0.26.0)
|
||||
│ ├── kebab-embed-ollama/ # Ollama /api/embed Embedder, opt-in provider=ollama (arctic 폴백 경로, v0.26.0)
|
||||
│ ├── kebab-store-vector/ # LanceDB VectorStore (P3-3, P7-3 follow-up)
|
||||
│ ├── kebab-llm/ kebab-llm-local/ # LanguageModel trait + Ollama adapter (P4-1, P4-2)
|
||||
│ ├── kebab-rag/ # RAG pipeline (P4-3)
|
||||
|
||||
@@ -107,16 +107,18 @@ respect_markdown_headings = true
|
||||
chunker_version = "md-heading-v1"
|
||||
|
||||
[models.embedding]
|
||||
provider = "fastembed" # "fastembed"(기본) / "candle"(순수 Rust, NUMA-안전)
|
||||
# / "none"(lexical-only — Ollama 불필요)
|
||||
# ⚠ provider="candle" 사용 시 아래 model/dimensions 도
|
||||
# multilingual-e5-large / 1024 로 바꿔야 함
|
||||
# (candle 은 현재 e5-large 만 지원).
|
||||
model = "multilingual-e5-small"
|
||||
provider = "fastembed" # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust, NUMA-안전)
|
||||
# / "ollama"(원격 HTTP /api/embed) / "none"(lexical-only — Ollama 불필요)
|
||||
# ⚠ provider/model 변경 시 아래 dimensions 도 맞춰야 함.
|
||||
model = "multilingual-e5-small" # candle/ollama 는 "snowflake-arctic-embed-l-v2.0"
|
||||
# (ollama 태그 "snowflake-arctic-embed2", 1024-dim) 도 지원 —
|
||||
# 설명형 query recall 보강. e5↔arctic 전환은
|
||||
# embedding_version cascade (재색인 필요).
|
||||
version = "v1"
|
||||
dimensions = 384
|
||||
dimensions = 384 # arctic / e5-large 는 1024.
|
||||
batch_size = 64
|
||||
num_threads = 0 # candle 전용 CPU 스레드 캡 (0=auto). env KEBAB_EMBED_THREADS 우선.
|
||||
# endpoint = "http://127.0.0.1:11434" # provider="ollama" 전용; 생략 시 [models.llm].endpoint fallback.
|
||||
|
||||
[models.llm]
|
||||
provider = "ollama"
|
||||
|
||||
@@ -14,6 +14,56 @@ historical contract that was implemented; this file accumulates the
|
||||
deltas so phase 5+ readers can find the live behavior without diffing
|
||||
git history.
|
||||
|
||||
## 2026-06-03 — arctic-embed-l-v2.0 임베더 통합 (candle + Ollama) (v0.26.0)
|
||||
|
||||
**무엇을 왜 추가했나.** 별칭(doc-side expansion) 제거(v0.25.0) 후 설명형 query 의
|
||||
recall 보강책으로 `snowflake-arctic-embed-l-v2.0` 임베더를 두 백엔드로 통합했다.
|
||||
근거는 방법별 측정(`/build/dogfood/logs/2026-06-03-method-measurements.md`):
|
||||
arctic = recall@10 **130/132**, recall@50 **132/132**, **용어 무손실**(syn/abbr/en
|
||||
유지). e5-large 대비 +7, 색인 1회·per-query 0·LLM 0 = 살아있는 KB 에 지속 가능.
|
||||
별칭이 청크당 색인-시 LLM(나무위키 18문서 cold 2.5h)을 요구한 것과 대조.
|
||||
|
||||
**무엇을 건드렸나.**
|
||||
- `kebab-embed-candle`: e5 하드코딩(`HF_MODEL`/`SUPPORTED_MODEL`/mean/`query:`+`passage:`)을
|
||||
**모델 레지스트리**(`MODEL_REGISTRY`: `EmbedModelSpec { name, hf_repo, pooling, query_prefix, doc_prefix, dim, version_tag }`)로
|
||||
일반화. e5(mean, `query:`/`passage:`) + arctic(**CLS**, `query:`/무접두어). pooling
|
||||
은 모델별 분기(mean=attention-mask-weighted / CLS=`hidden[:,0,:]`), tokenize/forward/L2
|
||||
공유. arctic pooling=CLS 는 HF `1_Pooling/config.json`(`pooling_mode_cls_token:true`)로
|
||||
확인. `model_version` 은 arctic 일 때 `+arctic-cls` 태그(switch 시 embedding_version
|
||||
cascade 트리거); e5 는 fastembed-e5 와의 호환(NUMA 드롭인) 위해 plain `config.version` 유지.
|
||||
- `kebab-embed-ollama` (신규 크레이트): `Embedder` 구현, `reqwest::blocking` POST
|
||||
`/api/embed` `{model, input:[...]}` → `embeddings`. batch 48 + fail-soft 재시도 3,
|
||||
결과 **L2 정규화**(Ollama raw 반환), dim 검증, query/doc prefix 모델 태그로 추론
|
||||
(`arctic-embed`→`query:`/무접두어, `e5`→`query:`/`passage:`). `model_version=ollama:{model}`.
|
||||
endpoint = `models.embedding.endpoint` ?? `models.llm.endpoint`.
|
||||
- `kebab-config`: `EmbeddingModelCfg.endpoint: Option<String>`(serde default, ollama용) +
|
||||
`provider` 문서에 `ollama` 추가 + env `KEBAB_MODELS_EMBEDDING_ENDPOINT`.
|
||||
- `kebab-app::embedder()`: provider match 에 `ollama` 분기 추가(facade 경유).
|
||||
- workspace member += `kebab-embed-ollama`, version 0.25.0 → **0.26.0**(minor).
|
||||
|
||||
**correctness 게이트.** candle arctic 임베딩이 측정에 쓴 Ollama `snowflake-arctic-embed2`
|
||||
임베딩과 일치해야 pooling/prefix 정확성(=recall 130 재현)이 보장된다. 검증:
|
||||
`kebab-embed-candle/tests/arctic_ollama_parity.rs`(`#[ignore]`, live Ollama 의존) 가
|
||||
candle arctic vs 우리 Ollama 어댑터로 같은 문장(설명형/약어/영문 포함, doc+query
|
||||
양 경로)을 임베딩해 per-sentence **코사인 > 0.99** 를 assert. 수동 실행 결과(코사인값)는
|
||||
릴리스 전 본 entry 에 기록.
|
||||
|
||||
**수동 검증 결과** (2026-06-03 worker 실측, Ollama @192.168.0.47:11434
|
||||
`snowflake-arctic-embed2`): 8문장 × (doc+query) 16벡터 per-sentence 코사인
|
||||
**0.999984 ~ 0.999995**, `cosine_min = 0.999984` (게이트 0.99 대비 대폭 상회).
|
||||
설명형("후입선출 방식으로 동작하는 자료구조")·약어("SVM 은 support vector machine")·
|
||||
영문·한글 모두 일치. → candle arctic 의 CLS pooling + `query: ` prefix 가 Ollama 측정
|
||||
경로와 정확히 동일 = recall@10 130 재현 보장. Ollama raw 도 이미 L2-정규화(norm 1.0)라
|
||||
어댑터의 L2 정규화는 idempotent no-op. 로그: `/build/dogfood/logs/arctic-parity.log`,
|
||||
요약: `/tmp/arctic-result.md`.
|
||||
|
||||
**호환성.** 기본 provider=fastembed e5 동작/벡터 불변(arctic 은 opt-in). dim 1024
|
||||
동일이나 LanceDB 테이블명에 모델명 포함(`chunk_embeddings_{model}_{dim}`)이라 충돌
|
||||
없음. e5 → arctic 전환 = `embedding_version` cascade(모델별 벡터 상이) → **재색인 필요**
|
||||
(기존 e5 KB 와 혼용 불가, 명확). A(heading enrichment)는 측정상 arctic 에서 악화 →
|
||||
미적용. spec: `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`, plan: 동일
|
||||
디렉토리 `2026-06-03-arctic-embedder-plan.md`.
|
||||
|
||||
## 2026-06-03 — doc-side expansion(별칭) 기능 완전 제거 (v0.25.0)
|
||||
|
||||
**무엇을 왜 제거했나.** v0.21.0 (PR #195/#196) 에서 도입한 색인-시 청크당 LLM
|
||||
|
||||
Reference in New Issue
Block a user