Merge pull request 'feat(embed): arctic-embed-l-v2.0 임베더(candle+ollama)' (#203) from feat/arctic-embedder into main

Reviewed-on: #203
2026-06-03 06:27:55 +00:00
parent e2ae9a4589 095c9f37a2
commit d71ed2516b
16 changed files with 1018 additions and 130 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4724,7 +4724,7 @@ dependencies = [

 [[package]]
 name = "kebab-app"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "base64 0.22.1",
@@ -4739,6 +4739,7 @@ dependencies = [
 "kebab-embed",
 "kebab-embed-candle",
 "kebab-embed-local",
+ "kebab-embed-ollama",
 "kebab-llm",
 "kebab-llm-local",
 "kebab-nli",
@@ -4771,7 +4772,7 @@ dependencies = [

 [[package]]
 name = "kebab-chunk"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -4789,7 +4790,7 @@ dependencies = [

 [[package]]
 name = "kebab-cli"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "clap",
@@ -4810,7 +4811,7 @@ dependencies = [

 [[package]]
 name = "kebab-config"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "dirs 5.0.1",
@@ -4826,7 +4827,7 @@ dependencies = [

 [[package]]
 name = "kebab-core"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -4840,7 +4841,7 @@ dependencies = [

 [[package]]
 name = "kebab-embed"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -4854,7 +4855,7 @@ dependencies = [

 [[package]]
 name = "kebab-embed-candle"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "candle-core",
@@ -4864,6 +4865,7 @@ dependencies = [
 "kebab-config",
 "kebab-core",
 "kebab-embed-local",
+ "kebab-embed-ollama",
 "rayon",
 "serde_json",
 "tempfile",
@@ -4873,7 +4875,7 @@ dependencies = [

 [[package]]
 name = "kebab-embed-local"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "fastembed",
@@ -4884,9 +4886,24 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "kebab-embed-ollama"
+version = "0.26.0"
+dependencies = [
+ "anyhow",
+ "kebab-config",
+ "kebab-core",
+ "reqwest 0.12.28",
+ "serde",
+ "serde_json",
+ "tokio",
+ "tracing",
+ "wiremock",
+]
+
 [[package]]
 name = "kebab-eval"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "kebab-app",
@@ -4905,7 +4922,7 @@ dependencies = [

 [[package]]
 name = "kebab-llm"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "kebab-core",
@@ -4914,7 +4931,7 @@ dependencies = [

 [[package]]
 name = "kebab-llm-local"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "kebab-config",
@@ -4931,7 +4948,7 @@ dependencies = [

 [[package]]
 name = "kebab-mcp"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "kebab-app",
@@ -4949,7 +4966,7 @@ dependencies = [

 [[package]]
 name = "kebab-nli"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "hf-hub",
@@ -4964,7 +4981,7 @@ dependencies = [

 [[package]]
 name = "kebab-parse-code"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "gix",
@@ -4987,7 +5004,7 @@ dependencies = [

 [[package]]
 name = "kebab-parse-image"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "ab_glyph",
 "anyhow",
@@ -5011,7 +5028,7 @@ dependencies = [

 [[package]]
 name = "kebab-parse-md"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "kebab-core",
@@ -5028,7 +5045,7 @@ dependencies = [

 [[package]]
 name = "kebab-parse-pdf"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -5043,7 +5060,7 @@ dependencies = [

 [[package]]
 name = "kebab-rag"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -5065,7 +5082,7 @@ dependencies = [

 [[package]]
 name = "kebab-search"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "globset",
@@ -5084,7 +5101,7 @@ dependencies = [

 [[package]]
 name = "kebab-source-fs"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -5102,7 +5119,7 @@ dependencies = [

 [[package]]
 name = "kebab-store-sqlite"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "blake3",
@@ -5122,7 +5139,7 @@ dependencies = [

 [[package]]
 name = "kebab-store-vector"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "arrow",
@@ -5146,7 +5163,7 @@ dependencies = [

 [[package]]
 name = "kebab-tui"
-version = "0.25.0"
+version = "0.26.0"
 dependencies = [
 "anyhow",
 "crossterm",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,7 @@ members = [
    "crates/kebab-embed",
    "crates/kebab-embed-local",
    "crates/kebab-embed-candle",
+    "crates/kebab-embed-ollama",
    "crates/kebab-llm",
    "crates/kebab-llm-local",
    "crates/kebab-rag",
@@ -31,7 +32,7 @@ edition       = "2024"
 rust-version  = "1.85"
 license       = "MIT OR Apache-2.0"
 repository    = "https://github.com/altair823/kebab"
-version       = "0.25.0"   # v0.25.0 — doc-side expansion(별칭) 기능 완전 제거: Chunk.aliases / expansion.rs / IngestExpansionCfg / alias lexical arm / expansion_progress wire kind 제거, 신규 마이그레이션 V013 이 chunk_aliases_fts + chunks.aliases DROP. AssetTimings.expansion_ms 는 wire 호환 위해 값 0 유지. 별칭 default-off 였어 사용자 체감 0. — CLAUDE.md §Release
+version       = "0.26.0"   # v0.26.0 — arctic-embed-l-v2.0 임베더 통합: kebab-embed-candle 다중 모델 레지스트리(e5 mean + arctic CLS, 모델별 pooling/prefix 분기) + 신규 kebab-embed-ollama 크레이트(provider="ollama", POST /api/embed, L2 정규화, batch+fail-soft). config models.embedding.provider 에 "ollama" 추가 + endpoint: Option<String>. 기본 동작 불변(provider=fastembed e5), arctic 은 opt-in, embedding_version cascade(arctic-cls / ollama:{model} 태그). — CLAUDE.md §Release

 # pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
 # intentional allow-list. The allowed lints are either cosmetic (doc style),
--- a/HANDOFF.md
+++ b/HANDOFF.md
@@ -35,6 +35,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.

 머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:

+- **2026-06-03 arctic-embed-l-v2.0 임베더 통합** — v0.26.0. 별칭 제거 후 설명형 query recall 보강(측정 recall@10 130/132, e5 +7). `kebab-embed-candle` 모델 레지스트리화(e5 mean + `snowflake-arctic-embed-l-v2.0` CLS, 모델별 pooling/prefix) + 신규 `kebab-embed-ollama`(`provider="ollama"`, `/api/embed`). config `endpoint: Option<String>` 추가. 기본 e5 유지(opt-in), arctic 전환은 embedding_version cascade → 재색인. candle↔Ollama cosine>0.99 게이트로 pooling/prefix 정확성 고정(`#[ignore]`). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 arctic), spec `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`.
 - **2026-06-03 doc-side expansion(별칭) 기능 완전 제거** — v0.25.0. 아래 2026-05-31 항목의 색인-시 청크당 LLM 별칭 생성 + 별칭 검색 채널을 **전부 제거**(ROI 음수: cross-lingual 은 e5-large 단독으로 충분, 기여는 설명형 +2 그룹뿐인데 대가가 청크당 색인-시 LLM). `Chunk.aliases`/`expansion.rs`/`IngestExpansionCfg`/alias lexical arm/`expansion_progress` wire kind 제거, 신규 마이그레이션 **V013** 이 `chunk_aliases_fts`+`chunks.aliases` DROP. 별칭 default-off 였어 사용자 체감 0, 기존 KB 도 재색인 불요(잔존 별칭 벡터는 `strip_alias_suffix` graceful 매핑/`reset` 정리). `AssetTimings.expansion_ms` 는 wire 호환 위해 값 0 으로 유지. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03), spec `docs/superpowers/specs/2026-06-03-remove-doc-expansion-spec.md`.
 - **2026-05-31 Phase 2 doc-side expansion 별칭(개별 dense 벡터) + 파생물 캐시(V012)** — v0.21.0 cut. 색인 시 LLM 이 청크별 별칭("같은 의미 다른 표현")을 생성, 줄별 **개별 dense 벡터**(sentinel `{chunk}#alias#N`)로 색인 (묶음 1벡터는 평균화 희석으로 회귀 → 폐기) + boilerplate 청크 skip. `[ingest.expansion]` default off. 측정(나무위키 ~1000 문서 CS corpus): 변형 일관성 14/18 → **16/18**, spread 0.222→0.111, 대조군 false-positive 별칭 무죄. 비용 병목(별칭 18문서 2.5h)은 **파생물 캐시(V012, 청크 내용 해시 키)**로 해소 — 정답 3개 cold 1879s → warm 13s **≈ 145배**, embedding+별칭 LLM 캐싱, version_key cascade 정합. search/ask 가 `kebab.sqlite`+`lancedb` 만으로 동작 → 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능. **결정/known limitation**: grounded/refusal 판정이 부분 인용을 grounded 로 오분류(정직한 거부가 false-positive 로 집계) — 별도 개선 후보. stack·svm 설명형 2개 잔존. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-31), 측정: `docs/superpowers/handoffs/2026-05-31-namu-wiki-alias-cache-study.md`.
 - **2026-05-29 v0.20.2 dogfood findings + 검색 품질 baseline** — 8-finding 라운드 완료. (1) Ask 응답언어: rag-v3 default (질문 언어 = 답변 언어). (2) eval `--config` facade 패치 로 dogfood KB 직접 eval 가능. (3) 검색 품질 baseline — hybrid hit@3=1.0 / MRR=0.833, lexical hit@3=1.0 / MRR=0.7 (golden 10 query). **O-2 known limitation**: 소형 모델(gemma4:e4b) refusal 메시지의 query 언어 불일치 가능 — 판정은 정상, 표시 문구만 해당. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-29).
--- a/README.md
+++ b/README.md
@@ -111,18 +111,46 @@ root = "~/KnowledgeBase"   # 색인할 폴더. 절대 / tilde / env / 상대 경

 [models.embedding]
 provider = "fastembed"            # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust)
-                                  # / "none"(lexical-only). candle 는 같은 모델·같은 벡터를
-                                  # 순수 Rust 로 돌려 NUMA 서버의 onnxruntime 48-스레드
-                                  # double-free 를 피하는 opt-in 백엔드 (재색인 불필요).
+                                  # / "ollama"(원격 HTTP) / "none"(lexical-only).
+                                  # candle 는 같은 모델·같은 벡터를 순수 Rust 로 돌려
+                                  # NUMA 서버의 onnxruntime 48-스레드 double-free 를 피하는
+                                  # opt-in 백엔드 (e5 는 재색인 불필요).
 model = "multilingual-e5-large"   # 다국어 sentence embedding (1024-dim).
                                  # 첫 ingest 시 ONNX (~1.3GB) 자동 다운로드.
                                  # candle provider 는 safetensors (~2GB) 다운로드.
+                                  # candle/ollama 는 "snowflake-arctic-embed-l-v2.0"
+                                  # (설명형 query 의 recall 보강) 도 지원 — 아래 참고.
 dimensions = 1024                 # config 와 LanceDB stored dim 불일치 시 검색 0건.
 num_threads = 0                   # candle 전용 CPU 스레드 캡 (0=auto=#cores).
                                  # env KEBAB_EMBED_THREADS 가 우선. NUMA 노드 바인딩은
                                  # numactl 과 조합. fastembed provider 는 무시.
+# endpoint = "http://127.0.0.1:11434"  # provider="ollama" 전용 HTTP endpoint.
+                                  # 생략 시 [models.llm].endpoint 로 폴백.
+                                  # fastembed/candle provider 는 무시.
 ```

+**arctic-embed-l-v2.0 (설명형 query recall 보강)**: 기본 e5-large 대신
+Snowflake `arctic-embed-l-v2.0` 임베더를 쓸 수 있다 (1024-dim, opt-in). 측정에서
+설명형/약어/영문 용어 query 의 recall@10 이 e5 대비 향상됐다. 두 경로:
+
+```toml
+# (A) candle 백엔드 — 순수 Rust, in-process (NUMA 안전, Metal GPU 가능):
+[models.embedding]
+provider = "candle"
+model    = "snowflake-arctic-embed-l-v2.0"   # CLS pooling, query 에 "query: " 접두어
+                                             # (문서는 무접두어). safetensors ~2GB 다운로드.
+
+# (B) ollama 백엔드 — 원격/로컬 Ollama 데몬에 위임 (POST /api/embed):
+[models.embedding]
+provider = "ollama"
+model    = "snowflake-arctic-embed2"          # Ollama 모델 태그 (ollama pull 필요)
+endpoint = "http://127.0.0.1:11434"           # 생략 시 [models.llm].endpoint
+```
+
+> ⚠️ e5 → arctic 전환은 `embedding_version` cascade 를 트리거한다 (모델이 다르면
+> 벡터도 다름). 기존 e5 KB 와 혼용 불가 — 전환 시 **재색인** 필요 (`kebab reset`
+> 후 재 ingest). 기본값은 e5 라 기존 사용자는 영향 없음.
+
 **Apple Silicon GPU 가속 (candle / macOS)**: M-시리즈 맥에서 candle 임베딩을
 GPU(Metal)로 돌리면 CPU 대비 대용량 ingest 가 크게 빨라진다. 빌드 또는 설치 시
 `embed_metal` feature 를 켠다:
--- a/crates/kebab-app/Cargo.toml
+++ b/crates/kebab-app/Cargo.toml
@@ -19,6 +19,7 @@ kebab-search = { path = "../kebab-search" }
 kebab-embed = { path = "../kebab-embed" }
 kebab-embed-local = { path = "../kebab-embed-local" }
 kebab-embed-candle = { path = "../kebab-embed-candle" }
+kebab-embed-ollama = { path = "../kebab-embed-ollama" }
 kebab-llm = { path = "../kebab-llm" }
 kebab-llm-local = { path = "../kebab-llm-local" }
 kebab-rag = { path = "../kebab-rag" }
--- a/crates/kebab-app/src/app.rs
+++ b/crates/kebab-app/src/app.rs
@@ -45,6 +45,7 @@ use kebab_core::{
 };
 use kebab_embed_candle::CandleEmbedder;
 use kebab_embed_local::FastembedEmbedder;
+use kebab_embed_ollama::OllamaEmbedder;
 use kebab_llm_local::OllamaLanguageModel;
 use kebab_parse_code::{
    CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor,
@@ -834,11 +835,13 @@ impl App {
        if let Some(e) = self.embedder.get() {
            return Ok(Some(e.clone()));
        }
-        // Provider branch (Track 1 spec §3). `embeddings_disabled()` above
-        // already handled `"none"`; here we route the live providers.
-        // `fastembed`/`onnx`/(empty) keep the default onnxruntime path
-        // (vectors unchanged — `embedding_version` is preserved); `candle`
-        // selects the pure-Rust NUMA-safe backend.
+        // Provider branch (Track 1 spec §3 + arctic-embedder spec). The
+        // `embeddings_disabled()` check above already handled `"none"`; here we
+        // route the live providers. `fastembed`/`onnx`/(empty) keep the default
+        // onnxruntime path (vectors unchanged — `embedding_version` is
+        // preserved); `candle` selects the pure-Rust NUMA-safe backend (e5 or
+        // arctic via its model registry); `ollama` offloads to a remote
+        // `/api/embed` daemon.
        let provider = self.config.models.embedding.provider.as_str();
        let emb: Arc<dyn Embedder + Send + Sync> = match provider {
            "fastembed" | "onnx" | "" => Arc::new(
@@ -847,10 +850,13 @@ impl App {
            "candle" => Arc::new(
                CandleEmbedder::new(&self.config).context("kb-app: load CandleEmbedder")?,
            ),
+            "ollama" => Arc::new(
+                OllamaEmbedder::new(&self.config).context("kb-app: load OllamaEmbedder")?,
+            ),
            other => {
                return Err(anyhow!(
                    "kb-app: unknown embedding provider {other:?}; expected one of \
-                     `fastembed` (default), `candle`, or `none` (lexical-only)"
+                     `fastembed` (default), `candle`, `ollama`, or `none` (lexical-only)"
                ));
            }
        };
--- a/crates/kebab-config/src/lib.rs
+++ b/crates/kebab-config/src/lib.rs
@@ -155,9 +155,10 @@ impl NliCfg {

 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct EmbeddingModelCfg {
-    /// `fastembed` (default, onnxruntime) or `candle` (pure-Rust,
-    /// NUMA-safe). `none` disables embeddings (lexical-only). Unknown
-    /// values error at embedder construction.
+    /// `fastembed` (default, onnxruntime), `candle` (pure-Rust, NUMA-safe),
+    /// or `ollama` (remote HTTP embedding endpoint). `none` disables
+    /// embeddings (lexical-only). Unknown values error at embedder
+    /// construction.
    pub provider: String,
    pub model: String,
    pub version: String,
@@ -170,6 +171,13 @@ pub struct EmbeddingModelCfg {
    /// provider. Defaulted on load so pre-0.22 config files still parse.
    #[serde(default)]
    pub num_threads: u32,
+    /// HTTP endpoint for the `ollama` embedding provider (e.g.
+    /// `"http://127.0.0.1:11434"`). `None` (or a missing key in TOML) means
+    /// "fall back to `models.llm.endpoint`" — same convention as the OCR /
+    /// vision endpoints. Ignored by the `fastembed` / `candle` providers.
+    /// Defaulted on load so pre-0.26 config files still parse.
+    #[serde(default)]
+    pub endpoint: Option<String>,
 }

 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -688,6 +696,7 @@ impl Config {
                    dimensions: 1024,
                    batch_size: 64,
                    num_threads: 0,
+                    endpoint: None,
                },
                llm: LlmCfg {
                    provider: "ollama".to_string(),
@@ -950,6 +959,12 @@ impl Config {
                        self.models.embedding.num_threads = n;
                    }
                }
+                "KEBAB_MODELS_EMBEDDING_ENDPOINT" => {
+                    // Empty value → None (= fall back to models.llm.endpoint),
+                    // mirroring the OCR endpoint override semantics.
+                    self.models.embedding.endpoint =
+                        if v.is_empty() { None } else { Some(v.clone()) };
+                }

                // models.llm
                "KEBAB_MODELS_LLM_PROVIDER" => self.models.llm.provider = v.clone(),
--- a/crates/kebab-embed-candle/Cargo.toml
+++ b/crates/kebab-embed-candle/Cargo.toml
@@ -38,6 +38,9 @@ metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]
 # not the library's own (non-dev) dependencies — so rayon/kebab-config/kebab-core
 # are repeated here for tests/parity.rs and tests/thread_cap.rs.
 kebab-embed-local = { path = "../kebab-embed-local" }
+# arctic↔Ollama parity test drives the real Ollama adapter for the reference
+# vectors (tests/arctic_ollama_parity.rs, `#[ignore]` — live Ollama).
+kebab-embed-ollama = { path = "../kebab-embed-ollama" }
 kebab-config = { path = "../kebab-config" }
 kebab-core = { path = "../kebab-core" }
 rayon = "1"
--- a/crates/kebab-embed-candle/src/lib.rs
+++ b/crates/kebab-embed-candle/src/lib.rs
@@ -1,31 +1,44 @@
 //! `kebab-embed-candle` — [`CandleEmbedder`], a pure-Rust (candle)
 //! implementation of [`Embedder`](kebab_core::Embedder).
 //!
-//! Runs the same `intfloat/multilingual-e5-large` model as the default
-//! [`FastembedEmbedder`](kebab_embed_local) but through `candle`
-//! (`candle-transformers`' XLM-RoBERTa) instead of onnxruntime. Motivation:
-//! fastembed 4.9's onnxruntime hard-codes 48 intra-op threads, which corrupts
-//! the heap (double-free) on dual-socket NUMA hosts. candle's CPU backend
-//! sizes its threads off the global rayon pool, so a one-shot
-//! [`rayon::ThreadPoolBuilder`] cap (config `num_threads` / env
-//! `KEBAB_EMBED_THREADS`) keeps the worker count NUMA-safe.
+//! Runs an XLM-RoBERTa-large embedding model through `candle`
+//! (`candle-transformers`' XLM-RoBERTa) instead of onnxruntime. Two models
+//! are wired through a small **registry** ([`MODEL_REGISTRY`]):
 //!
-//! Output parity with the onnxruntime path was proven by the Phase 0 spike
-//! (cosine 1.000000); this crate absorbs that pipeline verbatim:
+//! * `multilingual-e5-large` — the same weights the default
+//!   [`FastembedEmbedder`](kebab_embed_local) uses (mean pooling,
+//!   `query: `/`passage: ` prefixes). candle is the NUMA-safe drop-in:
+//!   fastembed 4.9's onnxruntime hard-codes 48 intra-op threads, which
+//!   corrupts the heap (double-free) on dual-socket NUMA hosts. candle's
+//!   CPU backend sizes its threads off the global rayon pool, so a one-shot
+//!   [`rayon::ThreadPoolBuilder`] cap (config `num_threads` / env
+//!   `KEBAB_EMBED_THREADS`) keeps the worker count NUMA-safe.
+//! * `snowflake-arctic-embed-l-v2.0` — Snowflake's arctic-embed v2.0
+//!   (CLS pooling, `query: ` on queries / no prefix on documents). Same
+//!   XLM-RoBERTa-large architecture, dim 1024, so it rides the exact same
+//!   tokenize → forward → L2 pipeline; only the pooling step and prefixes
+//!   differ (both keyed off the per-model [`EmbedModelSpec`]).
 //!
-//! 1. e5 prefix (`passage: ` for documents, `query: ` for queries — the same
-//!    convention as `kebab-embed-local`'s `prefix_input`);
+//! Output parity with the onnxruntime path (for e5) was proven by the
+//! Phase 0 spike (cosine 1.000000); the arctic path's pooling/prefix
+//! correctness is pinned by an `#[ignore]`d cosine>0.99 cross-check against
+//! Ollama's `snowflake-arctic-embed2` (see `tests/arctic_ollama_parity.rs`).
+//! The shared pipeline:
+//!
+//! 1. instruction prefix per [`EmbedModelSpec`] (query/doc);
 //! 2. tokenize (max_len 512, batch-longest padding, special tokens);
-//! 3. XLM-RoBERTa forward on `Device::Cpu`;
-//! 4. attention-mask-weighted mean pooling;
+//! 3. XLM-RoBERTa forward on the selected [`Device`];
+//! 4. pooling — mean (attention-mask-weighted) or CLS (first token);
 //! 5. L2 normalization.
 //!
 //! Model files (`config.json`, `tokenizer.json`, `model.safetensors`) are
-//! fetched via `hf-hub` into `{config.storage.model_dir}/candle/`.
+//! fetched via `hf-hub` into `{config.storage.model_dir}/candle/` (hf-hub's
+//! cache layout namespaces by repo, so e5 and arctic never collide).
 //!
 //! This crate is **opt-in** (`config.models.embedding.provider = "candle"`);
 //! the default provider stays `fastembed`. See
-//! `docs/superpowers/specs/2026-06-01-embed-candle-track-spec.md`.
+//! `docs/superpowers/specs/2026-06-01-embed-candle-track-spec.md` and
+//! `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`.

 use std::sync::Mutex;

@@ -42,22 +55,95 @@ use tokenizers::{PaddingParams, PaddingStrategy, Tokenizer, TruncationParams};
 /// `fastembed/` subdir so the two backends never collide.
 const CANDLE_CACHE_SUBDIR: &str = "candle";

-/// HuggingFace repo id for the multilingual e5 large model. Same weights the
-/// onnxruntime path uses, just the safetensors variant candle can read.
-const HF_MODEL: &str = "intfloat/multilingual-e5-large";
-
-/// The only `config.models.embedding.model` value the candle adapter accepts
-/// (the e5-large weights `HF_MODEL` resolves to). Guards against silently
-/// downloading e5-large while `model_id()` reports a different name.
-const SUPPORTED_MODEL: &str = "multilingual-e5-large";
-
-/// Token truncation length (e5 was trained at 512).
+/// Token truncation length (both e5 and arctic-embed-l-v2.0 train at 512).
 const MAX_LEN: usize = 512;

 /// Env var that overrides `config.models.embedding.num_threads`. Read once in
 /// [`CandleEmbedder::new`]; `0`/unset/unparseable means "leave rayon default".
 const ENV_EMBED_THREADS: &str = "KEBAB_EMBED_THREADS";

+/// Pooling strategy over the model's last hidden state. Keyed per-model by
+/// [`EmbedModelSpec::pooling`] — e5 is mean, arctic is CLS.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Pooling {
+    /// Attention-mask-weighted mean over all tokens (e5 / sentence-transformers
+    /// `pooling_mode_mean_tokens`).
+    Mean,
+    /// First token (`<s>`/`[CLS]`) hidden state (arctic-embed v2.0 —
+    /// `1_Pooling/config.json` has `pooling_mode_cls_token: true`).
+    Cls,
+}
+
+/// One supported embedding model: the HF repo candle downloads, the pooling
+/// strategy, and the e5-style instruction prefixes. [`MODEL_REGISTRY`] maps a
+/// `config.models.embedding.model` value to one of these.
+#[derive(Clone, Copy, Debug)]
+pub struct EmbedModelSpec {
+    /// The short `config.models.embedding.model` value that selects this spec.
+    pub name: &'static str,
+    /// HuggingFace repo id candle fetches `config.json` / `tokenizer.json` /
+    /// `model.safetensors` from.
+    pub hf_repo: &'static str,
+    /// Pooling over the last hidden state.
+    pub pooling: Pooling,
+    /// Prefix prepended to **query** inputs before tokenization.
+    pub query_prefix: &'static str,
+    /// Prefix prepended to **document** inputs before tokenization (arctic
+    /// uses `""` — documents are embedded raw).
+    pub doc_prefix: &'static str,
+    /// Expected embedding dimension (model hidden size).
+    pub dim: usize,
+    /// Suffix folded into `model_version` so switching **to** this model
+    /// triggers the `embedding_version` cascade even if the operator forgets
+    /// to bump `config.version`. `None` keeps the bare `config.version` — used
+    /// by e5 so candle-e5 and fastembed-e5 report the *same* version and stay
+    /// interchangeable (the NUMA drop-in invariant — Phase 0 cosine 1.0).
+    pub version_tag: Option<&'static str>,
+}
+
+/// The models the candle adapter can load. Adding a model = one entry here
+/// (plus, for a non-XLM-R architecture, a new forward path — both current
+/// entries are XLM-RoBERTa-large so they share everything but pooling/prefix).
+static MODEL_REGISTRY: &[EmbedModelSpec] = &[
+    EmbedModelSpec {
+        name: "multilingual-e5-large",
+        hf_repo: "intfloat/multilingual-e5-large",
+        pooling: Pooling::Mean,
+        query_prefix: "query: ",
+        doc_prefix: "passage: ",
+        dim: 1024,
+        version_tag: None,
+    },
+    EmbedModelSpec {
+        name: "snowflake-arctic-embed-l-v2.0",
+        hf_repo: "Snowflake/snowflake-arctic-embed-l-v2.0",
+        pooling: Pooling::Cls,
+        query_prefix: "query: ",
+        doc_prefix: "",
+        dim: 1024,
+        version_tag: Some("arctic-cls"),
+    },
+];
+
+/// Look up a model spec by `config.models.embedding.model`. Accepts either the
+/// short `name` or the full `hf_repo` id (mirrors the old e5 guard, which
+/// accepted both `multilingual-e5-large` and `intfloat/multilingual-e5-large`).
+pub(crate) fn lookup_spec(model: &str) -> Option<&'static EmbedModelSpec> {
+    MODEL_REGISTRY
+        .iter()
+        .find(|s| s.name == model || s.hf_repo == model)
+}
+
+/// Comma-separated list of supported model names, for the
+/// unsupported-model error message.
+fn supported_models() -> String {
+    MODEL_REGISTRY
+        .iter()
+        .map(|s| s.name)
+        .collect::<Vec<_>>()
+        .join("`, `")
+}
+
 /// Pure-Rust candle adapter. Construct via [`CandleEmbedder::new`]; the
 /// constructor downloads the model on first use, so share one instance.
 pub struct CandleEmbedder {
@@ -68,6 +154,9 @@ pub struct CandleEmbedder {
    model: Mutex<XLMRobertaModel>,
    tokenizer: Tokenizer,
    device: Device,
+    /// The resolved model spec (pooling + prefixes) — drives `embed` and
+    /// `embed_batch`.
+    spec: &'static EmbedModelSpec,
    model_id: EmbeddingModelId,
    version: EmbeddingVersion,
    dimensions: usize,
@@ -75,7 +164,8 @@ pub struct CandleEmbedder {
 }

 impl CandleEmbedder {
-    /// Build an embedder from `Config`. Applies the NUMA thread cap, fetches
+    /// Build an embedder from `Config`. Resolves the model spec from
+    /// `config.models.embedding.model`, applies the NUMA thread cap, fetches
    /// the model into `{model_dir}/candle/`, and validates that the model's
    /// hidden size matches `config.models.embedding.dimensions` before
    /// returning.
@@ -104,21 +194,20 @@ impl CandleEmbedder {
            }
        }

-        // 1b. Model guard. `HF_MODEL` is hard-coded (candle currently only wires
-        //     e5-large), so if the operator configured a *different* model name
-        //     we must NOT silently download e5-large and then label its vectors
-        //     with the configured name via `model_id()` — that would mislabel
-        //     `embedding_version` and corrupt a mixed index. Fail fast, before
-        //     the ~2GB download.
+        // 1b. Model registry lookup. If the operator configured a model the
+        //     candle adapter doesn't know, fail fast (BEFORE the ~2GB
+        //     download) — never silently download one model and then label its
+        //     vectors with another name via `model_id()`, which would mislabel
+        //     `embedding_version` and corrupt a mixed index.
        let want = config.models.embedding.model.as_str();
-        if want != SUPPORTED_MODEL && want != HF_MODEL {
-            anyhow::bail!(
-                "candle provider currently supports only '{SUPPORTED_MODEL}' (or \
-                 the HF id '{HF_MODEL}'), but config.models.embedding.model = \
-                 '{want}'. Use provider=fastembed for other models, or set \
-                 model = \"{SUPPORTED_MODEL}\"."
-            );
-        }
+        let spec = lookup_spec(want).ok_or_else(|| {
+            anyhow::anyhow!(
+                "candle provider supports the models `{}`, but \
+                 config.models.embedding.model = '{want}'. Use provider=fastembed \
+                 for other models, or pick a supported one.",
+                supported_models()
+            )
+        })?;

        // 2. Resolve `{data_dir}/models/candle/` exactly like the fastembed
        //    adapter resolves its own subdir.
@@ -134,14 +223,15 @@ impl CandleEmbedder {
        tracing::info!(
            target: "kebab-embed-candle",
            cache_dir = %cache_dir.display(),
-            model = HF_MODEL,
+            model = spec.hf_repo,
+            pooling = ?spec.pooling,
            "loading candle embedding model (first run downloads ~2GB safetensors)"
        );
        let api = hf_hub::api::sync::ApiBuilder::new()
            .with_cache_dir(cache_dir.clone())
            .build()
            .context("kb-embed-candle: build hf-hub api")?;
-        let repo = api.model(HF_MODEL.to_string());
+        let repo = api.model(spec.hf_repo.to_string());
        let config_path = repo.get("config.json").context("download config.json")?;
        let tokenizer_path = repo
            .get("tokenizer.json")
@@ -180,10 +270,21 @@ impl CandleEmbedder {
            }))
            .map_err(|e| anyhow::anyhow!("kb-embed-candle: set truncation: {e}"))?;

+        // model_version: fold the model tag in for non-e5 models so a switch
+        // triggers the embedding_version cascade; e5 keeps the bare
+        // config.version to stay interchangeable with fastembed-e5.
+        let version = match spec.version_tag {
+            Some(tag) => {
+                EmbeddingVersion(format!("{}+{}", config.models.embedding.version, tag))
+            }
+            None => EmbeddingVersion(config.models.embedding.version.clone()),
+        };
+
        tracing::info!(
            target: "kebab-embed-candle",
            dimensions = cfg.hidden_size,
            layers = cfg.num_hidden_layers,
+            model = spec.name,
            "candle embedding model loaded"
        );

@@ -191,16 +292,17 @@ impl CandleEmbedder {
            model: Mutex::new(model),
            tokenizer,
            device,
+            spec,
            model_id: EmbeddingModelId(config.models.embedding.model.clone()),
-            version: EmbeddingVersion(config.models.embedding.version.clone()),
+            version,
            dimensions: cfg.hidden_size,
            batch_size: config.models.embedding.batch_size.max(1),
        })
    }

-    /// Embed one batch of **already-prefixed** strings (the e5 `query:`/
-    /// `passage:` prefix is applied by the caller [`CandleEmbedder::embed`])
-    /// through the candle pipeline: tokenize → forward → masked mean pool → L2.
+    /// Embed one batch of **already-prefixed** strings (the per-model prefix
+    /// is applied by the caller [`CandleEmbedder::embed`]) through the candle
+    /// pipeline: tokenize → forward → pool (mean|CLS) → L2.
    fn embed_batch(&self, prefixed: &[String]) -> Result<Vec<Vec<f32>>> {
        let encodings = self
            .tokenizer
@@ -237,18 +339,30 @@ impl CandleEmbedder {
            guard.forward(&input_ids, &attn_f32, &token_type_ids, None, None, None)?
        };

-        // attention-mask-weighted mean pooling
-        let mask3 = attn_f32.unsqueeze(2)?; // (b, seq, 1)
-        let summed = hidden.broadcast_mul(&mask3)?.sum(1)?; // (b, hidden)
-        // counts ≥ 1 always: every input is e5-prefixed AND special tokens are
-        // added (encode_batch(_, true)), so no row has an all-zero mask. If that
-        // invariant ever breaks, broadcast_div would emit NaN vectors.
-        let counts = mask3.sum(1)?; // (b, 1)
-        let mean = summed.broadcast_div(&counts)?;
+        // Pooling — per the model spec.
+        let pooled = match self.spec.pooling {
+            Pooling::Mean => {
+                // attention-mask-weighted mean pooling
+                let mask3 = attn_f32.unsqueeze(2)?; // (b, seq, 1)
+                let summed = hidden.broadcast_mul(&mask3)?.sum(1)?; // (b, hidden)
+                // counts ≥ 1 always: every input is prefixed AND special
+                // tokens are added (encode_batch(_, true)), so no row has an
+                // all-zero mask. If that invariant ever breaks, broadcast_div
+                // would emit NaN vectors.
+                let counts = mask3.sum(1)?; // (b, 1)
+                summed.broadcast_div(&counts)?
+            }
+            Pooling::Cls => {
+                // CLS pooling: the first token's hidden state. arctic-embed
+                // v2.0 prepends `<s>` (the XLM-R BOS/CLS) at index 0, so
+                // `hidden[:, 0, :]` is the sentence embedding.
+                hidden.narrow(1, 0, 1)?.squeeze(1)? // (b, hidden)
+            }
+        };

        // L2 normalize
-        let norm = mean.sqr()?.sum_keepdim(1)?.sqrt()?;
-        let normalized = mean.broadcast_div(&norm)?;
+        let norm = pooled.sqr()?.sum_keepdim(1)?.sqrt()?;
+        let normalized = pooled.broadcast_div(&norm)?;

        // `.contiguous()` before host copy: broadcast ops can leave a strided
        // view, which `to_vec2` rejects on the Metal backend (CPU tolerates it).
@@ -274,9 +388,9 @@ impl Embedder for CandleEmbedder {
            return Ok(Vec::new());
        }

-        // e5 prefix per §11.3 BEFORE tokenization (same convention as
-        // FastembedEmbedder so the two backends produce comparable vectors).
-        let prefixed: Vec<String> = inputs.iter().map(prefix_input).collect();
+        // Per-model instruction prefix BEFORE tokenization (same convention as
+        // FastembedEmbedder for e5; arctic uses `query: `/no-prefix).
+        let prefixed: Vec<String> = inputs.iter().map(|i| prefix_input(self.spec, i)).collect();

        let mut out: Vec<Vec<f32>> = Vec::with_capacity(prefixed.len());
        for chunk in prefixed.chunks(self.batch_size) {
@@ -298,22 +412,22 @@ impl Embedder for CandleEmbedder {
    }
 }

-/// Build the e5-prefixed string for one [`EmbeddingInput`]. Free function so
-/// a unit test can pin the format without loading the model. Byte-identical to
-/// `kebab-embed-local`'s `prefix_input` — the two backends MUST agree here or
-/// their vectors diverge.
-fn prefix_input(input: &EmbeddingInput<'_>) -> String {
+/// Build the prefixed string for one [`EmbeddingInput`] using the model spec.
+/// Free function so a unit test can pin the format without loading the model.
+/// For e5 this is byte-identical to `kebab-embed-local`'s `prefix_input` — the
+/// two backends MUST agree there or their vectors diverge.
+fn prefix_input(spec: &EmbedModelSpec, input: &EmbeddingInput<'_>) -> String {
    match input.kind {
-        EmbeddingKind::Document => format!("passage: {}", input.text),
-        EmbeddingKind::Query => format!("query: {}", input.text),
+        EmbeddingKind::Document => format!("{}{}", spec.doc_prefix, input.text),
+        EmbeddingKind::Query => format!("{}{}", spec.query_prefix, input.text),
    }
 }

 /// Select the compute device. Built with the `metal` feature (Apple Silicon
 /// GPU), try Metal and fall back to CPU on failure; otherwise CPU. Metal only
-/// compiles/runs on macOS — the Linux server builds the CPU path. e5-large
-/// vectors are model-defined, so Metal-produced and CPU-produced embeddings are
-/// cross-compatible (a Mac can ingest on GPU, the server query on CPU).
+/// compiles/runs on macOS — the Linux server builds the CPU path. Embedding
+/// vectors are model-defined, so Metal-produced and CPU-produced embeddings
+/// are cross-compatible (a Mac can ingest on GPU, the server query on CPU).
 fn select_device() -> Device {
    #[cfg(feature = "metal")]
    {
@@ -367,26 +481,85 @@ pub(crate) fn check_dim(model_dim: usize, cfg_dim: usize) -> Result<()> {
 mod tests {
    use super::*;

-    // ── prefix_input ─────────────────────────────────────────────────
-    // Pin the exact e5 prefix strings; these MUST match
-    // kebab-embed-local::prefix_input or candle vs fastembed parity breaks.
+    fn e5_spec() -> &'static EmbedModelSpec {
+        lookup_spec("multilingual-e5-large").expect("e5 in registry")
+    }
+
+    fn arctic_spec() -> &'static EmbedModelSpec {
+        lookup_spec("snowflake-arctic-embed-l-v2.0").expect("arctic in registry")
+    }
+
+    // ── registry ─────────────────────────────────────────────────────

    #[test]
-    fn prefix_document_uses_passage() {
+    fn registry_resolves_e5_by_name_and_hf_repo() {
+        assert_eq!(
+            lookup_spec("multilingual-e5-large").map(|s| s.name),
+            Some("multilingual-e5-large")
+        );
+        assert_eq!(
+            lookup_spec("intfloat/multilingual-e5-large").map(|s| s.name),
+            Some("multilingual-e5-large")
+        );
+    }
+
+    #[test]
+    fn registry_resolves_arctic_and_its_pooling_is_cls() {
+        let s = arctic_spec();
+        assert_eq!(s.name, "snowflake-arctic-embed-l-v2.0");
+        assert_eq!(s.hf_repo, "Snowflake/snowflake-arctic-embed-l-v2.0");
+        assert_eq!(s.pooling, Pooling::Cls);
+        assert_eq!(s.dim, 1024);
+        assert_eq!(s.version_tag, Some("arctic-cls"));
+    }
+
+    #[test]
+    fn registry_e5_is_mean_pooling_no_version_tag() {
+        let s = e5_spec();
+        assert_eq!(s.pooling, Pooling::Mean);
+        assert_eq!(s.version_tag, None);
+    }
+
+    #[test]
+    fn registry_rejects_unknown_model() {
+        assert!(lookup_spec("multilingual-e5-small").is_none());
+    }
+
+    // ── prefix_input ─────────────────────────────────────────────────
+    // e5 prefixes MUST match kebab-embed-local::prefix_input or candle vs
+    // fastembed parity breaks; arctic uses query-only prefixing.
+
+    #[test]
+    fn e5_prefix_document_uses_passage() {
        let input = EmbeddingInput {
            text: "hello world",
            kind: EmbeddingKind::Document,
        };
-        assert_eq!(prefix_input(&input), "passage: hello world");
+        assert_eq!(prefix_input(e5_spec(), &input), "passage: hello world");
    }

    #[test]
-    fn prefix_query_uses_query() {
+    fn e5_prefix_query_uses_query() {
        let input = EmbeddingInput {
            text: "hello world",
            kind: EmbeddingKind::Query,
        };
-        assert_eq!(prefix_input(&input), "query: hello world");
+        assert_eq!(prefix_input(e5_spec(), &input), "query: hello world");
+    }
+
+    #[test]
+    fn arctic_prefix_query_uses_query_doc_is_bare() {
+        let doc = EmbeddingInput {
+            text: "후입선출 자료구조",
+            kind: EmbeddingKind::Document,
+        };
+        let qry = EmbeddingInput {
+            text: "스택 자료구조",
+            kind: EmbeddingKind::Query,
+        };
+        // arctic: documents are embedded raw, queries get `query: `.
+        assert_eq!(prefix_input(arctic_spec(), &doc), "후입선출 자료구조");
+        assert_eq!(prefix_input(arctic_spec(), &qry), "query: 스택 자료구조");
    }

    #[test]
@@ -399,8 +572,10 @@ mod tests {
            text: "",
            kind: EmbeddingKind::Query,
        };
-        assert_eq!(prefix_input(&doc), "passage: ");
-        assert_eq!(prefix_input(&qry), "query: ");
+        assert_eq!(prefix_input(e5_spec(), &doc), "passage: ");
+        assert_eq!(prefix_input(e5_spec(), &qry), "query: ");
+        assert_eq!(prefix_input(arctic_spec(), &doc), "");
+        assert_eq!(prefix_input(arctic_spec(), &qry), "query: ");
    }

    // ── check_dim ────────────────────────────────────────────────────
@@ -421,9 +596,9 @@ mod tests {
    }

    // ── model guard ──────────────────────────────────────────────────
-    // A non-e5-large model name must fail fast (BEFORE the ~2GB download),
-    // so we never download e5-large yet label its vectors with another name
-    // via model_id() — which would mislabel embedding_version.
+    // A model name not in the registry must fail fast (BEFORE the ~2GB
+    // download), so we never download one model yet label its vectors with
+    // another name via model_id() — which would mislabel embedding_version.

    #[test]
    fn new_rejects_unsupported_model() {
@@ -437,8 +612,8 @@ mod tests {
            .expect("unsupported model must error");
        let msg = format!("{err:#}");
        assert!(
-            msg.contains("candle provider currently supports only"),
-            "expected model-guard error, got: {msg}"
+            msg.contains("candle provider supports the models"),
+            "expected model-registry error, got: {msg}"
        );
    }
 }
--- a/crates/kebab-embed-candle/tests/arctic_ollama_parity.rs
+++ b/crates/kebab-embed-candle/tests/arctic_ollama_parity.rs
@@ -0,0 +1,128 @@
+//! arctic-embed-l-v2.0 correctness gate (`#[ignore]` — needs the ~2GB candle
+//! model + a live Ollama serving `snowflake-arctic-embed2`).
+//!
+//! This is the load-bearing pooling/prefix check for the arctic integration.
+//! The recall measurement that justified adopting arctic (recall@10 130/132)
+//! went through Ollama's `snowflake-arctic-embed2`. The candle path
+//! re-implements the model (XLM-RoBERTa-large + **CLS** pooling + `query: ` on
+//! queries / no prefix on documents). If candle's pooling or prefix is wrong,
+//! its vectors silently diverge from the measured route and the 130 number
+//! does NOT carry over. This test pins them together: per-sentence cosine
+//! between the candle vector and the Ollama vector must be **> 0.99**.
+//!
+//! `#[ignore]` because it depends on an external Ollama daemon (CI is
+//! headless/offline). The leader MUST run it once before merge.
+//!
+//! ## Manual run
+//!
+//! 1. Confirm Ollama is reachable and has the model:
+//!    ```sh
+//!    curl -s http://192.168.0.47:11434/api/tags        # should list snowflake-arctic-embed2
+//!    ```
+//! 2. Run (downloads the ~2GB candle safetensors on first run):
+//!    ```sh
+//!    CARGO_TARGET_DIR=/build/out/cargo-target \
+//!    KEBAB_ARCTIC_OLLAMA_ENDPOINT=http://192.168.0.47:11434 \
+//!    cargo test -p kebab-embed-candle --test arctic_ollama_parity -- --ignored --nocapture
+//!    ```
+//!    The endpoint defaults to `http://192.168.0.47:11434` if the env is unset.
+//!
+//! Record the printed `ARCTIC_PARITY_SUMMARY cosine_min=...` in
+//! `/tmp/arctic-result.md` + `tasks/HOTFIXES.md`.
+
+use kebab_config::Config;
+use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind};
+use kebab_embed_candle::CandleEmbedder;
+use kebab_embed_ollama::OllamaEmbedder;
+
+const DOGFOOD_CONFIG: &str = "/build/dogfood/config.toml";
+const DEFAULT_OLLAMA_ENDPOINT: &str = "http://192.168.0.47:11434";
+
+/// Mixed Korean / English + the descriptive-recall shapes arctic was adopted
+/// for (synonym / abbreviation / English term). Covers both prefix paths.
+const SENTENCES: &[&str] = &[
+    "스택 자료구조",
+    "후입선출 방식으로 동작하는 자료구조",
+    "큐는 선입선출 자료구조이다",
+    "Rust ownership and the borrow checker",
+    "소유권과 빌림 검사기는 메모리 안전성을 보장한다",
+    "SVM 은 support vector machine 의 약자이다",
+    "정렬 알고리즘의 시간 복잡도",
+    "The capital of France is Paris.",
+];
+
+fn cosine(a: &[f32], b: &[f32]) -> f32 {
+    let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
+    let na: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
+    let nb: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
+    dot / (na * nb)
+}
+
+/// Base config: prefer the canonical dogfood config (for storage/cache roots),
+/// fall back to `Config::defaults()` so the test still runs on a bare clone.
+fn base_config() -> Config {
+    Config::load(Some(std::path::Path::new(DOGFOOD_CONFIG))).unwrap_or_else(|_| Config::defaults())
+}
+
+#[test]
+#[ignore = "needs ~2GB candle model + live Ollama (snowflake-arctic-embed2); run manually before merge"]
+fn candle_arctic_matches_ollama_arctic() {
+    let endpoint = std::env::var("KEBAB_ARCTIC_OLLAMA_ENDPOINT")
+        .unwrap_or_else(|_| DEFAULT_OLLAMA_ENDPOINT.to_string());
+
+    // candle side: the in-process arctic model.
+    let mut candle_cfg = base_config();
+    candle_cfg.models.embedding.provider = "candle".to_string();
+    candle_cfg.models.embedding.model = "snowflake-arctic-embed-l-v2.0".to_string();
+    candle_cfg.models.embedding.dimensions = 1024;
+
+    // Ollama side: the reference route the recall numbers came from.
+    let mut ollama_cfg = base_config();
+    ollama_cfg.models.embedding.provider = "ollama".to_string();
+    ollama_cfg.models.embedding.model = "snowflake-arctic-embed2".to_string();
+    ollama_cfg.models.embedding.dimensions = 1024;
+    ollama_cfg.models.embedding.endpoint = Some(endpoint.clone());
+
+    let candle = CandleEmbedder::new(&candle_cfg).expect("build candle arctic embedder");
+    let ollama = OllamaEmbedder::new(&ollama_cfg).expect("build ollama arctic embedder");
+
+    // Exercise BOTH prefix paths so a query-side divergence can't hide.
+    let inputs: Vec<EmbeddingInput> = SENTENCES
+        .iter()
+        .flat_map(|s| {
+            [EmbeddingKind::Document, EmbeddingKind::Query]
+                .into_iter()
+                .map(move |kind| EmbeddingInput { text: s, kind })
+        })
+        .collect();
+
+    let cv = candle.embed(&inputs).expect("candle embed");
+    let ov = ollama
+        .embed(&inputs)
+        .expect("ollama embed (is snowflake-arctic-embed2 pulled @ the endpoint?)");
+
+    assert_eq!(cv.len(), ov.len(), "embedding counts must match");
+    assert_eq!(cv.len(), inputs.len(), "one vector per input");
+    assert_eq!(candle.dimensions(), 1024);
+
+    let mut min_cos = f32::INFINITY;
+    for (i, inp) in inputs.iter().enumerate() {
+        assert_eq!(cv[i].len(), 1024, "candle dim");
+        assert_eq!(ov[i].len(), 1024, "ollama dim");
+        let c = cosine(&cv[i], &ov[i]);
+        min_cos = min_cos.min(c);
+        let kind = match inp.kind {
+            EmbeddingKind::Document => "doc",
+            EmbeddingKind::Query => "qry",
+        };
+        let preview: String = inp.text.chars().take(36).collect();
+        println!("[{i:>2}] {kind} cos={c:.6}  {preview}");
+    }
+
+    println!("ARCTIC_PARITY_SUMMARY cosine_min={min_cos:.6} endpoint={endpoint}");
+    assert!(
+        min_cos > 0.99,
+        "candle arctic vs Ollama arctic cosine_min={min_cos:.6} ≤ 0.99 — \
+         pooling/prefix mismatch; the recall=130 measurement will NOT reproduce"
+    );
+}
--- a/crates/kebab-embed-ollama/Cargo.toml
+++ b/crates/kebab-embed-ollama/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "kebab-embed-ollama"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "Ollama HTTP adapter implementing kebab_core::Embedder (POST /api/embed, L2-normalized, batched + fail-soft)"
+
+[dependencies]
+kebab-core = { path = "../kebab-core" }
+kebab-config = { path = "../kebab-config" }
+# `default-features = false` drops native-tls (system OpenSSL); we pin rustls.
+# reqwest 0.12's `blocking` feature wraps a private current-thread tokio
+# runtime — this crate exposes NO async surface (no `async`/`await`/`tokio::*`
+# symbols), matching the kebab-llm-local invariant.
+reqwest    = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+serde      = { workspace = true, features = ["derive"] }
+serde_json = { workspace = true }
+tracing    = { workspace = true }
+anyhow     = { workspace = true }
+
+[dev-dependencies]
+# wiremock hosts the mock /api/embed server (needs a tokio runtime); tokio is
+# also pulled transitively at runtime by reqwest's `blocking` feature.
+wiremock   = { workspace = true }
+tokio      = { workspace = true, features = ["macros", "rt"] }
+
+[lints]
+workspace = true
--- a/crates/kebab-embed-ollama/src/lib.rs
+++ b/crates/kebab-embed-ollama/src/lib.rs
@@ -0,0 +1,310 @@
+//! `kebab-embed-ollama` — [`OllamaEmbedder`], a `reqwest::blocking` adapter
+//! implementing [`Embedder`](kebab_core::Embedder) over Ollama's
+//! `POST /api/embed` endpoint.
+//!
+//! ## Why this exists
+//!
+//! The candle backend ([`kebab-embed-candle`]) runs arctic-embed-l-v2.0
+//! in-process (pure Rust, NUMA-safe). This crate is the **fallback** path:
+//! it offloads embedding to a local/remote Ollama daemon (`snowflake-arctic-embed2`),
+//! which is exactly the route the recall measurements used — so it reproduces
+//! the measured numbers (recall@10 130/132) byte-for-route. Opt-in via
+//! `config.models.embedding.provider = "ollama"`.
+//!
+//! ## Wire shape
+//!
+//! Request (`POST {endpoint}/api/embed`):
+//!
+//! ```json
+//! { "model": "snowflake-arctic-embed2", "input": ["query: 스택", "후입선출 ..."] }
+//! ```
+//!
+//! Response:
+//!
+//! ```json
+//! { "model": "...", "embeddings": [[0.01, ...], [0.02, ...]] }
+//! ```
+//!
+//! ## Pipeline
+//!
+//! 1. instruction prefix per model ([`prefixes_for`] — arctic: `query: ` on
+//!    queries, no prefix on documents; e5: `query: `/`passage: `);
+//! 2. batch into `BATCH` (48) inputs per request;
+//! 3. `POST /api/embed`, with fail-soft retry (`MAX_RETRIES`);
+//! 4. **L2 normalize** each returned vector — Ollama returns raw (un-normalized)
+//!    embeddings, so we normalize for cosine consistency with the candle path;
+//! 5. dim check against `config.models.embedding.dimensions`.
+//!
+//! ## Send-safety
+//!
+//! `reqwest::blocking::Client: Send + Sync`; the adapter holds only the client,
+//! an endpoint string, and small config scalars, so it is trivially `Send + Sync`
+//! as the [`Embedder`] trait requires.
+
+use std::time::Duration;
+
+use anyhow::{Context, Result};
+use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion};
+use serde::{Deserialize, Serialize};
+
+/// Inputs per `/api/embed` request. Ollama handles arbitrary batch sizes, but
+/// a cap keeps a single HTTP body bounded and lets a partial failure retry a
+/// smaller unit.
+const BATCH: usize = 48;
+
+/// Fail-soft retry attempts per batch before the error propagates. Cold model
+/// load on the Ollama side can transiently 500/timeout; a couple of retries
+/// smooth that over without masking a hard misconfiguration.
+const MAX_RETRIES: u32 = 3;
+
+/// Default per-request HTTP timeout (seconds). Cold-loading an embedding model
+/// on first call can take tens of seconds; this matches the generous default
+/// used by the LLM adapter.
+const REQUEST_TIMEOUT_SECS: u64 = 300;
+
+/// Resolve the (query_prefix, doc_prefix) for an Ollama embedding model tag.
+///
+/// Mirrors `kebab-embed-candle`'s `MODEL_REGISTRY`, but keyed on the **Ollama
+/// model tag** (which differs from the HF id — e.g. `snowflake-arctic-embed2`
+/// vs `Snowflake/snowflake-arctic-embed-l-v2.0`). Kept here rather than shared
+/// so this crate does not depend on the candle backend.
+///
+/// An unrecognized model gets no prefix (`("", "")`): many embedding models
+/// are not instruction-tuned, so embedding the raw text is the correct default
+/// — and a misspelled known model surfaces as a recall regression, not a silent
+/// wrong-prefix, because the dim check still passes either way.
+fn prefixes_for(model: &str) -> (&'static str, &'static str) {
+    let m = model.to_ascii_lowercase();
+    if m.contains("arctic-embed") {
+        // arctic-embed v2.0: `query: ` on queries, documents embedded raw.
+        ("query: ", "")
+    } else if m.contains("e5") {
+        // multilingual-e5: `query: ` / `passage: `.
+        ("query: ", "passage: ")
+    } else {
+        ("", "")
+    }
+}
+
+/// `reqwest::blocking` adapter implementing [`Embedder`] over Ollama's
+/// `/api/embed`. Construction is offline; the first network call happens in
+/// [`Embedder::embed`].
+pub struct OllamaEmbedder {
+    client: reqwest::blocking::Client,
+    /// Validated endpoint base (e.g. `"http://127.0.0.1:11434"`).
+    endpoint: String,
+    /// Ollama model tag (e.g. `"snowflake-arctic-embed2"`).
+    model: String,
+    query_prefix: &'static str,
+    doc_prefix: &'static str,
+    model_id: EmbeddingModelId,
+    version: EmbeddingVersion,
+    dimensions: usize,
+}
+
+impl OllamaEmbedder {
+    /// Build from a workspace [`kebab_config::Config`]. Reads
+    /// `config.models.embedding.{model, dimensions}` and resolves the endpoint
+    /// as `models.embedding.endpoint` → fallback `models.llm.endpoint`.
+    ///
+    /// Does NOT touch the network. The caller (app layer) is expected to have
+    /// validated `provider == "ollama"`.
+    pub fn new(config: &kebab_config::Config) -> Result<Self> {
+        let emb = &config.models.embedding;
+        let endpoint = emb
+            .endpoint
+            .clone()
+            .filter(|e| !e.is_empty())
+            .unwrap_or_else(|| config.models.llm.endpoint.clone());
+        if endpoint.is_empty() {
+            anyhow::bail!(
+                "ollama embedding provider needs an endpoint: set \
+                 `models.embedding.endpoint` (or `models.llm.endpoint`)"
+            );
+        }
+        let client = reqwest::blocking::Client::builder()
+            .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
+            .build()
+            .context("kb-embed-ollama: build reqwest client")?;
+        let (query_prefix, doc_prefix) = prefixes_for(&emb.model);
+        Ok(Self {
+            client,
+            endpoint,
+            model: emb.model.clone(),
+            query_prefix,
+            doc_prefix,
+            model_id: EmbeddingModelId(emb.model.clone()),
+            // model_version = `ollama:{model}` so a provider/model switch
+            // triggers the embedding_version cascade and never collides with
+            // the candle path's version string for the same model.
+            version: EmbeddingVersion(format!("ollama:{}", emb.model)),
+            dimensions: emb.dimensions,
+        })
+    }
+
+    /// Embed one already-prefixed batch via `/api/embed`, with fail-soft retry.
+    fn embed_batch(&self, prefixed: &[String]) -> Result<Vec<Vec<f32>>> {
+        let url = format!("{}/api/embed", self.endpoint.trim_end_matches('/'));
+        let body = EmbedRequest {
+            model: &self.model,
+            input: prefixed,
+        };
+
+        let mut last_err: Option<anyhow::Error> = None;
+        for attempt in 1..=MAX_RETRIES {
+            match self.try_once(&url, &body) {
+                Ok(resp) => return self.finalize(resp, prefixed.len()),
+                Err(e) => {
+                    tracing::warn!(
+                        target: "kebab-embed-ollama",
+                        attempt,
+                        max = MAX_RETRIES,
+                        error = %e,
+                        "ollama /api/embed attempt failed; retrying"
+                    );
+                    last_err = Some(e);
+                }
+            }
+        }
+        Err(last_err.unwrap_or_else(|| {
+            anyhow::anyhow!("kb-embed-ollama: all {MAX_RETRIES} attempts failed")
+        }))
+    }
+
+    /// One HTTP round-trip. Network / non-2xx / decode errors all map to
+    /// `Err` so the retry loop can decide.
+    fn try_once(&self, url: &str, body: &EmbedRequest<'_>) -> Result<EmbedResponse> {
+        let resp = self
+            .client
+            .post(url)
+            .json(body)
+            .send()
+            .with_context(|| format!("kb-embed-ollama: POST {url}"))?;
+        let status = resp.status();
+        if !status.is_success() {
+            let text = resp.text().unwrap_or_default();
+            anyhow::bail!("kb-embed-ollama: /api/embed returned {status}: {text}");
+        }
+        resp.json::<EmbedResponse>()
+            .context("kb-embed-ollama: decode /api/embed response")
+    }
+
+    /// Validate count + dim, then L2-normalize each vector.
+    fn finalize(&self, resp: EmbedResponse, expected: usize) -> Result<Vec<Vec<f32>>> {
+        if resp.embeddings.len() != expected {
+            anyhow::bail!(
+                "kb-embed-ollama: expected {expected} embeddings, got {}",
+                resp.embeddings.len()
+            );
+        }
+        let mut out = Vec::with_capacity(resp.embeddings.len());
+        for v in resp.embeddings {
+            if v.len() != self.dimensions {
+                anyhow::bail!(
+                    "kb-embed-ollama: model returned dim {} but config expects {} \
+                     (check models.embedding.dimensions vs the Ollama model)",
+                    v.len(),
+                    self.dimensions
+                );
+            }
+            out.push(l2_normalize(v));
+        }
+        Ok(out)
+    }
+}
+
+impl Embedder for OllamaEmbedder {
+    fn model_id(&self) -> EmbeddingModelId {
+        self.model_id.clone()
+    }
+
+    fn model_version(&self) -> EmbeddingVersion {
+        self.version.clone()
+    }
+
+    fn dimensions(&self) -> usize {
+        self.dimensions
+    }
+
+    fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> Result<Vec<Vec<f32>>> {
+        if inputs.is_empty() {
+            return Ok(Vec::new());
+        }
+        let prefixed: Vec<String> = inputs.iter().map(|i| self.prefix(i)).collect();
+        let mut out = Vec::with_capacity(prefixed.len());
+        for chunk in prefixed.chunks(BATCH) {
+            out.extend(self.embed_batch(chunk)?);
+        }
+        debug_assert_eq!(out.len(), inputs.len());
+        Ok(out)
+    }
+}
+
+impl OllamaEmbedder {
+    /// Prefix one input per the resolved model prefixes.
+    fn prefix(&self, input: &EmbeddingInput<'_>) -> String {
+        match input.kind {
+            EmbeddingKind::Document => format!("{}{}", self.doc_prefix, input.text),
+            EmbeddingKind::Query => format!("{}{}", self.query_prefix, input.text),
+        }
+    }
+}
+
+/// L2-normalize a vector in place-ish (consumes + returns). A zero vector is
+/// returned unchanged (norm 0 → no division) so a degenerate embedding can
+/// never produce NaNs.
+fn l2_normalize(mut v: Vec<f32>) -> Vec<f32> {
+    let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt();
+    if norm > 0.0 {
+        for x in &mut v {
+            *x /= norm;
+        }
+    }
+    v
+}
+
+// ── Wire types ──────────────────────────────────────────────────────────────
+
+#[derive(Serialize)]
+struct EmbedRequest<'a> {
+    model: &'a str,
+    input: &'a [String],
+}
+
+#[derive(Deserialize)]
+struct EmbedResponse {
+    embeddings: Vec<Vec<f32>>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn prefixes_for_arctic_is_query_only() {
+        assert_eq!(prefixes_for("snowflake-arctic-embed2"), ("query: ", ""));
+        assert_eq!(prefixes_for("snowflake-arctic-embed2:latest"), ("query: ", ""));
+    }
+
+    #[test]
+    fn prefixes_for_e5_is_query_passage() {
+        assert_eq!(prefixes_for("multilingual-e5-large"), ("query: ", "passage: "));
+    }
+
+    #[test]
+    fn prefixes_for_unknown_is_bare() {
+        assert_eq!(prefixes_for("nomic-embed-text"), ("", ""));
+    }
+
+    #[test]
+    fn l2_normalize_unit_length() {
+        let v = l2_normalize(vec![3.0, 4.0]);
+        let norm = (v[0] * v[0] + v[1] * v[1]).sqrt();
+        assert!((norm - 1.0).abs() < 1e-6, "norm = {norm}");
+    }
+
+    #[test]
+    fn l2_normalize_zero_vector_is_unchanged() {
+        assert_eq!(l2_normalize(vec![0.0, 0.0, 0.0]), vec![0.0, 0.0, 0.0]);
+    }
+}
--- a/crates/kebab-embed-ollama/tests/embed_mock.rs
+++ b/crates/kebab-embed-ollama/tests/embed_mock.rs
@@ -0,0 +1,99 @@
+//! `/api/embed` behavior against a `wiremock`-hosted mock server.
+//!
+//! `wiremock` is async, so the tests are `#[tokio::test]`; the sync
+//! [`OllamaEmbedder`] is driven from `spawn_blocking` to keep `reqwest::blocking`
+//! off the async runtime (same pattern as `kebab-llm-local`'s streaming tests).
+//! tokio is a `dev-dependency` only.
+
+use kebab_config::Config;
+use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind};
+use kebab_embed_ollama::OllamaEmbedder;
+use wiremock::matchers::{method, path};
+use wiremock::{Mock, MockServer, ResponseTemplate};
+
+/// Config pointing at the mock server, with a small dim so the mock body is
+/// tiny. `model` is an arctic tag so prefix resolution is exercised.
+fn cfg_for(endpoint: &str, dim: usize) -> Config {
+    let mut cfg = Config::defaults();
+    cfg.models.embedding.provider = "ollama".to_string();
+    cfg.models.embedding.model = "snowflake-arctic-embed2".to_string();
+    cfg.models.embedding.dimensions = dim;
+    cfg.models.embedding.endpoint = Some(endpoint.to_string());
+    cfg
+}
+
+async fn embed_blocking(
+    cfg: Config,
+    inputs: Vec<(String, EmbeddingKind)>,
+) -> anyhow::Result<Vec<Vec<f32>>> {
+    tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Vec<f32>>> {
+        let emb = OllamaEmbedder::new(&cfg)?;
+        let refs: Vec<EmbeddingInput<'_>> = inputs
+            .iter()
+            .map(|(t, k)| EmbeddingInput { text: t, kind: *k })
+            .collect();
+        emb.embed(&refs)
+    })
+    .await
+    .expect("blocking task panicked")
+}
+
+#[tokio::test]
+async fn embed_returns_l2_normalized_vectors() {
+    let server = MockServer::start().await;
+    // Two raw (un-normalized) vectors of dim 2; the adapter must L2-normalize.
+    let body = r#"{"model":"snowflake-arctic-embed2","embeddings":[[3.0,4.0],[0.0,5.0]]}"#;
+    Mock::given(method("POST"))
+        .and(path("/api/embed"))
+        .respond_with(ResponseTemplate::new(200).set_body_string(body))
+        .mount(&server)
+        .await;
+
+    let out = embed_blocking(
+        cfg_for(&server.uri(), 2),
+        vec![
+            ("스택 자료구조".to_string(), EmbeddingKind::Query),
+            ("후입선출".to_string(), EmbeddingKind::Document),
+        ],
+    )
+    .await
+    .expect("embed should succeed");
+
+    assert_eq!(out.len(), 2);
+    for v in &out {
+        let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt();
+        assert!((norm - 1.0).abs() < 1e-5, "expected unit norm, got {norm}");
+    }
+    // [3,4] → [0.6, 0.8].
+    assert!((out[0][0] - 0.6).abs() < 1e-5 && (out[0][1] - 0.8).abs() < 1e-5);
+}
+
+#[tokio::test]
+async fn embed_rejects_dim_mismatch() {
+    let server = MockServer::start().await;
+    // Server returns dim 3, config expects dim 2 → hard error.
+    let body = r#"{"model":"snowflake-arctic-embed2","embeddings":[[1.0,2.0,3.0]]}"#;
+    Mock::given(method("POST"))
+        .and(path("/api/embed"))
+        .respond_with(ResponseTemplate::new(200).set_body_string(body))
+        .mount(&server)
+        .await;
+
+    let err = embed_blocking(
+        cfg_for(&server.uri(), 2),
+        vec![("q".to_string(), EmbeddingKind::Query)],
+    )
+    .await
+    .expect_err("dim mismatch must error");
+    let msg = format!("{err:#}");
+    assert!(msg.contains("dim"), "expected dim error, got: {msg}");
+}
+
+#[tokio::test]
+async fn embed_empty_input_is_noop() {
+    // No mock needed — empty input must never hit the network.
+    let out = embed_blocking(cfg_for("http://127.0.0.1:1", 2), vec![])
+        .await
+        .expect("empty embed should be Ok(empty)");
+    assert!(out.is_empty());
+}
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -16,7 +16,7 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab-
 | metadata | SQLite + FTS5 (lexical search + v0.20.1 한국어 형태소 tokenizer via lindera-ko-dic) |
 | vector | LanceDB (embedded, model 별 분리 table) |
 | Markdown parser | `pulldown-cmark`. frontmatter 에 title 없으면 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (`parser_version = md-frontmatter-v2`, 기존 doc 도 다음 ingest 에서 갱신) |
-| embedding | `fastembed-rs` (`multilingual-e5-large`, 1024d, v0.18.0부터 default 업그레이드) |
+| embedding | `fastembed-rs` (`multilingual-e5-large`, 1024d, v0.18.0부터 default 업그레이드). opt-in 대안: candle (e5 또는 `snowflake-arctic-embed-l-v2.0`) / Ollama `/api/embed`. arctic = 설명형 query recall 보강 (v0.26.0, 아래 결정표) |
 | 한국어 형태소분석 | `lindera-ko-dic` (FTS5 외부 tokenizer, v0.20.1) — 2자 이상 한국어 query 지원 |
 | LLM | Ollama HTTP (default `gemma4:e4b` ─ OCR / caption 와 family 통일. 사용자가 더 큰 variant `gemma4:26b` 등으로 override 가능) |
 | 음성 ASR | `whisper.cpp` (via `whisper-rs`) — P8 보류, 시스템 dep brainstorm 후 |
@@ -67,7 +67,8 @@ flowchart TB
    subgraph Adapters ["traits + adapters"]
        embed["kebab-embed<br/>(trait)"]
        embedlocal["kebab-embed-local<br/>(fastembed, default)"]
-        embedcandle["kebab-embed-candle<br/>(candle, NUMA-safe opt-in)"]
+        embedcandle["kebab-embed-candle<br/>(candle, e5+arctic, NUMA-safe opt-in)"]
+        embedollama["kebab-embed-ollama<br/>(Ollama /api/embed, opt-in)"]
        llm["kebab-llm<br/>(trait)"]
        llmlocal["kebab-llm-local<br/>(Ollama)"]
        search["kebab-search"]
@@ -94,6 +95,7 @@ flowchart TB
    app --> vector
    app --> embedlocal
    app --> embedcandle
+    app --> embedollama
    app --> llmlocal
    app --> search
    app --> rag
@@ -108,6 +110,8 @@ flowchart TB
    embedlocal --> embed
    embedcandle --> core
    embedcandle --> config
+    embedollama --> core
+    embedollama --> config
    llmlocal --> llm
    rag --> search
    rag --> llm
@@ -136,6 +140,23 @@ UI → store/llm/parse 직접 의존 금지. 모든 user-facing 진입은 `kebab

 `kebab-parse-code` 의 외부 tree-sitter grammar crate 의존: P10-1A-2 에서 `tree-sitter-rust` 추가, P10-1B 에서 `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` 추가, P10-1C-Go 에서 `tree-sitter-go` 추가, P10-1C-JK 에서 `tree-sitter-java` / `tree-sitter-kotlin-ng` 추가, P10-1D 에서 `tree-sitter-c` / `tree-sitter-cpp` 추가. 모두 `kebab-parse-code` 에만 격리 (facade 룰 — UI crate / chunker 가 직접 import 금지). Kotlin 은 `tree-sitter-kotlin-ng` 사용 (bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착 — 사용 불가). v0.18.0+ 부터 `kebab-source-fs` 는 자체 `code_meta` 모듈 (lang detect + skip helpers + BUILTIN_BLACKLIST) 을 보유, kebab-parse-code 와 분리 (refactor 2026-05-26). v0.19.0 부터 `kebab-parse-md` 가 `kebab-parse-types` (parser intermediate types) + `kebab-normalize` (CanonicalDocument lift) 두 crate 를 흡수 — 24 → 22 crates, design §3.7b 재작성 (HOTFIXES 2026-05-26). v0.20.1 부터 `kebab-search` 가 `lindera-ko-dic` 를 의존해 한국어 FTS5 형태소 tokenizer 지원 — V009 migration 으로 2자 이상 한국어 query 매칭 (Bug #8 closure).

+### 임베딩 백엔드 결정표 (v0.26.0)
+
+| provider | 모델 | pooling / prefix | 위치 | 언제 |
+|---|---|---|---|---|
+| `fastembed` (기본) | `multilingual-e5-large` | mean / `query:`·`passage:` | in-process (onnxruntime) | 기본. 단일 소켓 호스트 |
+| `candle` | e5 또는 `snowflake-arctic-embed-l-v2.0` | 모델별 (e5=mean, arctic=CLS) / arctic=`query:`·무접두어 | in-process (pure Rust) | NUMA 서버 (onnxruntime 48-스레드 double-free 회피), Apple Silicon Metal GPU |
+| `ollama` | `snowflake-arctic-embed2` 등 | 모델 태그로 추론 / arctic=`query:`·무접두어 | 원격 HTTP (`/api/embed`) | candle 폴백, 측정에 쓴 경로 그대로 재현 |
+
+**arctic-embed-l-v2.0 채택 근거**: 별칭(doc-side expansion) 제거(v0.25.0) 후 설명형
+query 의 recall 보강책. 측정(`/build/dogfood/logs/2026-06-03-method-measurements.md`)에서
+arctic = recall@10 130/132 (e5 대비 +7, 색인 1회·per-query 0·LLM 0, 용어 무손실).
+candle 이 주 백엔드(in-process, NUMA 안전), Ollama 가 폴백(측정 경로 재현). 두 경로의
+pooling/prefix 정확성은 `kebab-embed-candle/tests/arctic_ollama_parity.rs`
+(candle arctic vs Ollama arctic 코사인>0.99, `#[ignore]`) 로 고정. e5 → arctic 전환은
+`embedding_version` cascade (모델별 벡터 상이) → 재색인 필요. 기본값 e5 유지라 기존
+사용자 무영향. 자세한 내용: [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 2026-06-03 arctic entry.
+
 ## 디렉토리 구조

 ```text
@@ -184,7 +205,8 @@ kebab/
 │   ├── kebab-store-sqlite/                            # SQLite + FTS5 (V001/V002/V003) (P1-6, P2-1, P3-3). src/derivation_cache.rs = derivation_cache 테이블 저장소 (V012, v0.21.0)
 │   ├── kebab-search/                                  # Lexical + Vector + Hybrid retriever (P2-2, P3-4)
 │   ├── kebab-embed/  kebab-embed-local/                  # Embedder trait + fastembed adapter (P3-1, P3-2)
-│   ├── kebab-embed-candle/                             # candle (pure-Rust) Embedder, NUMA-safe opt-in provider=candle (Track 1, v0.22.0)
+│   ├── kebab-embed-candle/                             # candle (pure-Rust) Embedder, 모델 레지스트리(e5 mean + arctic CLS), NUMA-safe opt-in provider=candle (Track 1, v0.22.0; arctic v0.26.0)
+│   ├── kebab-embed-ollama/                             # Ollama /api/embed Embedder, opt-in provider=ollama (arctic 폴백 경로, v0.26.0)
 │   ├── kebab-store-vector/                            # LanceDB VectorStore (P3-3, P7-3 follow-up)
 │   ├── kebab-llm/  kebab-llm-local/                      # LanguageModel trait + Ollama adapter (P4-1, P4-2)
 │   ├── kebab-rag/                                     # RAG pipeline (P4-3)
--- a/docs/SMOKE.md
+++ b/docs/SMOKE.md
@@ -107,16 +107,18 @@ respect_markdown_headings = true
 chunker_version = "md-heading-v1"

 [models.embedding]
-provider = "fastembed"               # "fastembed"(기본) / "candle"(순수 Rust, NUMA-안전)
-                                     # / "none"(lexical-only — Ollama 불필요)
-                                     # ⚠ provider="candle" 사용 시 아래 model/dimensions 도
-                                     #   multilingual-e5-large / 1024 로 바꿔야 함
-                                     #   (candle 은 현재 e5-large 만 지원).
-model = "multilingual-e5-small"
+provider = "fastembed"               # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust, NUMA-안전)
+                                     # / "ollama"(원격 HTTP /api/embed) / "none"(lexical-only — Ollama 불필요)
+                                     # ⚠ provider/model 변경 시 아래 dimensions 도 맞춰야 함.
+model = "multilingual-e5-small"      # candle/ollama 는 "snowflake-arctic-embed-l-v2.0"
+                                     # (ollama 태그 "snowflake-arctic-embed2", 1024-dim) 도 지원 —
+                                     # 설명형 query recall 보강. e5↔arctic 전환은
+                                     # embedding_version cascade (재색인 필요).
 version = "v1"
-dimensions = 384
+dimensions = 384                     # arctic / e5-large 는 1024.
 batch_size = 64
 num_threads = 0                      # candle 전용 CPU 스레드 캡 (0=auto). env KEBAB_EMBED_THREADS 우선.
+# endpoint = "http://127.0.0.1:11434"  # provider="ollama" 전용; 생략 시 [models.llm].endpoint fallback.

 [models.llm]
 provider = "ollama"
--- a/tasks/HOTFIXES.md
+++ b/tasks/HOTFIXES.md
@@ -14,6 +14,56 @@ historical contract that was implemented; this file accumulates the
 deltas so phase 5+ readers can find the live behavior without diffing
 git history.

+## 2026-06-03 — arctic-embed-l-v2.0 임베더 통합 (candle + Ollama) (v0.26.0)
+
+**무엇을 왜 추가했나.** 별칭(doc-side expansion) 제거(v0.25.0) 후 설명형 query 의
+recall 보강책으로 `snowflake-arctic-embed-l-v2.0` 임베더를 두 백엔드로 통합했다.
+근거는 방법별 측정(`/build/dogfood/logs/2026-06-03-method-measurements.md`):
+arctic = recall@10 **130/132**, recall@50 **132/132**, **용어 무손실**(syn/abbr/en
+유지). e5-large 대비 +7, 색인 1회·per-query 0·LLM 0 = 살아있는 KB 에 지속 가능.
+별칭이 청크당 색인-시 LLM(나무위키 18문서 cold 2.5h)을 요구한 것과 대조.
+
+**무엇을 건드렸나.**
+- `kebab-embed-candle`: e5 하드코딩(`HF_MODEL`/`SUPPORTED_MODEL`/mean/`query:`+`passage:`)을
+  **모델 레지스트리**(`MODEL_REGISTRY`: `EmbedModelSpec { name, hf_repo, pooling, query_prefix, doc_prefix, dim, version_tag }`)로
+  일반화. e5(mean, `query:`/`passage:`) + arctic(**CLS**, `query:`/무접두어). pooling
+  은 모델별 분기(mean=attention-mask-weighted / CLS=`hidden[:,0,:]`), tokenize/forward/L2
+  공유. arctic pooling=CLS 는 HF `1_Pooling/config.json`(`pooling_mode_cls_token:true`)로
+  확인. `model_version` 은 arctic 일 때 `+arctic-cls` 태그(switch 시 embedding_version
+  cascade 트리거); e5 는 fastembed-e5 와의 호환(NUMA 드롭인) 위해 plain `config.version` 유지.
+- `kebab-embed-ollama` (신규 크레이트): `Embedder` 구현, `reqwest::blocking` POST
+  `/api/embed` `{model, input:[...]}` → `embeddings`. batch 48 + fail-soft 재시도 3,
+  결과 **L2 정규화**(Ollama raw 반환), dim 검증, query/doc prefix 모델 태그로 추론
+  (`arctic-embed`→`query:`/무접두어, `e5`→`query:`/`passage:`). `model_version=ollama:{model}`.
+  endpoint = `models.embedding.endpoint` ?? `models.llm.endpoint`.
+- `kebab-config`: `EmbeddingModelCfg.endpoint: Option<String>`(serde default, ollama용) +
+  `provider` 문서에 `ollama` 추가 + env `KEBAB_MODELS_EMBEDDING_ENDPOINT`.
+- `kebab-app::embedder()`: provider match 에 `ollama` 분기 추가(facade 경유).
+- workspace member += `kebab-embed-ollama`, version 0.25.0 → **0.26.0**(minor).
+
+**correctness 게이트.** candle arctic 임베딩이 측정에 쓴 Ollama `snowflake-arctic-embed2`
+임베딩과 일치해야 pooling/prefix 정확성(=recall 130 재현)이 보장된다. 검증:
+`kebab-embed-candle/tests/arctic_ollama_parity.rs`(`#[ignore]`, live Ollama 의존) 가
+candle arctic vs 우리 Ollama 어댑터로 같은 문장(설명형/약어/영문 포함, doc+query
+양 경로)을 임베딩해 per-sentence **코사인 > 0.99** 를 assert. 수동 실행 결과(코사인값)는
+릴리스 전 본 entry 에 기록.
+
+**수동 검증 결과** (2026-06-03 worker 실측, Ollama @192.168.0.47:11434
+`snowflake-arctic-embed2`): 8문장 × (doc+query) 16벡터 per-sentence 코사인
+**0.999984 ~ 0.999995**, `cosine_min = 0.999984` (게이트 0.99 대비 대폭 상회).
+설명형("후입선출 방식으로 동작하는 자료구조")·약어("SVM 은 support vector machine")·
+영문·한글 모두 일치. → candle arctic 의 CLS pooling + `query: ` prefix 가 Ollama 측정
+경로와 정확히 동일 = recall@10 130 재현 보장. Ollama raw 도 이미 L2-정규화(norm 1.0)라
+어댑터의 L2 정규화는 idempotent no-op. 로그: `/build/dogfood/logs/arctic-parity.log`,
+요약: `/tmp/arctic-result.md`.
+
+**호환성.** 기본 provider=fastembed e5 동작/벡터 불변(arctic 은 opt-in). dim 1024
+동일이나 LanceDB 테이블명에 모델명 포함(`chunk_embeddings_{model}_{dim}`)이라 충돌
+없음. e5 → arctic 전환 = `embedding_version` cascade(모델별 벡터 상이) → **재색인 필요**
+(기존 e5 KB 와 혼용 불가, 명확). A(heading enrichment)는 측정상 arctic 에서 악화 →
+미적용. spec: `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`, plan: 동일
+디렉토리 `2026-06-03-arctic-embedder-plan.md`.
+
 ## 2026-06-03 — doc-side expansion(별칭) 기능 완전 제거 (v0.25.0)

 **무엇을 왜 제거했나.** v0.21.0 (PR #195/#196) 에서 도입한 색인-시 청크당 LLM