diff --git a/Cargo.lock b/Cargo.lock index 4e8a1a8..860e41a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4724,7 +4724,7 @@ dependencies = [ [[package]] name = "kebab-app" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "base64 0.22.1", @@ -4739,6 +4739,7 @@ dependencies = [ "kebab-embed", "kebab-embed-candle", "kebab-embed-local", + "kebab-embed-ollama", "kebab-llm", "kebab-llm-local", "kebab-nli", @@ -4771,7 +4772,7 @@ dependencies = [ [[package]] name = "kebab-chunk" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "blake3", @@ -4789,7 +4790,7 @@ dependencies = [ [[package]] name = "kebab-cli" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "clap", @@ -4810,7 +4811,7 @@ dependencies = [ [[package]] name = "kebab-config" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "dirs 5.0.1", @@ -4826,7 +4827,7 @@ dependencies = [ [[package]] name = "kebab-core" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "blake3", @@ -4840,7 +4841,7 @@ dependencies = [ [[package]] name = "kebab-embed" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "blake3", @@ -4854,7 +4855,7 @@ dependencies = [ [[package]] name = "kebab-embed-candle" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "candle-core", @@ -4864,6 +4865,7 @@ dependencies = [ "kebab-config", "kebab-core", "kebab-embed-local", + "kebab-embed-ollama", "rayon", "serde_json", "tempfile", @@ -4873,7 +4875,7 @@ dependencies = [ [[package]] name = "kebab-embed-local" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "fastembed", @@ -4884,9 +4886,24 @@ dependencies = [ "tracing", ] +[[package]] +name = "kebab-embed-ollama" +version = "0.26.0" +dependencies = [ + "anyhow", + "kebab-config", + "kebab-core", + "reqwest 0.12.28", + "serde", + "serde_json", + "tokio", + "tracing", + "wiremock", +] + [[package]] name = "kebab-eval" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "kebab-app", @@ -4905,7 +4922,7 @@ dependencies = [ [[package]] name = "kebab-llm" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "kebab-core", @@ -4914,7 +4931,7 @@ dependencies = [ [[package]] name = "kebab-llm-local" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "kebab-config", @@ -4931,7 +4948,7 @@ dependencies = [ [[package]] name = "kebab-mcp" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "kebab-app", @@ -4949,7 +4966,7 @@ dependencies = [ [[package]] name = "kebab-nli" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "hf-hub", @@ -4964,7 +4981,7 @@ dependencies = [ [[package]] name = "kebab-parse-code" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "gix", @@ -4987,7 +5004,7 @@ dependencies = [ [[package]] name = "kebab-parse-image" -version = "0.25.0" +version = "0.26.0" dependencies = [ "ab_glyph", "anyhow", @@ -5011,7 +5028,7 @@ dependencies = [ [[package]] name = "kebab-parse-md" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "kebab-core", @@ -5028,7 +5045,7 @@ dependencies = [ [[package]] name = "kebab-parse-pdf" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "blake3", @@ -5043,7 +5060,7 @@ dependencies = [ [[package]] name = "kebab-rag" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "blake3", @@ -5065,7 +5082,7 @@ dependencies = [ [[package]] name = "kebab-search" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "globset", @@ -5084,7 +5101,7 @@ dependencies = [ [[package]] name = "kebab-source-fs" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "blake3", @@ -5102,7 +5119,7 @@ dependencies = [ [[package]] name = "kebab-store-sqlite" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "blake3", @@ -5122,7 +5139,7 @@ dependencies = [ [[package]] name = "kebab-store-vector" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "arrow", @@ -5146,7 +5163,7 @@ dependencies = [ [[package]] name = "kebab-tui" -version = "0.25.0" +version = "0.26.0" dependencies = [ "anyhow", "crossterm", diff --git a/Cargo.toml b/Cargo.toml index b5e04f8..dfccea5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ members = [ "crates/kebab-embed", "crates/kebab-embed-local", "crates/kebab-embed-candle", + "crates/kebab-embed-ollama", "crates/kebab-llm", "crates/kebab-llm-local", "crates/kebab-rag", @@ -31,7 +32,7 @@ edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" -version = "0.25.0" # v0.25.0 — doc-side expansion(별칭) 기능 완전 제거: Chunk.aliases / expansion.rs / IngestExpansionCfg / alias lexical arm / expansion_progress wire kind 제거, 신규 마이그레이션 V013 이 chunk_aliases_fts + chunks.aliases DROP. AssetTimings.expansion_ms 는 wire 호환 위해 값 0 유지. 별칭 default-off 였어 사용자 체감 0. — CLAUDE.md §Release +version = "0.26.0" # v0.26.0 — arctic-embed-l-v2.0 임베더 통합: kebab-embed-candle 다중 모델 레지스트리(e5 mean + arctic CLS, 모델별 pooling/prefix 분기) + 신규 kebab-embed-ollama 크레이트(provider="ollama", POST /api/embed, L2 정규화, batch+fail-soft). config models.embedding.provider 에 "ollama" 추가 + endpoint: Option. 기본 동작 불변(provider=fastembed e5), arctic 은 opt-in, embedding_version cascade(arctic-cls / ollama:{model} 태그). — CLAUDE.md §Release # pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with # intentional allow-list. The allowed lints are either cosmetic (doc style), diff --git a/HANDOFF.md b/HANDOFF.md index 1d9142c..977d949 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -35,6 +35,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능. 머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만: +- **2026-06-03 arctic-embed-l-v2.0 임베더 통합** — v0.26.0. 별칭 제거 후 설명형 query recall 보강(측정 recall@10 130/132, e5 +7). `kebab-embed-candle` 모델 레지스트리화(e5 mean + `snowflake-arctic-embed-l-v2.0` CLS, 모델별 pooling/prefix) + 신규 `kebab-embed-ollama`(`provider="ollama"`, `/api/embed`). config `endpoint: Option` 추가. 기본 e5 유지(opt-in), arctic 전환은 embedding_version cascade → 재색인. candle↔Ollama cosine>0.99 게이트로 pooling/prefix 정확성 고정(`#[ignore]`). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 arctic), spec `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`. - **2026-06-03 doc-side expansion(별칭) 기능 완전 제거** — v0.25.0. 아래 2026-05-31 항목의 색인-시 청크당 LLM 별칭 생성 + 별칭 검색 채널을 **전부 제거**(ROI 음수: cross-lingual 은 e5-large 단독으로 충분, 기여는 설명형 +2 그룹뿐인데 대가가 청크당 색인-시 LLM). `Chunk.aliases`/`expansion.rs`/`IngestExpansionCfg`/alias lexical arm/`expansion_progress` wire kind 제거, 신규 마이그레이션 **V013** 이 `chunk_aliases_fts`+`chunks.aliases` DROP. 별칭 default-off 였어 사용자 체감 0, 기존 KB 도 재색인 불요(잔존 별칭 벡터는 `strip_alias_suffix` graceful 매핑/`reset` 정리). `AssetTimings.expansion_ms` 는 wire 호환 위해 값 0 으로 유지. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03), spec `docs/superpowers/specs/2026-06-03-remove-doc-expansion-spec.md`. - **2026-05-31 Phase 2 doc-side expansion 별칭(개별 dense 벡터) + 파생물 캐시(V012)** — v0.21.0 cut. 색인 시 LLM 이 청크별 별칭("같은 의미 다른 표현")을 생성, 줄별 **개별 dense 벡터**(sentinel `{chunk}#alias#N`)로 색인 (묶음 1벡터는 평균화 희석으로 회귀 → 폐기) + boilerplate 청크 skip. `[ingest.expansion]` default off. 측정(나무위키 ~1000 문서 CS corpus): 변형 일관성 14/18 → **16/18**, spread 0.222→0.111, 대조군 false-positive 별칭 무죄. 비용 병목(별칭 18문서 2.5h)은 **파생물 캐시(V012, 청크 내용 해시 키)**로 해소 — 정답 3개 cold 1879s → warm 13s **≈ 145배**, embedding+별칭 LLM 캐싱, version_key cascade 정합. search/ask 가 `kebab.sqlite`+`lancedb` 만으로 동작 → 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능. **결정/known limitation**: grounded/refusal 판정이 부분 인용을 grounded 로 오분류(정직한 거부가 false-positive 로 집계) — 별도 개선 후보. stack·svm 설명형 2개 잔존. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-31), 측정: `docs/superpowers/handoffs/2026-05-31-namu-wiki-alias-cache-study.md`. - **2026-05-29 v0.20.2 dogfood findings + 검색 품질 baseline** — 8-finding 라운드 완료. (1) Ask 응답언어: rag-v3 default (질문 언어 = 답변 언어). (2) eval `--config` facade 패치 로 dogfood KB 직접 eval 가능. (3) 검색 품질 baseline — hybrid hit@3=1.0 / MRR=0.833, lexical hit@3=1.0 / MRR=0.7 (golden 10 query). **O-2 known limitation**: 소형 모델(gemma4:e4b) refusal 메시지의 query 언어 불일치 가능 — 판정은 정상, 표시 문구만 해당. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-29). diff --git a/README.md b/README.md index df49cb1..91605e0 100644 --- a/README.md +++ b/README.md @@ -111,18 +111,46 @@ root = "~/KnowledgeBase" # 색인할 폴더. 절대 / tilde / env / 상대 경 [models.embedding] provider = "fastembed" # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust) - # / "none"(lexical-only). candle 는 같은 모델·같은 벡터를 - # 순수 Rust 로 돌려 NUMA 서버의 onnxruntime 48-스레드 - # double-free 를 피하는 opt-in 백엔드 (재색인 불필요). + # / "ollama"(원격 HTTP) / "none"(lexical-only). + # candle 는 같은 모델·같은 벡터를 순수 Rust 로 돌려 + # NUMA 서버의 onnxruntime 48-스레드 double-free 를 피하는 + # opt-in 백엔드 (e5 는 재색인 불필요). model = "multilingual-e5-large" # 다국어 sentence embedding (1024-dim). # 첫 ingest 시 ONNX (~1.3GB) 자동 다운로드. # candle provider 는 safetensors (~2GB) 다운로드. + # candle/ollama 는 "snowflake-arctic-embed-l-v2.0" + # (설명형 query 의 recall 보강) 도 지원 — 아래 참고. dimensions = 1024 # config 와 LanceDB stored dim 불일치 시 검색 0건. num_threads = 0 # candle 전용 CPU 스레드 캡 (0=auto=#cores). # env KEBAB_EMBED_THREADS 가 우선. NUMA 노드 바인딩은 # numactl 과 조합. fastembed provider 는 무시. +# endpoint = "http://127.0.0.1:11434" # provider="ollama" 전용 HTTP endpoint. + # 생략 시 [models.llm].endpoint 로 폴백. + # fastembed/candle provider 는 무시. ``` +**arctic-embed-l-v2.0 (설명형 query recall 보강)**: 기본 e5-large 대신 +Snowflake `arctic-embed-l-v2.0` 임베더를 쓸 수 있다 (1024-dim, opt-in). 측정에서 +설명형/약어/영문 용어 query 의 recall@10 이 e5 대비 향상됐다. 두 경로: + +```toml +# (A) candle 백엔드 — 순수 Rust, in-process (NUMA 안전, Metal GPU 가능): +[models.embedding] +provider = "candle" +model = "snowflake-arctic-embed-l-v2.0" # CLS pooling, query 에 "query: " 접두어 + # (문서는 무접두어). safetensors ~2GB 다운로드. + +# (B) ollama 백엔드 — 원격/로컬 Ollama 데몬에 위임 (POST /api/embed): +[models.embedding] +provider = "ollama" +model = "snowflake-arctic-embed2" # Ollama 모델 태그 (ollama pull 필요) +endpoint = "http://127.0.0.1:11434" # 생략 시 [models.llm].endpoint +``` + +> ⚠️ e5 → arctic 전환은 `embedding_version` cascade 를 트리거한다 (모델이 다르면 +> 벡터도 다름). 기존 e5 KB 와 혼용 불가 — 전환 시 **재색인** 필요 (`kebab reset` +> 후 재 ingest). 기본값은 e5 라 기존 사용자는 영향 없음. + **Apple Silicon GPU 가속 (candle / macOS)**: M-시리즈 맥에서 candle 임베딩을 GPU(Metal)로 돌리면 CPU 대비 대용량 ingest 가 크게 빨라진다. 빌드 또는 설치 시 `embed_metal` feature 를 켠다: diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml index d8dea8a..6db0722 100644 --- a/crates/kebab-app/Cargo.toml +++ b/crates/kebab-app/Cargo.toml @@ -19,6 +19,7 @@ kebab-search = { path = "../kebab-search" } kebab-embed = { path = "../kebab-embed" } kebab-embed-local = { path = "../kebab-embed-local" } kebab-embed-candle = { path = "../kebab-embed-candle" } +kebab-embed-ollama = { path = "../kebab-embed-ollama" } kebab-llm = { path = "../kebab-llm" } kebab-llm-local = { path = "../kebab-llm-local" } kebab-rag = { path = "../kebab-rag" } diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index 7860f70..0bd4650 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -45,6 +45,7 @@ use kebab_core::{ }; use kebab_embed_candle::CandleEmbedder; use kebab_embed_local::FastembedEmbedder; +use kebab_embed_ollama::OllamaEmbedder; use kebab_llm_local::OllamaLanguageModel; use kebab_parse_code::{ CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, @@ -834,11 +835,13 @@ impl App { if let Some(e) = self.embedder.get() { return Ok(Some(e.clone())); } - // Provider branch (Track 1 spec §3). `embeddings_disabled()` above - // already handled `"none"`; here we route the live providers. - // `fastembed`/`onnx`/(empty) keep the default onnxruntime path - // (vectors unchanged — `embedding_version` is preserved); `candle` - // selects the pure-Rust NUMA-safe backend. + // Provider branch (Track 1 spec §3 + arctic-embedder spec). The + // `embeddings_disabled()` check above already handled `"none"`; here we + // route the live providers. `fastembed`/`onnx`/(empty) keep the default + // onnxruntime path (vectors unchanged — `embedding_version` is + // preserved); `candle` selects the pure-Rust NUMA-safe backend (e5 or + // arctic via its model registry); `ollama` offloads to a remote + // `/api/embed` daemon. let provider = self.config.models.embedding.provider.as_str(); let emb: Arc = match provider { "fastembed" | "onnx" | "" => Arc::new( @@ -847,10 +850,13 @@ impl App { "candle" => Arc::new( CandleEmbedder::new(&self.config).context("kb-app: load CandleEmbedder")?, ), + "ollama" => Arc::new( + OllamaEmbedder::new(&self.config).context("kb-app: load OllamaEmbedder")?, + ), other => { return Err(anyhow!( "kb-app: unknown embedding provider {other:?}; expected one of \ - `fastembed` (default), `candle`, or `none` (lexical-only)" + `fastembed` (default), `candle`, `ollama`, or `none` (lexical-only)" )); } }; diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index 49c47f9..cec773e 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -155,9 +155,10 @@ impl NliCfg { #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct EmbeddingModelCfg { - /// `fastembed` (default, onnxruntime) or `candle` (pure-Rust, - /// NUMA-safe). `none` disables embeddings (lexical-only). Unknown - /// values error at embedder construction. + /// `fastembed` (default, onnxruntime), `candle` (pure-Rust, NUMA-safe), + /// or `ollama` (remote HTTP embedding endpoint). `none` disables + /// embeddings (lexical-only). Unknown values error at embedder + /// construction. pub provider: String, pub model: String, pub version: String, @@ -170,6 +171,13 @@ pub struct EmbeddingModelCfg { /// provider. Defaulted on load so pre-0.22 config files still parse. #[serde(default)] pub num_threads: u32, + /// HTTP endpoint for the `ollama` embedding provider (e.g. + /// `"http://127.0.0.1:11434"`). `None` (or a missing key in TOML) means + /// "fall back to `models.llm.endpoint`" — same convention as the OCR / + /// vision endpoints. Ignored by the `fastembed` / `candle` providers. + /// Defaulted on load so pre-0.26 config files still parse. + #[serde(default)] + pub endpoint: Option, } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -688,6 +696,7 @@ impl Config { dimensions: 1024, batch_size: 64, num_threads: 0, + endpoint: None, }, llm: LlmCfg { provider: "ollama".to_string(), @@ -950,6 +959,12 @@ impl Config { self.models.embedding.num_threads = n; } } + "KEBAB_MODELS_EMBEDDING_ENDPOINT" => { + // Empty value → None (= fall back to models.llm.endpoint), + // mirroring the OCR endpoint override semantics. + self.models.embedding.endpoint = + if v.is_empty() { None } else { Some(v.clone()) }; + } // models.llm "KEBAB_MODELS_LLM_PROVIDER" => self.models.llm.provider = v.clone(), diff --git a/crates/kebab-embed-candle/Cargo.toml b/crates/kebab-embed-candle/Cargo.toml index 40a17d9..f9d2c67 100644 --- a/crates/kebab-embed-candle/Cargo.toml +++ b/crates/kebab-embed-candle/Cargo.toml @@ -38,6 +38,9 @@ metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"] # not the library's own (non-dev) dependencies — so rayon/kebab-config/kebab-core # are repeated here for tests/parity.rs and tests/thread_cap.rs. kebab-embed-local = { path = "../kebab-embed-local" } +# arctic↔Ollama parity test drives the real Ollama adapter for the reference +# vectors (tests/arctic_ollama_parity.rs, `#[ignore]` — live Ollama). +kebab-embed-ollama = { path = "../kebab-embed-ollama" } kebab-config = { path = "../kebab-config" } kebab-core = { path = "../kebab-core" } rayon = "1" diff --git a/crates/kebab-embed-candle/src/lib.rs b/crates/kebab-embed-candle/src/lib.rs index 0649e14..f45d529 100644 --- a/crates/kebab-embed-candle/src/lib.rs +++ b/crates/kebab-embed-candle/src/lib.rs @@ -1,31 +1,44 @@ //! `kebab-embed-candle` — [`CandleEmbedder`], a pure-Rust (candle) //! implementation of [`Embedder`](kebab_core::Embedder). //! -//! Runs the same `intfloat/multilingual-e5-large` model as the default -//! [`FastembedEmbedder`](kebab_embed_local) but through `candle` -//! (`candle-transformers`' XLM-RoBERTa) instead of onnxruntime. Motivation: -//! fastembed 4.9's onnxruntime hard-codes 48 intra-op threads, which corrupts -//! the heap (double-free) on dual-socket NUMA hosts. candle's CPU backend -//! sizes its threads off the global rayon pool, so a one-shot -//! [`rayon::ThreadPoolBuilder`] cap (config `num_threads` / env -//! `KEBAB_EMBED_THREADS`) keeps the worker count NUMA-safe. +//! Runs an XLM-RoBERTa-large embedding model through `candle` +//! (`candle-transformers`' XLM-RoBERTa) instead of onnxruntime. Two models +//! are wired through a small **registry** ([`MODEL_REGISTRY`]): //! -//! Output parity with the onnxruntime path was proven by the Phase 0 spike -//! (cosine 1.000000); this crate absorbs that pipeline verbatim: +//! * `multilingual-e5-large` — the same weights the default +//! [`FastembedEmbedder`](kebab_embed_local) uses (mean pooling, +//! `query: `/`passage: ` prefixes). candle is the NUMA-safe drop-in: +//! fastembed 4.9's onnxruntime hard-codes 48 intra-op threads, which +//! corrupts the heap (double-free) on dual-socket NUMA hosts. candle's +//! CPU backend sizes its threads off the global rayon pool, so a one-shot +//! [`rayon::ThreadPoolBuilder`] cap (config `num_threads` / env +//! `KEBAB_EMBED_THREADS`) keeps the worker count NUMA-safe. +//! * `snowflake-arctic-embed-l-v2.0` — Snowflake's arctic-embed v2.0 +//! (CLS pooling, `query: ` on queries / no prefix on documents). Same +//! XLM-RoBERTa-large architecture, dim 1024, so it rides the exact same +//! tokenize → forward → L2 pipeline; only the pooling step and prefixes +//! differ (both keyed off the per-model [`EmbedModelSpec`]). //! -//! 1. e5 prefix (`passage: ` for documents, `query: ` for queries — the same -//! convention as `kebab-embed-local`'s `prefix_input`); +//! Output parity with the onnxruntime path (for e5) was proven by the +//! Phase 0 spike (cosine 1.000000); the arctic path's pooling/prefix +//! correctness is pinned by an `#[ignore]`d cosine>0.99 cross-check against +//! Ollama's `snowflake-arctic-embed2` (see `tests/arctic_ollama_parity.rs`). +//! The shared pipeline: +//! +//! 1. instruction prefix per [`EmbedModelSpec`] (query/doc); //! 2. tokenize (max_len 512, batch-longest padding, special tokens); -//! 3. XLM-RoBERTa forward on `Device::Cpu`; -//! 4. attention-mask-weighted mean pooling; +//! 3. XLM-RoBERTa forward on the selected [`Device`]; +//! 4. pooling — mean (attention-mask-weighted) or CLS (first token); //! 5. L2 normalization. //! //! Model files (`config.json`, `tokenizer.json`, `model.safetensors`) are -//! fetched via `hf-hub` into `{config.storage.model_dir}/candle/`. +//! fetched via `hf-hub` into `{config.storage.model_dir}/candle/` (hf-hub's +//! cache layout namespaces by repo, so e5 and arctic never collide). //! //! This crate is **opt-in** (`config.models.embedding.provider = "candle"`); //! the default provider stays `fastembed`. See -//! `docs/superpowers/specs/2026-06-01-embed-candle-track-spec.md`. +//! `docs/superpowers/specs/2026-06-01-embed-candle-track-spec.md` and +//! `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`. use std::sync::Mutex; @@ -42,22 +55,95 @@ use tokenizers::{PaddingParams, PaddingStrategy, Tokenizer, TruncationParams}; /// `fastembed/` subdir so the two backends never collide. const CANDLE_CACHE_SUBDIR: &str = "candle"; -/// HuggingFace repo id for the multilingual e5 large model. Same weights the -/// onnxruntime path uses, just the safetensors variant candle can read. -const HF_MODEL: &str = "intfloat/multilingual-e5-large"; - -/// The only `config.models.embedding.model` value the candle adapter accepts -/// (the e5-large weights `HF_MODEL` resolves to). Guards against silently -/// downloading e5-large while `model_id()` reports a different name. -const SUPPORTED_MODEL: &str = "multilingual-e5-large"; - -/// Token truncation length (e5 was trained at 512). +/// Token truncation length (both e5 and arctic-embed-l-v2.0 train at 512). const MAX_LEN: usize = 512; /// Env var that overrides `config.models.embedding.num_threads`. Read once in /// [`CandleEmbedder::new`]; `0`/unset/unparseable means "leave rayon default". const ENV_EMBED_THREADS: &str = "KEBAB_EMBED_THREADS"; +/// Pooling strategy over the model's last hidden state. Keyed per-model by +/// [`EmbedModelSpec::pooling`] — e5 is mean, arctic is CLS. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Pooling { + /// Attention-mask-weighted mean over all tokens (e5 / sentence-transformers + /// `pooling_mode_mean_tokens`). + Mean, + /// First token (``/`[CLS]`) hidden state (arctic-embed v2.0 — + /// `1_Pooling/config.json` has `pooling_mode_cls_token: true`). + Cls, +} + +/// One supported embedding model: the HF repo candle downloads, the pooling +/// strategy, and the e5-style instruction prefixes. [`MODEL_REGISTRY`] maps a +/// `config.models.embedding.model` value to one of these. +#[derive(Clone, Copy, Debug)] +pub struct EmbedModelSpec { + /// The short `config.models.embedding.model` value that selects this spec. + pub name: &'static str, + /// HuggingFace repo id candle fetches `config.json` / `tokenizer.json` / + /// `model.safetensors` from. + pub hf_repo: &'static str, + /// Pooling over the last hidden state. + pub pooling: Pooling, + /// Prefix prepended to **query** inputs before tokenization. + pub query_prefix: &'static str, + /// Prefix prepended to **document** inputs before tokenization (arctic + /// uses `""` — documents are embedded raw). + pub doc_prefix: &'static str, + /// Expected embedding dimension (model hidden size). + pub dim: usize, + /// Suffix folded into `model_version` so switching **to** this model + /// triggers the `embedding_version` cascade even if the operator forgets + /// to bump `config.version`. `None` keeps the bare `config.version` — used + /// by e5 so candle-e5 and fastembed-e5 report the *same* version and stay + /// interchangeable (the NUMA drop-in invariant — Phase 0 cosine 1.0). + pub version_tag: Option<&'static str>, +} + +/// The models the candle adapter can load. Adding a model = one entry here +/// (plus, for a non-XLM-R architecture, a new forward path — both current +/// entries are XLM-RoBERTa-large so they share everything but pooling/prefix). +static MODEL_REGISTRY: &[EmbedModelSpec] = &[ + EmbedModelSpec { + name: "multilingual-e5-large", + hf_repo: "intfloat/multilingual-e5-large", + pooling: Pooling::Mean, + query_prefix: "query: ", + doc_prefix: "passage: ", + dim: 1024, + version_tag: None, + }, + EmbedModelSpec { + name: "snowflake-arctic-embed-l-v2.0", + hf_repo: "Snowflake/snowflake-arctic-embed-l-v2.0", + pooling: Pooling::Cls, + query_prefix: "query: ", + doc_prefix: "", + dim: 1024, + version_tag: Some("arctic-cls"), + }, +]; + +/// Look up a model spec by `config.models.embedding.model`. Accepts either the +/// short `name` or the full `hf_repo` id (mirrors the old e5 guard, which +/// accepted both `multilingual-e5-large` and `intfloat/multilingual-e5-large`). +pub(crate) fn lookup_spec(model: &str) -> Option<&'static EmbedModelSpec> { + MODEL_REGISTRY + .iter() + .find(|s| s.name == model || s.hf_repo == model) +} + +/// Comma-separated list of supported model names, for the +/// unsupported-model error message. +fn supported_models() -> String { + MODEL_REGISTRY + .iter() + .map(|s| s.name) + .collect::>() + .join("`, `") +} + /// Pure-Rust candle adapter. Construct via [`CandleEmbedder::new`]; the /// constructor downloads the model on first use, so share one instance. pub struct CandleEmbedder { @@ -68,6 +154,9 @@ pub struct CandleEmbedder { model: Mutex, tokenizer: Tokenizer, device: Device, + /// The resolved model spec (pooling + prefixes) — drives `embed` and + /// `embed_batch`. + spec: &'static EmbedModelSpec, model_id: EmbeddingModelId, version: EmbeddingVersion, dimensions: usize, @@ -75,7 +164,8 @@ pub struct CandleEmbedder { } impl CandleEmbedder { - /// Build an embedder from `Config`. Applies the NUMA thread cap, fetches + /// Build an embedder from `Config`. Resolves the model spec from + /// `config.models.embedding.model`, applies the NUMA thread cap, fetches /// the model into `{model_dir}/candle/`, and validates that the model's /// hidden size matches `config.models.embedding.dimensions` before /// returning. @@ -104,21 +194,20 @@ impl CandleEmbedder { } } - // 1b. Model guard. `HF_MODEL` is hard-coded (candle currently only wires - // e5-large), so if the operator configured a *different* model name - // we must NOT silently download e5-large and then label its vectors - // with the configured name via `model_id()` — that would mislabel - // `embedding_version` and corrupt a mixed index. Fail fast, before - // the ~2GB download. + // 1b. Model registry lookup. If the operator configured a model the + // candle adapter doesn't know, fail fast (BEFORE the ~2GB + // download) — never silently download one model and then label its + // vectors with another name via `model_id()`, which would mislabel + // `embedding_version` and corrupt a mixed index. let want = config.models.embedding.model.as_str(); - if want != SUPPORTED_MODEL && want != HF_MODEL { - anyhow::bail!( - "candle provider currently supports only '{SUPPORTED_MODEL}' (or \ - the HF id '{HF_MODEL}'), but config.models.embedding.model = \ - '{want}'. Use provider=fastembed for other models, or set \ - model = \"{SUPPORTED_MODEL}\"." - ); - } + let spec = lookup_spec(want).ok_or_else(|| { + anyhow::anyhow!( + "candle provider supports the models `{}`, but \ + config.models.embedding.model = '{want}'. Use provider=fastembed \ + for other models, or pick a supported one.", + supported_models() + ) + })?; // 2. Resolve `{data_dir}/models/candle/` exactly like the fastembed // adapter resolves its own subdir. @@ -134,14 +223,15 @@ impl CandleEmbedder { tracing::info!( target: "kebab-embed-candle", cache_dir = %cache_dir.display(), - model = HF_MODEL, + model = spec.hf_repo, + pooling = ?spec.pooling, "loading candle embedding model (first run downloads ~2GB safetensors)" ); let api = hf_hub::api::sync::ApiBuilder::new() .with_cache_dir(cache_dir.clone()) .build() .context("kb-embed-candle: build hf-hub api")?; - let repo = api.model(HF_MODEL.to_string()); + let repo = api.model(spec.hf_repo.to_string()); let config_path = repo.get("config.json").context("download config.json")?; let tokenizer_path = repo .get("tokenizer.json") @@ -180,10 +270,21 @@ impl CandleEmbedder { })) .map_err(|e| anyhow::anyhow!("kb-embed-candle: set truncation: {e}"))?; + // model_version: fold the model tag in for non-e5 models so a switch + // triggers the embedding_version cascade; e5 keeps the bare + // config.version to stay interchangeable with fastembed-e5. + let version = match spec.version_tag { + Some(tag) => { + EmbeddingVersion(format!("{}+{}", config.models.embedding.version, tag)) + } + None => EmbeddingVersion(config.models.embedding.version.clone()), + }; + tracing::info!( target: "kebab-embed-candle", dimensions = cfg.hidden_size, layers = cfg.num_hidden_layers, + model = spec.name, "candle embedding model loaded" ); @@ -191,16 +292,17 @@ impl CandleEmbedder { model: Mutex::new(model), tokenizer, device, + spec, model_id: EmbeddingModelId(config.models.embedding.model.clone()), - version: EmbeddingVersion(config.models.embedding.version.clone()), + version, dimensions: cfg.hidden_size, batch_size: config.models.embedding.batch_size.max(1), }) } - /// Embed one batch of **already-prefixed** strings (the e5 `query:`/ - /// `passage:` prefix is applied by the caller [`CandleEmbedder::embed`]) - /// through the candle pipeline: tokenize → forward → masked mean pool → L2. + /// Embed one batch of **already-prefixed** strings (the per-model prefix + /// is applied by the caller [`CandleEmbedder::embed`]) through the candle + /// pipeline: tokenize → forward → pool (mean|CLS) → L2. fn embed_batch(&self, prefixed: &[String]) -> Result>> { let encodings = self .tokenizer @@ -237,18 +339,30 @@ impl CandleEmbedder { guard.forward(&input_ids, &attn_f32, &token_type_ids, None, None, None)? }; - // attention-mask-weighted mean pooling - let mask3 = attn_f32.unsqueeze(2)?; // (b, seq, 1) - let summed = hidden.broadcast_mul(&mask3)?.sum(1)?; // (b, hidden) - // counts ≥ 1 always: every input is e5-prefixed AND special tokens are - // added (encode_batch(_, true)), so no row has an all-zero mask. If that - // invariant ever breaks, broadcast_div would emit NaN vectors. - let counts = mask3.sum(1)?; // (b, 1) - let mean = summed.broadcast_div(&counts)?; + // Pooling — per the model spec. + let pooled = match self.spec.pooling { + Pooling::Mean => { + // attention-mask-weighted mean pooling + let mask3 = attn_f32.unsqueeze(2)?; // (b, seq, 1) + let summed = hidden.broadcast_mul(&mask3)?.sum(1)?; // (b, hidden) + // counts ≥ 1 always: every input is prefixed AND special + // tokens are added (encode_batch(_, true)), so no row has an + // all-zero mask. If that invariant ever breaks, broadcast_div + // would emit NaN vectors. + let counts = mask3.sum(1)?; // (b, 1) + summed.broadcast_div(&counts)? + } + Pooling::Cls => { + // CLS pooling: the first token's hidden state. arctic-embed + // v2.0 prepends `` (the XLM-R BOS/CLS) at index 0, so + // `hidden[:, 0, :]` is the sentence embedding. + hidden.narrow(1, 0, 1)?.squeeze(1)? // (b, hidden) + } + }; // L2 normalize - let norm = mean.sqr()?.sum_keepdim(1)?.sqrt()?; - let normalized = mean.broadcast_div(&norm)?; + let norm = pooled.sqr()?.sum_keepdim(1)?.sqrt()?; + let normalized = pooled.broadcast_div(&norm)?; // `.contiguous()` before host copy: broadcast ops can leave a strided // view, which `to_vec2` rejects on the Metal backend (CPU tolerates it). @@ -274,9 +388,9 @@ impl Embedder for CandleEmbedder { return Ok(Vec::new()); } - // e5 prefix per §11.3 BEFORE tokenization (same convention as - // FastembedEmbedder so the two backends produce comparable vectors). - let prefixed: Vec = inputs.iter().map(prefix_input).collect(); + // Per-model instruction prefix BEFORE tokenization (same convention as + // FastembedEmbedder for e5; arctic uses `query: `/no-prefix). + let prefixed: Vec = inputs.iter().map(|i| prefix_input(self.spec, i)).collect(); let mut out: Vec> = Vec::with_capacity(prefixed.len()); for chunk in prefixed.chunks(self.batch_size) { @@ -298,22 +412,22 @@ impl Embedder for CandleEmbedder { } } -/// Build the e5-prefixed string for one [`EmbeddingInput`]. Free function so -/// a unit test can pin the format without loading the model. Byte-identical to -/// `kebab-embed-local`'s `prefix_input` — the two backends MUST agree here or -/// their vectors diverge. -fn prefix_input(input: &EmbeddingInput<'_>) -> String { +/// Build the prefixed string for one [`EmbeddingInput`] using the model spec. +/// Free function so a unit test can pin the format without loading the model. +/// For e5 this is byte-identical to `kebab-embed-local`'s `prefix_input` — the +/// two backends MUST agree there or their vectors diverge. +fn prefix_input(spec: &EmbedModelSpec, input: &EmbeddingInput<'_>) -> String { match input.kind { - EmbeddingKind::Document => format!("passage: {}", input.text), - EmbeddingKind::Query => format!("query: {}", input.text), + EmbeddingKind::Document => format!("{}{}", spec.doc_prefix, input.text), + EmbeddingKind::Query => format!("{}{}", spec.query_prefix, input.text), } } /// Select the compute device. Built with the `metal` feature (Apple Silicon /// GPU), try Metal and fall back to CPU on failure; otherwise CPU. Metal only -/// compiles/runs on macOS — the Linux server builds the CPU path. e5-large -/// vectors are model-defined, so Metal-produced and CPU-produced embeddings are -/// cross-compatible (a Mac can ingest on GPU, the server query on CPU). +/// compiles/runs on macOS — the Linux server builds the CPU path. Embedding +/// vectors are model-defined, so Metal-produced and CPU-produced embeddings +/// are cross-compatible (a Mac can ingest on GPU, the server query on CPU). fn select_device() -> Device { #[cfg(feature = "metal")] { @@ -367,26 +481,85 @@ pub(crate) fn check_dim(model_dim: usize, cfg_dim: usize) -> Result<()> { mod tests { use super::*; - // ── prefix_input ───────────────────────────────────────────────── - // Pin the exact e5 prefix strings; these MUST match - // kebab-embed-local::prefix_input or candle vs fastembed parity breaks. + fn e5_spec() -> &'static EmbedModelSpec { + lookup_spec("multilingual-e5-large").expect("e5 in registry") + } + + fn arctic_spec() -> &'static EmbedModelSpec { + lookup_spec("snowflake-arctic-embed-l-v2.0").expect("arctic in registry") + } + + // ── registry ───────────────────────────────────────────────────── #[test] - fn prefix_document_uses_passage() { + fn registry_resolves_e5_by_name_and_hf_repo() { + assert_eq!( + lookup_spec("multilingual-e5-large").map(|s| s.name), + Some("multilingual-e5-large") + ); + assert_eq!( + lookup_spec("intfloat/multilingual-e5-large").map(|s| s.name), + Some("multilingual-e5-large") + ); + } + + #[test] + fn registry_resolves_arctic_and_its_pooling_is_cls() { + let s = arctic_spec(); + assert_eq!(s.name, "snowflake-arctic-embed-l-v2.0"); + assert_eq!(s.hf_repo, "Snowflake/snowflake-arctic-embed-l-v2.0"); + assert_eq!(s.pooling, Pooling::Cls); + assert_eq!(s.dim, 1024); + assert_eq!(s.version_tag, Some("arctic-cls")); + } + + #[test] + fn registry_e5_is_mean_pooling_no_version_tag() { + let s = e5_spec(); + assert_eq!(s.pooling, Pooling::Mean); + assert_eq!(s.version_tag, None); + } + + #[test] + fn registry_rejects_unknown_model() { + assert!(lookup_spec("multilingual-e5-small").is_none()); + } + + // ── prefix_input ───────────────────────────────────────────────── + // e5 prefixes MUST match kebab-embed-local::prefix_input or candle vs + // fastembed parity breaks; arctic uses query-only prefixing. + + #[test] + fn e5_prefix_document_uses_passage() { let input = EmbeddingInput { text: "hello world", kind: EmbeddingKind::Document, }; - assert_eq!(prefix_input(&input), "passage: hello world"); + assert_eq!(prefix_input(e5_spec(), &input), "passage: hello world"); } #[test] - fn prefix_query_uses_query() { + fn e5_prefix_query_uses_query() { let input = EmbeddingInput { text: "hello world", kind: EmbeddingKind::Query, }; - assert_eq!(prefix_input(&input), "query: hello world"); + assert_eq!(prefix_input(e5_spec(), &input), "query: hello world"); + } + + #[test] + fn arctic_prefix_query_uses_query_doc_is_bare() { + let doc = EmbeddingInput { + text: "후입선출 자료구조", + kind: EmbeddingKind::Document, + }; + let qry = EmbeddingInput { + text: "스택 자료구조", + kind: EmbeddingKind::Query, + }; + // arctic: documents are embedded raw, queries get `query: `. + assert_eq!(prefix_input(arctic_spec(), &doc), "후입선출 자료구조"); + assert_eq!(prefix_input(arctic_spec(), &qry), "query: 스택 자료구조"); } #[test] @@ -399,8 +572,10 @@ mod tests { text: "", kind: EmbeddingKind::Query, }; - assert_eq!(prefix_input(&doc), "passage: "); - assert_eq!(prefix_input(&qry), "query: "); + assert_eq!(prefix_input(e5_spec(), &doc), "passage: "); + assert_eq!(prefix_input(e5_spec(), &qry), "query: "); + assert_eq!(prefix_input(arctic_spec(), &doc), ""); + assert_eq!(prefix_input(arctic_spec(), &qry), "query: "); } // ── check_dim ──────────────────────────────────────────────────── @@ -421,9 +596,9 @@ mod tests { } // ── model guard ────────────────────────────────────────────────── - // A non-e5-large model name must fail fast (BEFORE the ~2GB download), - // so we never download e5-large yet label its vectors with another name - // via model_id() — which would mislabel embedding_version. + // A model name not in the registry must fail fast (BEFORE the ~2GB + // download), so we never download one model yet label its vectors with + // another name via model_id() — which would mislabel embedding_version. #[test] fn new_rejects_unsupported_model() { @@ -437,8 +612,8 @@ mod tests { .expect("unsupported model must error"); let msg = format!("{err:#}"); assert!( - msg.contains("candle provider currently supports only"), - "expected model-guard error, got: {msg}" + msg.contains("candle provider supports the models"), + "expected model-registry error, got: {msg}" ); } } diff --git a/crates/kebab-embed-candle/tests/arctic_ollama_parity.rs b/crates/kebab-embed-candle/tests/arctic_ollama_parity.rs new file mode 100644 index 0000000..ccc3504 --- /dev/null +++ b/crates/kebab-embed-candle/tests/arctic_ollama_parity.rs @@ -0,0 +1,128 @@ +//! arctic-embed-l-v2.0 correctness gate (`#[ignore]` — needs the ~2GB candle +//! model + a live Ollama serving `snowflake-arctic-embed2`). +//! +//! This is the load-bearing pooling/prefix check for the arctic integration. +//! The recall measurement that justified adopting arctic (recall@10 130/132) +//! went through Ollama's `snowflake-arctic-embed2`. The candle path +//! re-implements the model (XLM-RoBERTa-large + **CLS** pooling + `query: ` on +//! queries / no prefix on documents). If candle's pooling or prefix is wrong, +//! its vectors silently diverge from the measured route and the 130 number +//! does NOT carry over. This test pins them together: per-sentence cosine +//! between the candle vector and the Ollama vector must be **> 0.99**. +//! +//! `#[ignore]` because it depends on an external Ollama daemon (CI is +//! headless/offline). The leader MUST run it once before merge. +//! +//! ## Manual run +//! +//! 1. Confirm Ollama is reachable and has the model: +//! ```sh +//! curl -s http://192.168.0.47:11434/api/tags # should list snowflake-arctic-embed2 +//! ``` +//! 2. Run (downloads the ~2GB candle safetensors on first run): +//! ```sh +//! CARGO_TARGET_DIR=/build/out/cargo-target \ +//! KEBAB_ARCTIC_OLLAMA_ENDPOINT=http://192.168.0.47:11434 \ +//! cargo test -p kebab-embed-candle --test arctic_ollama_parity -- --ignored --nocapture +//! ``` +//! The endpoint defaults to `http://192.168.0.47:11434` if the env is unset. +//! +//! Record the printed `ARCTIC_PARITY_SUMMARY cosine_min=...` in +//! `/tmp/arctic-result.md` + `tasks/HOTFIXES.md`. + +use kebab_config::Config; +use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind}; +use kebab_embed_candle::CandleEmbedder; +use kebab_embed_ollama::OllamaEmbedder; + +const DOGFOOD_CONFIG: &str = "/build/dogfood/config.toml"; +const DEFAULT_OLLAMA_ENDPOINT: &str = "http://192.168.0.47:11434"; + +/// Mixed Korean / English + the descriptive-recall shapes arctic was adopted +/// for (synonym / abbreviation / English term). Covers both prefix paths. +const SENTENCES: &[&str] = &[ + "스택 자료구조", + "후입선출 방식으로 동작하는 자료구조", + "큐는 선입선출 자료구조이다", + "Rust ownership and the borrow checker", + "소유권과 빌림 검사기는 메모리 안전성을 보장한다", + "SVM 은 support vector machine 의 약자이다", + "정렬 알고리즘의 시간 복잡도", + "The capital of France is Paris.", +]; + +fn cosine(a: &[f32], b: &[f32]) -> f32 { + let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum(); + let na: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + let nb: f32 = b.iter().map(|x| x * x).sum::().sqrt(); + dot / (na * nb) +} + +/// Base config: prefer the canonical dogfood config (for storage/cache roots), +/// fall back to `Config::defaults()` so the test still runs on a bare clone. +fn base_config() -> Config { + Config::load(Some(std::path::Path::new(DOGFOOD_CONFIG))).unwrap_or_else(|_| Config::defaults()) +} + +#[test] +#[ignore = "needs ~2GB candle model + live Ollama (snowflake-arctic-embed2); run manually before merge"] +fn candle_arctic_matches_ollama_arctic() { + let endpoint = std::env::var("KEBAB_ARCTIC_OLLAMA_ENDPOINT") + .unwrap_or_else(|_| DEFAULT_OLLAMA_ENDPOINT.to_string()); + + // candle side: the in-process arctic model. + let mut candle_cfg = base_config(); + candle_cfg.models.embedding.provider = "candle".to_string(); + candle_cfg.models.embedding.model = "snowflake-arctic-embed-l-v2.0".to_string(); + candle_cfg.models.embedding.dimensions = 1024; + + // Ollama side: the reference route the recall numbers came from. + let mut ollama_cfg = base_config(); + ollama_cfg.models.embedding.provider = "ollama".to_string(); + ollama_cfg.models.embedding.model = "snowflake-arctic-embed2".to_string(); + ollama_cfg.models.embedding.dimensions = 1024; + ollama_cfg.models.embedding.endpoint = Some(endpoint.clone()); + + let candle = CandleEmbedder::new(&candle_cfg).expect("build candle arctic embedder"); + let ollama = OllamaEmbedder::new(&ollama_cfg).expect("build ollama arctic embedder"); + + // Exercise BOTH prefix paths so a query-side divergence can't hide. + let inputs: Vec = SENTENCES + .iter() + .flat_map(|s| { + [EmbeddingKind::Document, EmbeddingKind::Query] + .into_iter() + .map(move |kind| EmbeddingInput { text: s, kind }) + }) + .collect(); + + let cv = candle.embed(&inputs).expect("candle embed"); + let ov = ollama + .embed(&inputs) + .expect("ollama embed (is snowflake-arctic-embed2 pulled @ the endpoint?)"); + + assert_eq!(cv.len(), ov.len(), "embedding counts must match"); + assert_eq!(cv.len(), inputs.len(), "one vector per input"); + assert_eq!(candle.dimensions(), 1024); + + let mut min_cos = f32::INFINITY; + for (i, inp) in inputs.iter().enumerate() { + assert_eq!(cv[i].len(), 1024, "candle dim"); + assert_eq!(ov[i].len(), 1024, "ollama dim"); + let c = cosine(&cv[i], &ov[i]); + min_cos = min_cos.min(c); + let kind = match inp.kind { + EmbeddingKind::Document => "doc", + EmbeddingKind::Query => "qry", + }; + let preview: String = inp.text.chars().take(36).collect(); + println!("[{i:>2}] {kind} cos={c:.6} {preview}"); + } + + println!("ARCTIC_PARITY_SUMMARY cosine_min={min_cos:.6} endpoint={endpoint}"); + assert!( + min_cos > 0.99, + "candle arctic vs Ollama arctic cosine_min={min_cos:.6} ≤ 0.99 — \ + pooling/prefix mismatch; the recall=130 measurement will NOT reproduce" + ); +} diff --git a/crates/kebab-embed-ollama/Cargo.toml b/crates/kebab-embed-ollama/Cargo.toml new file mode 100644 index 0000000..9a90c9a --- /dev/null +++ b/crates/kebab-embed-ollama/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "kebab-embed-ollama" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Ollama HTTP adapter implementing kebab_core::Embedder (POST /api/embed, L2-normalized, batched + fail-soft)" + +[dependencies] +kebab-core = { path = "../kebab-core" } +kebab-config = { path = "../kebab-config" } +# `default-features = false` drops native-tls (system OpenSSL); we pin rustls. +# reqwest 0.12's `blocking` feature wraps a private current-thread tokio +# runtime — this crate exposes NO async surface (no `async`/`await`/`tokio::*` +# symbols), matching the kebab-llm-local invariant. +reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +tracing = { workspace = true } +anyhow = { workspace = true } + +[dev-dependencies] +# wiremock hosts the mock /api/embed server (needs a tokio runtime); tokio is +# also pulled transitively at runtime by reqwest's `blocking` feature. +wiremock = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt"] } + +[lints] +workspace = true diff --git a/crates/kebab-embed-ollama/src/lib.rs b/crates/kebab-embed-ollama/src/lib.rs new file mode 100644 index 0000000..afc9e06 --- /dev/null +++ b/crates/kebab-embed-ollama/src/lib.rs @@ -0,0 +1,310 @@ +//! `kebab-embed-ollama` — [`OllamaEmbedder`], a `reqwest::blocking` adapter +//! implementing [`Embedder`](kebab_core::Embedder) over Ollama's +//! `POST /api/embed` endpoint. +//! +//! ## Why this exists +//! +//! The candle backend ([`kebab-embed-candle`]) runs arctic-embed-l-v2.0 +//! in-process (pure Rust, NUMA-safe). This crate is the **fallback** path: +//! it offloads embedding to a local/remote Ollama daemon (`snowflake-arctic-embed2`), +//! which is exactly the route the recall measurements used — so it reproduces +//! the measured numbers (recall@10 130/132) byte-for-route. Opt-in via +//! `config.models.embedding.provider = "ollama"`. +//! +//! ## Wire shape +//! +//! Request (`POST {endpoint}/api/embed`): +//! +//! ```json +//! { "model": "snowflake-arctic-embed2", "input": ["query: 스택", "후입선출 ..."] } +//! ``` +//! +//! Response: +//! +//! ```json +//! { "model": "...", "embeddings": [[0.01, ...], [0.02, ...]] } +//! ``` +//! +//! ## Pipeline +//! +//! 1. instruction prefix per model ([`prefixes_for`] — arctic: `query: ` on +//! queries, no prefix on documents; e5: `query: `/`passage: `); +//! 2. batch into `BATCH` (48) inputs per request; +//! 3. `POST /api/embed`, with fail-soft retry (`MAX_RETRIES`); +//! 4. **L2 normalize** each returned vector — Ollama returns raw (un-normalized) +//! embeddings, so we normalize for cosine consistency with the candle path; +//! 5. dim check against `config.models.embedding.dimensions`. +//! +//! ## Send-safety +//! +//! `reqwest::blocking::Client: Send + Sync`; the adapter holds only the client, +//! an endpoint string, and small config scalars, so it is trivially `Send + Sync` +//! as the [`Embedder`] trait requires. + +use std::time::Duration; + +use anyhow::{Context, Result}; +use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion}; +use serde::{Deserialize, Serialize}; + +/// Inputs per `/api/embed` request. Ollama handles arbitrary batch sizes, but +/// a cap keeps a single HTTP body bounded and lets a partial failure retry a +/// smaller unit. +const BATCH: usize = 48; + +/// Fail-soft retry attempts per batch before the error propagates. Cold model +/// load on the Ollama side can transiently 500/timeout; a couple of retries +/// smooth that over without masking a hard misconfiguration. +const MAX_RETRIES: u32 = 3; + +/// Default per-request HTTP timeout (seconds). Cold-loading an embedding model +/// on first call can take tens of seconds; this matches the generous default +/// used by the LLM adapter. +const REQUEST_TIMEOUT_SECS: u64 = 300; + +/// Resolve the (query_prefix, doc_prefix) for an Ollama embedding model tag. +/// +/// Mirrors `kebab-embed-candle`'s `MODEL_REGISTRY`, but keyed on the **Ollama +/// model tag** (which differs from the HF id — e.g. `snowflake-arctic-embed2` +/// vs `Snowflake/snowflake-arctic-embed-l-v2.0`). Kept here rather than shared +/// so this crate does not depend on the candle backend. +/// +/// An unrecognized model gets no prefix (`("", "")`): many embedding models +/// are not instruction-tuned, so embedding the raw text is the correct default +/// — and a misspelled known model surfaces as a recall regression, not a silent +/// wrong-prefix, because the dim check still passes either way. +fn prefixes_for(model: &str) -> (&'static str, &'static str) { + let m = model.to_ascii_lowercase(); + if m.contains("arctic-embed") { + // arctic-embed v2.0: `query: ` on queries, documents embedded raw. + ("query: ", "") + } else if m.contains("e5") { + // multilingual-e5: `query: ` / `passage: `. + ("query: ", "passage: ") + } else { + ("", "") + } +} + +/// `reqwest::blocking` adapter implementing [`Embedder`] over Ollama's +/// `/api/embed`. Construction is offline; the first network call happens in +/// [`Embedder::embed`]. +pub struct OllamaEmbedder { + client: reqwest::blocking::Client, + /// Validated endpoint base (e.g. `"http://127.0.0.1:11434"`). + endpoint: String, + /// Ollama model tag (e.g. `"snowflake-arctic-embed2"`). + model: String, + query_prefix: &'static str, + doc_prefix: &'static str, + model_id: EmbeddingModelId, + version: EmbeddingVersion, + dimensions: usize, +} + +impl OllamaEmbedder { + /// Build from a workspace [`kebab_config::Config`]. Reads + /// `config.models.embedding.{model, dimensions}` and resolves the endpoint + /// as `models.embedding.endpoint` → fallback `models.llm.endpoint`. + /// + /// Does NOT touch the network. The caller (app layer) is expected to have + /// validated `provider == "ollama"`. + pub fn new(config: &kebab_config::Config) -> Result { + let emb = &config.models.embedding; + let endpoint = emb + .endpoint + .clone() + .filter(|e| !e.is_empty()) + .unwrap_or_else(|| config.models.llm.endpoint.clone()); + if endpoint.is_empty() { + anyhow::bail!( + "ollama embedding provider needs an endpoint: set \ + `models.embedding.endpoint` (or `models.llm.endpoint`)" + ); + } + let client = reqwest::blocking::Client::builder() + .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS)) + .build() + .context("kb-embed-ollama: build reqwest client")?; + let (query_prefix, doc_prefix) = prefixes_for(&emb.model); + Ok(Self { + client, + endpoint, + model: emb.model.clone(), + query_prefix, + doc_prefix, + model_id: EmbeddingModelId(emb.model.clone()), + // model_version = `ollama:{model}` so a provider/model switch + // triggers the embedding_version cascade and never collides with + // the candle path's version string for the same model. + version: EmbeddingVersion(format!("ollama:{}", emb.model)), + dimensions: emb.dimensions, + }) + } + + /// Embed one already-prefixed batch via `/api/embed`, with fail-soft retry. + fn embed_batch(&self, prefixed: &[String]) -> Result>> { + let url = format!("{}/api/embed", self.endpoint.trim_end_matches('/')); + let body = EmbedRequest { + model: &self.model, + input: prefixed, + }; + + let mut last_err: Option = None; + for attempt in 1..=MAX_RETRIES { + match self.try_once(&url, &body) { + Ok(resp) => return self.finalize(resp, prefixed.len()), + Err(e) => { + tracing::warn!( + target: "kebab-embed-ollama", + attempt, + max = MAX_RETRIES, + error = %e, + "ollama /api/embed attempt failed; retrying" + ); + last_err = Some(e); + } + } + } + Err(last_err.unwrap_or_else(|| { + anyhow::anyhow!("kb-embed-ollama: all {MAX_RETRIES} attempts failed") + })) + } + + /// One HTTP round-trip. Network / non-2xx / decode errors all map to + /// `Err` so the retry loop can decide. + fn try_once(&self, url: &str, body: &EmbedRequest<'_>) -> Result { + let resp = self + .client + .post(url) + .json(body) + .send() + .with_context(|| format!("kb-embed-ollama: POST {url}"))?; + let status = resp.status(); + if !status.is_success() { + let text = resp.text().unwrap_or_default(); + anyhow::bail!("kb-embed-ollama: /api/embed returned {status}: {text}"); + } + resp.json::() + .context("kb-embed-ollama: decode /api/embed response") + } + + /// Validate count + dim, then L2-normalize each vector. + fn finalize(&self, resp: EmbedResponse, expected: usize) -> Result>> { + if resp.embeddings.len() != expected { + anyhow::bail!( + "kb-embed-ollama: expected {expected} embeddings, got {}", + resp.embeddings.len() + ); + } + let mut out = Vec::with_capacity(resp.embeddings.len()); + for v in resp.embeddings { + if v.len() != self.dimensions { + anyhow::bail!( + "kb-embed-ollama: model returned dim {} but config expects {} \ + (check models.embedding.dimensions vs the Ollama model)", + v.len(), + self.dimensions + ); + } + out.push(l2_normalize(v)); + } + Ok(out) + } +} + +impl Embedder for OllamaEmbedder { + fn model_id(&self) -> EmbeddingModelId { + self.model_id.clone() + } + + fn model_version(&self) -> EmbeddingVersion { + self.version.clone() + } + + fn dimensions(&self) -> usize { + self.dimensions + } + + fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> Result>> { + if inputs.is_empty() { + return Ok(Vec::new()); + } + let prefixed: Vec = inputs.iter().map(|i| self.prefix(i)).collect(); + let mut out = Vec::with_capacity(prefixed.len()); + for chunk in prefixed.chunks(BATCH) { + out.extend(self.embed_batch(chunk)?); + } + debug_assert_eq!(out.len(), inputs.len()); + Ok(out) + } +} + +impl OllamaEmbedder { + /// Prefix one input per the resolved model prefixes. + fn prefix(&self, input: &EmbeddingInput<'_>) -> String { + match input.kind { + EmbeddingKind::Document => format!("{}{}", self.doc_prefix, input.text), + EmbeddingKind::Query => format!("{}{}", self.query_prefix, input.text), + } + } +} + +/// L2-normalize a vector in place-ish (consumes + returns). A zero vector is +/// returned unchanged (norm 0 → no division) so a degenerate embedding can +/// never produce NaNs. +fn l2_normalize(mut v: Vec) -> Vec { + let norm = v.iter().map(|x| x * x).sum::().sqrt(); + if norm > 0.0 { + for x in &mut v { + *x /= norm; + } + } + v +} + +// ── Wire types ────────────────────────────────────────────────────────────── + +#[derive(Serialize)] +struct EmbedRequest<'a> { + model: &'a str, + input: &'a [String], +} + +#[derive(Deserialize)] +struct EmbedResponse { + embeddings: Vec>, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn prefixes_for_arctic_is_query_only() { + assert_eq!(prefixes_for("snowflake-arctic-embed2"), ("query: ", "")); + assert_eq!(prefixes_for("snowflake-arctic-embed2:latest"), ("query: ", "")); + } + + #[test] + fn prefixes_for_e5_is_query_passage() { + assert_eq!(prefixes_for("multilingual-e5-large"), ("query: ", "passage: ")); + } + + #[test] + fn prefixes_for_unknown_is_bare() { + assert_eq!(prefixes_for("nomic-embed-text"), ("", "")); + } + + #[test] + fn l2_normalize_unit_length() { + let v = l2_normalize(vec![3.0, 4.0]); + let norm = (v[0] * v[0] + v[1] * v[1]).sqrt(); + assert!((norm - 1.0).abs() < 1e-6, "norm = {norm}"); + } + + #[test] + fn l2_normalize_zero_vector_is_unchanged() { + assert_eq!(l2_normalize(vec![0.0, 0.0, 0.0]), vec![0.0, 0.0, 0.0]); + } +} diff --git a/crates/kebab-embed-ollama/tests/embed_mock.rs b/crates/kebab-embed-ollama/tests/embed_mock.rs new file mode 100644 index 0000000..52a4c79 --- /dev/null +++ b/crates/kebab-embed-ollama/tests/embed_mock.rs @@ -0,0 +1,99 @@ +//! `/api/embed` behavior against a `wiremock`-hosted mock server. +//! +//! `wiremock` is async, so the tests are `#[tokio::test]`; the sync +//! [`OllamaEmbedder`] is driven from `spawn_blocking` to keep `reqwest::blocking` +//! off the async runtime (same pattern as `kebab-llm-local`'s streaming tests). +//! tokio is a `dev-dependency` only. + +use kebab_config::Config; +use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind}; +use kebab_embed_ollama::OllamaEmbedder; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +/// Config pointing at the mock server, with a small dim so the mock body is +/// tiny. `model` is an arctic tag so prefix resolution is exercised. +fn cfg_for(endpoint: &str, dim: usize) -> Config { + let mut cfg = Config::defaults(); + cfg.models.embedding.provider = "ollama".to_string(); + cfg.models.embedding.model = "snowflake-arctic-embed2".to_string(); + cfg.models.embedding.dimensions = dim; + cfg.models.embedding.endpoint = Some(endpoint.to_string()); + cfg +} + +async fn embed_blocking( + cfg: Config, + inputs: Vec<(String, EmbeddingKind)>, +) -> anyhow::Result>> { + tokio::task::spawn_blocking(move || -> anyhow::Result>> { + let emb = OllamaEmbedder::new(&cfg)?; + let refs: Vec> = inputs + .iter() + .map(|(t, k)| EmbeddingInput { text: t, kind: *k }) + .collect(); + emb.embed(&refs) + }) + .await + .expect("blocking task panicked") +} + +#[tokio::test] +async fn embed_returns_l2_normalized_vectors() { + let server = MockServer::start().await; + // Two raw (un-normalized) vectors of dim 2; the adapter must L2-normalize. + let body = r#"{"model":"snowflake-arctic-embed2","embeddings":[[3.0,4.0],[0.0,5.0]]}"#; + Mock::given(method("POST")) + .and(path("/api/embed")) + .respond_with(ResponseTemplate::new(200).set_body_string(body)) + .mount(&server) + .await; + + let out = embed_blocking( + cfg_for(&server.uri(), 2), + vec![ + ("스택 자료구조".to_string(), EmbeddingKind::Query), + ("후입선출".to_string(), EmbeddingKind::Document), + ], + ) + .await + .expect("embed should succeed"); + + assert_eq!(out.len(), 2); + for v in &out { + let norm = v.iter().map(|x| x * x).sum::().sqrt(); + assert!((norm - 1.0).abs() < 1e-5, "expected unit norm, got {norm}"); + } + // [3,4] → [0.6, 0.8]. + assert!((out[0][0] - 0.6).abs() < 1e-5 && (out[0][1] - 0.8).abs() < 1e-5); +} + +#[tokio::test] +async fn embed_rejects_dim_mismatch() { + let server = MockServer::start().await; + // Server returns dim 3, config expects dim 2 → hard error. + let body = r#"{"model":"snowflake-arctic-embed2","embeddings":[[1.0,2.0,3.0]]}"#; + Mock::given(method("POST")) + .and(path("/api/embed")) + .respond_with(ResponseTemplate::new(200).set_body_string(body)) + .mount(&server) + .await; + + let err = embed_blocking( + cfg_for(&server.uri(), 2), + vec![("q".to_string(), EmbeddingKind::Query)], + ) + .await + .expect_err("dim mismatch must error"); + let msg = format!("{err:#}"); + assert!(msg.contains("dim"), "expected dim error, got: {msg}"); +} + +#[tokio::test] +async fn embed_empty_input_is_noop() { + // No mock needed — empty input must never hit the network. + let out = embed_blocking(cfg_for("http://127.0.0.1:1", 2), vec![]) + .await + .expect("empty embed should be Ok(empty)"); + assert!(out.is_empty()); +} diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 4bc23dc..7b5b70e 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -16,7 +16,7 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab- | metadata | SQLite + FTS5 (lexical search + v0.20.1 한국어 형태소 tokenizer via lindera-ko-dic) | | vector | LanceDB (embedded, model 별 분리 table) | | Markdown parser | `pulldown-cmark`. frontmatter 에 title 없으면 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (`parser_version = md-frontmatter-v2`, 기존 doc 도 다음 ingest 에서 갱신) | -| embedding | `fastembed-rs` (`multilingual-e5-large`, 1024d, v0.18.0부터 default 업그레이드) | +| embedding | `fastembed-rs` (`multilingual-e5-large`, 1024d, v0.18.0부터 default 업그레이드). opt-in 대안: candle (e5 또는 `snowflake-arctic-embed-l-v2.0`) / Ollama `/api/embed`. arctic = 설명형 query recall 보강 (v0.26.0, 아래 결정표) | | 한국어 형태소분석 | `lindera-ko-dic` (FTS5 외부 tokenizer, v0.20.1) — 2자 이상 한국어 query 지원 | | LLM | Ollama HTTP (default `gemma4:e4b` ─ OCR / caption 와 family 통일. 사용자가 더 큰 variant `gemma4:26b` 등으로 override 가능) | | 음성 ASR | `whisper.cpp` (via `whisper-rs`) — P8 보류, 시스템 dep brainstorm 후 | @@ -67,7 +67,8 @@ flowchart TB subgraph Adapters ["traits + adapters"] embed["kebab-embed
(trait)"] embedlocal["kebab-embed-local
(fastembed, default)"] - embedcandle["kebab-embed-candle
(candle, NUMA-safe opt-in)"] + embedcandle["kebab-embed-candle
(candle, e5+arctic, NUMA-safe opt-in)"] + embedollama["kebab-embed-ollama
(Ollama /api/embed, opt-in)"] llm["kebab-llm
(trait)"] llmlocal["kebab-llm-local
(Ollama)"] search["kebab-search"] @@ -94,6 +95,7 @@ flowchart TB app --> vector app --> embedlocal app --> embedcandle + app --> embedollama app --> llmlocal app --> search app --> rag @@ -108,6 +110,8 @@ flowchart TB embedlocal --> embed embedcandle --> core embedcandle --> config + embedollama --> core + embedollama --> config llmlocal --> llm rag --> search rag --> llm @@ -136,6 +140,23 @@ UI → store/llm/parse 직접 의존 금지. 모든 user-facing 진입은 `kebab `kebab-parse-code` 의 외부 tree-sitter grammar crate 의존: P10-1A-2 에서 `tree-sitter-rust` 추가, P10-1B 에서 `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` 추가, P10-1C-Go 에서 `tree-sitter-go` 추가, P10-1C-JK 에서 `tree-sitter-java` / `tree-sitter-kotlin-ng` 추가, P10-1D 에서 `tree-sitter-c` / `tree-sitter-cpp` 추가. 모두 `kebab-parse-code` 에만 격리 (facade 룰 — UI crate / chunker 가 직접 import 금지). Kotlin 은 `tree-sitter-kotlin-ng` 사용 (bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착 — 사용 불가). v0.18.0+ 부터 `kebab-source-fs` 는 자체 `code_meta` 모듈 (lang detect + skip helpers + BUILTIN_BLACKLIST) 을 보유, kebab-parse-code 와 분리 (refactor 2026-05-26). v0.19.0 부터 `kebab-parse-md` 가 `kebab-parse-types` (parser intermediate types) + `kebab-normalize` (CanonicalDocument lift) 두 crate 를 흡수 — 24 → 22 crates, design §3.7b 재작성 (HOTFIXES 2026-05-26). v0.20.1 부터 `kebab-search` 가 `lindera-ko-dic` 를 의존해 한국어 FTS5 형태소 tokenizer 지원 — V009 migration 으로 2자 이상 한국어 query 매칭 (Bug #8 closure). +### 임베딩 백엔드 결정표 (v0.26.0) + +| provider | 모델 | pooling / prefix | 위치 | 언제 | +|---|---|---|---|---| +| `fastembed` (기본) | `multilingual-e5-large` | mean / `query:`·`passage:` | in-process (onnxruntime) | 기본. 단일 소켓 호스트 | +| `candle` | e5 또는 `snowflake-arctic-embed-l-v2.0` | 모델별 (e5=mean, arctic=CLS) / arctic=`query:`·무접두어 | in-process (pure Rust) | NUMA 서버 (onnxruntime 48-스레드 double-free 회피), Apple Silicon Metal GPU | +| `ollama` | `snowflake-arctic-embed2` 등 | 모델 태그로 추론 / arctic=`query:`·무접두어 | 원격 HTTP (`/api/embed`) | candle 폴백, 측정에 쓴 경로 그대로 재현 | + +**arctic-embed-l-v2.0 채택 근거**: 별칭(doc-side expansion) 제거(v0.25.0) 후 설명형 +query 의 recall 보강책. 측정(`/build/dogfood/logs/2026-06-03-method-measurements.md`)에서 +arctic = recall@10 130/132 (e5 대비 +7, 색인 1회·per-query 0·LLM 0, 용어 무손실). +candle 이 주 백엔드(in-process, NUMA 안전), Ollama 가 폴백(측정 경로 재현). 두 경로의 +pooling/prefix 정확성은 `kebab-embed-candle/tests/arctic_ollama_parity.rs` +(candle arctic vs Ollama arctic 코사인>0.99, `#[ignore]`) 로 고정. e5 → arctic 전환은 +`embedding_version` cascade (모델별 벡터 상이) → 재색인 필요. 기본값 e5 유지라 기존 +사용자 무영향. 자세한 내용: [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 2026-06-03 arctic entry. + ## 디렉토리 구조 ```text @@ -184,7 +205,8 @@ kebab/ │ ├── kebab-store-sqlite/ # SQLite + FTS5 (V001/V002/V003) (P1-6, P2-1, P3-3). src/derivation_cache.rs = derivation_cache 테이블 저장소 (V012, v0.21.0) │ ├── kebab-search/ # Lexical + Vector + Hybrid retriever (P2-2, P3-4) │ ├── kebab-embed/ kebab-embed-local/ # Embedder trait + fastembed adapter (P3-1, P3-2) -│ ├── kebab-embed-candle/ # candle (pure-Rust) Embedder, NUMA-safe opt-in provider=candle (Track 1, v0.22.0) +│ ├── kebab-embed-candle/ # candle (pure-Rust) Embedder, 모델 레지스트리(e5 mean + arctic CLS), NUMA-safe opt-in provider=candle (Track 1, v0.22.0; arctic v0.26.0) +│ ├── kebab-embed-ollama/ # Ollama /api/embed Embedder, opt-in provider=ollama (arctic 폴백 경로, v0.26.0) │ ├── kebab-store-vector/ # LanceDB VectorStore (P3-3, P7-3 follow-up) │ ├── kebab-llm/ kebab-llm-local/ # LanguageModel trait + Ollama adapter (P4-1, P4-2) │ ├── kebab-rag/ # RAG pipeline (P4-3) diff --git a/docs/SMOKE.md b/docs/SMOKE.md index c5124ff..941e105 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -107,16 +107,18 @@ respect_markdown_headings = true chunker_version = "md-heading-v1" [models.embedding] -provider = "fastembed" # "fastembed"(기본) / "candle"(순수 Rust, NUMA-안전) - # / "none"(lexical-only — Ollama 불필요) - # ⚠ provider="candle" 사용 시 아래 model/dimensions 도 - # multilingual-e5-large / 1024 로 바꿔야 함 - # (candle 은 현재 e5-large 만 지원). -model = "multilingual-e5-small" +provider = "fastembed" # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust, NUMA-안전) + # / "ollama"(원격 HTTP /api/embed) / "none"(lexical-only — Ollama 불필요) + # ⚠ provider/model 변경 시 아래 dimensions 도 맞춰야 함. +model = "multilingual-e5-small" # candle/ollama 는 "snowflake-arctic-embed-l-v2.0" + # (ollama 태그 "snowflake-arctic-embed2", 1024-dim) 도 지원 — + # 설명형 query recall 보강. e5↔arctic 전환은 + # embedding_version cascade (재색인 필요). version = "v1" -dimensions = 384 +dimensions = 384 # arctic / e5-large 는 1024. batch_size = 64 num_threads = 0 # candle 전용 CPU 스레드 캡 (0=auto). env KEBAB_EMBED_THREADS 우선. +# endpoint = "http://127.0.0.1:11434" # provider="ollama" 전용; 생략 시 [models.llm].endpoint fallback. [models.llm] provider = "ollama" diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index 151338f..c594f27 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,56 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-06-03 — arctic-embed-l-v2.0 임베더 통합 (candle + Ollama) (v0.26.0) + +**무엇을 왜 추가했나.** 별칭(doc-side expansion) 제거(v0.25.0) 후 설명형 query 의 +recall 보강책으로 `snowflake-arctic-embed-l-v2.0` 임베더를 두 백엔드로 통합했다. +근거는 방법별 측정(`/build/dogfood/logs/2026-06-03-method-measurements.md`): +arctic = recall@10 **130/132**, recall@50 **132/132**, **용어 무손실**(syn/abbr/en +유지). e5-large 대비 +7, 색인 1회·per-query 0·LLM 0 = 살아있는 KB 에 지속 가능. +별칭이 청크당 색인-시 LLM(나무위키 18문서 cold 2.5h)을 요구한 것과 대조. + +**무엇을 건드렸나.** +- `kebab-embed-candle`: e5 하드코딩(`HF_MODEL`/`SUPPORTED_MODEL`/mean/`query:`+`passage:`)을 + **모델 레지스트리**(`MODEL_REGISTRY`: `EmbedModelSpec { name, hf_repo, pooling, query_prefix, doc_prefix, dim, version_tag }`)로 + 일반화. e5(mean, `query:`/`passage:`) + arctic(**CLS**, `query:`/무접두어). pooling + 은 모델별 분기(mean=attention-mask-weighted / CLS=`hidden[:,0,:]`), tokenize/forward/L2 + 공유. arctic pooling=CLS 는 HF `1_Pooling/config.json`(`pooling_mode_cls_token:true`)로 + 확인. `model_version` 은 arctic 일 때 `+arctic-cls` 태그(switch 시 embedding_version + cascade 트리거); e5 는 fastembed-e5 와의 호환(NUMA 드롭인) 위해 plain `config.version` 유지. +- `kebab-embed-ollama` (신규 크레이트): `Embedder` 구현, `reqwest::blocking` POST + `/api/embed` `{model, input:[...]}` → `embeddings`. batch 48 + fail-soft 재시도 3, + 결과 **L2 정규화**(Ollama raw 반환), dim 검증, query/doc prefix 모델 태그로 추론 + (`arctic-embed`→`query:`/무접두어, `e5`→`query:`/`passage:`). `model_version=ollama:{model}`. + endpoint = `models.embedding.endpoint` ?? `models.llm.endpoint`. +- `kebab-config`: `EmbeddingModelCfg.endpoint: Option`(serde default, ollama용) + + `provider` 문서에 `ollama` 추가 + env `KEBAB_MODELS_EMBEDDING_ENDPOINT`. +- `kebab-app::embedder()`: provider match 에 `ollama` 분기 추가(facade 경유). +- workspace member += `kebab-embed-ollama`, version 0.25.0 → **0.26.0**(minor). + +**correctness 게이트.** candle arctic 임베딩이 측정에 쓴 Ollama `snowflake-arctic-embed2` +임베딩과 일치해야 pooling/prefix 정확성(=recall 130 재현)이 보장된다. 검증: +`kebab-embed-candle/tests/arctic_ollama_parity.rs`(`#[ignore]`, live Ollama 의존) 가 +candle arctic vs 우리 Ollama 어댑터로 같은 문장(설명형/약어/영문 포함, doc+query +양 경로)을 임베딩해 per-sentence **코사인 > 0.99** 를 assert. 수동 실행 결과(코사인값)는 +릴리스 전 본 entry 에 기록. + +**수동 검증 결과** (2026-06-03 worker 실측, Ollama @192.168.0.47:11434 +`snowflake-arctic-embed2`): 8문장 × (doc+query) 16벡터 per-sentence 코사인 +**0.999984 ~ 0.999995**, `cosine_min = 0.999984` (게이트 0.99 대비 대폭 상회). +설명형("후입선출 방식으로 동작하는 자료구조")·약어("SVM 은 support vector machine")· +영문·한글 모두 일치. → candle arctic 의 CLS pooling + `query: ` prefix 가 Ollama 측정 +경로와 정확히 동일 = recall@10 130 재현 보장. Ollama raw 도 이미 L2-정규화(norm 1.0)라 +어댑터의 L2 정규화는 idempotent no-op. 로그: `/build/dogfood/logs/arctic-parity.log`, +요약: `/tmp/arctic-result.md`. + +**호환성.** 기본 provider=fastembed e5 동작/벡터 불변(arctic 은 opt-in). dim 1024 +동일이나 LanceDB 테이블명에 모델명 포함(`chunk_embeddings_{model}_{dim}`)이라 충돌 +없음. e5 → arctic 전환 = `embedding_version` cascade(모델별 벡터 상이) → **재색인 필요** +(기존 e5 KB 와 혼용 불가, 명확). A(heading enrichment)는 측정상 arctic 에서 악화 → +미적용. spec: `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`, plan: 동일 +디렉토리 `2026-06-03-arctic-embedder-plan.md`. + ## 2026-06-03 — doc-side expansion(별칭) 기능 완전 제거 (v0.25.0) **무엇을 왜 제거했나.** v0.21.0 (PR #195/#196) 에서 도입한 색인-시 청크당 LLM