feat(embed): candle Metal (Apple Silicon GPU) opt-in build feature + v0.23.0

- kebab-embed-candle: `metal` feature → candle metal backend; select_device() picks Device::new_metal(0) (CPU fallback) under the feature, else Device::Cpu. .contiguous() before to_vec2 (Metal rejects strided views; CPU tolerates). - feature passthrough: kebab-app/embed_metal → kebab-cli/embed_metal. Build on macOS: cargo build --release --features embed_metal. - default (non-metal) path unchanged: clippy 0, candle units + thread_cap + parity pass. - README + HOTFIXES: Mac-GPU-ingest → copy sqlite+lancedb → server CPU-query workflow. - version 0.22.0 → 0.23.0 (opt-in build surface). macOS-only compile; Metal execution/speed/parity validated by user on M4 Pro (not buildable on the Linux CI/dev machine). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 11:37:08 +00:00
parent 99f8cfa691
commit 369aeb3d24
8 changed files with 485 additions and 50 deletions
--- a/crates/kebab-app/Cargo.toml
+++ b/crates/kebab-app/Cargo.toml
@@ -100,6 +100,8 @@ reqwest      = { version = "0.12", default-features = false, features = ["blocki
 # disable path 없음; 이 feature 는 spec §6.3 명시를 honor 하는 role 만.
 default = ["fts_korean_morphological"]
 fts_korean_morphological = []
+# opt-in (macOS): candle embedder runs on the Apple Silicon GPU. See kebab-embed-candle.
+embed_metal = ["kebab-embed-candle/metal"]

 [lints]
 workspace = true
--- a/crates/kebab-cli/Cargo.toml
+++ b/crates/kebab-cli/Cargo.toml
@@ -51,5 +51,10 @@ tempfile     = { workspace = true }
 rusqlite     = { workspace = true }
 time         = { workspace = true }

+[features]
+# opt-in (macOS): build the `kebab` binary with candle on the Apple Silicon GPU.
+#   cargo build --release --features embed_metal
+embed_metal = ["kebab-app/embed_metal"]
+
 [lints]
 workspace = true
--- a/crates/kebab-embed-candle/Cargo.toml
+++ b/crates/kebab-embed-candle/Cargo.toml
@@ -25,6 +25,14 @@ rayon = "1"
 anyhow = { workspace = true }
 tracing = { workspace = true }

+[features]
+# opt-in: run candle on the Apple Silicon GPU (Metal). macOS-only — the build
+# enables candle's metal backend and `select_device()` picks Metal (CPU fallback
+# on failure). Lets an M-series Mac ingest e5-large on GPU (10×+ vs CPU); the
+# resulting vectors are cross-compatible with the CPU path (same model), so the
+# Linux server can serve queries on CPU candle.
+metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]
+
 [dev-dependencies]
 # Integration-test binaries can only see the library's public API + these,
 # not the library's own (non-dev) dependencies — so rayon/kebab-config/kebab-core
--- a/crates/kebab-embed-candle/src/lib.rs
+++ b/crates/kebab-embed-candle/src/lib.rs
@@ -128,7 +128,7 @@ impl CandleEmbedder {
        std::fs::create_dir_all(&cache_dir)
            .with_context(|| format!("create candle cache dir {}", cache_dir.display()))?;

-        let device = Device::Cpu;
+        let device = select_device();

        // 3. Fetch model files via hf-hub into the candle cache.
        tracing::info!(
@@ -250,7 +250,9 @@ impl CandleEmbedder {
        let norm = mean.sqr()?.sum_keepdim(1)?.sqrt()?;
        let normalized = mean.broadcast_div(&norm)?;

-        Ok(normalized.to_vec2::<f32>()?)
+        // `.contiguous()` before host copy: broadcast ops can leave a strided
+        // view, which `to_vec2` rejects on the Metal backend (CPU tolerates it).
+        Ok(normalized.contiguous()?.to_vec2::<f32>()?)
    }
 }

@@ -307,6 +309,32 @@ fn prefix_input(input: &EmbeddingInput<'_>) -> String {
    }
 }

+/// Select the compute device. Built with the `metal` feature (Apple Silicon
+/// GPU), try Metal and fall back to CPU on failure; otherwise CPU. Metal only
+/// compiles/runs on macOS — the Linux server builds the CPU path. e5-large
+/// vectors are model-defined, so Metal-produced and CPU-produced embeddings are
+/// cross-compatible (a Mac can ingest on GPU, the server query on CPU).
+fn select_device() -> Device {
+    #[cfg(feature = "metal")]
+    {
+        match Device::new_metal(0) {
+            Ok(d) => {
+                tracing::info!(target: "kebab-embed-candle", "candle device = Metal (GPU)");
+                return d;
+            }
+            Err(e) => {
+                tracing::warn!(
+                    target: "kebab-embed-candle",
+                    error = %e,
+                    "Metal device unavailable; falling back to CPU"
+                );
+            }
+        }
+    }
+    tracing::info!(target: "kebab-embed-candle", "candle device = CPU");
+    Device::Cpu
+}
+
 /// Apply a one-shot global rayon thread cap (the NUMA-safety lever). Returns
 /// `true` if this call set the pool, `false` if it was already initialized
 /// (cap not applied) or `n_threads == 0`. `#[doc(hidden)] pub` so the