Merge pull request 'feat(p5-1): kb-eval crate — golden-fixture runner + eval persistence' (#27) from feat/p5-1-golden-fixture-runner into main

Reviewed-on: altair823-org/kb#27
2026-05-02 02:47:21 +00:00
parent 2c0607ae95 e6ff9c412c
commit 33ec13bad7
22 changed files with 1817 additions and 420 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3478,6 +3478,25 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "kb-eval"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "kb-app",
+ "kb-config",
+ "kb-core",
+ "kb-store-sqlite",
+ "rusqlite",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "tempfile",
+ "time",
+ "tracing",
+ "uuid",
+]
+
 [[package]]
 name = "kb-llm"
 version = "0.1.0"
@@ -6380,6 +6399,19 @@ dependencies = [
 "syn 2.0.117",
 ]

+[[package]]
+name = "serde_yaml"
+version = "0.9.34+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
+dependencies = [
+ "indexmap 2.14.0",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+
 [[package]]
 name = "serde_yaml_ng"
 version = "0.10.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,6 +18,7 @@ members = [
    "crates/kb-rag",
    "crates/kb-app",
    "crates/kb-cli",
+    "crates/kb-eval",
 ]

 [workspace.package]
@@ -32,6 +33,9 @@ anyhow       = "1"
 thiserror    = "2"
 serde        = { version = "1", features = ["derive"] }
 serde_json   = "1"
+# Golden-fixture loader (P5-1, kb-eval) parses YAML; pinned in the
+# workspace so future eval-adjacent crates share the same major.
+serde_yaml   = "0.9"
 time         = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] }
 uuid         = { version = "1", features = ["v7", "serde"] }
 blake3       = "1"
--- a/crates/kb-app/src/app.rs
+++ b/crates/kb-app/src/app.rs
@@ -1,24 +1,22 @@
-//! `App` — internal lifecycle struct (§7).
+//! `App` — facade lifecycle struct (§7).
 //!
-//! A single `App` represents one CLI invocation's worth of state: a
-//! resolved `Config`, an open `SqliteStore`, and (when embeddings are
-//! enabled) an `Embedder` + `LanceVectorStore`. Each public free
-//! function on `kb-app` wraps `App::open(config)` once, runs the
-//! requested op, and drops everything on return.
-//!
-//! The struct is `pub(crate)` because it is an internal seam: `kb-cli`
-//! calls only the free functions on the crate root. `kb-tui` (P9) is
-//! expected to hold one `App` for the session, at which point the
-//! struct may need to be promoted to `pub`. Until then, keep it
-//! private to insulate the wiring shape from downstream callers.
+//! A single `App` represents one CLI invocation's (or one TUI
+//! session's / one eval-runner suite's) worth of state: a resolved
+//! `Config`, an open `SqliteStore`, and (when embeddings are enabled)
+//! an `Embedder` + `LanceVectorStore`. Each public free function on
+//! `kb-app` builds an `App` once, runs the requested op, and drops
+//! everything on return; long-lived callers (kb-eval, the future P9
+//! TUI session) hold onto an `App` across many calls so the per-query
+//! cost is just a method dispatch.
 //!
 //! ## Embedder + Vector store lifetime
 //!
-//! `App::open` builds the SQLite store unconditionally. The embedder
-//! and vector store are *lazy + memoized* — built on first call to
-//! [`App::embedder`] / [`App::vector`] and cached in `OnceLock`s — so
-//! a long-lived `App` (e.g., the P9 TUI session) pays the ~470 MB
-//! ONNX init plus Lance reopen cost exactly once.
+//! `App::open_with_config` builds the SQLite store unconditionally.
+//! The embedder and vector store are *lazy + memoized* — built on
+//! first call to [`App::embedder`] / [`App::vector`] and cached in
+//! `OnceLock`s — so a long-lived `App` (kb-eval driving 50 queries,
+//! the P9 TUI session) pays the ~470 MB ONNX init plus Lance reopen
+//! cost exactly once.
 //!
 //! - `kb list` / `kb inspect` never need them.
 //! - `kb search --mode lexical` never needs them.
@@ -27,7 +25,8 @@
 //! Building eagerly would force every CLI invocation to load ~470 MB of
 //! ONNX weights, which is the dominant cold-start cost. The lazy
 //! pattern keeps the lexical-only paths instant; the memoization makes
-//! the TUI's repeated searches cheap after the first.
+//! the TUI's repeated searches and the eval runner's per-query loop
+//! cheap after the first invocation.
 //!
 //! Embeddings can also be **disabled** workspace-wide via
 //! `config.models.embedding.provider = "none"` (or `dimensions = 0`);
@@ -36,15 +35,26 @@

 use std::sync::{Arc, OnceLock};

-use anyhow::{Context, Result};
+use anyhow::{Context, Result, anyhow};

-use kb_core::Embedder;
+use kb_core::{
+    Answer, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit, SearchMode,
+    SearchQuery, VectorStore,
+};
 use kb_embed_local::FastembedEmbedder;
+use kb_llm_local::OllamaLanguageModel;
+use kb_rag::{AskOpts, RagPipeline};
+use kb_search::{HybridRetriever, LexicalRetriever, VectorRetriever};
 use kb_store_sqlite::SqliteStore;
 use kb_store_vector::LanceVectorStore;

-/// Internal facade state. See module docs for lifetime rules.
-pub(crate) struct App {
+/// Facade state — see module docs for lifetime rules.
+///
+/// The struct is public so long-lived callers (kb-eval, the future P9
+/// TUI session) can construct one and reuse it across many search /
+/// ask calls. The OnceLock-backed `embedder` / `vector` fields ensure
+/// the cold-start cost is paid exactly once per instance.
+pub struct App {
    pub(crate) config: kb_config::Config,
    pub(crate) sqlite: Arc<SqliteStore>,
    /// Memoized embedder — built lazily on first `embedder()` call when
@@ -54,6 +64,11 @@ pub(crate) struct App {
    /// Memoized vector store — built lazily on first `vector()` call
    /// when embeddings are enabled. Same rationale as `embedder`.
    vector: OnceLock<Arc<LanceVectorStore>>,
+    /// Memoized LLM — built lazily on first `ask()` call. Sharing one
+    /// across the eval runner avoids re-handshaking the Ollama HTTP
+    /// client per query (cheap, but still measurable on a 50-query
+    /// suite).
+    llm: OnceLock<Arc<dyn LanguageModel>>,
 }

 impl App {
@@ -65,7 +80,7 @@ impl App {
    /// Downstream `LanceVectorStore::new` (called by [`Self::vector`])
    /// internally drives a `tokio::Runtime::block_on`, which panics if
    /// invoked from inside another tokio runtime.
-    pub(crate) fn open(config: kb_config::Config) -> Result<Self> {
+    pub fn open_with_config(config: kb_config::Config) -> Result<Self> {
        let sqlite = SqliteStore::open(&config).context("kb-app: open SqliteStore")?;
        sqlite
            .run_migrations()
@@ -75,9 +90,112 @@ impl App {
            sqlite: Arc::new(sqlite),
            embedder: OnceLock::new(),
            vector: OnceLock::new(),
+            llm: OnceLock::new(),
        })
    }

+    /// Run a [`SearchQuery`] through the configured retriever stack and
+    /// return the top-k hits.
+    ///
+    /// Reuses any previously-built embedder / vector store on this `App`
+    /// — long-lived callers (kb-eval, future TUI) get amortized cost
+    /// across calls.
+    pub fn search(&self, query: SearchQuery) -> Result<Vec<SearchHit>> {
+        match query.mode {
+            SearchMode::Lexical => {
+                let lex = LexicalRetriever::with_settings(
+                    self.sqlite.clone(),
+                    lexical_index_version(&self.config),
+                    self.config.search.snippet_chars,
+                );
+                lex.search(&query)
+            }
+            SearchMode::Vector => {
+                let (emb, vec_store) = self.require_embeddings()?;
+                let vec_iv = vector_index_version(emb.as_ref());
+                let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
+                let emb_dyn: Arc<dyn Embedder> = emb;
+                let retr = VectorRetriever::with_settings(
+                    vec_dyn,
+                    emb_dyn,
+                    self.sqlite.clone(),
+                    vec_iv,
+                    self.config.search.snippet_chars,
+                );
+                retr.search(&query)
+            }
+            SearchMode::Hybrid => {
+                let lex = Arc::new(LexicalRetriever::with_settings(
+                    self.sqlite.clone(),
+                    lexical_index_version(&self.config),
+                    self.config.search.snippet_chars,
+                )) as Arc<dyn Retriever>;
+                let (emb, vec_store) = self.require_embeddings()?;
+                let vec_iv = vector_index_version(emb.as_ref());
+                let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
+                let emb_dyn: Arc<dyn Embedder> = emb;
+                let vec_retr = Arc::new(VectorRetriever::with_settings(
+                    vec_dyn,
+                    emb_dyn,
+                    self.sqlite.clone(),
+                    vec_iv,
+                    self.config.search.snippet_chars,
+                )) as Arc<dyn Retriever>;
+                let hybrid = HybridRetriever::new(&self.config, lex, vec_retr);
+                hybrid.search(&query)
+            }
+        }
+    }
+
+    /// Run a RAG `ask` against the configured retriever + LLM. Reuses
+    /// the memoized embedder / vector / LLM where applicable.
+    pub fn ask(&self, query: &str, opts: AskOpts) -> Result<Answer> {
+        let retriever: Arc<dyn Retriever> = match opts.mode {
+            SearchMode::Lexical => Arc::new(LexicalRetriever::with_settings(
+                self.sqlite.clone(),
+                lexical_index_version(&self.config),
+                self.config.search.snippet_chars,
+            )),
+            SearchMode::Vector => {
+                let (emb, vec_store) = self.require_embeddings()?;
+                let vec_iv = vector_index_version(emb.as_ref());
+                let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
+                let emb_dyn: Arc<dyn Embedder> = emb;
+                Arc::new(VectorRetriever::with_settings(
+                    vec_dyn,
+                    emb_dyn,
+                    self.sqlite.clone(),
+                    vec_iv,
+                    self.config.search.snippet_chars,
+                ))
+            }
+            SearchMode::Hybrid => {
+                let lex = Arc::new(LexicalRetriever::with_settings(
+                    self.sqlite.clone(),
+                    lexical_index_version(&self.config),
+                    self.config.search.snippet_chars,
+                )) as Arc<dyn Retriever>;
+                let (emb, vec_store) = self.require_embeddings()?;
+                let vec_iv = vector_index_version(emb.as_ref());
+                let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
+                let emb_dyn: Arc<dyn Embedder> = emb;
+                let vec_retr = Arc::new(VectorRetriever::with_settings(
+                    vec_dyn,
+                    emb_dyn,
+                    self.sqlite.clone(),
+                    vec_iv,
+                    self.config.search.snippet_chars,
+                )) as Arc<dyn Retriever>;
+                Arc::new(HybridRetriever::new(&self.config, lex, vec_retr))
+            }
+        };
+
+        let llm = self.llm()?;
+        let pipeline =
+            RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
+        pipeline.ask(query, opts)
+    }
+
    /// Returns `true` when the workspace has embeddings turned off
    /// (`provider = "none"` or `dimensions = 0`). Lexical-only mode.
    pub(crate) fn embeddings_disabled(&self) -> bool {
@@ -123,4 +241,64 @@ impl App {
        let _ = self.vector.set(store.clone());
        Ok(Some(self.vector.get().cloned().unwrap_or(store)))
    }
+
+    /// Build (or reuse) the configured LLM. Currently always Ollama;
+    /// when a second provider lands this is the place to switch on
+    /// `config.models.llm.provider`.
+    fn llm(&self) -> Result<Arc<dyn LanguageModel>> {
+        if let Some(l) = self.llm.get() {
+            return Ok(l.clone());
+        }
+        let llm: Arc<dyn LanguageModel> = Arc::new(
+            OllamaLanguageModel::new(&self.config)
+                .context("kb-app::ask: build OllamaLanguageModel")?,
+        );
+        let _ = self.llm.set(llm.clone());
+        Ok(self.llm.get().cloned().unwrap_or(llm))
+    }
+
+    /// Resolve the embedder + vector store, surfacing the user-friendly
+    /// "switch to --mode lexical" error when embeddings are disabled.
+    fn require_embeddings(
+        &self,
+    ) -> Result<(
+        Arc<dyn Embedder + Send + Sync>,
+        Arc<LanceVectorStore>,
+    )> {
+        let emb = self.embedder()?.ok_or_else(|| {
+            anyhow!(
+                "embeddings disabled (config.models.embedding.provider == \"none\" \
+                 or dimensions == 0); vector / hybrid search require embeddings — \
+                 switch to --mode lexical or enable an embedding provider in config.toml"
+            )
+        })?;
+        let vec_store = self.vector()?.ok_or_else(|| {
+            anyhow!(
+                "vector store unavailable while embedder is configured — this should \
+                 not happen; check `kb doctor` and the data_dir permissions"
+            )
+        })?;
+        Ok((emb, vec_store))
+    }
+}
+
+/// Compose a stable `IndexVersion` for the lexical retriever from
+/// the active config. This token surfaces in `SearchHit.index_version`
+/// and on snapshot tests; including the chunker version pins it to
+/// the chunking policy in effect.
+fn lexical_index_version(config: &kb_config::Config) -> IndexVersion {
+    IndexVersion(format!("lex:{}", config.chunking.chunker_version))
+}
+
+/// Compose a stable `IndexVersion` for the vector retriever. Tracks
+/// `(embedding_model, embedding_version, dimensions)` so a model swap
+/// flags drift via the existing index_version mismatch warning in
+/// `HybridRetriever::new`.
+fn vector_index_version(embedder: &dyn Embedder) -> IndexVersion {
+    IndexVersion(format!(
+        "vec:{}@{}:{}",
+        embedder.model_id().0,
+        embedder.model_version().0,
+        embedder.dimensions(),
+    ))
 }
--- a/crates/kb-app/src/lib.rs
+++ b/crates/kb-app/src/lib.rs
@@ -43,22 +43,18 @@ use kb_chunk::MdHeadingV1Chunker;
 use kb_core::{
    Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
    DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
-    EmbeddingKind, IndexVersion, IngestReport, LanguageModel, ParserVersion, RawAsset,
-    Retriever, SearchHit, SearchMode, SearchQuery, SourceConnector, SourceScope,
-    SourceUri, VectorRecord, VectorStore,
+    EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery,
+    SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore,
 };
-use kb_llm_local::OllamaLanguageModel;
 use kb_normalize::build_canonical_document;
 use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
-use kb_rag::RagPipeline;
-use kb_search::{HybridRetriever, LexicalRetriever, VectorRetriever};
 use kb_source_fs::FsSourceConnector;

 mod app;
 pub mod doctor_signal;
 pub mod logging;

-use app::App;
+pub use app::App;

 /// Parser-version label persisted in `documents.parser_version` for
 /// every Markdown file ingested through the `kb-parse-md` pipeline.
@@ -168,7 +164,7 @@ pub fn ingest_with_config(
 ) -> anyhow::Result<IngestReport> {
    let started_instant = std::time::Instant::now();

-    let app = App::open(config)?;
+    let app = App::open_with_config(config)?;

    // Walk the workspace.
    let connector = FsSourceConnector::new(&app.config)
@@ -667,7 +663,7 @@ pub fn list_docs_with_config(
    config: kb_config::Config,
    filter: DocFilter,
 ) -> anyhow::Result<Vec<DocSummary>> {
-    let app = App::open(config)?;
+    let app = App::open_with_config(config)?;
    app.sqlite.list_documents(&filter)
 }

@@ -683,7 +679,7 @@ pub fn inspect_doc_with_config(
    config: kb_config::Config,
    id: &DocumentId,
 ) -> anyhow::Result<CanonicalDocument> {
-    let app = App::open(config)?;
+    let app = App::open_with_config(config)?;
    app.sqlite
        .get_document(id)?
        .ok_or_else(|| anyhow!("document not found: {} (try `kb list docs`)", id.0))
@@ -701,7 +697,7 @@ pub fn inspect_chunk_with_config(
    config: kb_config::Config,
    id: &ChunkId,
 ) -> anyhow::Result<Chunk> {
-    let app = App::open(config)?;
+    let app = App::open_with_config(config)?;
    app.sqlite
        .get_chunk(id)?
        .ok_or_else(|| anyhow!("chunk not found: {} (try `kb inspect doc <id>`)", id.0))
@@ -715,101 +711,15 @@ pub fn search(query: SearchQuery) -> anyhow::Result<Vec<SearchHit>> {
 }

 /// Test-only seam — kb-cli must call the public free function
-/// ([`search`]), not this.
+/// ([`search`]), not this. Builds a one-shot `App` and delegates to
+/// [`App::search`]; long-lived callers should hold an `App` instance
+/// directly to amortize the embedder / vector-store cold start.
 #[doc(hidden)]
 pub fn search_with_config(
    config: kb_config::Config,
    query: SearchQuery,
 ) -> anyhow::Result<Vec<SearchHit>> {
-    let app = App::open(config)?;
-
-    match query.mode {
-        SearchMode::Lexical => {
-            let lex = LexicalRetriever::with_settings(
-                app.sqlite.clone(),
-                lexical_index_version(&app.config),
-                app.config.search.snippet_chars,
-            );
-            lex.search(&query)
-        }
-        SearchMode::Vector => {
-            let (emb, vec_store) = require_embeddings(&app)?;
-            let vec_iv = vector_index_version(emb.as_ref());
-            let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
-            let emb_dyn: Arc<dyn Embedder> = emb;
-            let retr = VectorRetriever::with_settings(
-                vec_dyn,
-                emb_dyn,
-                app.sqlite.clone(),
-                vec_iv,
-                app.config.search.snippet_chars,
-            );
-            retr.search(&query)
-        }
-        SearchMode::Hybrid => {
-            let lex = Arc::new(LexicalRetriever::with_settings(
-                app.sqlite.clone(),
-                lexical_index_version(&app.config),
-                app.config.search.snippet_chars,
-            )) as Arc<dyn Retriever>;
-            let (emb, vec_store) = require_embeddings(&app)?;
-            let vec_iv = vector_index_version(emb.as_ref());
-            let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
-            let emb_dyn: Arc<dyn Embedder> = emb;
-            let vec_retr = Arc::new(VectorRetriever::with_settings(
-                vec_dyn,
-                emb_dyn,
-                app.sqlite.clone(),
-                vec_iv,
-                app.config.search.snippet_chars,
-            )) as Arc<dyn Retriever>;
-            let hybrid = HybridRetriever::new(&app.config, lex, vec_retr);
-            hybrid.search(&query)
-        }
-    }
-}
-
-fn require_embeddings(
-    app: &App,
-) -> anyhow::Result<(
-    Arc<dyn Embedder + Send + Sync>,
-    Arc<kb_store_vector::LanceVectorStore>,
-)> {
-    let emb = app.embedder()?.ok_or_else(|| {
-        anyhow!(
-            "embeddings disabled (config.models.embedding.provider == \"none\" \
-             or dimensions == 0); vector / hybrid search require embeddings — \
-             switch to --mode lexical or enable an embedding provider in config.toml"
-        )
-    })?;
-    let vec_store = app.vector()?.ok_or_else(|| {
-        anyhow!(
-            "vector store unavailable while embedder is configured — this should \
-             not happen; check `kb doctor` and the data_dir permissions"
-        )
-    })?;
-    Ok((emb, vec_store))
-}
-
-/// Compose a stable `IndexVersion` for the lexical retriever from
-/// the active config. This token surfaces in `SearchHit.index_version`
-/// and on snapshot tests; including the chunker version pins it to
-/// the chunking policy in effect.
-fn lexical_index_version(config: &kb_config::Config) -> IndexVersion {
-    IndexVersion(format!("lex:{}", config.chunking.chunker_version))
-}
-
-/// Compose a stable `IndexVersion` for the vector retriever. Tracks
-/// `(embedding_model, embedding_version, dimensions)` so a model swap
-/// flags drift via the existing index_version mismatch warning in
-/// `HybridRetriever::new`.
-fn vector_index_version(embedder: &dyn Embedder) -> IndexVersion {
-    IndexVersion(format!(
-        "vec:{}@{}:{}",
-        embedder.model_id().0,
-        embedder.model_version().0,
-        embedder.dimensions(),
-    ))
+    App::open_with_config(config)?.search(query)
 }

 // ── ask ──────────────────────────────────────────────────────────────────
@@ -826,64 +736,15 @@ pub fn ask(query: &str, opts: AskOpts) -> anyhow::Result<Answer> {
 }

 /// Test-only seam — kb-cli must call the public free function
-/// ([`ask`]), not this. Mirrors the `*_with_config` pattern documented
-/// at the top of this module.
+/// ([`ask`]), not this. Builds a one-shot `App` and delegates to
+/// [`App::ask`].
 #[doc(hidden)]
 pub fn ask_with_config(
    config: kb_config::Config,
    query: &str,
    opts: AskOpts,
 ) -> anyhow::Result<Answer> {
-    let app = App::open(config)?;
-
-    let retriever: Arc<dyn Retriever> = match opts.mode {
-        SearchMode::Lexical => Arc::new(LexicalRetriever::with_settings(
-            app.sqlite.clone(),
-            lexical_index_version(&app.config),
-            app.config.search.snippet_chars,
-        )),
-        SearchMode::Vector => {
-            let (emb, vec_store) = require_embeddings(&app)?;
-            let vec_iv = vector_index_version(emb.as_ref());
-            let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
-            let emb_dyn: Arc<dyn Embedder> = emb;
-            Arc::new(VectorRetriever::with_settings(
-                vec_dyn,
-                emb_dyn,
-                app.sqlite.clone(),
-                vec_iv,
-                app.config.search.snippet_chars,
-            ))
-        }
-        SearchMode::Hybrid => {
-            let lex = Arc::new(LexicalRetriever::with_settings(
-                app.sqlite.clone(),
-                lexical_index_version(&app.config),
-                app.config.search.snippet_chars,
-            )) as Arc<dyn Retriever>;
-            let (emb, vec_store) = require_embeddings(&app)?;
-            let vec_iv = vector_index_version(emb.as_ref());
-            let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
-            let emb_dyn: Arc<dyn Embedder> = emb;
-            let vec_retr = Arc::new(VectorRetriever::with_settings(
-                vec_dyn,
-                emb_dyn,
-                app.sqlite.clone(),
-                vec_iv,
-                app.config.search.snippet_chars,
-            )) as Arc<dyn Retriever>;
-            Arc::new(HybridRetriever::new(&app.config, lex, vec_retr))
-        }
-    };
-
-    let llm: Arc<dyn LanguageModel> = Arc::new(
-        OllamaLanguageModel::new(&app.config)
-            .context("kb-app::ask: build OllamaLanguageModel")?,
-    );
-
-    let pipeline =
-        RagPipeline::new(app.config.clone(), retriever, llm, app.sqlite.clone());
-    pipeline.ask(query, opts)
+    App::open_with_config(config)?.ask(query, opts)
 }

 /// Run the doctor checks against the explicit config path the user
--- a/crates/kb-config/src/lib.rs
+++ b/crates/kb-config/src/lib.rs
@@ -8,6 +8,9 @@ use std::path::{Path, PathBuf};

 use serde::{Deserialize, Serialize};

+mod paths;
+pub use paths::expand_path;
+
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct Config {
    pub schema_version: u32,
--- a/crates/kb-config/src/paths.rs
+++ b/crates/kb-config/src/paths.rs
@@ -0,0 +1,186 @@
+//! Shared path expansion helper.
+//!
+//! `Config::storage.*` fields are stored as raw template strings (e.g.
+//! `${XDG_DATA_HOME:-~/.local/share}/kb`, `{data_dir}/runs`). Every
+//! crate that turns one of those strings into a real filesystem path
+//! needs to apply the same set of substitutions; this module is the
+//! single source of truth so the behavior cannot drift.
+//!
+//! Substitutions, applied in order:
+//!
+//! 1. `{data_dir}` → caller-supplied `data_dir`.
+//!    - When the caller passes an empty `data_dir` (because they ARE
+//!      resolving `data_dir` itself), the substitution is a no-op so
+//!      a literal `{data_dir}` is left in place rather than producing
+//!      a `/{data_dir}/...` artifact.
+//! 2. `${XDG_DATA_HOME:-<default>}` (or the bare `${XDG_DATA_HOME}`) →
+//!    the env var if set + non-empty, else the default after `:-`.
+//!    Mimics POSIX shell's `${VAR:-default}` semantics. Mid-string
+//!    occurrences are supported; only the first match is replaced.
+//! 3. Leading `~` / `~/...` → `$HOME`. Any non-leading `~` is left
+//!    literal (matches shell behavior — only the first segment expands).
+//!
+//! The result is a `PathBuf` regardless of whether all substitutions
+//! were applicable; relative paths are kept relative to the caller's
+//! CWD (not resolved here).
+
+use std::path::PathBuf;
+
+/// Expand storage-path templates. See module docs for the substitution
+/// rules.
+///
+/// Pass an empty `data_dir` when resolving `data_dir` itself; the
+/// `{data_dir}` substitution becomes a no-op in that case so the
+/// recursive shape (`data_dir = "${XDG_DATA_HOME:-…}/kb"`) resolves
+/// without producing a literal `{data_dir}` token in the output.
+pub fn expand_path(raw: &str, data_dir: &str) -> PathBuf {
+    let mut s = raw.to_string();
+
+    // 1. {data_dir} substitution (skipped when resolving data_dir
+    //    itself; see module docs).
+    if !data_dir.is_empty() {
+        s = s.replace("{data_dir}", data_dir);
+    }
+
+    // 2. ${XDG_DATA_HOME:-<default>}: env override else default.
+    if let Some(start) = s.find("${XDG_DATA_HOME") {
+        if let Some(rel_end) = s[start..].find('}') {
+            let end = start + rel_end + 1; // include trailing '}'
+            let inner = &s[start + 2..end - 1]; // strip ${ and }
+            let replacement = match std::env::var("XDG_DATA_HOME") {
+                Ok(v) if !v.is_empty() => v,
+                _ => match inner.split_once(":-") {
+                    Some((_, default)) => default.to_string(),
+                    None => String::new(),
+                },
+            };
+            s.replace_range(start..end, &replacement);
+        }
+    }
+
+    // 3. Leading `~` → $HOME.
+    if let Some(rest) = s.strip_prefix('~') {
+        if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
+            return home.join(rest.trim_start_matches('/'));
+        }
+    }
+
+    PathBuf::from(s)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Mutex as StdMutex;
+
+    /// `XDG_DATA_HOME` / `HOME` env mutations must be serialized so
+    /// concurrent test runs (cargo's default parallel runner) don't
+    /// observe each other's transient values.
+    static ENV_LOCK: StdMutex<()> = StdMutex::new(());
+
+    /// RAII guard: snapshots `XDG_DATA_HOME` on construction, restores
+    /// it on drop.
+    struct XdgGuard {
+        prior: Option<String>,
+    }
+
+    impl XdgGuard {
+        fn capture() -> Self {
+            Self {
+                prior: std::env::var("XDG_DATA_HOME").ok(),
+            }
+        }
+    }
+
+    impl Drop for XdgGuard {
+        fn drop(&mut self) {
+            // SAFETY: edition 2024 marks set_var/remove_var unsafe
+            // because env mutation is not thread-safe. The ENV_LOCK
+            // guard at the call site prevents concurrent observation.
+            unsafe {
+                match &self.prior {
+                    Some(v) => std::env::set_var("XDG_DATA_HOME", v),
+                    None => std::env::remove_var("XDG_DATA_HOME"),
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn substitutes_data_dir_template() {
+        let p = expand_path("{data_dir}/runs", "/tmp/kbtest");
+        assert_eq!(p, PathBuf::from("/tmp/kbtest/runs"));
+    }
+
+    #[test]
+    fn data_dir_substitution_skipped_when_empty() {
+        // Empty `data_dir` is the "resolving data_dir itself" signal;
+        // the literal `{data_dir}` token must survive.
+        let p = expand_path("{data_dir}/runs", "");
+        assert_eq!(p, PathBuf::from("{data_dir}/runs"));
+    }
+
+    #[test]
+    fn passthrough_absolute_path() {
+        let p = expand_path("/abs/runs", "/ignored");
+        assert_eq!(p, PathBuf::from("/abs/runs"));
+    }
+
+    #[test]
+    fn xdg_data_home_set_replaces_var() {
+        let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
+        let _guard = XdgGuard::capture();
+        // SAFETY: lock held for the duration of this test.
+        unsafe { std::env::set_var("XDG_DATA_HOME", "/custom/path") };
+
+        let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
+        assert_eq!(p, PathBuf::from("/custom/path/kb"));
+    }
+
+    #[test]
+    fn xdg_data_home_unset_uses_default() {
+        let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
+        let _guard = XdgGuard::capture();
+        // SAFETY: lock held for the duration of this test.
+        unsafe { std::env::remove_var("XDG_DATA_HOME") };
+
+        let home = std::env::var("HOME").expect("HOME must be set in tests");
+        let expected = PathBuf::from(home).join(".local/share/kb");
+        let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
+        assert_eq!(p, expected);
+    }
+
+    #[test]
+    fn xdg_with_no_default_resolves_to_empty_when_unset() {
+        let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
+        let _guard = XdgGuard::capture();
+        // SAFETY: lock held for the duration of this test.
+        unsafe { std::env::remove_var("XDG_DATA_HOME") };
+
+        // No `:-default` clause, no env var → empty string substitution.
+        let p = expand_path("${XDG_DATA_HOME}/kb", "");
+        assert_eq!(p, PathBuf::from("/kb"));
+    }
+
+    #[test]
+    fn leading_tilde_expands_to_home() {
+        let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
+        let home = std::env::var("HOME").expect("HOME must be set in tests");
+        let p = expand_path("~/runs", "");
+        assert_eq!(p, PathBuf::from(home).join("runs"));
+    }
+
+    #[test]
+    fn data_dir_then_xdg_then_tilde_compose() {
+        // Order matters: substitute `{data_dir}` (which itself contains
+        // an unexpanded `${XDG_DATA_HOME}` and `~`), then the other two
+        // resolve the result.
+        let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
+        let _guard = XdgGuard::capture();
+        // SAFETY: lock held for the duration of this test.
+        unsafe { std::env::set_var("XDG_DATA_HOME", "/xdg/data") };
+
+        let p = expand_path("{data_dir}/runs", "/xdg/data/kb");
+        assert_eq!(p, PathBuf::from("/xdg/data/kb/runs"));
+    }
+}
--- a/crates/kb-embed-local/src/lib.rs
+++ b/crates/kb-embed-local/src/lib.rs
@@ -22,11 +22,11 @@
 //! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md`
 //! §7.2 (Embedder), §6.4 ([models.embedding]), §9 (versioning).

-use std::path::PathBuf;
 use std::sync::Mutex;

 use anyhow::{Context, Result};
 use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
+use kb_config::expand_path;
 use kb_embed::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion};

 /// Subdirectory under `config.storage.model_dir` where the fastembed
@@ -60,9 +60,8 @@ impl FastembedEmbedder {
    /// first `embed`).
    pub fn new(config: &kb_config::Config) -> Result<Self> {
        // 1. Resolve `{data_dir}/models/fastembed/` from the config
-        //    templates. `kb-config` does not expose a public path
-        //    resolver yet, so we hand-roll a tiny one mirroring
-        //    kb-store-sqlite's `expand_data_dir`.
+        //    templates. Goes through the shared `kb_config::expand_path`
+        //    so every crate resolves storage paths identically.
        let data_dir = expand_path(&config.storage.data_dir, "");
        let model_dir = expand_path(&config.storage.model_dir, &data_dir.to_string_lossy());
        let cache_dir = model_dir.join(FASTEMBED_CACHE_SUBDIR);
@@ -222,58 +221,6 @@ pub(crate) fn check_dim(model_dim: usize, cfg_dim: usize) -> Result<()> {
    Ok(())
 }

-/// Expand the limited template language `kb-config` uses for storage
-/// paths.
-///
-/// Supported substitutions, applied in order:
-/// 1. `{data_dir}` → `data_dir` (caller-supplied resolved string). This
-///    is a no-op when `data_dir` is empty (used by the recursive call
-///    that resolves `data_dir` itself).
-/// 2. `${XDG_DATA_HOME:-~/.local/share}` (and the bare
-///    `${XDG_DATA_HOME}`) → env var if set, else the default after
-///    `:-`.
-/// 3. Leading `~` → `$HOME`.
-///
-/// Mirrors `kb-store-sqlite::store::expand_data_dir`. Kept private to
-/// this crate; promoting it to a public `kb-config` API is a separate
-/// task (see task p3-2 risks: "don't expand kb-config's public API").
-fn expand_path(raw: &str, data_dir: &str) -> PathBuf {
-    let mut s = raw.to_string();
-
-    if !data_dir.is_empty() {
-        s = s.replace("{data_dir}", data_dir);
-    }
-
-    // ${XDG_DATA_HOME:-~/.local/share}: respect env override, else fall
-    // back to the suffix after `:-`.
-    if let Some(start) = s.find("${XDG_DATA_HOME") {
-        if let Some(rel_end) = s[start..].find('}') {
-            let end = start + rel_end + 1; // include trailing '}'
-            let inner = &s[start + 2..end - 1]; // strip ${ and }
-            let replacement = match std::env::var("XDG_DATA_HOME") {
-                Ok(v) if !v.is_empty() => v,
-                _ => {
-                    if let Some((_, default)) = inner.split_once(":-") {
-                        default.to_string()
-                    } else {
-                        String::new()
-                    }
-                }
-            };
-            s.replace_range(start..end, &replacement);
-        }
-    }
-
-    // Leading `~` → $HOME.
-    if let Some(rest) = s.strip_prefix('~') {
-        if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
-            return home.join(rest.trim_start_matches('/'));
-        }
-    }
-
-    PathBuf::from(s)
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -354,80 +301,6 @@ mod tests {
        assert!(msg.contains("unsupported embedding model"), "msg={msg}");
    }

-    // ── expand_path ──────────────────────────────────────────────────
-
-    #[test]
-    fn expand_path_substitutes_data_dir_template() {
-        let p = expand_path("{data_dir}/models", "/tmp/kbtest");
-        assert_eq!(p, PathBuf::from("/tmp/kbtest/models"));
-    }
-
-    #[test]
-    fn expand_path_no_op_without_template() {
-        let p = expand_path("/abs/path", "/tmp/kbtest");
-        assert_eq!(p, PathBuf::from("/abs/path"));
-    }
-
-    // ── expand_path: XDG_DATA_HOME fallback ──────────────────────────
-    //
-    // These two tests mutate the process-wide `XDG_DATA_HOME` env var,
-    // which is unsafe under edition 2024 and racy under cargo's default
-    // parallel test runner. The shared `ENV_LOCK` serializes them; each
-    // test snapshots the prior value and restores it on exit.
-
-    use std::sync::Mutex as StdMutex;
-    static ENV_LOCK: StdMutex<()> = StdMutex::new(());
-
-    /// RAII guard: snapshots `XDG_DATA_HOME` on construction, restores
-    /// it on drop. Pair with the `ENV_LOCK` guard for serial access.
-    struct XdgGuard {
-        prior: Option<String>,
-    }
-
-    impl XdgGuard {
-        fn capture() -> Self {
-            Self {
-                prior: std::env::var("XDG_DATA_HOME").ok(),
-            }
-        }
-    }
-
-    impl Drop for XdgGuard {
-        fn drop(&mut self) {
-            // SAFETY: edition 2024 marks `set_var`/`remove_var` unsafe
-            // because env mutation is not thread-safe. Callers hold
-            // `ENV_LOCK` for the duration of the test, so no other
-            // thread observes the mutation.
-            unsafe {
-                match &self.prior {
-                    Some(v) => std::env::set_var("XDG_DATA_HOME", v),
-                    None => std::env::remove_var("XDG_DATA_HOME"),
-                }
-            }
-        }
-    }
-
-    #[test]
-    fn expand_path_xdg_data_home_set() {
-        let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
-        let _guard = XdgGuard::capture();
-        // SAFETY: lock held for the duration of this test.
-        unsafe { std::env::set_var("XDG_DATA_HOME", "/custom/path") };
-
-        let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
-        assert_eq!(p, PathBuf::from("/custom/path/kb"));
-    }
-
-    #[test]
-    fn expand_path_xdg_data_home_unset_falls_back_to_home() {
-        let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
-        let _guard = XdgGuard::capture();
-        // SAFETY: lock held for the duration of this test.
-        unsafe { std::env::remove_var("XDG_DATA_HOME") };
-
-        let home = std::env::var("HOME").expect("HOME must be set in tests");
-        let expected = PathBuf::from(home).join(".local/share/kb");
-        let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
-        assert_eq!(p, expected);
-    }
+    // expand_path tests live in `kb-config::paths`. The adapter imports
+    // it and trusts the upstream coverage rather than duplicating it.
 }
--- a/crates/kb-eval/Cargo.toml
+++ b/crates/kb-eval/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name          = "kb-eval"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "Golden-fixture eval runner: load YAML, drive kb-app search/ask, persist eval_runs / eval_query_results / per_query.jsonl"
+
+[dependencies]
+# Allowed deps per p5-1 spec — domain types + facade only.
+kb-core         = { path = "../kb-core" }
+kb-config       = { path = "../kb-config" }
+kb-app          = { path = "../kb-app" }
+kb-store-sqlite = { path = "../kb-store-sqlite" }
+serde           = { workspace = true }
+serde_json      = { workspace = true }
+serde_yaml      = { workspace = true }
+time            = { workspace = true }
+tracing         = { workspace = true }
+anyhow          = { workspace = true }
+# `uuid::Uuid::now_v7()` powers the `run_<ulid_lower>`-shaped run_id;
+# v7 UUIDs are timestamp-ordered (same monotonicity as ULID) and `uuid`
+# is already in workspace deps, so we avoid pulling a new ULID crate
+# just for the lower-cased timestamp prefix.
+uuid            = { workspace = true }
+
+[dev-dependencies]
+tempfile        = { workspace = true }
+rusqlite        = { workspace = true }
--- a/crates/kb-eval/src/lib.rs
+++ b/crates/kb-eval/src/lib.rs
@@ -0,0 +1,29 @@
+//! `kb-eval` — golden-fixture eval runner (P5-1).
+//!
+//! Loads `fixtures/golden_queries.yaml`, runs each entry through the
+//! [`kb_app`] facade (lexical / vector / hybrid + optional RAG), and
+//! persists results into `eval_runs` / `eval_query_results` plus
+//! `runs_dir/<run_id>/per_query.jsonl` (design §5.7, §6.3).
+//!
+//! Metric computation lives in P5-2 (`kb-eval::metrics`); this crate is
+//! the **data collector** only.
+//!
+//! ## Allowed deps (per task spec)
+//!
+//! `kb-core`, `kb-config`, `kb-app`, `kb-store-sqlite`, plus `serde`,
+//! `serde_yaml`, `serde_json`, `time`, `tracing`,
+//! `anyhow`, `uuid`. Retrieval / embedding / LLM crates are NOT
+//! reachable here — every retrieval and `ask` call must go through
+//! `kb-app`.
+//!
+//! ## `run_id` recipe
+//!
+//! `run_id` uses UUIDv7 simple — timestamp-ordered, lowercase hex.
+
+mod loader;
+mod runner;
+mod types;
+
+pub use loader::load_golden_set;
+pub use runner::{run_eval, run_eval_with_config};
+pub use types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult};
--- a/crates/kb-eval/src/loader.rs
+++ b/crates/kb-eval/src/loader.rs
@@ -0,0 +1,229 @@
+//! Golden-set YAML loader.
+//!
+//! Two entry points:
+//!
+//! - [`load_golden_set`] — pure YAML parse + uniqueness check. Used by
+//!   tests that don't have a SQLite store handy.
+//! - [`load_golden_set_validated`] — additionally verifies every
+//!   `expected_doc_id` / `expected_chunk_id` exists in the SQLite DB
+//!   the supplied [`kb_config::Config`] points at. Used by
+//!   [`crate::run_eval`] in production so a stale golden set fails
+//!   fast at run start.
+
+use std::collections::{BTreeSet, HashSet};
+use std::path::Path;
+
+use anyhow::{Context, Result, anyhow};
+use kb_store_sqlite::SqliteStore;
+
+use crate::types::GoldenQuery;
+
+/// Parse the YAML at `path` into a `Vec<GoldenQuery>` and check that
+/// every `id` is unique.
+///
+/// The YAML is expected to be a top-level list of mappings. Required
+/// fields per entry: `id`, `query`. All other fields default to empty /
+/// `None` per [`GoldenQuery`]'s `serde(default)` annotations.
+pub fn load_golden_set(path: &Path) -> Result<Vec<GoldenQuery>> {
+    let bytes =
+        std::fs::read(path).with_context(|| format!("read golden YAML from {}", path.display()))?;
+    let queries: Vec<GoldenQuery> = serde_yaml::from_slice(&bytes)
+        .with_context(|| format!("parse golden YAML at {}", path.display()))?;
+    check_unique_ids(&queries)?;
+    Ok(queries)
+}
+
+/// Same as [`load_golden_set`] but additionally validates that every
+/// `expected_doc_id` and `expected_chunk_id` referenced by the loaded
+/// entries actually exists in the SQLite database `cfg` resolves to.
+///
+/// Missing IDs are surfaced as a single sorted error listing every
+/// offender, so curators can fix the whole set in one pass.
+///
+/// Currently used only by the in-module tests below; production code
+/// inlines `load_golden_set` + `validate_against_db` in
+/// [`crate::run_eval_with_config`] so the validation can run against
+/// an already-opened [`kb_config::Config`] without re-parsing YAML.
+#[cfg(test)]
+pub(crate) fn load_golden_set_validated(
+    yaml_path: &Path,
+    cfg: &kb_config::Config,
+) -> Result<Vec<GoldenQuery>> {
+    let queries = load_golden_set(yaml_path)?;
+    validate_against_db(&queries, cfg)?;
+    Ok(queries)
+}
+
+fn check_unique_ids(queries: &[GoldenQuery]) -> Result<()> {
+    let mut seen: HashSet<&str> = HashSet::new();
+    let mut dups: BTreeSet<String> = BTreeSet::new();
+    for q in queries {
+        if !seen.insert(q.id.as_str()) {
+            dups.insert(q.id.clone());
+        }
+    }
+    if dups.is_empty() {
+        Ok(())
+    } else {
+        let list: Vec<String> = dups.into_iter().collect();
+        Err(anyhow!("duplicate query id(s): {}", list.join(", ")))
+    }
+}
+
+/// Read every doc_id / chunk_id referenced by `queries` and confirm
+/// SQLite has rows for them. Builds a sorted, deduplicated error
+/// message listing every missing ID.
+pub(crate) fn validate_against_db(queries: &[GoldenQuery], cfg: &kb_config::Config) -> Result<()> {
+    // Short-circuit when there is nothing to validate — saves opening
+    // SQLite for golden sets that omit expected_*_ids entirely.
+    let needs_check = queries
+        .iter()
+        .any(|q| !q.expected_doc_ids.is_empty() || !q.expected_chunk_ids.is_empty());
+    if !needs_check {
+        return Ok(());
+    }
+
+    let store = SqliteStore::open(cfg).context("open SqliteStore for golden validation")?;
+    store
+        .run_migrations()
+        .context("run migrations for golden validation")?;
+
+    let mut missing_docs: BTreeSet<String> = BTreeSet::new();
+    let mut missing_chunks: BTreeSet<String> = BTreeSet::new();
+
+    for q in queries {
+        for did in &q.expected_doc_ids {
+            let exists = store
+                .document_exists(&did.0)
+                .with_context(|| format!("probe document {}", did.0))?;
+            if !exists {
+                missing_docs.insert(did.0.clone());
+            }
+        }
+        for cid in &q.expected_chunk_ids {
+            let exists = store
+                .chunk_exists(&cid.0)
+                .with_context(|| format!("probe chunk {}", cid.0))?;
+            if !exists {
+                missing_chunks.insert(cid.0.clone());
+            }
+        }
+    }
+
+    if missing_docs.is_empty() && missing_chunks.is_empty() {
+        return Ok(());
+    }
+
+    let mut parts: Vec<String> = Vec::new();
+    if !missing_docs.is_empty() {
+        parts.push(format!(
+            "missing doc_ids: {}",
+            missing_docs.into_iter().collect::<Vec<_>>().join(", ")
+        ));
+    }
+    if !missing_chunks.is_empty() {
+        parts.push(format!(
+            "missing chunk_ids: {}",
+            missing_chunks.into_iter().collect::<Vec<_>>().join(", ")
+        ));
+    }
+    Err(anyhow!(
+        "golden set references unknown IDs — {}",
+        parts.join("; ")
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    //! Tests that exercise the crate-private
+    //! [`load_golden_set_validated`]. The pure-parser cases live in
+    //! `tests/loader.rs`; only the validated-variant cases need to sit
+    //! next to the function so they can see the `pub(crate)` symbol.
+    use super::*;
+    use kb_config::Config;
+    use kb_store_sqlite::SqliteStore;
+    use rusqlite::params;
+    use std::fs;
+    use tempfile::tempdir;
+
+    #[test]
+    fn rejects_unknown_expected_chunk_id() {
+        let tmp = tempdir().unwrap();
+        let mut config = Config::defaults();
+        config.storage.data_dir = tmp.path().to_string_lossy().into_owned();
+
+        let store = SqliteStore::open(&config).unwrap();
+        store.run_migrations().unwrap();
+        seed_one_chunk(&store, "doc_present", "chunk_present");
+
+        let yaml_path = tmp.path().join("golden.yaml");
+        fs::write(
+            &yaml_path,
+            "- id: g1\n  query: hello\n  expected_chunk_ids: [\"chunk_present\", \"chunk_missing\"]\n",
+        )
+        .unwrap();
+
+        let err = load_golden_set_validated(&yaml_path, &config).unwrap_err();
+        let msg = format!("{err:#}");
+        assert!(msg.contains("missing chunk_ids"), "msg: {msg}");
+        assert!(msg.contains("chunk_missing"), "msg: {msg}");
+        assert!(!msg.contains("chunk_present"), "msg: {msg}");
+    }
+
+    #[test]
+    fn accepts_resolved_ids() {
+        let tmp = tempdir().unwrap();
+        let mut config = Config::defaults();
+        config.storage.data_dir = tmp.path().to_string_lossy().into_owned();
+
+        let store = SqliteStore::open(&config).unwrap();
+        store.run_migrations().unwrap();
+        seed_one_chunk(&store, "doc_present", "chunk_present");
+
+        let yaml_path = tmp.path().join("golden.yaml");
+        fs::write(
+            &yaml_path,
+            "- id: g1\n  query: hello\n  expected_doc_ids: [\"doc_present\"]\n  expected_chunk_ids: [\"chunk_present\"]\n",
+        )
+        .unwrap();
+
+        let qs = load_golden_set_validated(&yaml_path, &config).unwrap();
+        assert_eq!(qs.len(), 1);
+    }
+
+    fn seed_one_chunk(store: &SqliteStore, doc_id: &str, chunk_id: &str) {
+        let conn = store.read_conn();
+        let asset_id = format!("a_{doc_id}");
+        conn.execute(
+            "INSERT OR IGNORE INTO assets (
+                asset_id, source_uri, workspace_path, media_type, byte_len,
+                checksum, storage_kind, storage_path, discovered_at
+             ) VALUES (?, ?, ?, '\"markdown\"', 0,
+                       'deadbeefdeadbeefdeadbeefdeadbeef',
+                       'reference', ?, '1970-01-01T00:00:00Z')",
+            params![asset_id, "file:///tmp/x.md", "x.md", "x.md"],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT OR IGNORE INTO documents (
+                doc_id, asset_id, workspace_path, title, lang, source_type,
+                trust_level, parser_version, doc_version, schema_version,
+                metadata_json, provenance_json, created_at, updated_at
+             ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
+                       '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
+            params![doc_id, asset_id, "x.md"],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT OR IGNORE INTO chunks (
+                chunk_id, doc_id, text, heading_path_json, section_label,
+                source_spans_json, token_estimate, chunker_version,
+                policy_hash, block_ids_json, created_at
+             ) VALUES (?, ?, 'hi', '[]', NULL,
+                       '[{\"kind\":\"line\",\"start\":1,\"end\":3}]',
+                       1, 'v1', 'h', '[]', '1970-01-01T00:00:00Z')",
+            params![chunk_id, doc_id],
+        )
+        .unwrap();
+    }
+}
--- a/crates/kb-eval/src/runner.rs
+++ b/crates/kb-eval/src/runner.rs
@@ -0,0 +1,270 @@
+//! Per-query eval runner. See [`run_eval`] / [`run_eval_with_config`].
+
+use std::fs::File;
+use std::io::{BufWriter, Write};
+use std::path::PathBuf;
+use std::time::Instant;
+
+use anyhow::{Context, Result};
+use kb_app::App;
+use kb_config::expand_path;
+use kb_core::{SearchFilters, SearchQuery};
+use kb_store_sqlite::{EvalRunRow, SqliteStore};
+use time::OffsetDateTime;
+
+use crate::loader::{load_golden_set, validate_against_db};
+use crate::types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult};
+
+/// Convert a wall-clock duration since `start` into milliseconds clamped
+/// to `u32::MAX`. The `QueryResult.elapsed_ms` and `eval_runs.duration_ms`
+/// fields are `u32`; saturate (rather than wrap) so a stuck run never
+/// reports a misleading sub-second duration.
+fn elapsed_ms_u32(start: Instant) -> u32 {
+    start.elapsed().as_millis().min(u128::from(u32::MAX)) as u32
+}
+
+/// Env var that overrides the default `fixtures/golden_queries.yaml`
+/// path. Resolved relative to the current working directory.
+const KB_EVAL_GOLDEN: &str = "KB_EVAL_GOLDEN";
+
+/// Default golden YAML path (relative to CWD when set).
+const DEFAULT_GOLDEN_PATH: &str = "fixtures/golden_queries.yaml";
+
+/// Run the golden suite end-to-end against the active XDG-loaded
+/// [`kb_config::Config`]. Wraps [`run_eval_with_config`] with
+/// `Config::load(None)`.
+pub fn run_eval(opts: &EvalRunOpts) -> Result<EvalRun> {
+    let cfg = kb_config::Config::load(None).context("load Config for run_eval")?;
+    run_eval_with_config(&cfg, opts)
+}
+
+/// Run the golden suite end-to-end against an explicit
+/// [`kb_config::Config`]. Used by integration tests (TempDir-backed
+/// data_dir) and any future caller that wants to drive the runner
+/// against a non-default config.
+pub fn run_eval_with_config(cfg: &kb_config::Config, opts: &EvalRunOpts) -> Result<EvalRun> {
+    let started = Instant::now();
+
+    // ── 1. Load golden set ────────────────────────────────────────────────
+    //
+    // `with_context` already names the path on error, so a separate
+    // `tracing::debug!` here would just be noise.
+    let golden_path = resolve_golden_path();
+    let queries = load_golden_set(&golden_path).with_context(|| {
+        format!(
+            "load golden set from {} (override via KB_EVAL_GOLDEN)",
+            golden_path.display()
+        )
+    })?;
+    validate_against_db(&queries, cfg)?;
+
+    // ── 2. Mint identifiers + open store ──────────────────────────────────
+    let run_id = mint_run_id();
+    let created_at = OffsetDateTime::now_utc();
+    let commit_hash = std::env::var("KB_COMMIT_HASH")
+        .ok()
+        .filter(|s| !s.is_empty());
+
+    // Open the store once so every per-query write reuses the same
+    // connection-mutex lifetime.
+    let store = SqliteStore::open(cfg).context("open SqliteStore for run_eval")?;
+    store
+        .run_migrations()
+        .context("run migrations for run_eval")?;
+
+    // ── 3. Build config_snapshot_json ─────────────────────────────────────
+    let config_snapshot_json = build_config_snapshot(cfg)?;
+    let config_snapshot_text =
+        serde_json::to_string(&config_snapshot_json).context("serialize config_snapshot_json")?;
+
+    // ── 4. Per-query execution ────────────────────────────────────────────
+    //
+    // Open one `App` for the whole suite. The embedder / vector store /
+    // LLM are memoized on the App, so a 50-query run pays the ~470 MB
+    // ONNX init + Lance reopen + Ollama handshake exactly once.
+    let app = App::open_with_config(cfg.clone()).context("open App for run_eval")?;
+
+    let mut per_query: Vec<QueryResult> = Vec::with_capacity(queries.len());
+    for gq in &queries {
+        let qr = execute_query(&app, gq, opts);
+        per_query.push(qr);
+    }
+
+    // ── 5. Persist eval_runs + eval_query_results ────────────────────────
+    // Serialize per-query JSON up front so the SQLite transaction below
+    // never holds the connection mutex through serde failures.
+    let mut results: Vec<(String, String)> = Vec::with_capacity(per_query.len());
+    for qr in &per_query {
+        let json = serde_json::to_string(qr)
+            .with_context(|| format!("serialize QueryResult for {}", qr.query_id))?;
+        results.push((qr.query_id.clone(), json));
+    }
+    let row = EvalRunRow {
+        run_id: &run_id,
+        suite: opts.suite.as_str(),
+        config_snapshot_json: &config_snapshot_text,
+        aggregate_json: "{}",
+        commit_hash: commit_hash.as_deref(),
+        created_at,
+    };
+    store
+        .record_eval_run_with_results(&row, &results)
+        .context("record eval_runs + eval_query_results (transactional)")?;
+
+    // ── 6. Mirror to runs_dir/<run_id>/per_query.jsonl ────────────────────
+    write_per_query_jsonl(cfg, &run_id, &per_query)?;
+
+    let duration_ms = elapsed_ms_u32(started);
+    tracing::info!(
+        target: "kb-eval",
+        run_id = %run_id,
+        suite = %opts.suite,
+        queries = per_query.len(),
+        duration_ms,
+        "kb-eval: run complete"
+    );
+
+    Ok(EvalRun {
+        run_id,
+        created_at,
+        commit_hash,
+        config_snapshot_json,
+        per_query,
+    })
+}
+
+/// Mint a `run_<lower>` identifier. UUIDv7 stands in for ULID — same
+/// timestamp-ordered monotonicity, already in workspace deps. Lower-
+/// case simple form to match the `ulid_lower()` shape the spec asks
+/// for.
+fn mint_run_id() -> String {
+    let id = uuid::Uuid::now_v7().simple().to_string();
+    format!("run_{id}")
+}
+
+/// Resolve the golden YAML path. Honors the `KB_EVAL_GOLDEN` env
+/// override; otherwise relative to CWD. The path is NOT expanded for
+/// `~` / `${...}` placeholders — direct file paths only.
+fn resolve_golden_path() -> PathBuf {
+    match std::env::var(KB_EVAL_GOLDEN) {
+        Ok(s) if !s.is_empty() => PathBuf::from(s),
+        _ => PathBuf::from(DEFAULT_GOLDEN_PATH),
+    }
+}
+
+/// Run one [`GoldenQuery`] through the kb-app facade. Errors are
+/// captured into `QueryResult.error` so the run continues.
+fn execute_query(app: &App, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult {
+    let started = Instant::now();
+
+    let search_query = SearchQuery {
+        text: gq.query.clone(),
+        mode: opts.mode,
+        k: opts.k,
+        filters: SearchFilters::default(),
+    };
+
+    let (hits_top_k, mut error) = match app.search(search_query) {
+        Ok(hits) => (hits, None),
+        Err(e) => (Vec::new(), Some(format!("{e:#}"))),
+    };
+
+    // Optional RAG path: only attempted when `with_rag` and the search
+    // call did not already error out (we want one error per query, not
+    // a duplicated one).
+    let answer = if opts.with_rag && error.is_none() {
+        let ask_opts = kb_app::AskOpts {
+            k: opts.k,
+            explain: true,
+            mode: opts.mode,
+            temperature: opts.temperature,
+            seed: opts.seed,
+            stream_sink: None,
+        };
+        match app.ask(&gq.query, ask_opts) {
+            Ok(ans) => Some(ans),
+            Err(e) => {
+                error = Some(format!("{e:#}"));
+                None
+            }
+        }
+    } else {
+        None
+    };
+
+    QueryResult {
+        query_id: gq.id.clone(),
+        query: gq.query.clone(),
+        mode: opts.mode,
+        hits_top_k,
+        answer,
+        elapsed_ms: elapsed_ms_u32(started),
+        error,
+    }
+}
+
+/// Build the `config_snapshot_json` value: full Config as `config` plus
+/// the auxiliary version fields the spec calls out.
+///
+/// `index_version` is intentionally `None` here — it is composed
+/// dynamically by `kb-app` on a per-call basis from the configured
+/// embedder (e.g., `vec:<model>@<version>:<dim>`), so it is not a
+/// stable run-time property of the config alone. P5-2 may compose it
+/// from `embedding.{model,version,dimensions}` if it needs the field
+/// for compare reports.
+fn build_config_snapshot(cfg: &kb_config::Config) -> Result<serde_json::Value> {
+    let cfg_value = serde_json::to_value(cfg).context("serialize Config")?;
+    Ok(serde_json::json!({
+        "config": cfg_value,
+        "chunker_version": cfg.chunking.chunker_version,
+        "embedding": {
+            "model": cfg.models.embedding.model,
+            "version": cfg.models.embedding.version,
+            "dimensions": cfg.models.embedding.dimensions,
+            "provider": cfg.models.embedding.provider,
+        },
+        "llm": {
+            "model_id": cfg.models.llm.model,
+            "provider": cfg.models.llm.provider,
+        },
+        "prompt_template_version": cfg.rag.prompt_template_version,
+        "score_gate": cfg.rag.score_gate,
+        "rrf_k": cfg.search.rrf_k,
+        "index_version": serde_json::Value::Null,
+    }))
+}
+
+/// Write the `runs_dir/<run_id>/per_query.jsonl` mirror (design §6.3).
+/// Each `QueryResult` is one line, separator `\n`. The directory is
+/// created if it doesn't exist; an existing file is overwritten (a
+/// `run_id` collision would already have failed the `eval_runs`
+/// PRIMARY KEY upstream).
+fn write_per_query_jsonl(
+    cfg: &kb_config::Config,
+    run_id: &str,
+    per_query: &[QueryResult],
+) -> Result<()> {
+    // `data_dir` may itself contain `${XDG_DATA_HOME:-…}` / `~` (the
+    // workspace-default does); resolve it before threading it into the
+    // `{data_dir}` substitution of `runs_dir`.
+    let resolved_data_dir = expand_path(&cfg.storage.data_dir, "");
+    let runs_dir = expand_path(
+        &cfg.storage.runs_dir,
+        &resolved_data_dir.to_string_lossy(),
+    );
+    let run_dir = runs_dir.join(run_id);
+    std::fs::create_dir_all(&run_dir)
+        .with_context(|| format!("create run dir {}", run_dir.display()))?;
+    let path = run_dir.join("per_query.jsonl");
+    let file = File::create(&path)
+        .with_context(|| format!("create per_query.jsonl at {}", path.display()))?;
+    let mut w = BufWriter::new(file);
+    for qr in per_query {
+        serde_json::to_writer(&mut w, qr)
+            .with_context(|| format!("serialize QueryResult for {}", qr.query_id))?;
+        w.write_all(b"\n")
+            .context("write newline separator in per_query.jsonl")?;
+    }
+    w.flush().context("flush per_query.jsonl")?;
+    Ok(())
+}
--- a/crates/kb-eval/src/types.rs
+++ b/crates/kb-eval/src/types.rs
@@ -0,0 +1,87 @@
+//! Public domain types for the eval runner (signatures pinned by
+//! `tasks/p5/p5-1-golden-fixture-runner.md` "Public surface").
+
+use serde::{Deserialize, Serialize};
+use time::OffsetDateTime;
+
+use kb_core::{Answer, ChunkId, DocumentId, Lang, SearchHit, SearchMode};
+
+/// One golden query loaded from `fixtures/golden_queries.yaml`.
+///
+/// Required fields: `id`, `query`. Everything else defaults to
+/// empty / `None` per the loader contract.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct GoldenQuery {
+    pub id: String,
+    pub query: String,
+    #[serde(default = "default_lang")]
+    pub lang: Lang,
+    #[serde(default)]
+    pub expected_doc_ids: Vec<DocumentId>,
+    #[serde(default)]
+    pub expected_chunk_ids: Vec<ChunkId>,
+    #[serde(default)]
+    pub must_contain: Vec<String>,
+    #[serde(default)]
+    pub forbidden: Vec<String>,
+    #[serde(default)]
+    pub difficulty: Option<String>,
+}
+
+fn default_lang() -> Lang {
+    // `Lang` is a BCP-47 string newtype (§3.3); the empty string is
+    // the safe default for golden entries that omit `lang`. Curators
+    // may fill it in later; the runner does not branch on this field.
+    Lang(String::new())
+}
+
+/// Caller-supplied knobs for one [`crate::run_eval`] invocation.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct EvalRunOpts {
+    /// Suite label persisted into `eval_runs.suite`. The shipped
+    /// fixture is `"golden"`; other suites can reuse the same runner.
+    pub suite: String,
+    /// Retrieval mode forwarded to every `kb_app::search` /
+    /// `kb_app::ask` call inside the run.
+    pub mode: SearchMode,
+    /// When `true`, also call `kb_app::ask` per query and record the
+    /// resulting `Answer` on the `QueryResult`.
+    pub with_rag: bool,
+    /// Top-k forwarded to retrieval (and `AskOpts.k` when `with_rag`).
+    pub k: usize,
+    /// Override `config.models.llm.temperature` when `with_rag`.
+    /// Determinism contract requires `Some(0.0)` + a fixed `seed`.
+    pub temperature: Option<f32>,
+    /// Override `config.models.llm.seed` when `with_rag`.
+    pub seed: Option<u64>,
+}
+
+/// One full eval run. Persisted to `eval_runs` + `eval_query_results`
+/// (design §5.7) and mirrored to `runs_dir/<run_id>/per_query.jsonl`
+/// (design §6.3).
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct EvalRun {
+    pub run_id: String,
+    #[serde(with = "time::serde::rfc3339")]
+    pub created_at: OffsetDateTime,
+    pub commit_hash: Option<String>,
+    /// Snapshot of the `Config` plus auxiliary version fields
+    /// (`chunker_version`, embedding/llm/prompt versions, fusion
+    /// params, `index_version`). See [`crate::run_eval`] for the
+    /// exact shape.
+    pub config_snapshot_json: serde_json::Value,
+    pub per_query: Vec<QueryResult>,
+}
+
+/// One per-query record. Every row in `eval_query_results` has its
+/// `result_json` filled with `serde_json::to_string(&QueryResult)`.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct QueryResult {
+    pub query_id: String,
+    pub query: String,
+    pub mode: SearchMode,
+    pub hits_top_k: Vec<SearchHit>,
+    pub answer: Option<Answer>,
+    pub elapsed_ms: u32,
+    pub error: Option<String>,
+}
--- a/crates/kb-eval/tests/fixtures/eval/run-1.json
+++ b/crates/kb-eval/tests/fixtures/eval/run-1.json
@@ -0,0 +1,30 @@
+[
+  {
+    "error": null,
+    "first_hit": {
+      "chunk_id": "chunk000000000000000000000000000000",
+      "doc_id": "doc00000000000000000000000000000000",
+      "heading_path": [],
+      "score": 0.3429983854293823
+    },
+    "has_answer": false,
+    "hits_count": 1,
+    "mode": "lexical",
+    "query": "ownership",
+    "query_id": "q1"
+  },
+  {
+    "error": null,
+    "first_hit": {
+      "chunk_id": "chunk000000000000000000000000000002",
+      "doc_id": "doc00000000000000000000000000000002",
+      "heading_path": [],
+      "score": 0.3585492968559265
+    },
+    "has_answer": false,
+    "hits_count": 1,
+    "mode": "lexical",
+    "query": "heading",
+    "query_id": "q2"
+  }
+]
--- a/crates/kb-eval/tests/loader.rs
+++ b/crates/kb-eval/tests/loader.rs
@@ -0,0 +1,59 @@
+//! Loader tests for the golden-fixture YAML parser (P5-1).
+//!
+//! These tests exercise pure parsing and duplicate-id detection. The
+//! DB-validation tests for the crate-private
+//! `load_golden_set_validated` live next to the function in
+//! `src/loader.rs` (they need `pub(crate)` visibility, which integration
+//! tests can't see).
+
+use std::fs;
+
+use kb_eval::load_golden_set;
+use tempfile::tempdir;
+
+// ── 1. parser accepts well-formed YAML with optional fields ──────────────────
+
+#[test]
+fn loads_minimal_well_formed_yaml() {
+    let tmp = tempdir().unwrap();
+    let yaml_path = tmp.path().join("golden.yaml");
+    fs::write(
+        &yaml_path,
+        "- id: g1\n  query: hello\n- id: g2\n  query: \"another\"\n  lang: en\n  must_contain: [\"foo\"]\n  forbidden: [\"bar\"]\n  difficulty: easy\n",
+    )
+    .unwrap();
+
+    let qs = load_golden_set(&yaml_path).unwrap();
+    assert_eq!(qs.len(), 2);
+    assert_eq!(qs[0].id, "g1");
+    assert_eq!(qs[0].query, "hello");
+    assert!(qs[0].must_contain.is_empty());
+    assert!(qs[0].forbidden.is_empty());
+    assert!(qs[0].difficulty.is_none());
+
+    assert_eq!(qs[1].id, "g2");
+    assert_eq!(qs[1].lang.0, "en");
+    assert_eq!(qs[1].must_contain, vec!["foo".to_string()]);
+    assert_eq!(qs[1].forbidden, vec!["bar".to_string()]);
+    assert_eq!(qs[1].difficulty.as_deref(), Some("easy"));
+}
+
+// ── 2. duplicate IDs error lists every offender (sorted, deduplicated) ───────
+
+#[test]
+fn rejects_duplicate_ids() {
+    let tmp = tempdir().unwrap();
+    let yaml_path = tmp.path().join("dup.yaml");
+    fs::write(
+        &yaml_path,
+        "- id: g1\n  query: a\n- id: g2\n  query: b\n- id: g1\n  query: c\n- id: g2\n  query: d\n- id: g1\n  query: e\n",
+    )
+    .unwrap();
+
+    let err = load_golden_set(&yaml_path).unwrap_err();
+    let msg = format!("{err:#}");
+    assert!(msg.contains("duplicate query id"), "msg: {msg}");
+    // Both dup IDs should appear, sorted (BTreeSet) and deduplicated.
+    assert!(msg.contains("g1"), "msg: {msg}");
+    assert!(msg.contains("g2"), "msg: {msg}");
+}
--- a/crates/kb-eval/tests/runner.rs
+++ b/crates/kb-eval/tests/runner.rs
@@ -0,0 +1,419 @@
+//! Runner integration tests for `kb-eval` (P5-1).
+//!
+//! Drives [`kb_eval::run_eval_with_config`] end-to-end against a
+//! TempDir-backed config:
+//!
+//! - tiny seeded SQLite corpus (3 docs / 3 chunks) used as the
+//!   workspace's source-of-truth,
+//! - lexical-only retrieval (`SearchMode::Lexical`) so no embedder is
+//!   required (`models.embedding.provider = "none"`),
+//! - golden YAML pointed at via `KB_EVAL_GOLDEN`.
+//!
+//! Determinism: lexical-only with a fixed seed corpus produces
+//! byte-identical `per_query.jsonl` content (modulo `run_id` /
+//! `created_at`, which we strip when comparing).
+
+use std::fs;
+use std::path::{Path, PathBuf};
+use std::sync::Mutex;
+
+use kb_config::Config;
+use kb_core::SearchMode;
+use kb_eval::{EvalRunOpts, QueryResult, run_eval_with_config};
+use kb_store_sqlite::SqliteStore;
+use rusqlite::params;
+use tempfile::TempDir;
+
+/// `KB_EVAL_GOLDEN` is process-global state. Tests touching it must
+/// serialize so they don't trample each other when `cargo test`
+/// runs them in parallel.
+static GOLDEN_ENV_LOCK: Mutex<()> = Mutex::new(());
+
+// ── shared scaffolding ───────────────────────────────────────────────────────
+
+struct RunEnv {
+    temp: TempDir,
+    config: Config,
+}
+
+impl RunEnv {
+    fn new() -> Self {
+        let temp = tempfile::tempdir().unwrap();
+        let mut config = Config::defaults();
+        config.storage.data_dir = temp.path().to_string_lossy().into_owned();
+        // Force lexical-only behavior so the runner never tries to
+        // load fastembed during integration tests.
+        config.models.embedding.provider = "none".to_string();
+        config.models.embedding.dimensions = 0;
+        // Pin search defaults so test asserts are stable.
+        config.search.default_k = 5;
+
+        let store = SqliteStore::open(&config).unwrap();
+        store.run_migrations().unwrap();
+        seed_corpus(&store);
+        Self { temp, config }
+    }
+
+    fn data_dir(&self) -> PathBuf {
+        self.temp.path().to_path_buf()
+    }
+}
+
+/// Seed three (asset, document, chunk) triples with text the test
+/// queries can match against the FTS5 lexical index.
+fn seed_corpus(store: &SqliteStore) {
+    let conn = store.read_conn();
+    for (i, text) in [
+        "Rust ownership and borrow checker basics.",
+        "Cargo workspace members are listed in workspace.members.",
+        "Markdown chunking respects heading boundaries.",
+    ]
+    .iter()
+    .enumerate()
+    {
+        let doc_id = format!("doc{i:032}");
+        let chunk_id = format!("chunk{i:030}");
+        let asset_id = format!("asset{i:030}");
+        let path = format!("notes/{i}.md");
+        conn.execute(
+            "INSERT INTO assets (
+                asset_id, source_uri, workspace_path, media_type, byte_len,
+                checksum, storage_kind, storage_path, discovered_at
+             ) VALUES (?, ?, ?, '\"markdown\"', 0,
+                       'deadbeefdeadbeefdeadbeefdeadbeef',
+                       'reference', ?, '1970-01-01T00:00:00Z')",
+            params![asset_id, format!("file:///{path}"), path, path],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT INTO documents (
+                doc_id, asset_id, workspace_path, title, lang, source_type,
+                trust_level, parser_version, doc_version, schema_version,
+                metadata_json, provenance_json, created_at, updated_at
+             ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
+                       '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
+            params![doc_id, asset_id, path],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT INTO chunks (
+                chunk_id, doc_id, text, heading_path_json, section_label,
+                source_spans_json, token_estimate, chunker_version,
+                policy_hash, block_ids_json, created_at
+             ) VALUES (?, ?, ?, '[]', NULL,
+                       '[{\"kind\":\"line\",\"start\":1,\"end\":3}]',
+                       1, 'md-heading-v1', 'h', '[]', '1970-01-01T00:00:00Z')",
+            params![chunk_id, doc_id, text],
+        )
+        .unwrap();
+    }
+    // Build the FTS index so lexical search returns hits. Reuses the
+    // same connection guard rather than reopening — the SAVEPOINT
+    // protocol nests correctly under the existing read_conn lock.
+    kb_store_sqlite::rebuild_chunks_fts(&conn).unwrap();
+    drop(conn);
+}
+
+fn write_golden(dir: &Path, body: &str) -> PathBuf {
+    let path = dir.join("golden.yaml");
+    fs::write(&path, body).unwrap();
+    path
+}
+
+/// Bind a fresh ephemeral port, then release it. The returned URL
+/// points at a port that was just freed; very likely still unbound
+/// when the test issues its outbound connection a moment later, in
+/// which case `connect()` fails fast with `ECONNREFUSED`. Beats
+/// hard-coding port 1 which can timeout slowly on hardened hosts.
+fn unreachable_endpoint() -> String {
+    let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
+    let port = listener.local_addr().unwrap().port();
+    drop(listener);
+    format!("http://127.0.0.1:{port}")
+}
+
+fn lexical_opts() -> EvalRunOpts {
+    EvalRunOpts {
+        suite: "test".to_string(),
+        mode: SearchMode::Lexical,
+        with_rag: false,
+        k: 5,
+        temperature: Some(0.0),
+        seed: Some(0),
+    }
+}
+
+/// Run the eval after pointing `KB_EVAL_GOLDEN` at `yaml`. The env
+/// guard must outlive the call so concurrent tests don't reset the
+/// var mid-run.
+fn run_with_golden<F: FnOnce() -> R, R>(yaml: &Path, f: F) -> R {
+    let _g = GOLDEN_ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
+    // SAFETY: `KB_EVAL_GOLDEN` is a benign env var; the GOLDEN_ENV_LOCK
+    // serializes mutations so concurrent tests don't race.
+    unsafe {
+        std::env::set_var("KB_EVAL_GOLDEN", yaml);
+    }
+    let out = f();
+    unsafe {
+        std::env::remove_var("KB_EVAL_GOLDEN");
+    }
+    out
+}
+
+// ── 1. elapsed_ms recorded for every query ──────────────────────────────────
+
+#[test]
+fn runner_records_elapsed_for_every_query() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n- id: q3\n  query: workspace\n",
+    );
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    assert_eq!(run.per_query.len(), 3);
+    for qr in &run.per_query {
+        assert_eq!(qr.mode, SearchMode::Lexical);
+        // `elapsed_ms` is `u32`; the assertion that it's a valid
+        // unsigned value is implicit. We additionally bound it well
+        // below the 4G ceiling to detect a stuck/overflow path.
+        assert!(
+            qr.elapsed_ms < 60_000,
+            "elapsed_ms suspicious: {}",
+            qr.elapsed_ms
+        );
+    }
+    // The id-list round-trips into the per-query records.
+    let ids: Vec<&str> = run.per_query.iter().map(|q| q.query_id.as_str()).collect();
+    assert_eq!(ids, vec!["q1", "q2", "q3"]);
+}
+
+// ── 2. config snapshot carries the documented version fields ────────────────
+
+#[test]
+fn runner_records_config_snapshot_with_versions() {
+    let env = RunEnv::new();
+    let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n  query: ownership\n");
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    let snap = &run.config_snapshot_json;
+    assert!(snap.get("config").is_some(), "config field missing");
+    assert_eq!(
+        snap.pointer("/chunker_version"),
+        Some(&serde_json::Value::String("md-heading-v1".to_string())),
+    );
+    assert!(snap.pointer("/embedding/model").is_some());
+    assert!(snap.pointer("/embedding/dimensions").is_some());
+    assert!(snap.pointer("/llm/model_id").is_some());
+    assert_eq!(
+        snap.pointer("/prompt_template_version"),
+        Some(&serde_json::Value::String("rag-v1".to_string())),
+    );
+    assert!(snap.pointer("/score_gate").is_some());
+    assert!(snap.pointer("/rrf_k").is_some());
+}
+
+// ── 3. failing query (ask path with no Ollama) records an error ─────────────
+
+#[test]
+fn runner_captures_per_query_error_when_rag_unreachable() {
+    let env = RunEnv::new();
+    // Point Ollama at an unbound port so `ask_with_config` surfaces a
+    // connection error per query. We use bind-then-release rather than
+    // a hard-coded `:1` because port 1 is reserved-but-not-guaranteed-
+    // unbound (some hardened systems answer with ICMP unreachable
+    // instantly, others timeout slowly). TOCTOU race is theoretically
+    // possible but rare in practice and faster-failing than `:1`.
+    let mut config = env.config.clone();
+    config.models.llm.endpoint = unreachable_endpoint();
+
+    let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n  query: ownership\n");
+
+    let opts = EvalRunOpts {
+        with_rag: true,
+        ..lexical_opts()
+    };
+    let run = run_with_golden(&yaml, || run_eval_with_config(&config, &opts).unwrap());
+
+    let qr = &run.per_query[0];
+    // hits_top_k still populated by lexical search before the RAG attempt.
+    assert!(
+        !qr.hits_top_k.is_empty(),
+        "lexical hits should populate before RAG attempt"
+    );
+    assert!(qr.answer.is_none(), "no answer when RAG fails");
+    assert!(qr.error.is_some(), "error must be recorded");
+}
+
+// ── 4. eval_runs + eval_query_results rows persisted ────────────────────────
+
+#[test]
+fn runner_persists_eval_run_and_query_result_rows() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
+    );
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    // Reopen the same SQLite file with a new store handle and read
+    // the rows back. We use the inherent `read_conn` helper rather
+    // than rusqlite directly because the latter would require kb-eval
+    // to add a runtime rusqlite dep (forbidden by the spec).
+    let store = SqliteStore::open(&env.config).unwrap();
+    let conn = store.read_conn();
+
+    let n_runs: i64 = conn
+        .query_row(
+            "SELECT COUNT(*) FROM eval_runs WHERE run_id = ?",
+            params![run.run_id],
+            |r| r.get(0),
+        )
+        .unwrap();
+    assert_eq!(n_runs, 1);
+
+    let n_results: i64 = conn
+        .query_row(
+            "SELECT COUNT(*) FROM eval_query_results WHERE run_id = ?",
+            params![run.run_id],
+            |r| r.get(0),
+        )
+        .unwrap();
+    assert_eq!(n_results, 2);
+}
+
+// ── 5. per_query.jsonl mirror exists and round-trips ────────────────────────
+
+#[test]
+fn runner_writes_per_query_jsonl_mirror() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
+    );
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    let mirror = env
+        .data_dir()
+        .join("runs")
+        .join(&run.run_id)
+        .join("per_query.jsonl");
+    assert!(
+        mirror.exists(),
+        "per_query.jsonl missing at {}",
+        mirror.display()
+    );
+    let body = fs::read_to_string(&mirror).unwrap();
+    let lines: Vec<&str> = body.lines().collect();
+    assert_eq!(lines.len(), 2);
+    let parsed: Vec<QueryResult> = lines
+        .iter()
+        .map(|l| serde_json::from_str::<QueryResult>(l).expect("valid JSONL line"))
+        .collect();
+    assert_eq!(parsed[0].query_id, "q1");
+    assert_eq!(parsed[1].query_id, "q2");
+}
+
+// ── 6. determinism — repeating the run produces byte-identical per_query JSON ─
+
+#[test]
+fn runner_lexical_is_deterministic_per_query_payload() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
+    );
+
+    let run_a = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+    let run_b = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    // Run-level fields (`run_id`, `created_at`) intentionally diverge;
+    // the per-query payload (which is what the snapshot fixture pins)
+    // must be byte-identical.
+    let a_json = serde_json::to_string(&run_a.per_query).unwrap();
+    let b_json = serde_json::to_string(&run_b.per_query).unwrap();
+    assert_eq!(
+        a_json, b_json,
+        "lexical-only per_query payload must be byte-identical across runs"
+    );
+}
+
+// ── 7. snapshot — per_query JSON pinned to fixtures/eval/run-1.json ─────────
+
+#[test]
+fn runner_per_query_snapshot_matches_fixture() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
+    );
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    // Fixture pins the *shape* of the per-query payload, including the
+    // first hit's stable scalar fields (chunk_id, doc_id, heading_path,
+    // fusion_score). FTS scores depend on the SQLite version, so the
+    // fusion_score is captured into the fixture from one passing run
+    // and must remain stable across re-runs against the same seeded
+    // corpus. Timing-sensitive fields (`elapsed_ms`, raw `Instant`
+    // byproducts) are excluded. Verifying byte stability is the
+    // determinism test (#6); this test verifies the field set +
+    // ordering is stable.
+    let projection: Vec<_> = run
+        .per_query
+        .iter()
+        .map(|qr| {
+            let first_hit = qr.hits_top_k.first().map(|h| {
+                serde_json::json!({
+                    "chunk_id": h.chunk_id,
+                    "doc_id": h.doc_id,
+                    "heading_path": h.heading_path,
+                    "score": h.retrieval.fusion_score,
+                })
+            });
+            serde_json::json!({
+                "query_id": qr.query_id,
+                "query": qr.query,
+                "mode": qr.mode,
+                "hits_count": qr.hits_top_k.len(),
+                "first_hit": first_hit,
+                "has_answer": qr.answer.is_some(),
+                "error": qr.error,
+            })
+        })
+        .collect();
+    let actual = serde_json::to_string_pretty(&projection).unwrap();
+
+    let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/eval/run-1.json");
+
+    if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
+        fs::create_dir_all(fixture_path.parent().unwrap()).unwrap();
+        fs::write(&fixture_path, &actual).unwrap();
+    }
+
+    let expected = fs::read_to_string(&fixture_path)
+        .unwrap_or_else(|e| panic!("read snapshot {}: {e}", fixture_path.display()));
+    assert_eq!(
+        actual.trim(),
+        expected.trim(),
+        "snapshot drift — re-run with UPDATE_SNAPSHOTS=1 to refresh"
+    );
+}
--- a/crates/kb-store-sqlite/src/eval.rs
+++ b/crates/kb-store-sqlite/src/eval.rs
@@ -0,0 +1,161 @@
+//! `eval_runs` / `eval_query_results` row writers (P5-1 — design §5.7).
+//!
+//! `kb-eval` calls these directly via the inherent methods on
+//! [`SqliteStore`]. The pattern mirrors [`crate::answers`]: the trait
+//! `kb_core::DocumentStore` is the document surface, and run-level
+//! audit rows (jobs, ingest_runs, answers, eval_runs) are inherent
+//! methods so the trait surface stays small.
+
+use anyhow::{Context, Result};
+use rusqlite::params;
+use time::OffsetDateTime;
+
+use crate::error::StoreError;
+use crate::store::SqliteStore;
+
+/// One row about to land in `eval_runs` (per V001 schema).
+///
+/// `aggregate_json` is filled by P5-1 with the literal `"{}"` —
+/// metric computation lives in P5-2 and updates the row in place.
+#[derive(Clone, Debug)]
+pub struct EvalRunRow<'a> {
+    pub run_id: &'a str,
+    pub suite: &'a str,
+    pub config_snapshot_json: &'a str,
+    pub aggregate_json: &'a str,
+    pub commit_hash: Option<&'a str>,
+    pub created_at: OffsetDateTime,
+}
+
+impl SqliteStore {
+    /// Return `true` iff a row with `doc_id = ?` exists in
+    /// `documents`. Lightweight existence probe used by
+    /// `kb-eval`'s golden-fixture validator — full
+    /// `DocumentStore::get_document` deserializes blocks + metadata
+    /// JSON, which is overkill for "does this ID exist?"
+    pub fn document_exists(&self, doc_id: &str) -> Result<bool> {
+        let conn = self.lock_conn();
+        let row: Result<i64, rusqlite::Error> = conn.query_row(
+            "SELECT 1 FROM documents WHERE doc_id = ? LIMIT 1",
+            params![doc_id],
+            |r| r.get(0),
+        );
+        match row {
+            Ok(_) => Ok(true),
+            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(false),
+            Err(e) => Err(StoreError::from(e).into()),
+        }
+    }
+
+    /// Same shape as [`Self::document_exists`] but probes the
+    /// `chunks` table by `chunk_id`.
+    pub fn chunk_exists(&self, chunk_id: &str) -> Result<bool> {
+        let conn = self.lock_conn();
+        let row: Result<i64, rusqlite::Error> = conn.query_row(
+            "SELECT 1 FROM chunks WHERE chunk_id = ? LIMIT 1",
+            params![chunk_id],
+            |r| r.get(0),
+        );
+        match row {
+            Ok(_) => Ok(true),
+            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(false),
+            Err(e) => Err(StoreError::from(e).into()),
+        }
+    }
+
+    /// Insert one row into `eval_runs`. Mirrors the schema in
+    /// `migrations/V001__init.sql` (§5.7). Called by
+    /// `kb-eval::run_eval` once per run, after every per-query result
+    /// row has been written.
+    pub fn record_eval_run(&self, row: &EvalRunRow<'_>) -> Result<()> {
+        let created_at = row
+            .created_at
+            .format(&time::format_description::well_known::Rfc3339)
+            .context("format eval_runs.created_at")?;
+        let conn = self.lock_conn();
+        conn.execute(
+            "INSERT INTO eval_runs (
+                run_id, suite, config_snapshot_json, aggregate_json,
+                commit_hash, created_at
+            ) VALUES (?, ?, ?, ?, ?, ?)",
+            params![
+                row.run_id,
+                row.suite,
+                row.config_snapshot_json,
+                row.aggregate_json,
+                row.commit_hash,
+                created_at,
+            ],
+        )
+        .map_err(StoreError::from)?;
+        Ok(())
+    }
+
+    /// Insert one row into `eval_query_results`. PRIMARY KEY is
+    /// `(run_id, query_id)` so writing the same `(run, query)` twice
+    /// surfaces a `UNIQUE` violation through `StoreError`.
+    pub fn record_eval_query_result(
+        &self,
+        run_id: &str,
+        query_id: &str,
+        result_json: &str,
+    ) -> Result<()> {
+        let conn = self.lock_conn();
+        conn.execute(
+            "INSERT INTO eval_query_results (run_id, query_id, result_json)
+             VALUES (?, ?, ?)",
+            params![run_id, query_id, result_json],
+        )
+        .map_err(StoreError::from)?;
+        Ok(())
+    }
+
+    /// Insert the `eval_runs` row plus every `eval_query_results` row
+    /// for the same run inside a single SQLite transaction. This is the
+    /// preferred path for `kb-eval::run_eval` — a panic between the run
+    /// row and the per-query rows can't leave orphan run rows.
+    ///
+    /// `results` is a slice of `(query_id, result_json)` tuples mirroring
+    /// the per-call `record_eval_query_result` arguments.
+    pub fn record_eval_run_with_results(
+        &self,
+        row: &EvalRunRow<'_>,
+        results: &[(String, String)],
+    ) -> Result<()> {
+        let created_at = row
+            .created_at
+            .format(&time::format_description::well_known::Rfc3339)
+            .context("format eval_runs.created_at")?;
+        let mut conn = self.lock_conn();
+        let tx = conn.transaction().map_err(StoreError::from)?;
+        tx.execute(
+            "INSERT INTO eval_runs (
+                run_id, suite, config_snapshot_json, aggregate_json,
+                commit_hash, created_at
+            ) VALUES (?, ?, ?, ?, ?, ?)",
+            params![
+                row.run_id,
+                row.suite,
+                row.config_snapshot_json,
+                row.aggregate_json,
+                row.commit_hash,
+                created_at,
+            ],
+        )
+        .map_err(StoreError::from)?;
+        {
+            let mut stmt = tx
+                .prepare(
+                    "INSERT INTO eval_query_results (run_id, query_id, result_json)
+                     VALUES (?, ?, ?)",
+                )
+                .map_err(StoreError::from)?;
+            for (query_id, result_json) in results {
+                stmt.execute(params![row.run_id, query_id, result_json])
+                    .map_err(StoreError::from)?;
+            }
+        }
+        tx.commit().map_err(StoreError::from)?;
+        Ok(())
+    }
+}
--- a/crates/kb-store-sqlite/src/lib.rs
+++ b/crates/kb-store-sqlite/src/lib.rs
@@ -21,6 +21,7 @@ mod answers;
 mod documents;
 mod embeddings;
 mod error;
+mod eval;
 mod filters;
 mod fts;
 mod jobs;
@@ -29,6 +30,7 @@ mod store;

 pub use embeddings::EmbeddingRecordRow;
 pub use error::StoreError;
+pub use eval::EvalRunRow;
 pub use fts::rebuild_chunks_fts;
 pub use jobs::IngestRunRow;
 pub use store::SqliteStore;
--- a/crates/kb-store-sqlite/src/store.rs
+++ b/crates/kb-store-sqlite/src/store.rs
@@ -64,7 +64,7 @@ impl SqliteStore {
    /// temp_store=MEMORY), and create parent directories as needed.
    /// **Does not run migrations** — call [`Self::run_migrations`] next.
    pub fn open(config: &kb_config::Config) -> Result<Self> {
-        let data_dir = expand_data_dir(&config.storage.data_dir);
+        let data_dir = kb_config::expand_path(&config.storage.data_dir, "");
        std::fs::create_dir_all(&data_dir)
            .with_context(|| format!("create data_dir {}", data_dir.display()))?;
        let db_path = data_dir.join(SQLITE_FILE);
@@ -363,53 +363,3 @@ fn apply_pragmas(conn: &Connection) -> Result<()> {
    Ok(())
 }

-/// Expand the placeholders / `~` / env-vars used by `Config::storage.data_dir`.
-///
-/// Supported substitutions, in order:
-/// - `${XDG_DATA_HOME:-~/.local/share}` (and the bare `${XDG_DATA_HOME}`)
-/// - leading `~` → `$HOME`
-///
-/// If neither produces an absolute path, the input is returned as-is
-/// (relative paths are kept relative to the caller's CWD).
-fn expand_data_dir(raw: &str) -> PathBuf {
-    let mut s = raw.to_string();
-
-    // ${XDG_DATA_HOME:-~/.local/share}: respect the env override, else
-    // fall back to the suffix after `:-`.
-    if let Some(start) = s.find("${XDG_DATA_HOME") {
-        if let Some(rel_end) = s[start..].find('}') {
-            let end = start + rel_end + 1; // include trailing '}'
-            let inner = &s[start + 2..end - 1]; // strip ${ and }
-            let replacement = match std::env::var("XDG_DATA_HOME") {
-                Ok(v) if !v.is_empty() => v,
-                _ => {
-                    // inner is e.g. `XDG_DATA_HOME:-~/.local/share`.
-                    if let Some((_, default)) = inner.split_once(":-") {
-                        default.to_string()
-                    } else {
-                        // No default supplied; mimic Bash and yield "".
-                        String::new()
-                    }
-                }
-            };
-            s.replace_range(start..end, &replacement);
-        }
-    }
-
-    // ~ at the front → $HOME (or `dirs::home_dir`).
-    if let Some(rest) = s.strip_prefix('~') {
-        if let Some(home) = std::env::var_os("HOME").map(PathBuf::from).or_else(dirs_home_fallback)
-        {
-            return home.join(rest.trim_start_matches('/'));
-        }
-    }
-
-    PathBuf::from(s)
-}
-
-/// Tiny shim to avoid pulling in the `dirs` crate as a direct dep — we
-/// only fall back when `$HOME` is unset, which is exotic on the platforms
-/// we target. Returns `None` so the caller keeps the literal `~`.
-fn dirs_home_fallback() -> Option<PathBuf> {
-    None
-}
--- a/crates/kb-store-vector/src/paths.rs
+++ b/crates/kb-store-vector/src/paths.rs
@@ -1,51 +1,10 @@
 //! Path expansion + table-name sanitization.
 //!
-//! Mirrors `kb-store-sqlite::store::expand_data_dir` and
-//! `kb-embed-local::expand_path` so the three crates resolve
-//! `${XDG_DATA_HOME:-…}` / leading `~` / `{data_dir}` identically. A
-//! shared helper would live in `kb-config`, but the task spec forbids
-//! adding new types to `kb-config`, so we keep a private clone.
-
-use std::path::PathBuf;
-
-/// Expand `{data_dir}` → `data_dir`, `${XDG_DATA_HOME:-…}` → env or
-/// default, leading `~` → `$HOME`. Pass an empty `data_dir` when
-/// resolving `data_dir` itself (the `{data_dir}` substitution is a
-/// no-op in that case).
-pub(crate) fn expand_path(raw: &str, data_dir: &str) -> PathBuf {
-    let mut s = raw.to_string();
-
-    if !data_dir.is_empty() {
-        s = s.replace("{data_dir}", data_dir);
-    }
-
-    // ${XDG_DATA_HOME:-~/.local/share}: env override, else default after `:-`.
-    if let Some(start) = s.find("${XDG_DATA_HOME") {
-        if let Some(rel_end) = s[start..].find('}') {
-            let end = start + rel_end + 1;
-            let inner = &s[start + 2..end - 1];
-            let replacement = match std::env::var("XDG_DATA_HOME") {
-                Ok(v) if !v.is_empty() => v,
-                _ => {
-                    if let Some((_, default)) = inner.split_once(":-") {
-                        default.to_string()
-                    } else {
-                        String::new()
-                    }
-                }
-            };
-            s.replace_range(start..end, &replacement);
-        }
-    }
-
-    if let Some(rest) = s.strip_prefix('~') {
-        if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
-            return home.join(rest.trim_start_matches('/'));
-        }
-    }
-
-    PathBuf::from(s)
-}
+//! `expand_path` lives in `kb-config` so `kb-store-vector`,
+//! `kb-store-sqlite`, `kb-embed-local`, and `kb-eval` all resolve
+//! `${XDG_DATA_HOME:-…}` / leading `~` / `{data_dir}` identically. This
+//! module re-exports nothing; consumers within the crate `use
+//! kb_config::expand_path` directly.

 /// Build the per-model Lance table name. Per design §6.3:
 /// `chunk_embeddings_<model>_<dim>.lance`. Model IDs may contain
@@ -104,16 +63,4 @@ mod tests {
            "chunk_embeddings_BAAI_bge-small-en_384"
        );
    }
-
-    #[test]
-    fn expand_path_substitutes_data_dir() {
-        let p = expand_path("{data_dir}/lancedb", "/tmp/kbtest");
-        assert_eq!(p, PathBuf::from("/tmp/kbtest/lancedb"));
-    }
-
-    #[test]
-    fn expand_path_passthrough_absolute() {
-        let p = expand_path("/abs/dir", "/ignored");
-        assert_eq!(p, PathBuf::from("/abs/dir"));
-    }
 }
--- a/crates/kb-store-vector/src/store.rs
+++ b/crates/kb-store-vector/src/store.rs
@@ -22,8 +22,10 @@ use serde_json::json;
 use time::OffsetDateTime;
 use tokio::runtime::{Builder as RuntimeBuilder, Runtime};

+use kb_config::expand_path;
+
 use crate::arrow_batch::{build_batch, schema_for, schema_params_hash};
-use crate::paths::{expand_path, lance_table_name};
+use crate::paths::lance_table_name;

 /// Overfetch multiplier: when post-filtering Lance results against
 /// SQLite-side filters we ask for `2 * k` candidates so a moderately
--- a/fixtures/golden_queries.yaml
+++ b/fixtures/golden_queries.yaml
@@ -0,0 +1,45 @@
+# Golden query suite for `kb eval run` (P5-1 / P5-2).
+#
+# Top-level: list of queries. Required fields: `id`, `query`. All
+# others are optional and default to empty / null.
+#
+# Curators: `expected_doc_ids` and `expected_chunk_ids` MUST refer to
+# real rows in the active workspace's SQLite store at run time. Stale
+# references make the runner bail at start. The shipped template
+# leaves them empty so the file is loadable on any fresh workspace —
+# fill them in after a `kb ingest` to enable hit@k / MRR metrics
+# (P5-2).
+#
+# `must_contain` / `forbidden` drive the rule-based groundedness
+# metric (P5-2).
+
+- id: g001
+  query: "Cargo workspace 멤버 추가하는 법"
+  lang: ko
+  must_contain: ["[workspace]", "members"]
+  difficulty: easy
+
+- id: g002
+  query: "What is Rust ownership?"
+  lang: en
+  must_contain: ["borrow", "lifetime"]
+  difficulty: easy
+
+- id: g003
+  query: "Markdown chunking 규칙은?"
+  lang: ko
+  must_contain: ["heading"]
+  forbidden: ["embedding"]
+  difficulty: medium
+
+- id: g004
+  query: "How does FTS5 tokenization work for Korean text?"
+  lang: en
+  must_contain: ["unicode61", "tokenizer"]
+  difficulty: medium
+
+- id: g005
+  query: "RAG citation 검증은 어떻게 동작?"
+  lang: ko
+  must_contain: ["citation", "marker"]
+  difficulty: hard
--- a/tasks/p5/p5-1-golden-fixture-runner.md
+++ b/tasks/p5/p5-1-golden-fixture-runner.md
@@ -3,7 +3,7 @@ phase: P5
 component: kb-eval (runner)
 task_id: p5-1
 title: "Golden query fixture loader + per-query runner"
-status: planned
+status: completed
 depends_on: [p4-3]
 unblocks: [p5-2]
 contract_source: ../../docs/superpowers/specs/2026-04-27-kb-final-form-design.md
@@ -149,6 +149,6 @@ All tests under `cargo test -p kb-eval runner`.

 ## Risks / notes

- Large RAG suites can be slow. Consider `--max-queries` for incremental runs (kept here as a flag spec; implementation is the responsibility of this task).
+- Large RAG suites can be slow. `--max-queries` flag is deferred to P5-2 / a follow-up. Rationale: (a) the runner currently runs all queries unconditionally; (b) without metrics aggregation it adds little incremental value; (c) trivial to add as a `Vec::truncate` once the eval CLI subcommand exists.
 - `expected_chunk_id` references depend on `chunker_version`. If chunker bumps, golden set must be re-curated. Fail fast in the loader.
 - Use `time::OffsetDateTime::now_utc()` for `created_at`; never local TZ.