feat(p5-1): kb-eval crate — golden-fixture runner + eval persistence

- new kb-eval crate: load_golden_set (YAML) + run_eval (per-query search/ask + persistence) - new kb-store-sqlite::eval module: record_eval_run_with_results (transactional), document_exists / chunk_exists probes - fixtures/golden_queries.yaml: 5-entry KO+EN template - tests: 13 pass (loader: parse, dup-id, missing chunk_id; runner: elapsed, snapshot, error capture, JSONL, determinism, persistence, config_snapshot) - per_query.jsonl mirror written to runs_dir/<run_id>/ - temperature=0 + fixed seed → byte-identical per_query.jsonl (lexical) deviations from spec (documented in code): - run_id uses uuid::Uuid::now_v7().simple() (timestamp-ordered hex) instead of ULID — uuid already in workspace deps - load_golden_set_validated kept #[cfg(test)] pub(crate) — production inlines validate_against_db - snapshot fixture uses normalized projection (id/query/mode/first_hit) — full byte-determinism covered by separate test - index_version in config_snapshot left null (composed per call by kb-app, not config-level) deferred to follow-up: - App reuse across queries (currently rebuilds App per query) - expand_path hoist to kb-config (3 crate clones now) - --max-queries flag (deferred to P5-2 per updated spec) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 18:01:09 +00:00
parent 2c0607ae95
commit 58a11cc2b8
14 changed files with 1443 additions and 2 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3478,6 +3478,25 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "kb-eval"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "kb-app",
+ "kb-config",
+ "kb-core",
+ "kb-store-sqlite",
+ "rusqlite",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "tempfile",
+ "time",
+ "tracing",
+ "uuid",
+]
+
 [[package]]
 name = "kb-llm"
 version = "0.1.0"
@@ -6380,6 +6399,19 @@ dependencies = [
 "syn 2.0.117",
 ]

+[[package]]
+name = "serde_yaml"
+version = "0.9.34+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
+dependencies = [
+ "indexmap 2.14.0",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+
 [[package]]
 name = "serde_yaml_ng"
 version = "0.10.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,6 +18,7 @@ members = [
    "crates/kb-rag",
    "crates/kb-app",
    "crates/kb-cli",
+    "crates/kb-eval",
 ]

 [workspace.package]
@@ -32,6 +33,9 @@ anyhow       = "1"
 thiserror    = "2"
 serde        = { version = "1", features = ["derive"] }
 serde_json   = "1"
+# Golden-fixture loader (P5-1, kb-eval) parses YAML; pinned in the
+# workspace so future eval-adjacent crates share the same major.
+serde_yaml   = "0.9"
 time         = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] }
 uuid         = { version = "1", features = ["v7", "serde"] }
 blake3       = "1"
--- a/crates/kb-eval/Cargo.toml
+++ b/crates/kb-eval/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name          = "kb-eval"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "Golden-fixture eval runner: load YAML, drive kb-app search/ask, persist eval_runs / eval_query_results / per_query.jsonl"
+
+[dependencies]
+# Allowed deps per p5-1 spec — domain types + facade only.
+kb-core         = { path = "../kb-core" }
+kb-config       = { path = "../kb-config" }
+kb-app          = { path = "../kb-app" }
+kb-store-sqlite = { path = "../kb-store-sqlite" }
+serde           = { workspace = true }
+serde_json      = { workspace = true }
+serde_yaml      = { workspace = true }
+time            = { workspace = true }
+tracing         = { workspace = true }
+anyhow          = { workspace = true }
+# `uuid::Uuid::now_v7()` powers the `run_<ulid_lower>`-shaped run_id;
+# v7 UUIDs are timestamp-ordered (same monotonicity as ULID) and `uuid`
+# is already in workspace deps, so we avoid pulling a new ULID crate
+# just for the lower-cased timestamp prefix.
+uuid            = { workspace = true }
+
+[dev-dependencies]
+tempfile        = { workspace = true }
+rusqlite        = { workspace = true }
--- a/crates/kb-eval/src/lib.rs
+++ b/crates/kb-eval/src/lib.rs
@@ -0,0 +1,29 @@
+//! `kb-eval` — golden-fixture eval runner (P5-1).
+//!
+//! Loads `fixtures/golden_queries.yaml`, runs each entry through the
+//! [`kb_app`] facade (lexical / vector / hybrid + optional RAG), and
+//! persists results into `eval_runs` / `eval_query_results` plus
+//! `runs_dir/<run_id>/per_query.jsonl` (design §5.7, §6.3).
+//!
+//! Metric computation lives in P5-2 (`kb-eval::metrics`); this crate is
+//! the **data collector** only.
+//!
+//! ## Allowed deps (per task spec)
+//!
+//! `kb-core`, `kb-config`, `kb-app`, `kb-store-sqlite`, plus `serde`,
+//! `serde_yaml`, `serde_json`, `time`, `tracing`,
+//! `anyhow`, `uuid`. Retrieval / embedding / LLM crates are NOT
+//! reachable here — every retrieval and `ask` call must go through
+//! `kb-app`.
+//!
+//! ## `run_id` recipe
+//!
+//! `run_id` uses UUIDv7 simple — timestamp-ordered, lowercase hex.
+
+mod loader;
+mod runner;
+mod types;
+
+pub use loader::load_golden_set;
+pub use runner::{run_eval, run_eval_with_config};
+pub use types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult};
--- a/crates/kb-eval/src/loader.rs
+++ b/crates/kb-eval/src/loader.rs
@@ -0,0 +1,229 @@
+//! Golden-set YAML loader.
+//!
+//! Two entry points:
+//!
+//! - [`load_golden_set`] — pure YAML parse + uniqueness check. Used by
+//!   tests that don't have a SQLite store handy.
+//! - [`load_golden_set_validated`] — additionally verifies every
+//!   `expected_doc_id` / `expected_chunk_id` exists in the SQLite DB
+//!   the supplied [`kb_config::Config`] points at. Used by
+//!   [`crate::run_eval`] in production so a stale golden set fails
+//!   fast at run start.
+
+use std::collections::{BTreeSet, HashSet};
+use std::path::Path;
+
+use anyhow::{Context, Result, anyhow};
+use kb_store_sqlite::SqliteStore;
+
+use crate::types::GoldenQuery;
+
+/// Parse the YAML at `path` into a `Vec<GoldenQuery>` and check that
+/// every `id` is unique.
+///
+/// The YAML is expected to be a top-level list of mappings. Required
+/// fields per entry: `id`, `query`. All other fields default to empty /
+/// `None` per [`GoldenQuery`]'s `serde(default)` annotations.
+pub fn load_golden_set(path: &Path) -> Result<Vec<GoldenQuery>> {
+    let bytes =
+        std::fs::read(path).with_context(|| format!("read golden YAML from {}", path.display()))?;
+    let queries: Vec<GoldenQuery> = serde_yaml::from_slice(&bytes)
+        .with_context(|| format!("parse golden YAML at {}", path.display()))?;
+    check_unique_ids(&queries)?;
+    Ok(queries)
+}
+
+/// Same as [`load_golden_set`] but additionally validates that every
+/// `expected_doc_id` and `expected_chunk_id` referenced by the loaded
+/// entries actually exists in the SQLite database `cfg` resolves to.
+///
+/// Missing IDs are surfaced as a single sorted error listing every
+/// offender, so curators can fix the whole set in one pass.
+///
+/// Currently used only by the in-module tests below; production code
+/// inlines `load_golden_set` + `validate_against_db` in
+/// [`crate::run_eval_with_config`] so the validation can run against
+/// an already-opened [`kb_config::Config`] without re-parsing YAML.
+#[cfg(test)]
+pub(crate) fn load_golden_set_validated(
+    yaml_path: &Path,
+    cfg: &kb_config::Config,
+) -> Result<Vec<GoldenQuery>> {
+    let queries = load_golden_set(yaml_path)?;
+    validate_against_db(&queries, cfg)?;
+    Ok(queries)
+}
+
+fn check_unique_ids(queries: &[GoldenQuery]) -> Result<()> {
+    let mut seen: HashSet<&str> = HashSet::new();
+    let mut dups: BTreeSet<String> = BTreeSet::new();
+    for q in queries {
+        if !seen.insert(q.id.as_str()) {
+            dups.insert(q.id.clone());
+        }
+    }
+    if dups.is_empty() {
+        Ok(())
+    } else {
+        let list: Vec<String> = dups.into_iter().collect();
+        Err(anyhow!("duplicate query id(s): {}", list.join(", ")))
+    }
+}
+
+/// Read every doc_id / chunk_id referenced by `queries` and confirm
+/// SQLite has rows for them. Builds a sorted, deduplicated error
+/// message listing every missing ID.
+pub(crate) fn validate_against_db(queries: &[GoldenQuery], cfg: &kb_config::Config) -> Result<()> {
+    // Short-circuit when there is nothing to validate — saves opening
+    // SQLite for golden sets that omit expected_*_ids entirely.
+    let needs_check = queries
+        .iter()
+        .any(|q| !q.expected_doc_ids.is_empty() || !q.expected_chunk_ids.is_empty());
+    if !needs_check {
+        return Ok(());
+    }
+
+    let store = SqliteStore::open(cfg).context("open SqliteStore for golden validation")?;
+    store
+        .run_migrations()
+        .context("run migrations for golden validation")?;
+
+    let mut missing_docs: BTreeSet<String> = BTreeSet::new();
+    let mut missing_chunks: BTreeSet<String> = BTreeSet::new();
+
+    for q in queries {
+        for did in &q.expected_doc_ids {
+            let exists = store
+                .document_exists(&did.0)
+                .with_context(|| format!("probe document {}", did.0))?;
+            if !exists {
+                missing_docs.insert(did.0.clone());
+            }
+        }
+        for cid in &q.expected_chunk_ids {
+            let exists = store
+                .chunk_exists(&cid.0)
+                .with_context(|| format!("probe chunk {}", cid.0))?;
+            if !exists {
+                missing_chunks.insert(cid.0.clone());
+            }
+        }
+    }
+
+    if missing_docs.is_empty() && missing_chunks.is_empty() {
+        return Ok(());
+    }
+
+    let mut parts: Vec<String> = Vec::new();
+    if !missing_docs.is_empty() {
+        parts.push(format!(
+            "missing doc_ids: {}",
+            missing_docs.into_iter().collect::<Vec<_>>().join(", ")
+        ));
+    }
+    if !missing_chunks.is_empty() {
+        parts.push(format!(
+            "missing chunk_ids: {}",
+            missing_chunks.into_iter().collect::<Vec<_>>().join(", ")
+        ));
+    }
+    Err(anyhow!(
+        "golden set references unknown IDs — {}",
+        parts.join("; ")
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    //! Tests that exercise the crate-private
+    //! [`load_golden_set_validated`]. The pure-parser cases live in
+    //! `tests/loader.rs`; only the validated-variant cases need to sit
+    //! next to the function so they can see the `pub(crate)` symbol.
+    use super::*;
+    use kb_config::Config;
+    use kb_store_sqlite::SqliteStore;
+    use rusqlite::params;
+    use std::fs;
+    use tempfile::tempdir;
+
+    #[test]
+    fn rejects_unknown_expected_chunk_id() {
+        let tmp = tempdir().unwrap();
+        let mut config = Config::defaults();
+        config.storage.data_dir = tmp.path().to_string_lossy().into_owned();
+
+        let store = SqliteStore::open(&config).unwrap();
+        store.run_migrations().unwrap();
+        seed_one_chunk(&store, "doc_present", "chunk_present");
+
+        let yaml_path = tmp.path().join("golden.yaml");
+        fs::write(
+            &yaml_path,
+            "- id: g1\n  query: hello\n  expected_chunk_ids: [\"chunk_present\", \"chunk_missing\"]\n",
+        )
+        .unwrap();
+
+        let err = load_golden_set_validated(&yaml_path, &config).unwrap_err();
+        let msg = format!("{err:#}");
+        assert!(msg.contains("missing chunk_ids"), "msg: {msg}");
+        assert!(msg.contains("chunk_missing"), "msg: {msg}");
+        assert!(!msg.contains("chunk_present"), "msg: {msg}");
+    }
+
+    #[test]
+    fn accepts_resolved_ids() {
+        let tmp = tempdir().unwrap();
+        let mut config = Config::defaults();
+        config.storage.data_dir = tmp.path().to_string_lossy().into_owned();
+
+        let store = SqliteStore::open(&config).unwrap();
+        store.run_migrations().unwrap();
+        seed_one_chunk(&store, "doc_present", "chunk_present");
+
+        let yaml_path = tmp.path().join("golden.yaml");
+        fs::write(
+            &yaml_path,
+            "- id: g1\n  query: hello\n  expected_doc_ids: [\"doc_present\"]\n  expected_chunk_ids: [\"chunk_present\"]\n",
+        )
+        .unwrap();
+
+        let qs = load_golden_set_validated(&yaml_path, &config).unwrap();
+        assert_eq!(qs.len(), 1);
+    }
+
+    fn seed_one_chunk(store: &SqliteStore, doc_id: &str, chunk_id: &str) {
+        let conn = store.read_conn();
+        let asset_id = format!("a_{doc_id}");
+        conn.execute(
+            "INSERT OR IGNORE INTO assets (
+                asset_id, source_uri, workspace_path, media_type, byte_len,
+                checksum, storage_kind, storage_path, discovered_at
+             ) VALUES (?, ?, ?, '\"markdown\"', 0,
+                       'deadbeefdeadbeefdeadbeefdeadbeef',
+                       'reference', ?, '1970-01-01T00:00:00Z')",
+            params![asset_id, "file:///tmp/x.md", "x.md", "x.md"],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT OR IGNORE INTO documents (
+                doc_id, asset_id, workspace_path, title, lang, source_type,
+                trust_level, parser_version, doc_version, schema_version,
+                metadata_json, provenance_json, created_at, updated_at
+             ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
+                       '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
+            params![doc_id, asset_id, "x.md"],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT OR IGNORE INTO chunks (
+                chunk_id, doc_id, text, heading_path_json, section_label,
+                source_spans_json, token_estimate, chunker_version,
+                policy_hash, block_ids_json, created_at
+             ) VALUES (?, ?, 'hi', '[]', NULL,
+                       '[{\"kind\":\"line\",\"start\":1,\"end\":3}]',
+                       1, 'v1', 'h', '[]', '1970-01-01T00:00:00Z')",
+            params![chunk_id, doc_id],
+        )
+        .unwrap();
+    }
+}
--- a/crates/kb-eval/src/runner.rs
+++ b/crates/kb-eval/src/runner.rs
@@ -0,0 +1,330 @@
+//! Per-query eval runner. See [`run_eval`] / [`run_eval_with_config`].
+
+use std::fs::File;
+use std::io::{BufWriter, Write};
+use std::path::PathBuf;
+
+use anyhow::{Context, Result};
+use kb_core::{SearchFilters, SearchQuery};
+use kb_store_sqlite::{EvalRunRow, SqliteStore};
+use time::OffsetDateTime;
+
+use crate::loader::{load_golden_set, validate_against_db};
+use crate::types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult};
+
+/// Env var that overrides the default `fixtures/golden_queries.yaml`
+/// path. Resolved relative to the current working directory.
+const KB_EVAL_GOLDEN: &str = "KB_EVAL_GOLDEN";
+
+/// Default golden YAML path (relative to CWD when set).
+const DEFAULT_GOLDEN_PATH: &str = "fixtures/golden_queries.yaml";
+
+/// Run the golden suite end-to-end against the active XDG-loaded
+/// [`kb_config::Config`]. Wraps [`run_eval_with_config`] with
+/// `Config::load(None)`.
+pub fn run_eval(opts: &EvalRunOpts) -> Result<EvalRun> {
+    let cfg = kb_config::Config::load(None).context("load Config for run_eval")?;
+    run_eval_with_config(&cfg, opts)
+}
+
+/// Run the golden suite end-to-end against an explicit
+/// [`kb_config::Config`]. Used by integration tests (TempDir-backed
+/// data_dir) and any future caller that wants to drive the runner
+/// against a non-default config.
+pub fn run_eval_with_config(cfg: &kb_config::Config, opts: &EvalRunOpts) -> Result<EvalRun> {
+    let started = std::time::Instant::now();
+
+    // ── 1. Load golden set ────────────────────────────────────────────────
+    let golden_path = resolve_golden_path();
+    tracing::debug!(
+        target: "kb-eval",
+        path = %golden_path.display(),
+        "kb-eval: loading golden set"
+    );
+    let queries = load_golden_set(&golden_path).with_context(|| {
+        format!(
+            "load golden set from {} (override via KB_EVAL_GOLDEN)",
+            golden_path.display()
+        )
+    })?;
+    validate_against_db(&queries, cfg)?;
+
+    // ── 2. Mint identifiers + open store ──────────────────────────────────
+    let run_id = mint_run_id();
+    let created_at = OffsetDateTime::now_utc();
+    let commit_hash = std::env::var("KB_COMMIT_HASH")
+        .ok()
+        .filter(|s| !s.is_empty());
+
+    // Open the store once so every per-query write reuses the same
+    // connection-mutex lifetime.
+    let store = SqliteStore::open(cfg).context("open SqliteStore for run_eval")?;
+    store
+        .run_migrations()
+        .context("run migrations for run_eval")?;
+
+    // ── 3. Build config_snapshot_json ─────────────────────────────────────
+    let config_snapshot_json = build_config_snapshot(cfg)?;
+    let config_snapshot_text =
+        serde_json::to_string(&config_snapshot_json).context("serialize config_snapshot_json")?;
+
+    // ── 4. Per-query execution ────────────────────────────────────────────
+    let mut per_query: Vec<QueryResult> = Vec::with_capacity(queries.len());
+    for gq in &queries {
+        let qr = execute_query(cfg, gq, opts);
+        per_query.push(qr);
+    }
+
+    // ── 5. Persist eval_runs + eval_query_results ────────────────────────
+    // Serialize per-query JSON up front so the SQLite transaction below
+    // never holds the connection mutex through serde failures.
+    let mut results: Vec<(String, String)> = Vec::with_capacity(per_query.len());
+    for qr in &per_query {
+        let json = serde_json::to_string(qr)
+            .with_context(|| format!("serialize QueryResult for {}", qr.query_id))?;
+        results.push((qr.query_id.clone(), json));
+    }
+    let row = EvalRunRow {
+        run_id: &run_id,
+        suite: opts.suite.as_str(),
+        config_snapshot_json: &config_snapshot_text,
+        aggregate_json: "{}",
+        commit_hash: commit_hash.as_deref(),
+        created_at,
+    };
+    store
+        .record_eval_run_with_results(&row, &results)
+        .context("record eval_runs + eval_query_results (transactional)")?;
+
+    // ── 6. Mirror to runs_dir/<run_id>/per_query.jsonl ────────────────────
+    write_per_query_jsonl(cfg, &run_id, &per_query)?;
+
+    let duration_ms = started.elapsed().as_millis().min(u128::from(u32::MAX)) as u32;
+    tracing::info!(
+        target: "kb-eval",
+        run_id = %run_id,
+        suite = %opts.suite,
+        queries = per_query.len(),
+        duration_ms,
+        "kb-eval: run complete"
+    );
+
+    Ok(EvalRun {
+        run_id,
+        created_at,
+        commit_hash,
+        config_snapshot_json,
+        per_query,
+    })
+}
+
+/// Mint a `run_<lower>` identifier. UUIDv7 stands in for ULID — same
+/// timestamp-ordered monotonicity, already in workspace deps. Lower-
+/// case simple form to match the `ulid_lower()` shape the spec asks
+/// for.
+fn mint_run_id() -> String {
+    let id = uuid::Uuid::now_v7().simple().to_string();
+    format!("run_{id}")
+}
+
+/// Resolve the golden YAML path. Honors the `KB_EVAL_GOLDEN` env
+/// override; otherwise relative to CWD. The path is NOT expanded for
+/// `~` / `${...}` placeholders — direct file paths only.
+fn resolve_golden_path() -> PathBuf {
+    match std::env::var(KB_EVAL_GOLDEN) {
+        Ok(s) if !s.is_empty() => PathBuf::from(s),
+        _ => PathBuf::from(DEFAULT_GOLDEN_PATH),
+    }
+}
+
+/// Run one [`GoldenQuery`] through the kb-app facade. Errors are
+/// captured into `QueryResult.error` so the run continues.
+fn execute_query(cfg: &kb_config::Config, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult {
+    let started = std::time::Instant::now();
+
+    let search_query = SearchQuery {
+        text: gq.query.clone(),
+        mode: opts.mode,
+        k: opts.k,
+        filters: SearchFilters::default(),
+    };
+
+    let (hits_top_k, mut error) = match kb_app::search_with_config(cfg.clone(), search_query) {
+        Ok(hits) => (hits, None),
+        Err(e) => (Vec::new(), Some(format!("{e:#}"))),
+    };
+
+    // Optional RAG path: only attempted when `with_rag` and the search
+    // call did not already error out (we want one error per query, not
+    // a duplicated one).
+    let answer = if opts.with_rag && error.is_none() {
+        let ask_opts = kb_app::AskOpts {
+            k: opts.k,
+            explain: true,
+            mode: opts.mode,
+            temperature: opts.temperature,
+            seed: opts.seed,
+            stream_sink: None,
+        };
+        match kb_app::ask_with_config(cfg.clone(), &gq.query, ask_opts) {
+            Ok(ans) => Some(ans),
+            Err(e) => {
+                error = Some(format!("{e:#}"));
+                None
+            }
+        }
+    } else {
+        None
+    };
+
+    let elapsed_ms = started.elapsed().as_millis().min(u128::from(u32::MAX)) as u32;
+
+    QueryResult {
+        query_id: gq.id.clone(),
+        query: gq.query.clone(),
+        mode: opts.mode,
+        hits_top_k,
+        answer,
+        elapsed_ms,
+        error,
+    }
+}
+
+/// Build the `config_snapshot_json` value: full Config as `config` plus
+/// the auxiliary version fields the spec calls out.
+///
+/// `index_version` is intentionally `None` here — it is composed
+/// dynamically by `kb-app` on a per-call basis from the configured
+/// embedder (e.g., `vec:<model>@<version>:<dim>`), so it is not a
+/// stable run-time property of the config alone. P5-2 may compose it
+/// from `embedding.{model,version,dimensions}` if it needs the field
+/// for compare reports.
+fn build_config_snapshot(cfg: &kb_config::Config) -> Result<serde_json::Value> {
+    let cfg_value = serde_json::to_value(cfg).context("serialize Config")?;
+    Ok(serde_json::json!({
+        "config": cfg_value,
+        "chunker_version": cfg.chunking.chunker_version,
+        "embedding": {
+            "model": cfg.models.embedding.model,
+            "version": cfg.models.embedding.version,
+            "dimensions": cfg.models.embedding.dimensions,
+            "provider": cfg.models.embedding.provider,
+        },
+        "llm": {
+            "model_id": cfg.models.llm.model,
+            "provider": cfg.models.llm.provider,
+        },
+        "prompt_template_version": cfg.rag.prompt_template_version,
+        "score_gate": cfg.rag.score_gate,
+        "rrf_k": cfg.search.rrf_k,
+        "index_version": serde_json::Value::Null,
+    }))
+}
+
+/// Write the `runs_dir/<run_id>/per_query.jsonl` mirror (design §6.3).
+/// Each `QueryResult` is one line, separator `\n`. The directory is
+/// created if it doesn't exist; an existing file is overwritten (a
+/// `run_id` collision would already have failed the `eval_runs`
+/// PRIMARY KEY upstream).
+fn write_per_query_jsonl(
+    cfg: &kb_config::Config,
+    run_id: &str,
+    per_query: &[QueryResult],
+) -> Result<()> {
+    let runs_dir = expand_path(&cfg.storage.runs_dir, &cfg.storage.data_dir);
+    let run_dir = runs_dir.join(run_id);
+    std::fs::create_dir_all(&run_dir)
+        .with_context(|| format!("create run dir {}", run_dir.display()))?;
+    let path = run_dir.join("per_query.jsonl");
+    let file = File::create(&path)
+        .with_context(|| format!("create per_query.jsonl at {}", path.display()))?;
+    let mut w = BufWriter::new(file);
+    for qr in per_query {
+        serde_json::to_writer(&mut w, qr)
+            .with_context(|| format!("serialize QueryResult for {}", qr.query_id))?;
+        w.write_all(b"\n")
+            .context("write newline separator in per_query.jsonl")?;
+    }
+    w.flush().context("flush per_query.jsonl")?;
+    Ok(())
+}
+
+/// Expand `{data_dir}` / `${XDG_DATA_HOME:-…}` / leading `~`. Mirror
+/// of `kb-store-vector::paths::expand_path` and
+/// `kb-store-sqlite::expand_data_dir` — kept private here because
+/// `kb-config` does not (yet) expose a shared resolver helper.
+fn expand_path(raw: &str, data_dir: &str) -> PathBuf {
+    let mut s = raw.to_string();
+
+    // First, resolve `data_dir` itself so any `{data_dir}` substitution
+    // points at an already-expanded base path. `data_dir` may contain
+    // `${XDG_DATA_HOME:-…}` and `~`; resolve them once and re-use the
+    // result.
+    let resolved_data_dir = expand_xdg_and_tilde(data_dir);
+    s = s.replace("{data_dir}", &resolved_data_dir);
+
+    expand_xdg_and_tilde_path(&s)
+}
+
+fn expand_xdg_and_tilde(raw: &str) -> String {
+    let s = expand_xdg(raw);
+    expand_tilde_str(&s)
+}
+
+fn expand_xdg_and_tilde_path(raw: &str) -> PathBuf {
+    let s = expand_xdg_and_tilde(raw);
+    PathBuf::from(s)
+}
+
+fn expand_xdg(raw: &str) -> String {
+    let mut s = raw.to_string();
+    if let Some(start) = s.find("${XDG_DATA_HOME") {
+        if let Some(rel_end) = s[start..].find('}') {
+            let end = start + rel_end + 1;
+            let inner = &s[start + 2..end - 1];
+            let replacement = match std::env::var("XDG_DATA_HOME") {
+                Ok(v) if !v.is_empty() => v,
+                _ => match inner.split_once(":-") {
+                    Some((_, default)) => default.to_string(),
+                    None => String::new(),
+                },
+            };
+            s.replace_range(start..end, &replacement);
+        }
+    }
+    s
+}
+
+fn expand_tilde_str(raw: &str) -> String {
+    if let Some(rest) = raw.strip_prefix("~/") {
+        if let Some(home) = std::env::var_os("HOME") {
+            let mut p = PathBuf::from(home);
+            p.push(rest);
+            return p.to_string_lossy().into_owned();
+        }
+    }
+    if raw == "~" {
+        if let Some(home) = std::env::var_os("HOME") {
+            return PathBuf::from(home).to_string_lossy().into_owned();
+        }
+    }
+    raw.to_string()
+}
+
+#[cfg(test)]
+mod expand_tests {
+    use super::*;
+    use std::path::Path;
+
+    #[test]
+    fn expand_substitutes_data_dir() {
+        let p = expand_path("{data_dir}/runs", "/tmp/kbtest");
+        assert_eq!(p, Path::new("/tmp/kbtest/runs"));
+    }
+
+    #[test]
+    fn expand_passthrough_absolute() {
+        let p = expand_path("/abs/runs", "/ignored");
+        assert_eq!(p, Path::new("/abs/runs"));
+    }
+}
--- a/crates/kb-eval/src/types.rs
+++ b/crates/kb-eval/src/types.rs
@@ -0,0 +1,87 @@
+//! Public domain types for the eval runner (signatures pinned by
+//! `tasks/p5/p5-1-golden-fixture-runner.md` "Public surface").
+
+use serde::{Deserialize, Serialize};
+use time::OffsetDateTime;
+
+use kb_core::{Answer, ChunkId, DocumentId, Lang, SearchHit, SearchMode};
+
+/// One golden query loaded from `fixtures/golden_queries.yaml`.
+///
+/// Required fields: `id`, `query`. Everything else defaults to
+/// empty / `None` per the loader contract.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct GoldenQuery {
+    pub id: String,
+    pub query: String,
+    #[serde(default = "default_lang")]
+    pub lang: Lang,
+    #[serde(default)]
+    pub expected_doc_ids: Vec<DocumentId>,
+    #[serde(default)]
+    pub expected_chunk_ids: Vec<ChunkId>,
+    #[serde(default)]
+    pub must_contain: Vec<String>,
+    #[serde(default)]
+    pub forbidden: Vec<String>,
+    #[serde(default)]
+    pub difficulty: Option<String>,
+}
+
+fn default_lang() -> Lang {
+    // `Lang` is a BCP-47 string newtype (§3.3); the empty string is
+    // the safe default for golden entries that omit `lang`. Curators
+    // may fill it in later; the runner does not branch on this field.
+    Lang(String::new())
+}
+
+/// Caller-supplied knobs for one [`crate::run_eval`] invocation.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct EvalRunOpts {
+    /// Suite label persisted into `eval_runs.suite`. The shipped
+    /// fixture is `"golden"`; other suites can reuse the same runner.
+    pub suite: String,
+    /// Retrieval mode forwarded to every `kb_app::search` /
+    /// `kb_app::ask` call inside the run.
+    pub mode: SearchMode,
+    /// When `true`, also call `kb_app::ask` per query and record the
+    /// resulting `Answer` on the `QueryResult`.
+    pub with_rag: bool,
+    /// Top-k forwarded to retrieval (and `AskOpts.k` when `with_rag`).
+    pub k: usize,
+    /// Override `config.models.llm.temperature` when `with_rag`.
+    /// Determinism contract requires `Some(0.0)` + a fixed `seed`.
+    pub temperature: Option<f32>,
+    /// Override `config.models.llm.seed` when `with_rag`.
+    pub seed: Option<u64>,
+}
+
+/// One full eval run. Persisted to `eval_runs` + `eval_query_results`
+/// (design §5.7) and mirrored to `runs_dir/<run_id>/per_query.jsonl`
+/// (design §6.3).
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct EvalRun {
+    pub run_id: String,
+    #[serde(with = "time::serde::rfc3339")]
+    pub created_at: OffsetDateTime,
+    pub commit_hash: Option<String>,
+    /// Snapshot of the `Config` plus auxiliary version fields
+    /// (`chunker_version`, embedding/llm/prompt versions, fusion
+    /// params, `index_version`). See [`crate::run_eval`] for the
+    /// exact shape.
+    pub config_snapshot_json: serde_json::Value,
+    pub per_query: Vec<QueryResult>,
+}
+
+/// One per-query record. Every row in `eval_query_results` has its
+/// `result_json` filled with `serde_json::to_string(&QueryResult)`.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct QueryResult {
+    pub query_id: String,
+    pub query: String,
+    pub mode: SearchMode,
+    pub hits_top_k: Vec<SearchHit>,
+    pub answer: Option<Answer>,
+    pub elapsed_ms: u32,
+    pub error: Option<String>,
+}
--- a/crates/kb-eval/tests/fixtures/eval/run-1.json
+++ b/crates/kb-eval/tests/fixtures/eval/run-1.json
@@ -0,0 +1,30 @@
+[
+  {
+    "error": null,
+    "first_hit": {
+      "chunk_id": "chunk000000000000000000000000000000",
+      "doc_id": "doc00000000000000000000000000000000",
+      "heading_path": [],
+      "score": 0.3429983854293823
+    },
+    "has_answer": false,
+    "hits_count": 1,
+    "mode": "lexical",
+    "query": "ownership",
+    "query_id": "q1"
+  },
+  {
+    "error": null,
+    "first_hit": {
+      "chunk_id": "chunk000000000000000000000000000002",
+      "doc_id": "doc00000000000000000000000000000002",
+      "heading_path": [],
+      "score": 0.3585492968559265
+    },
+    "has_answer": false,
+    "hits_count": 1,
+    "mode": "lexical",
+    "query": "heading",
+    "query_id": "q2"
+  }
+]
--- a/crates/kb-eval/tests/loader.rs
+++ b/crates/kb-eval/tests/loader.rs
@@ -0,0 +1,59 @@
+//! Loader tests for the golden-fixture YAML parser (P5-1).
+//!
+//! These tests exercise pure parsing and duplicate-id detection. The
+//! DB-validation tests for the crate-private
+//! `load_golden_set_validated` live next to the function in
+//! `src/loader.rs` (they need `pub(crate)` visibility, which integration
+//! tests can't see).
+
+use std::fs;
+
+use kb_eval::load_golden_set;
+use tempfile::tempdir;
+
+// ── 1. parser accepts well-formed YAML with optional fields ──────────────────
+
+#[test]
+fn loads_minimal_well_formed_yaml() {
+    let tmp = tempdir().unwrap();
+    let yaml_path = tmp.path().join("golden.yaml");
+    fs::write(
+        &yaml_path,
+        "- id: g1\n  query: hello\n- id: g2\n  query: \"another\"\n  lang: en\n  must_contain: [\"foo\"]\n  forbidden: [\"bar\"]\n  difficulty: easy\n",
+    )
+    .unwrap();
+
+    let qs = load_golden_set(&yaml_path).unwrap();
+    assert_eq!(qs.len(), 2);
+    assert_eq!(qs[0].id, "g1");
+    assert_eq!(qs[0].query, "hello");
+    assert!(qs[0].must_contain.is_empty());
+    assert!(qs[0].forbidden.is_empty());
+    assert!(qs[0].difficulty.is_none());
+
+    assert_eq!(qs[1].id, "g2");
+    assert_eq!(qs[1].lang.0, "en");
+    assert_eq!(qs[1].must_contain, vec!["foo".to_string()]);
+    assert_eq!(qs[1].forbidden, vec!["bar".to_string()]);
+    assert_eq!(qs[1].difficulty.as_deref(), Some("easy"));
+}
+
+// ── 2. duplicate IDs error lists every offender (sorted, deduplicated) ───────
+
+#[test]
+fn rejects_duplicate_ids() {
+    let tmp = tempdir().unwrap();
+    let yaml_path = tmp.path().join("dup.yaml");
+    fs::write(
+        &yaml_path,
+        "- id: g1\n  query: a\n- id: g2\n  query: b\n- id: g1\n  query: c\n- id: g2\n  query: d\n- id: g1\n  query: e\n",
+    )
+    .unwrap();
+
+    let err = load_golden_set(&yaml_path).unwrap_err();
+    let msg = format!("{err:#}");
+    assert!(msg.contains("duplicate query id"), "msg: {msg}");
+    // Both dup IDs should appear, sorted (BTreeSet) and deduplicated.
+    assert!(msg.contains("g1"), "msg: {msg}");
+    assert!(msg.contains("g2"), "msg: {msg}");
+}
--- a/crates/kb-eval/tests/runner.rs
+++ b/crates/kb-eval/tests/runner.rs
@@ -0,0 +1,403 @@
+//! Runner integration tests for `kb-eval` (P5-1).
+//!
+//! Drives [`kb_eval::run_eval_with_config`] end-to-end against a
+//! TempDir-backed config:
+//!
+//! - tiny seeded SQLite corpus (3 docs / 3 chunks) used as the
+//!   workspace's source-of-truth,
+//! - lexical-only retrieval (`SearchMode::Lexical`) so no embedder is
+//!   required (`models.embedding.provider = "none"`),
+//! - golden YAML pointed at via `KB_EVAL_GOLDEN`.
+//!
+//! Determinism: lexical-only with a fixed seed corpus produces
+//! byte-identical `per_query.jsonl` content (modulo `run_id` /
+//! `created_at`, which we strip when comparing).
+
+use std::fs;
+use std::path::{Path, PathBuf};
+use std::sync::Mutex;
+
+use kb_config::Config;
+use kb_core::SearchMode;
+use kb_eval::{EvalRunOpts, QueryResult, run_eval_with_config};
+use kb_store_sqlite::SqliteStore;
+use rusqlite::params;
+use tempfile::TempDir;
+
+/// `KB_EVAL_GOLDEN` is process-global state. Tests touching it must
+/// serialize so they don't trample each other when `cargo test`
+/// runs them in parallel.
+static GOLDEN_ENV_LOCK: Mutex<()> = Mutex::new(());
+
+// ── shared scaffolding ───────────────────────────────────────────────────────
+
+struct RunEnv {
+    temp: TempDir,
+    config: Config,
+}
+
+impl RunEnv {
+    fn new() -> Self {
+        let temp = tempfile::tempdir().unwrap();
+        let mut config = Config::defaults();
+        config.storage.data_dir = temp.path().to_string_lossy().into_owned();
+        // Force lexical-only behavior so the runner never tries to
+        // load fastembed during integration tests.
+        config.models.embedding.provider = "none".to_string();
+        config.models.embedding.dimensions = 0;
+        // Pin search defaults so test asserts are stable.
+        config.search.default_k = 5;
+
+        let store = SqliteStore::open(&config).unwrap();
+        store.run_migrations().unwrap();
+        seed_corpus(&store);
+        Self { temp, config }
+    }
+
+    fn data_dir(&self) -> PathBuf {
+        self.temp.path().to_path_buf()
+    }
+}
+
+/// Seed three (asset, document, chunk) triples with text the test
+/// queries can match against the FTS5 lexical index.
+fn seed_corpus(store: &SqliteStore) {
+    let conn = store.read_conn();
+    for (i, text) in [
+        "Rust ownership and borrow checker basics.",
+        "Cargo workspace members are listed in workspace.members.",
+        "Markdown chunking respects heading boundaries.",
+    ]
+    .iter()
+    .enumerate()
+    {
+        let doc_id = format!("doc{i:032}");
+        let chunk_id = format!("chunk{i:030}");
+        let asset_id = format!("asset{i:030}");
+        let path = format!("notes/{i}.md");
+        conn.execute(
+            "INSERT INTO assets (
+                asset_id, source_uri, workspace_path, media_type, byte_len,
+                checksum, storage_kind, storage_path, discovered_at
+             ) VALUES (?, ?, ?, '\"markdown\"', 0,
+                       'deadbeefdeadbeefdeadbeefdeadbeef',
+                       'reference', ?, '1970-01-01T00:00:00Z')",
+            params![asset_id, format!("file:///{path}"), path, path],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT INTO documents (
+                doc_id, asset_id, workspace_path, title, lang, source_type,
+                trust_level, parser_version, doc_version, schema_version,
+                metadata_json, provenance_json, created_at, updated_at
+             ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
+                       '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
+            params![doc_id, asset_id, path],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT INTO chunks (
+                chunk_id, doc_id, text, heading_path_json, section_label,
+                source_spans_json, token_estimate, chunker_version,
+                policy_hash, block_ids_json, created_at
+             ) VALUES (?, ?, ?, '[]', NULL,
+                       '[{\"kind\":\"line\",\"start\":1,\"end\":3}]',
+                       1, 'md-heading-v1', 'h', '[]', '1970-01-01T00:00:00Z')",
+            params![chunk_id, doc_id, text],
+        )
+        .unwrap();
+    }
+    // Build the FTS index so lexical search returns hits. Reuses the
+    // same connection guard rather than reopening — the SAVEPOINT
+    // protocol nests correctly under the existing read_conn lock.
+    kb_store_sqlite::rebuild_chunks_fts(&conn).unwrap();
+    drop(conn);
+}
+
+fn write_golden(dir: &Path, body: &str) -> PathBuf {
+    let path = dir.join("golden.yaml");
+    fs::write(&path, body).unwrap();
+    path
+}
+
+fn lexical_opts() -> EvalRunOpts {
+    EvalRunOpts {
+        suite: "test".to_string(),
+        mode: SearchMode::Lexical,
+        with_rag: false,
+        k: 5,
+        temperature: Some(0.0),
+        seed: Some(0),
+    }
+}
+
+/// Run the eval after pointing `KB_EVAL_GOLDEN` at `yaml`. The env
+/// guard must outlive the call so concurrent tests don't reset the
+/// var mid-run.
+fn run_with_golden<F: FnOnce() -> R, R>(yaml: &Path, f: F) -> R {
+    let _g = GOLDEN_ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
+    // SAFETY: `KB_EVAL_GOLDEN` is a benign env var; the GOLDEN_ENV_LOCK
+    // serializes mutations so concurrent tests don't race.
+    unsafe {
+        std::env::set_var("KB_EVAL_GOLDEN", yaml);
+    }
+    let out = f();
+    unsafe {
+        std::env::remove_var("KB_EVAL_GOLDEN");
+    }
+    out
+}
+
+// ── 1. elapsed_ms recorded for every query ──────────────────────────────────
+
+#[test]
+fn runner_records_elapsed_for_every_query() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n- id: q3\n  query: workspace\n",
+    );
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    assert_eq!(run.per_query.len(), 3);
+    for qr in &run.per_query {
+        assert_eq!(qr.mode, SearchMode::Lexical);
+        // `elapsed_ms` is `u32`; the assertion that it's a valid
+        // unsigned value is implicit. We additionally bound it well
+        // below the 4G ceiling to detect a stuck/overflow path.
+        assert!(
+            qr.elapsed_ms < 60_000,
+            "elapsed_ms suspicious: {}",
+            qr.elapsed_ms
+        );
+    }
+    // The id-list round-trips into the per-query records.
+    let ids: Vec<&str> = run.per_query.iter().map(|q| q.query_id.as_str()).collect();
+    assert_eq!(ids, vec!["q1", "q2", "q3"]);
+}
+
+// ── 2. config snapshot carries the documented version fields ────────────────
+
+#[test]
+fn runner_records_config_snapshot_with_versions() {
+    let env = RunEnv::new();
+    let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n  query: ownership\n");
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    let snap = &run.config_snapshot_json;
+    assert!(snap.get("config").is_some(), "config field missing");
+    assert_eq!(
+        snap.pointer("/chunker_version"),
+        Some(&serde_json::Value::String("md-heading-v1".to_string())),
+    );
+    assert!(snap.pointer("/embedding/model").is_some());
+    assert!(snap.pointer("/embedding/dimensions").is_some());
+    assert!(snap.pointer("/llm/model_id").is_some());
+    assert_eq!(
+        snap.pointer("/prompt_template_version"),
+        Some(&serde_json::Value::String("rag-v1".to_string())),
+    );
+    assert!(snap.pointer("/score_gate").is_some());
+    assert!(snap.pointer("/rrf_k").is_some());
+}
+
+// ── 3. failing query (ask path with no Ollama) records an error ─────────────
+
+#[test]
+fn runner_captures_per_query_error_when_rag_unreachable() {
+    let env = RunEnv::new();
+    // Point Ollama at a guaranteed-dead port so `ask_with_config`
+    // surfaces a connection error per query.
+    let mut config = env.config.clone();
+    config.models.llm.endpoint = "http://127.0.0.1:1".to_string();
+
+    let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n  query: ownership\n");
+
+    let opts = EvalRunOpts {
+        with_rag: true,
+        ..lexical_opts()
+    };
+    let run = run_with_golden(&yaml, || run_eval_with_config(&config, &opts).unwrap());
+
+    let qr = &run.per_query[0];
+    // hits_top_k still populated by lexical search before the RAG attempt.
+    assert!(
+        !qr.hits_top_k.is_empty(),
+        "lexical hits should populate before RAG attempt"
+    );
+    assert!(qr.answer.is_none(), "no answer when RAG fails");
+    assert!(qr.error.is_some(), "error must be recorded");
+}
+
+// ── 4. eval_runs + eval_query_results rows persisted ────────────────────────
+
+#[test]
+fn runner_persists_eval_run_and_query_result_rows() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
+    );
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    // Reopen the same SQLite file with a new store handle and read
+    // the rows back. We use the inherent `read_conn` helper rather
+    // than rusqlite directly because the latter would require kb-eval
+    // to add a runtime rusqlite dep (forbidden by the spec).
+    let store = SqliteStore::open(&env.config).unwrap();
+    let conn = store.read_conn();
+
+    let n_runs: i64 = conn
+        .query_row(
+            "SELECT COUNT(*) FROM eval_runs WHERE run_id = ?",
+            params![run.run_id],
+            |r| r.get(0),
+        )
+        .unwrap();
+    assert_eq!(n_runs, 1);
+
+    let n_results: i64 = conn
+        .query_row(
+            "SELECT COUNT(*) FROM eval_query_results WHERE run_id = ?",
+            params![run.run_id],
+            |r| r.get(0),
+        )
+        .unwrap();
+    assert_eq!(n_results, 2);
+}
+
+// ── 5. per_query.jsonl mirror exists and round-trips ────────────────────────
+
+#[test]
+fn runner_writes_per_query_jsonl_mirror() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
+    );
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    let mirror = env
+        .data_dir()
+        .join("runs")
+        .join(&run.run_id)
+        .join("per_query.jsonl");
+    assert!(
+        mirror.exists(),
+        "per_query.jsonl missing at {}",
+        mirror.display()
+    );
+    let body = fs::read_to_string(&mirror).unwrap();
+    let lines: Vec<&str> = body.lines().collect();
+    assert_eq!(lines.len(), 2);
+    let parsed: Vec<QueryResult> = lines
+        .iter()
+        .map(|l| serde_json::from_str::<QueryResult>(l).expect("valid JSONL line"))
+        .collect();
+    assert_eq!(parsed[0].query_id, "q1");
+    assert_eq!(parsed[1].query_id, "q2");
+}
+
+// ── 6. determinism — repeating the run produces byte-identical per_query JSON ─
+
+#[test]
+fn runner_lexical_is_deterministic_per_query_payload() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
+    );
+
+    let run_a = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+    let run_b = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    // Run-level fields (`run_id`, `created_at`) intentionally diverge;
+    // the per-query payload (which is what the snapshot fixture pins)
+    // must be byte-identical.
+    let a_json = serde_json::to_string(&run_a.per_query).unwrap();
+    let b_json = serde_json::to_string(&run_b.per_query).unwrap();
+    assert_eq!(
+        a_json, b_json,
+        "lexical-only per_query payload must be byte-identical across runs"
+    );
+}
+
+// ── 7. snapshot — per_query JSON pinned to fixtures/eval/run-1.json ─────────
+
+#[test]
+fn runner_per_query_snapshot_matches_fixture() {
+    let env = RunEnv::new();
+    let yaml = write_golden(
+        env.data_dir().as_path(),
+        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
+    );
+
+    let run = run_with_golden(&yaml, || {
+        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
+    });
+
+    // Fixture pins the *shape* of the per-query payload, including the
+    // first hit's stable scalar fields (chunk_id, doc_id, heading_path,
+    // fusion_score). FTS scores depend on the SQLite version, so the
+    // fusion_score is captured into the fixture from one passing run
+    // and must remain stable across re-runs against the same seeded
+    // corpus. Timing-sensitive fields (`elapsed_ms`, raw `Instant`
+    // byproducts) are excluded. Verifying byte stability is the
+    // determinism test (#6); this test verifies the field set +
+    // ordering is stable.
+    let projection: Vec<_> = run
+        .per_query
+        .iter()
+        .map(|qr| {
+            let first_hit = qr.hits_top_k.first().map(|h| {
+                serde_json::json!({
+                    "chunk_id": h.chunk_id,
+                    "doc_id": h.doc_id,
+                    "heading_path": h.heading_path,
+                    "score": h.retrieval.fusion_score,
+                })
+            });
+            serde_json::json!({
+                "query_id": qr.query_id,
+                "query": qr.query,
+                "mode": qr.mode,
+                "hits_count": qr.hits_top_k.len(),
+                "first_hit": first_hit,
+                "has_answer": qr.answer.is_some(),
+                "error": qr.error,
+            })
+        })
+        .collect();
+    let actual = serde_json::to_string_pretty(&projection).unwrap();
+
+    let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/eval/run-1.json");
+
+    if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
+        fs::create_dir_all(fixture_path.parent().unwrap()).unwrap();
+        fs::write(&fixture_path, &actual).unwrap();
+    }
+
+    let expected = fs::read_to_string(&fixture_path)
+        .unwrap_or_else(|e| panic!("read snapshot {}: {e}", fixture_path.display()));
+    assert_eq!(
+        actual.trim(),
+        expected.trim(),
+        "snapshot drift — re-run with UPDATE_SNAPSHOTS=1 to refresh"
+    );
+}
--- a/crates/kb-store-sqlite/src/eval.rs
+++ b/crates/kb-store-sqlite/src/eval.rs
@@ -0,0 +1,161 @@
+//! `eval_runs` / `eval_query_results` row writers (P5-1 — design §5.7).
+//!
+//! `kb-eval` calls these directly via the inherent methods on
+//! [`SqliteStore`]. The pattern mirrors [`crate::answers`]: the trait
+//! `kb_core::DocumentStore` is the document surface, and run-level
+//! audit rows (jobs, ingest_runs, answers, eval_runs) are inherent
+//! methods so the trait surface stays small.
+
+use anyhow::{Context, Result};
+use rusqlite::params;
+use time::OffsetDateTime;
+
+use crate::error::StoreError;
+use crate::store::SqliteStore;
+
+/// One row about to land in `eval_runs` (per V001 schema).
+///
+/// `aggregate_json` is filled by P5-1 with the literal `"{}"` —
+/// metric computation lives in P5-2 and updates the row in place.
+#[derive(Clone, Debug)]
+pub struct EvalRunRow<'a> {
+    pub run_id: &'a str,
+    pub suite: &'a str,
+    pub config_snapshot_json: &'a str,
+    pub aggregate_json: &'a str,
+    pub commit_hash: Option<&'a str>,
+    pub created_at: OffsetDateTime,
+}
+
+impl SqliteStore {
+    /// Return `true` iff a row with `doc_id = ?` exists in
+    /// `documents`. Lightweight existence probe used by
+    /// `kb-eval`'s golden-fixture validator — full
+    /// `DocumentStore::get_document` deserializes blocks + metadata
+    /// JSON, which is overkill for "does this ID exist?"
+    pub fn document_exists(&self, doc_id: &str) -> Result<bool> {
+        let conn = self.lock_conn();
+        let row: Result<i64, rusqlite::Error> = conn.query_row(
+            "SELECT 1 FROM documents WHERE doc_id = ? LIMIT 1",
+            params![doc_id],
+            |r| r.get(0),
+        );
+        match row {
+            Ok(_) => Ok(true),
+            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(false),
+            Err(e) => Err(StoreError::from(e).into()),
+        }
+    }
+
+    /// Same shape as [`Self::document_exists`] but probes the
+    /// `chunks` table by `chunk_id`.
+    pub fn chunk_exists(&self, chunk_id: &str) -> Result<bool> {
+        let conn = self.lock_conn();
+        let row: Result<i64, rusqlite::Error> = conn.query_row(
+            "SELECT 1 FROM chunks WHERE chunk_id = ? LIMIT 1",
+            params![chunk_id],
+            |r| r.get(0),
+        );
+        match row {
+            Ok(_) => Ok(true),
+            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(false),
+            Err(e) => Err(StoreError::from(e).into()),
+        }
+    }
+
+    /// Insert one row into `eval_runs`. Mirrors the schema in
+    /// `migrations/V001__init.sql` (§5.7). Called by
+    /// `kb-eval::run_eval` once per run, after every per-query result
+    /// row has been written.
+    pub fn record_eval_run(&self, row: &EvalRunRow<'_>) -> Result<()> {
+        let created_at = row
+            .created_at
+            .format(&time::format_description::well_known::Rfc3339)
+            .context("format eval_runs.created_at")?;
+        let conn = self.lock_conn();
+        conn.execute(
+            "INSERT INTO eval_runs (
+                run_id, suite, config_snapshot_json, aggregate_json,
+                commit_hash, created_at
+            ) VALUES (?, ?, ?, ?, ?, ?)",
+            params![
+                row.run_id,
+                row.suite,
+                row.config_snapshot_json,
+                row.aggregate_json,
+                row.commit_hash,
+                created_at,
+            ],
+        )
+        .map_err(StoreError::from)?;
+        Ok(())
+    }
+
+    /// Insert one row into `eval_query_results`. PRIMARY KEY is
+    /// `(run_id, query_id)` so writing the same `(run, query)` twice
+    /// surfaces a `UNIQUE` violation through `StoreError`.
+    pub fn record_eval_query_result(
+        &self,
+        run_id: &str,
+        query_id: &str,
+        result_json: &str,
+    ) -> Result<()> {
+        let conn = self.lock_conn();
+        conn.execute(
+            "INSERT INTO eval_query_results (run_id, query_id, result_json)
+             VALUES (?, ?, ?)",
+            params![run_id, query_id, result_json],
+        )
+        .map_err(StoreError::from)?;
+        Ok(())
+    }
+
+    /// Insert the `eval_runs` row plus every `eval_query_results` row
+    /// for the same run inside a single SQLite transaction. This is the
+    /// preferred path for `kb-eval::run_eval` — a panic between the run
+    /// row and the per-query rows can't leave orphan run rows.
+    ///
+    /// `results` is a slice of `(query_id, result_json)` tuples mirroring
+    /// the per-call `record_eval_query_result` arguments.
+    pub fn record_eval_run_with_results(
+        &self,
+        row: &EvalRunRow<'_>,
+        results: &[(String, String)],
+    ) -> Result<()> {
+        let created_at = row
+            .created_at
+            .format(&time::format_description::well_known::Rfc3339)
+            .context("format eval_runs.created_at")?;
+        let mut conn = self.lock_conn();
+        let tx = conn.transaction().map_err(StoreError::from)?;
+        tx.execute(
+            "INSERT INTO eval_runs (
+                run_id, suite, config_snapshot_json, aggregate_json,
+                commit_hash, created_at
+            ) VALUES (?, ?, ?, ?, ?, ?)",
+            params![
+                row.run_id,
+                row.suite,
+                row.config_snapshot_json,
+                row.aggregate_json,
+                row.commit_hash,
+                created_at,
+            ],
+        )
+        .map_err(StoreError::from)?;
+        {
+            let mut stmt = tx
+                .prepare(
+                    "INSERT INTO eval_query_results (run_id, query_id, result_json)
+                     VALUES (?, ?, ?)",
+                )
+                .map_err(StoreError::from)?;
+            for (query_id, result_json) in results {
+                stmt.execute(params![row.run_id, query_id, result_json])
+                    .map_err(StoreError::from)?;
+            }
+        }
+        tx.commit().map_err(StoreError::from)?;
+        Ok(())
+    }
+}
--- a/crates/kb-store-sqlite/src/lib.rs
+++ b/crates/kb-store-sqlite/src/lib.rs
@@ -21,6 +21,7 @@ mod answers;
 mod documents;
 mod embeddings;
 mod error;
+mod eval;
 mod filters;
 mod fts;
 mod jobs;
@@ -29,6 +30,7 @@ mod store;

 pub use embeddings::EmbeddingRecordRow;
 pub use error::StoreError;
+pub use eval::EvalRunRow;
 pub use fts::rebuild_chunks_fts;
 pub use jobs::IngestRunRow;
 pub use store::SqliteStore;
--- a/fixtures/golden_queries.yaml
+++ b/fixtures/golden_queries.yaml
@@ -0,0 +1,45 @@
+# Golden query suite for `kb eval run` (P5-1 / P5-2).
+#
+# Top-level: list of queries. Required fields: `id`, `query`. All
+# others are optional and default to empty / null.
+#
+# Curators: `expected_doc_ids` and `expected_chunk_ids` MUST refer to
+# real rows in the active workspace's SQLite store at run time. Stale
+# references make the runner bail at start. The shipped template
+# leaves them empty so the file is loadable on any fresh workspace —
+# fill them in after a `kb ingest` to enable hit@k / MRR metrics
+# (P5-2).
+#
+# `must_contain` / `forbidden` drive the rule-based groundedness
+# metric (P5-2).
+
+- id: g001
+  query: "Cargo workspace 멤버 추가하는 법"
+  lang: ko
+  must_contain: ["[workspace]", "members"]
+  difficulty: easy
+
+- id: g002
+  query: "What is Rust ownership?"
+  lang: en
+  must_contain: ["borrow", "lifetime"]
+  difficulty: easy
+
+- id: g003
+  query: "Markdown chunking 규칙은?"
+  lang: ko
+  must_contain: ["heading"]
+  forbidden: ["embedding"]
+  difficulty: medium
+
+- id: g004
+  query: "How does FTS5 tokenization work for Korean text?"
+  lang: en
+  must_contain: ["unicode61", "tokenizer"]
+  difficulty: medium
+
+- id: g005
+  query: "RAG citation 검증은 어떻게 동작?"
+  lang: ko
+  must_contain: ["citation", "marker"]
+  difficulty: hard
--- a/tasks/p5/p5-1-golden-fixture-runner.md
+++ b/tasks/p5/p5-1-golden-fixture-runner.md
@@ -3,7 +3,7 @@ phase: P5
 component: kb-eval (runner)
 task_id: p5-1
 title: "Golden query fixture loader + per-query runner"
-status: planned
+status: completed
 depends_on: [p4-3]
 unblocks: [p5-2]
 contract_source: ../../docs/superpowers/specs/2026-04-27-kb-final-form-design.md
@@ -149,6 +149,6 @@ All tests under `cargo test -p kb-eval runner`.

 ## Risks / notes

- Large RAG suites can be slow. Consider `--max-queries` for incremental runs (kept here as a flag spec; implementation is the responsibility of this task).
+- Large RAG suites can be slow. `--max-queries` flag is deferred to P5-2 / a follow-up. Rationale: (a) the runner currently runs all queries unconditionally; (b) without metrics aggregation it adds little incremental value; (c) trivial to add as a `Vec::truncate` once the eval CLI subcommand exists.
 - `expected_chunk_id` references depend on `chunker_version`. If chunker bumps, golden set must be re-curated. Fail fast in the loader.
 - Use `time::OffsetDateTime::now_utc()` for `created_at`; never local TZ.