kebab/crates/kebab-eval/tests/runner.rs

//! Runner integration tests for `kb-eval` (P5-1).
//!
//! Drives [`kebab_eval::run_eval_with_config`] end-to-end against a
//! TempDir-backed config:
//!
//! - tiny seeded SQLite corpus (3 docs / 3 chunks) used as the
//!   workspace's source-of-truth,
//! - lexical-only retrieval (`SearchMode::Lexical`) so no embedder is
//!   required (`models.embedding.provider = "none"`),
//! - golden YAML pointed at via `KEBAB_EVAL_GOLDEN`.
//!
//! Determinism: lexical-only with a fixed seed corpus produces
//! byte-identical `per_query.jsonl` content (modulo `run_id` /
//! `created_at`, which we strip when comparing).

use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Mutex;

use kebab_config::Config;
use kebab_core::SearchMode;
use kebab_eval::{EvalRunOpts, QueryResult, run_eval_with_config};
use kebab_store_sqlite::SqliteStore;
use rusqlite::params;
use tempfile::TempDir;

/// `KEBAB_EVAL_GOLDEN` is process-global state. Tests touching it must
/// serialize so they don't trample each other when `cargo test`
/// runs them in parallel.
static GOLDEN_ENV_LOCK: Mutex<()> = Mutex::new(());

// ── shared scaffolding ───────────────────────────────────────────────────────

struct RunEnv {
    temp: TempDir,
    config: Config,
}

impl RunEnv {
    fn new() -> Self {
        let temp = tempfile::tempdir().unwrap();
        let mut config = Config::defaults();
        config.storage.data_dir = temp.path().to_string_lossy().into_owned();
        // Force lexical-only behavior so the runner never tries to
        // load fastembed during integration tests.
        config.models.embedding.provider = "none".to_string();
        config.models.embedding.dimensions = 0;
        // Pin search defaults so test asserts are stable.
        config.search.default_k = 5;

        let store = SqliteStore::open(&config).unwrap();
        store.run_migrations().unwrap();
        seed_corpus(&store);
        Self { temp, config }
    }

    fn data_dir(&self) -> PathBuf {
        self.temp.path().to_path_buf()
    }
}

/// Seed three (asset, document, chunk) triples with text the test
/// queries can match against the FTS5 lexical index.
fn seed_corpus(store: &SqliteStore) {
    let conn = store.read_conn();
    for (i, text) in [
        "Rust ownership and borrow checker basics.",
        "Cargo workspace members are listed in workspace.members.",
        "Markdown chunking respects heading boundaries.",
    ]
    .iter()
    .enumerate()
    {
        let doc_id = format!("doc{i:032}");
        let chunk_id = format!("chunk{i:030}");
        let asset_id = format!("asset{i:030}");
        let path = format!("notes/{i}.md");
        conn.execute(
            "INSERT INTO assets (
                asset_id, source_uri, workspace_path, media_type, byte_len,
                checksum, storage_kind, storage_path, discovered_at
             ) VALUES (?, ?, ?, '\"markdown\"', 0,
                       'deadbeefdeadbeefdeadbeefdeadbeef',
                       'reference', ?, '1970-01-01T00:00:00Z')",
            params![asset_id, format!("file:///{path}"), path, path],
        )
        .unwrap();
        conn.execute(
            "INSERT INTO documents (
                doc_id, asset_id, workspace_path, title, lang, source_type,
                trust_level, parser_version, doc_version, schema_version,
                metadata_json, provenance_json, created_at, updated_at
             ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
                       '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
            params![doc_id, asset_id, path],
        )
        .unwrap();
        conn.execute(
            "INSERT INTO chunks (
                chunk_id, doc_id, text, heading_path_json, section_label,
                source_spans_json, token_estimate, chunker_version,
                policy_hash, block_ids_json, created_at
             ) VALUES (?, ?, ?, '[]', NULL,
                       '[{\"kind\":\"line\",\"start\":1,\"end\":3}]',
                       1, 'md-heading-v1', 'h', '[]', '1970-01-01T00:00:00Z')",
            params![chunk_id, doc_id, text],
        )
        .unwrap();
    }
    // Build the FTS index so lexical search returns hits. Reuses the
    // same connection guard rather than reopening — the SAVEPOINT
    // protocol nests correctly under the existing read_conn lock.
    kebab_store_sqlite::rebuild_chunks_fts(&conn).unwrap();
    drop(conn);
}

fn write_golden(dir: &Path, body: &str) -> PathBuf {
    let path = dir.join("golden.yaml");
    fs::write(&path, body).unwrap();
    path
}

/// Bind a fresh ephemeral port, then release it. The returned URL
/// points at a port that was just freed; very likely still unbound
/// when the test issues its outbound connection a moment later, in
/// which case `connect()` fails fast with `ECONNREFUSED`. Beats
/// hard-coding port 1 which can timeout slowly on hardened hosts.
fn unreachable_endpoint() -> String {
    let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
    let port = listener.local_addr().unwrap().port();
    drop(listener);
    format!("http://127.0.0.1:{port}")
}

fn lexical_opts() -> EvalRunOpts {
    EvalRunOpts {
        suite: "test".to_string(),
        mode: SearchMode::Lexical,
        with_rag: false,
        k: 5,
        temperature: Some(0.0),
        seed: Some(0),
    }
}

/// Run the eval after pointing `KEBAB_EVAL_GOLDEN` at `yaml`. The env
/// guard must outlive the call so concurrent tests don't reset the
/// var mid-run.
fn run_with_golden<F: FnOnce() -> R, R>(yaml: &Path, f: F) -> R {
    let _g = GOLDEN_ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
    // SAFETY: `KEBAB_EVAL_GOLDEN` is a benign env var; the GOLDEN_ENV_LOCK
    // serializes mutations so concurrent tests don't race.
    unsafe {
        std::env::set_var("KEBAB_EVAL_GOLDEN", yaml);
    }
    let out = f();
    unsafe {
        std::env::remove_var("KEBAB_EVAL_GOLDEN");
    }
    out
}

// ── 1. elapsed_ms recorded for every query ──────────────────────────────────

#[test]
fn runner_records_elapsed_for_every_query() {
    let env = RunEnv::new();
    let yaml = write_golden(
        env.data_dir().as_path(),
        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n- id: q3\n  query: workspace\n",
    );

    let run = run_with_golden(&yaml, || {
        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
    });

    assert_eq!(run.per_query.len(), 3);
    for qr in &run.per_query {
        assert_eq!(qr.mode, SearchMode::Lexical);
        // `elapsed_ms` is `u32`; the assertion that it's a valid
        // unsigned value is implicit. We additionally bound it well
        // below the 4G ceiling to detect a stuck/overflow path.
        assert!(
            qr.elapsed_ms < 60_000,
            "elapsed_ms suspicious: {}",
            qr.elapsed_ms
        );
    }
    // The id-list round-trips into the per-query records.
    let ids: Vec<&str> = run.per_query.iter().map(|q| q.query_id.as_str()).collect();
    assert_eq!(ids, vec!["q1", "q2", "q3"]);
}

// ── 2. config snapshot carries the documented version fields ────────────────

#[test]
fn runner_records_config_snapshot_with_versions() {
    let env = RunEnv::new();
    let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n  query: ownership\n");

    let run = run_with_golden(&yaml, || {
        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
    });

    let snap = &run.config_snapshot_json;
    assert!(snap.get("config").is_some(), "config field missing");
    assert_eq!(
        snap.pointer("/chunker_version"),
        Some(&serde_json::Value::String("md-heading-v1".to_string())),
    );
    assert!(snap.pointer("/embedding/model").is_some());
    assert!(snap.pointer("/embedding/dimensions").is_some());
    assert!(snap.pointer("/llm/model_id").is_some());
    assert_eq!(
        snap.pointer("/prompt_template_version"),
        Some(&serde_json::Value::String("rag-v2".to_string())),
    );
    assert!(snap.pointer("/score_gate").is_some());
    assert!(snap.pointer("/rrf_k").is_some());
}

// ── 3. failing query (ask path with no Ollama) records an error ─────────────

#[test]
fn runner_captures_per_query_error_when_rag_unreachable() {
    let env = RunEnv::new();
    // Point Ollama at an unbound port so `ask_with_config` surfaces a
    // connection error per query. We use bind-then-release rather than
    // a hard-coded `:1` because port 1 is reserved-but-not-guaranteed-
    // unbound (some hardened systems answer with ICMP unreachable
    // instantly, others timeout slowly). TOCTOU race is theoretically
    // possible but rare in practice and faster-failing than `:1`.
    let mut config = env.config.clone();
    config.models.llm.endpoint = unreachable_endpoint();

    let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n  query: ownership\n");

    let opts = EvalRunOpts {
        with_rag: true,
        ..lexical_opts()
    };
    let run = run_with_golden(&yaml, || run_eval_with_config(&config, &opts).unwrap());

    let qr = &run.per_query[0];
    // hits_top_k still populated by lexical search before the RAG attempt.
    assert!(
        !qr.hits_top_k.is_empty(),
        "lexical hits should populate before RAG attempt"
    );
    assert!(qr.answer.is_none(), "no answer when RAG fails");
    assert!(qr.error.is_some(), "error must be recorded");
}

// ── 4. eval_runs + eval_query_results rows persisted ────────────────────────

#[test]
fn runner_persists_eval_run_and_query_result_rows() {
    let env = RunEnv::new();
    let yaml = write_golden(
        env.data_dir().as_path(),
        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
    );

    let run = run_with_golden(&yaml, || {
        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
    });

    // Reopen the same SQLite file with a new store handle and read
    // the rows back. We use the inherent `read_conn` helper rather
    // than rusqlite directly because the latter would require kb-eval
    // to add a runtime rusqlite dep (forbidden by the spec).
    let store = SqliteStore::open(&env.config).unwrap();
    let conn = store.read_conn();

    let n_runs: i64 = conn
        .query_row(
            "SELECT COUNT(*) FROM eval_runs WHERE run_id = ?",
            params![run.run_id],
            |r| r.get(0),
        )
        .unwrap();
    assert_eq!(n_runs, 1);

    let n_results: i64 = conn
        .query_row(
            "SELECT COUNT(*) FROM eval_query_results WHERE run_id = ?",
            params![run.run_id],
            |r| r.get(0),
        )
        .unwrap();
    assert_eq!(n_results, 2);
}

// ── 5. per_query.jsonl mirror exists and round-trips ────────────────────────

#[test]
fn runner_writes_per_query_jsonl_mirror() {
    let env = RunEnv::new();
    let yaml = write_golden(
        env.data_dir().as_path(),
        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
    );

    let run = run_with_golden(&yaml, || {
        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
    });

    let mirror = env
        .data_dir()
        .join("runs")
        .join(&run.run_id)
        .join("per_query.jsonl");
    assert!(
        mirror.exists(),
        "per_query.jsonl missing at {}",
        mirror.display()
    );
    let body = fs::read_to_string(&mirror).unwrap();
    let lines: Vec<&str> = body.lines().collect();
    assert_eq!(lines.len(), 2);
    let parsed: Vec<QueryResult> = lines
        .iter()
        .map(|l| serde_json::from_str::<QueryResult>(l).expect("valid JSONL line"))
        .collect();
    assert_eq!(parsed[0].query_id, "q1");
    assert_eq!(parsed[1].query_id, "q2");
}

// ── 6. determinism — repeating the run produces byte-identical per_query JSON ─

#[test]
fn runner_lexical_is_deterministic_per_query_payload() {
    let env = RunEnv::new();
    let yaml = write_golden(
        env.data_dir().as_path(),
        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
    );

    let run_a = run_with_golden(&yaml, || {
        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
    });
    let run_b = run_with_golden(&yaml, || {
        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
    });

    // Run-level fields (`run_id`, `created_at`) intentionally diverge;
    // the per-query payload (which is what the snapshot fixture pins)
    // must be byte-identical.
    let a_json = serde_json::to_string(&run_a.per_query).unwrap();
    let b_json = serde_json::to_string(&run_b.per_query).unwrap();
    assert_eq!(
        a_json, b_json,
        "lexical-only per_query payload must be byte-identical across runs"
    );
}

// ── 7. snapshot — per_query JSON pinned to fixtures/eval/run-1.json ─────────

#[test]
fn runner_per_query_snapshot_matches_fixture() {
    let env = RunEnv::new();
    let yaml = write_golden(
        env.data_dir().as_path(),
        "- id: q1\n  query: ownership\n- id: q2\n  query: heading\n",
    );

    let run = run_with_golden(&yaml, || {
        run_eval_with_config(&env.config, &lexical_opts()).unwrap()
    });

    // Fixture pins the *shape* of the per-query payload, including the
    // first hit's stable scalar fields (chunk_id, doc_id, heading_path,
    // fusion_score). FTS scores depend on the SQLite version, so the
    // fusion_score is captured into the fixture from one passing run
    // and must remain stable across re-runs against the same seeded
    // corpus. Timing-sensitive fields (`elapsed_ms`, raw `Instant`
    // byproducts) are excluded. Verifying byte stability is the
    // determinism test (#6); this test verifies the field set +
    // ordering is stable.
    let projection: Vec<_> = run
        .per_query
        .iter()
        .map(|qr| {
            let first_hit = qr.hits_top_k.first().map(|h| {
                serde_json::json!({
                    "chunk_id": h.chunk_id,
                    "doc_id": h.doc_id,
                    "heading_path": h.heading_path,
                    "score": h.retrieval.fusion_score,
                })
            });
            serde_json::json!({
                "query_id": qr.query_id,
                "query": qr.query,
                "mode": qr.mode,
                "hits_count": qr.hits_top_k.len(),
                "first_hit": first_hit,
                "has_answer": qr.answer.is_some(),
                "error": qr.error,
            })
        })
        .collect();
    let actual = serde_json::to_string_pretty(&projection).unwrap();

    let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/eval/run-1.json");

    if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
        fs::create_dir_all(fixture_path.parent().unwrap()).unwrap();
        fs::write(&fixture_path, &actual).unwrap();
    }

    let expected = fs::read_to_string(&fixture_path)
        .unwrap_or_else(|e| panic!("read snapshot {}: {e}", fixture_path.display()));
    assert_eq!(
        actual.trim(),
        expected.trim(),
        "snapshot drift — re-run with UPDATE_SNAPSHOTS=1 to refresh"
    );
}