Merge pull request 'feat(p5-1): kb-eval crate — golden-fixture runner + eval persistence' (#27) from feat/p5-1-golden-fixture-runner into main
Reviewed-on: altair823-org/kb#27
This commit was merged in pull request #27.
This commit is contained in:
32
Cargo.lock
generated
32
Cargo.lock
generated
@@ -3478,6 +3478,25 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-eval"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kb-app",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"kb-store-sqlite",
|
||||
"rusqlite",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_yaml",
|
||||
"tempfile",
|
||||
"time",
|
||||
"tracing",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-llm"
|
||||
version = "0.1.0"
|
||||
@@ -6380,6 +6399,19 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_yaml"
|
||||
version = "0.9.34+deprecated"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
|
||||
dependencies = [
|
||||
"indexmap 2.14.0",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
"unsafe-libyaml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_yaml_ng"
|
||||
version = "0.10.0"
|
||||
|
||||
@@ -18,6 +18,7 @@ members = [
|
||||
"crates/kb-rag",
|
||||
"crates/kb-app",
|
||||
"crates/kb-cli",
|
||||
"crates/kb-eval",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -32,6 +33,9 @@ anyhow = "1"
|
||||
thiserror = "2"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
# Golden-fixture loader (P5-1, kb-eval) parses YAML; pinned in the
|
||||
# workspace so future eval-adjacent crates share the same major.
|
||||
serde_yaml = "0.9"
|
||||
time = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] }
|
||||
uuid = { version = "1", features = ["v7", "serde"] }
|
||||
blake3 = "1"
|
||||
|
||||
@@ -1,24 +1,22 @@
|
||||
//! `App` — internal lifecycle struct (§7).
|
||||
//! `App` — facade lifecycle struct (§7).
|
||||
//!
|
||||
//! A single `App` represents one CLI invocation's worth of state: a
|
||||
//! resolved `Config`, an open `SqliteStore`, and (when embeddings are
|
||||
//! enabled) an `Embedder` + `LanceVectorStore`. Each public free
|
||||
//! function on `kb-app` wraps `App::open(config)` once, runs the
|
||||
//! requested op, and drops everything on return.
|
||||
//!
|
||||
//! The struct is `pub(crate)` because it is an internal seam: `kb-cli`
|
||||
//! calls only the free functions on the crate root. `kb-tui` (P9) is
|
||||
//! expected to hold one `App` for the session, at which point the
|
||||
//! struct may need to be promoted to `pub`. Until then, keep it
|
||||
//! private to insulate the wiring shape from downstream callers.
|
||||
//! A single `App` represents one CLI invocation's (or one TUI
|
||||
//! session's / one eval-runner suite's) worth of state: a resolved
|
||||
//! `Config`, an open `SqliteStore`, and (when embeddings are enabled)
|
||||
//! an `Embedder` + `LanceVectorStore`. Each public free function on
|
||||
//! `kb-app` builds an `App` once, runs the requested op, and drops
|
||||
//! everything on return; long-lived callers (kb-eval, the future P9
|
||||
//! TUI session) hold onto an `App` across many calls so the per-query
|
||||
//! cost is just a method dispatch.
|
||||
//!
|
||||
//! ## Embedder + Vector store lifetime
|
||||
//!
|
||||
//! `App::open` builds the SQLite store unconditionally. The embedder
|
||||
//! and vector store are *lazy + memoized* — built on first call to
|
||||
//! [`App::embedder`] / [`App::vector`] and cached in `OnceLock`s — so
|
||||
//! a long-lived `App` (e.g., the P9 TUI session) pays the ~470 MB
|
||||
//! ONNX init plus Lance reopen cost exactly once.
|
||||
//! `App::open_with_config` builds the SQLite store unconditionally.
|
||||
//! The embedder and vector store are *lazy + memoized* — built on
|
||||
//! first call to [`App::embedder`] / [`App::vector`] and cached in
|
||||
//! `OnceLock`s — so a long-lived `App` (kb-eval driving 50 queries,
|
||||
//! the P9 TUI session) pays the ~470 MB ONNX init plus Lance reopen
|
||||
//! cost exactly once.
|
||||
//!
|
||||
//! - `kb list` / `kb inspect` never need them.
|
||||
//! - `kb search --mode lexical` never needs them.
|
||||
@@ -27,7 +25,8 @@
|
||||
//! Building eagerly would force every CLI invocation to load ~470 MB of
|
||||
//! ONNX weights, which is the dominant cold-start cost. The lazy
|
||||
//! pattern keeps the lexical-only paths instant; the memoization makes
|
||||
//! the TUI's repeated searches cheap after the first.
|
||||
//! the TUI's repeated searches and the eval runner's per-query loop
|
||||
//! cheap after the first invocation.
|
||||
//!
|
||||
//! Embeddings can also be **disabled** workspace-wide via
|
||||
//! `config.models.embedding.provider = "none"` (or `dimensions = 0`);
|
||||
@@ -36,15 +35,26 @@
|
||||
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
|
||||
use kb_core::Embedder;
|
||||
use kb_core::{
|
||||
Answer, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit, SearchMode,
|
||||
SearchQuery, VectorStore,
|
||||
};
|
||||
use kb_embed_local::FastembedEmbedder;
|
||||
use kb_llm_local::OllamaLanguageModel;
|
||||
use kb_rag::{AskOpts, RagPipeline};
|
||||
use kb_search::{HybridRetriever, LexicalRetriever, VectorRetriever};
|
||||
use kb_store_sqlite::SqliteStore;
|
||||
use kb_store_vector::LanceVectorStore;
|
||||
|
||||
/// Internal facade state. See module docs for lifetime rules.
|
||||
pub(crate) struct App {
|
||||
/// Facade state — see module docs for lifetime rules.
|
||||
///
|
||||
/// The struct is public so long-lived callers (kb-eval, the future P9
|
||||
/// TUI session) can construct one and reuse it across many search /
|
||||
/// ask calls. The OnceLock-backed `embedder` / `vector` fields ensure
|
||||
/// the cold-start cost is paid exactly once per instance.
|
||||
pub struct App {
|
||||
pub(crate) config: kb_config::Config,
|
||||
pub(crate) sqlite: Arc<SqliteStore>,
|
||||
/// Memoized embedder — built lazily on first `embedder()` call when
|
||||
@@ -54,6 +64,11 @@ pub(crate) struct App {
|
||||
/// Memoized vector store — built lazily on first `vector()` call
|
||||
/// when embeddings are enabled. Same rationale as `embedder`.
|
||||
vector: OnceLock<Arc<LanceVectorStore>>,
|
||||
/// Memoized LLM — built lazily on first `ask()` call. Sharing one
|
||||
/// across the eval runner avoids re-handshaking the Ollama HTTP
|
||||
/// client per query (cheap, but still measurable on a 50-query
|
||||
/// suite).
|
||||
llm: OnceLock<Arc<dyn LanguageModel>>,
|
||||
}
|
||||
|
||||
impl App {
|
||||
@@ -65,7 +80,7 @@ impl App {
|
||||
/// Downstream `LanceVectorStore::new` (called by [`Self::vector`])
|
||||
/// internally drives a `tokio::Runtime::block_on`, which panics if
|
||||
/// invoked from inside another tokio runtime.
|
||||
pub(crate) fn open(config: kb_config::Config) -> Result<Self> {
|
||||
pub fn open_with_config(config: kb_config::Config) -> Result<Self> {
|
||||
let sqlite = SqliteStore::open(&config).context("kb-app: open SqliteStore")?;
|
||||
sqlite
|
||||
.run_migrations()
|
||||
@@ -75,9 +90,112 @@ impl App {
|
||||
sqlite: Arc::new(sqlite),
|
||||
embedder: OnceLock::new(),
|
||||
vector: OnceLock::new(),
|
||||
llm: OnceLock::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Run a [`SearchQuery`] through the configured retriever stack and
|
||||
/// return the top-k hits.
|
||||
///
|
||||
/// Reuses any previously-built embedder / vector store on this `App`
|
||||
/// — long-lived callers (kb-eval, future TUI) get amortized cost
|
||||
/// across calls.
|
||||
pub fn search(&self, query: SearchQuery) -> Result<Vec<SearchHit>> {
|
||||
match query.mode {
|
||||
SearchMode::Lexical => {
|
||||
let lex = LexicalRetriever::with_settings(
|
||||
self.sqlite.clone(),
|
||||
lexical_index_version(&self.config),
|
||||
self.config.search.snippet_chars,
|
||||
);
|
||||
lex.search(&query)
|
||||
}
|
||||
SearchMode::Vector => {
|
||||
let (emb, vec_store) = self.require_embeddings()?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
let retr = VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
self.sqlite.clone(),
|
||||
vec_iv,
|
||||
self.config.search.snippet_chars,
|
||||
);
|
||||
retr.search(&query)
|
||||
}
|
||||
SearchMode::Hybrid => {
|
||||
let lex = Arc::new(LexicalRetriever::with_settings(
|
||||
self.sqlite.clone(),
|
||||
lexical_index_version(&self.config),
|
||||
self.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
let (emb, vec_store) = self.require_embeddings()?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
let vec_retr = Arc::new(VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
self.sqlite.clone(),
|
||||
vec_iv,
|
||||
self.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
let hybrid = HybridRetriever::new(&self.config, lex, vec_retr);
|
||||
hybrid.search(&query)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Run a RAG `ask` against the configured retriever + LLM. Reuses
|
||||
/// the memoized embedder / vector / LLM where applicable.
|
||||
pub fn ask(&self, query: &str, opts: AskOpts) -> Result<Answer> {
|
||||
let retriever: Arc<dyn Retriever> = match opts.mode {
|
||||
SearchMode::Lexical => Arc::new(LexicalRetriever::with_settings(
|
||||
self.sqlite.clone(),
|
||||
lexical_index_version(&self.config),
|
||||
self.config.search.snippet_chars,
|
||||
)),
|
||||
SearchMode::Vector => {
|
||||
let (emb, vec_store) = self.require_embeddings()?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
Arc::new(VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
self.sqlite.clone(),
|
||||
vec_iv,
|
||||
self.config.search.snippet_chars,
|
||||
))
|
||||
}
|
||||
SearchMode::Hybrid => {
|
||||
let lex = Arc::new(LexicalRetriever::with_settings(
|
||||
self.sqlite.clone(),
|
||||
lexical_index_version(&self.config),
|
||||
self.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
let (emb, vec_store) = self.require_embeddings()?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
let vec_retr = Arc::new(VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
self.sqlite.clone(),
|
||||
vec_iv,
|
||||
self.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
Arc::new(HybridRetriever::new(&self.config, lex, vec_retr))
|
||||
}
|
||||
};
|
||||
|
||||
let llm = self.llm()?;
|
||||
let pipeline =
|
||||
RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
pipeline.ask(query, opts)
|
||||
}
|
||||
|
||||
/// Returns `true` when the workspace has embeddings turned off
|
||||
/// (`provider = "none"` or `dimensions = 0`). Lexical-only mode.
|
||||
pub(crate) fn embeddings_disabled(&self) -> bool {
|
||||
@@ -123,4 +241,64 @@ impl App {
|
||||
let _ = self.vector.set(store.clone());
|
||||
Ok(Some(self.vector.get().cloned().unwrap_or(store)))
|
||||
}
|
||||
|
||||
/// Build (or reuse) the configured LLM. Currently always Ollama;
|
||||
/// when a second provider lands this is the place to switch on
|
||||
/// `config.models.llm.provider`.
|
||||
fn llm(&self) -> Result<Arc<dyn LanguageModel>> {
|
||||
if let Some(l) = self.llm.get() {
|
||||
return Ok(l.clone());
|
||||
}
|
||||
let llm: Arc<dyn LanguageModel> = Arc::new(
|
||||
OllamaLanguageModel::new(&self.config)
|
||||
.context("kb-app::ask: build OllamaLanguageModel")?,
|
||||
);
|
||||
let _ = self.llm.set(llm.clone());
|
||||
Ok(self.llm.get().cloned().unwrap_or(llm))
|
||||
}
|
||||
|
||||
/// Resolve the embedder + vector store, surfacing the user-friendly
|
||||
/// "switch to --mode lexical" error when embeddings are disabled.
|
||||
fn require_embeddings(
|
||||
&self,
|
||||
) -> Result<(
|
||||
Arc<dyn Embedder + Send + Sync>,
|
||||
Arc<LanceVectorStore>,
|
||||
)> {
|
||||
let emb = self.embedder()?.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"embeddings disabled (config.models.embedding.provider == \"none\" \
|
||||
or dimensions == 0); vector / hybrid search require embeddings — \
|
||||
switch to --mode lexical or enable an embedding provider in config.toml"
|
||||
)
|
||||
})?;
|
||||
let vec_store = self.vector()?.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"vector store unavailable while embedder is configured — this should \
|
||||
not happen; check `kb doctor` and the data_dir permissions"
|
||||
)
|
||||
})?;
|
||||
Ok((emb, vec_store))
|
||||
}
|
||||
}
|
||||
|
||||
/// Compose a stable `IndexVersion` for the lexical retriever from
|
||||
/// the active config. This token surfaces in `SearchHit.index_version`
|
||||
/// and on snapshot tests; including the chunker version pins it to
|
||||
/// the chunking policy in effect.
|
||||
fn lexical_index_version(config: &kb_config::Config) -> IndexVersion {
|
||||
IndexVersion(format!("lex:{}", config.chunking.chunker_version))
|
||||
}
|
||||
|
||||
/// Compose a stable `IndexVersion` for the vector retriever. Tracks
|
||||
/// `(embedding_model, embedding_version, dimensions)` so a model swap
|
||||
/// flags drift via the existing index_version mismatch warning in
|
||||
/// `HybridRetriever::new`.
|
||||
fn vector_index_version(embedder: &dyn Embedder) -> IndexVersion {
|
||||
IndexVersion(format!(
|
||||
"vec:{}@{}:{}",
|
||||
embedder.model_id().0,
|
||||
embedder.model_version().0,
|
||||
embedder.dimensions(),
|
||||
))
|
||||
}
|
||||
|
||||
@@ -43,22 +43,18 @@ use kb_chunk::MdHeadingV1Chunker;
|
||||
use kb_core::{
|
||||
Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
|
||||
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
|
||||
EmbeddingKind, IndexVersion, IngestReport, LanguageModel, ParserVersion, RawAsset,
|
||||
Retriever, SearchHit, SearchMode, SearchQuery, SourceConnector, SourceScope,
|
||||
SourceUri, VectorRecord, VectorStore,
|
||||
EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery,
|
||||
SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore,
|
||||
};
|
||||
use kb_llm_local::OllamaLanguageModel;
|
||||
use kb_normalize::build_canonical_document;
|
||||
use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
|
||||
use kb_rag::RagPipeline;
|
||||
use kb_search::{HybridRetriever, LexicalRetriever, VectorRetriever};
|
||||
use kb_source_fs::FsSourceConnector;
|
||||
|
||||
mod app;
|
||||
pub mod doctor_signal;
|
||||
pub mod logging;
|
||||
|
||||
use app::App;
|
||||
pub use app::App;
|
||||
|
||||
/// Parser-version label persisted in `documents.parser_version` for
|
||||
/// every Markdown file ingested through the `kb-parse-md` pipeline.
|
||||
@@ -168,7 +164,7 @@ pub fn ingest_with_config(
|
||||
) -> anyhow::Result<IngestReport> {
|
||||
let started_instant = std::time::Instant::now();
|
||||
|
||||
let app = App::open(config)?;
|
||||
let app = App::open_with_config(config)?;
|
||||
|
||||
// Walk the workspace.
|
||||
let connector = FsSourceConnector::new(&app.config)
|
||||
@@ -667,7 +663,7 @@ pub fn list_docs_with_config(
|
||||
config: kb_config::Config,
|
||||
filter: DocFilter,
|
||||
) -> anyhow::Result<Vec<DocSummary>> {
|
||||
let app = App::open(config)?;
|
||||
let app = App::open_with_config(config)?;
|
||||
app.sqlite.list_documents(&filter)
|
||||
}
|
||||
|
||||
@@ -683,7 +679,7 @@ pub fn inspect_doc_with_config(
|
||||
config: kb_config::Config,
|
||||
id: &DocumentId,
|
||||
) -> anyhow::Result<CanonicalDocument> {
|
||||
let app = App::open(config)?;
|
||||
let app = App::open_with_config(config)?;
|
||||
app.sqlite
|
||||
.get_document(id)?
|
||||
.ok_or_else(|| anyhow!("document not found: {} (try `kb list docs`)", id.0))
|
||||
@@ -701,7 +697,7 @@ pub fn inspect_chunk_with_config(
|
||||
config: kb_config::Config,
|
||||
id: &ChunkId,
|
||||
) -> anyhow::Result<Chunk> {
|
||||
let app = App::open(config)?;
|
||||
let app = App::open_with_config(config)?;
|
||||
app.sqlite
|
||||
.get_chunk(id)?
|
||||
.ok_or_else(|| anyhow!("chunk not found: {} (try `kb inspect doc <id>`)", id.0))
|
||||
@@ -715,101 +711,15 @@ pub fn search(query: SearchQuery) -> anyhow::Result<Vec<SearchHit>> {
|
||||
}
|
||||
|
||||
/// Test-only seam — kb-cli must call the public free function
|
||||
/// ([`search`]), not this.
|
||||
/// ([`search`]), not this. Builds a one-shot `App` and delegates to
|
||||
/// [`App::search`]; long-lived callers should hold an `App` instance
|
||||
/// directly to amortize the embedder / vector-store cold start.
|
||||
#[doc(hidden)]
|
||||
pub fn search_with_config(
|
||||
config: kb_config::Config,
|
||||
query: SearchQuery,
|
||||
) -> anyhow::Result<Vec<SearchHit>> {
|
||||
let app = App::open(config)?;
|
||||
|
||||
match query.mode {
|
||||
SearchMode::Lexical => {
|
||||
let lex = LexicalRetriever::with_settings(
|
||||
app.sqlite.clone(),
|
||||
lexical_index_version(&app.config),
|
||||
app.config.search.snippet_chars,
|
||||
);
|
||||
lex.search(&query)
|
||||
}
|
||||
SearchMode::Vector => {
|
||||
let (emb, vec_store) = require_embeddings(&app)?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
let retr = VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
app.sqlite.clone(),
|
||||
vec_iv,
|
||||
app.config.search.snippet_chars,
|
||||
);
|
||||
retr.search(&query)
|
||||
}
|
||||
SearchMode::Hybrid => {
|
||||
let lex = Arc::new(LexicalRetriever::with_settings(
|
||||
app.sqlite.clone(),
|
||||
lexical_index_version(&app.config),
|
||||
app.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
let (emb, vec_store) = require_embeddings(&app)?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
let vec_retr = Arc::new(VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
app.sqlite.clone(),
|
||||
vec_iv,
|
||||
app.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
let hybrid = HybridRetriever::new(&app.config, lex, vec_retr);
|
||||
hybrid.search(&query)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn require_embeddings(
|
||||
app: &App,
|
||||
) -> anyhow::Result<(
|
||||
Arc<dyn Embedder + Send + Sync>,
|
||||
Arc<kb_store_vector::LanceVectorStore>,
|
||||
)> {
|
||||
let emb = app.embedder()?.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"embeddings disabled (config.models.embedding.provider == \"none\" \
|
||||
or dimensions == 0); vector / hybrid search require embeddings — \
|
||||
switch to --mode lexical or enable an embedding provider in config.toml"
|
||||
)
|
||||
})?;
|
||||
let vec_store = app.vector()?.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"vector store unavailable while embedder is configured — this should \
|
||||
not happen; check `kb doctor` and the data_dir permissions"
|
||||
)
|
||||
})?;
|
||||
Ok((emb, vec_store))
|
||||
}
|
||||
|
||||
/// Compose a stable `IndexVersion` for the lexical retriever from
|
||||
/// the active config. This token surfaces in `SearchHit.index_version`
|
||||
/// and on snapshot tests; including the chunker version pins it to
|
||||
/// the chunking policy in effect.
|
||||
fn lexical_index_version(config: &kb_config::Config) -> IndexVersion {
|
||||
IndexVersion(format!("lex:{}", config.chunking.chunker_version))
|
||||
}
|
||||
|
||||
/// Compose a stable `IndexVersion` for the vector retriever. Tracks
|
||||
/// `(embedding_model, embedding_version, dimensions)` so a model swap
|
||||
/// flags drift via the existing index_version mismatch warning in
|
||||
/// `HybridRetriever::new`.
|
||||
fn vector_index_version(embedder: &dyn Embedder) -> IndexVersion {
|
||||
IndexVersion(format!(
|
||||
"vec:{}@{}:{}",
|
||||
embedder.model_id().0,
|
||||
embedder.model_version().0,
|
||||
embedder.dimensions(),
|
||||
))
|
||||
App::open_with_config(config)?.search(query)
|
||||
}
|
||||
|
||||
// ── ask ──────────────────────────────────────────────────────────────────
|
||||
@@ -826,64 +736,15 @@ pub fn ask(query: &str, opts: AskOpts) -> anyhow::Result<Answer> {
|
||||
}
|
||||
|
||||
/// Test-only seam — kb-cli must call the public free function
|
||||
/// ([`ask`]), not this. Mirrors the `*_with_config` pattern documented
|
||||
/// at the top of this module.
|
||||
/// ([`ask`]), not this. Builds a one-shot `App` and delegates to
|
||||
/// [`App::ask`].
|
||||
#[doc(hidden)]
|
||||
pub fn ask_with_config(
|
||||
config: kb_config::Config,
|
||||
query: &str,
|
||||
opts: AskOpts,
|
||||
) -> anyhow::Result<Answer> {
|
||||
let app = App::open(config)?;
|
||||
|
||||
let retriever: Arc<dyn Retriever> = match opts.mode {
|
||||
SearchMode::Lexical => Arc::new(LexicalRetriever::with_settings(
|
||||
app.sqlite.clone(),
|
||||
lexical_index_version(&app.config),
|
||||
app.config.search.snippet_chars,
|
||||
)),
|
||||
SearchMode::Vector => {
|
||||
let (emb, vec_store) = require_embeddings(&app)?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
Arc::new(VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
app.sqlite.clone(),
|
||||
vec_iv,
|
||||
app.config.search.snippet_chars,
|
||||
))
|
||||
}
|
||||
SearchMode::Hybrid => {
|
||||
let lex = Arc::new(LexicalRetriever::with_settings(
|
||||
app.sqlite.clone(),
|
||||
lexical_index_version(&app.config),
|
||||
app.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
let (emb, vec_store) = require_embeddings(&app)?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
let vec_retr = Arc::new(VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
app.sqlite.clone(),
|
||||
vec_iv,
|
||||
app.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
Arc::new(HybridRetriever::new(&app.config, lex, vec_retr))
|
||||
}
|
||||
};
|
||||
|
||||
let llm: Arc<dyn LanguageModel> = Arc::new(
|
||||
OllamaLanguageModel::new(&app.config)
|
||||
.context("kb-app::ask: build OllamaLanguageModel")?,
|
||||
);
|
||||
|
||||
let pipeline =
|
||||
RagPipeline::new(app.config.clone(), retriever, llm, app.sqlite.clone());
|
||||
pipeline.ask(query, opts)
|
||||
App::open_with_config(config)?.ask(query, opts)
|
||||
}
|
||||
|
||||
/// Run the doctor checks against the explicit config path the user
|
||||
|
||||
@@ -8,6 +8,9 @@ use std::path::{Path, PathBuf};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
mod paths;
|
||||
pub use paths::expand_path;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Config {
|
||||
pub schema_version: u32,
|
||||
|
||||
186
crates/kb-config/src/paths.rs
Normal file
186
crates/kb-config/src/paths.rs
Normal file
@@ -0,0 +1,186 @@
|
||||
//! Shared path expansion helper.
|
||||
//!
|
||||
//! `Config::storage.*` fields are stored as raw template strings (e.g.
|
||||
//! `${XDG_DATA_HOME:-~/.local/share}/kb`, `{data_dir}/runs`). Every
|
||||
//! crate that turns one of those strings into a real filesystem path
|
||||
//! needs to apply the same set of substitutions; this module is the
|
||||
//! single source of truth so the behavior cannot drift.
|
||||
//!
|
||||
//! Substitutions, applied in order:
|
||||
//!
|
||||
//! 1. `{data_dir}` → caller-supplied `data_dir`.
|
||||
//! - When the caller passes an empty `data_dir` (because they ARE
|
||||
//! resolving `data_dir` itself), the substitution is a no-op so
|
||||
//! a literal `{data_dir}` is left in place rather than producing
|
||||
//! a `/{data_dir}/...` artifact.
|
||||
//! 2. `${XDG_DATA_HOME:-<default>}` (or the bare `${XDG_DATA_HOME}`) →
|
||||
//! the env var if set + non-empty, else the default after `:-`.
|
||||
//! Mimics POSIX shell's `${VAR:-default}` semantics. Mid-string
|
||||
//! occurrences are supported; only the first match is replaced.
|
||||
//! 3. Leading `~` / `~/...` → `$HOME`. Any non-leading `~` is left
|
||||
//! literal (matches shell behavior — only the first segment expands).
|
||||
//!
|
||||
//! The result is a `PathBuf` regardless of whether all substitutions
|
||||
//! were applicable; relative paths are kept relative to the caller's
|
||||
//! CWD (not resolved here).
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Expand storage-path templates. See module docs for the substitution
|
||||
/// rules.
|
||||
///
|
||||
/// Pass an empty `data_dir` when resolving `data_dir` itself; the
|
||||
/// `{data_dir}` substitution becomes a no-op in that case so the
|
||||
/// recursive shape (`data_dir = "${XDG_DATA_HOME:-…}/kb"`) resolves
|
||||
/// without producing a literal `{data_dir}` token in the output.
|
||||
pub fn expand_path(raw: &str, data_dir: &str) -> PathBuf {
|
||||
let mut s = raw.to_string();
|
||||
|
||||
// 1. {data_dir} substitution (skipped when resolving data_dir
|
||||
// itself; see module docs).
|
||||
if !data_dir.is_empty() {
|
||||
s = s.replace("{data_dir}", data_dir);
|
||||
}
|
||||
|
||||
// 2. ${XDG_DATA_HOME:-<default>}: env override else default.
|
||||
if let Some(start) = s.find("${XDG_DATA_HOME") {
|
||||
if let Some(rel_end) = s[start..].find('}') {
|
||||
let end = start + rel_end + 1; // include trailing '}'
|
||||
let inner = &s[start + 2..end - 1]; // strip ${ and }
|
||||
let replacement = match std::env::var("XDG_DATA_HOME") {
|
||||
Ok(v) if !v.is_empty() => v,
|
||||
_ => match inner.split_once(":-") {
|
||||
Some((_, default)) => default.to_string(),
|
||||
None => String::new(),
|
||||
},
|
||||
};
|
||||
s.replace_range(start..end, &replacement);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Leading `~` → $HOME.
|
||||
if let Some(rest) = s.strip_prefix('~') {
|
||||
if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
|
||||
return home.join(rest.trim_start_matches('/'));
|
||||
}
|
||||
}
|
||||
|
||||
PathBuf::from(s)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Mutex as StdMutex;
|
||||
|
||||
/// `XDG_DATA_HOME` / `HOME` env mutations must be serialized so
|
||||
/// concurrent test runs (cargo's default parallel runner) don't
|
||||
/// observe each other's transient values.
|
||||
static ENV_LOCK: StdMutex<()> = StdMutex::new(());
|
||||
|
||||
/// RAII guard: snapshots `XDG_DATA_HOME` on construction, restores
|
||||
/// it on drop.
|
||||
struct XdgGuard {
|
||||
prior: Option<String>,
|
||||
}
|
||||
|
||||
impl XdgGuard {
|
||||
fn capture() -> Self {
|
||||
Self {
|
||||
prior: std::env::var("XDG_DATA_HOME").ok(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for XdgGuard {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: edition 2024 marks set_var/remove_var unsafe
|
||||
// because env mutation is not thread-safe. The ENV_LOCK
|
||||
// guard at the call site prevents concurrent observation.
|
||||
unsafe {
|
||||
match &self.prior {
|
||||
Some(v) => std::env::set_var("XDG_DATA_HOME", v),
|
||||
None => std::env::remove_var("XDG_DATA_HOME"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn substitutes_data_dir_template() {
|
||||
let p = expand_path("{data_dir}/runs", "/tmp/kbtest");
|
||||
assert_eq!(p, PathBuf::from("/tmp/kbtest/runs"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn data_dir_substitution_skipped_when_empty() {
|
||||
// Empty `data_dir` is the "resolving data_dir itself" signal;
|
||||
// the literal `{data_dir}` token must survive.
|
||||
let p = expand_path("{data_dir}/runs", "");
|
||||
assert_eq!(p, PathBuf::from("{data_dir}/runs"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn passthrough_absolute_path() {
|
||||
let p = expand_path("/abs/runs", "/ignored");
|
||||
assert_eq!(p, PathBuf::from("/abs/runs"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xdg_data_home_set_replaces_var() {
|
||||
let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
|
||||
let _guard = XdgGuard::capture();
|
||||
// SAFETY: lock held for the duration of this test.
|
||||
unsafe { std::env::set_var("XDG_DATA_HOME", "/custom/path") };
|
||||
|
||||
let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
|
||||
assert_eq!(p, PathBuf::from("/custom/path/kb"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xdg_data_home_unset_uses_default() {
|
||||
let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
|
||||
let _guard = XdgGuard::capture();
|
||||
// SAFETY: lock held for the duration of this test.
|
||||
unsafe { std::env::remove_var("XDG_DATA_HOME") };
|
||||
|
||||
let home = std::env::var("HOME").expect("HOME must be set in tests");
|
||||
let expected = PathBuf::from(home).join(".local/share/kb");
|
||||
let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
|
||||
assert_eq!(p, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xdg_with_no_default_resolves_to_empty_when_unset() {
|
||||
let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
|
||||
let _guard = XdgGuard::capture();
|
||||
// SAFETY: lock held for the duration of this test.
|
||||
unsafe { std::env::remove_var("XDG_DATA_HOME") };
|
||||
|
||||
// No `:-default` clause, no env var → empty string substitution.
|
||||
let p = expand_path("${XDG_DATA_HOME}/kb", "");
|
||||
assert_eq!(p, PathBuf::from("/kb"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn leading_tilde_expands_to_home() {
|
||||
let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
|
||||
let home = std::env::var("HOME").expect("HOME must be set in tests");
|
||||
let p = expand_path("~/runs", "");
|
||||
assert_eq!(p, PathBuf::from(home).join("runs"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn data_dir_then_xdg_then_tilde_compose() {
|
||||
// Order matters: substitute `{data_dir}` (which itself contains
|
||||
// an unexpanded `${XDG_DATA_HOME}` and `~`), then the other two
|
||||
// resolve the result.
|
||||
let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
|
||||
let _guard = XdgGuard::capture();
|
||||
// SAFETY: lock held for the duration of this test.
|
||||
unsafe { std::env::set_var("XDG_DATA_HOME", "/xdg/data") };
|
||||
|
||||
let p = expand_path("{data_dir}/runs", "/xdg/data/kb");
|
||||
assert_eq!(p, PathBuf::from("/xdg/data/kb/runs"));
|
||||
}
|
||||
}
|
||||
@@ -22,11 +22,11 @@
|
||||
//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md`
|
||||
//! §7.2 (Embedder), §6.4 ([models.embedding]), §9 (versioning).
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
|
||||
use kb_config::expand_path;
|
||||
use kb_embed::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion};
|
||||
|
||||
/// Subdirectory under `config.storage.model_dir` where the fastembed
|
||||
@@ -60,9 +60,8 @@ impl FastembedEmbedder {
|
||||
/// first `embed`).
|
||||
pub fn new(config: &kb_config::Config) -> Result<Self> {
|
||||
// 1. Resolve `{data_dir}/models/fastembed/` from the config
|
||||
// templates. `kb-config` does not expose a public path
|
||||
// resolver yet, so we hand-roll a tiny one mirroring
|
||||
// kb-store-sqlite's `expand_data_dir`.
|
||||
// templates. Goes through the shared `kb_config::expand_path`
|
||||
// so every crate resolves storage paths identically.
|
||||
let data_dir = expand_path(&config.storage.data_dir, "");
|
||||
let model_dir = expand_path(&config.storage.model_dir, &data_dir.to_string_lossy());
|
||||
let cache_dir = model_dir.join(FASTEMBED_CACHE_SUBDIR);
|
||||
@@ -222,58 +221,6 @@ pub(crate) fn check_dim(model_dim: usize, cfg_dim: usize) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Expand the limited template language `kb-config` uses for storage
|
||||
/// paths.
|
||||
///
|
||||
/// Supported substitutions, applied in order:
|
||||
/// 1. `{data_dir}` → `data_dir` (caller-supplied resolved string). This
|
||||
/// is a no-op when `data_dir` is empty (used by the recursive call
|
||||
/// that resolves `data_dir` itself).
|
||||
/// 2. `${XDG_DATA_HOME:-~/.local/share}` (and the bare
|
||||
/// `${XDG_DATA_HOME}`) → env var if set, else the default after
|
||||
/// `:-`.
|
||||
/// 3. Leading `~` → `$HOME`.
|
||||
///
|
||||
/// Mirrors `kb-store-sqlite::store::expand_data_dir`. Kept private to
|
||||
/// this crate; promoting it to a public `kb-config` API is a separate
|
||||
/// task (see task p3-2 risks: "don't expand kb-config's public API").
|
||||
fn expand_path(raw: &str, data_dir: &str) -> PathBuf {
|
||||
let mut s = raw.to_string();
|
||||
|
||||
if !data_dir.is_empty() {
|
||||
s = s.replace("{data_dir}", data_dir);
|
||||
}
|
||||
|
||||
// ${XDG_DATA_HOME:-~/.local/share}: respect env override, else fall
|
||||
// back to the suffix after `:-`.
|
||||
if let Some(start) = s.find("${XDG_DATA_HOME") {
|
||||
if let Some(rel_end) = s[start..].find('}') {
|
||||
let end = start + rel_end + 1; // include trailing '}'
|
||||
let inner = &s[start + 2..end - 1]; // strip ${ and }
|
||||
let replacement = match std::env::var("XDG_DATA_HOME") {
|
||||
Ok(v) if !v.is_empty() => v,
|
||||
_ => {
|
||||
if let Some((_, default)) = inner.split_once(":-") {
|
||||
default.to_string()
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
}
|
||||
};
|
||||
s.replace_range(start..end, &replacement);
|
||||
}
|
||||
}
|
||||
|
||||
// Leading `~` → $HOME.
|
||||
if let Some(rest) = s.strip_prefix('~') {
|
||||
if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
|
||||
return home.join(rest.trim_start_matches('/'));
|
||||
}
|
||||
}
|
||||
|
||||
PathBuf::from(s)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -354,80 +301,6 @@ mod tests {
|
||||
assert!(msg.contains("unsupported embedding model"), "msg={msg}");
|
||||
}
|
||||
|
||||
// ── expand_path ──────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn expand_path_substitutes_data_dir_template() {
|
||||
let p = expand_path("{data_dir}/models", "/tmp/kbtest");
|
||||
assert_eq!(p, PathBuf::from("/tmp/kbtest/models"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_path_no_op_without_template() {
|
||||
let p = expand_path("/abs/path", "/tmp/kbtest");
|
||||
assert_eq!(p, PathBuf::from("/abs/path"));
|
||||
}
|
||||
|
||||
// ── expand_path: XDG_DATA_HOME fallback ──────────────────────────
|
||||
//
|
||||
// These two tests mutate the process-wide `XDG_DATA_HOME` env var,
|
||||
// which is unsafe under edition 2024 and racy under cargo's default
|
||||
// parallel test runner. The shared `ENV_LOCK` serializes them; each
|
||||
// test snapshots the prior value and restores it on exit.
|
||||
|
||||
use std::sync::Mutex as StdMutex;
|
||||
static ENV_LOCK: StdMutex<()> = StdMutex::new(());
|
||||
|
||||
/// RAII guard: snapshots `XDG_DATA_HOME` on construction, restores
|
||||
/// it on drop. Pair with the `ENV_LOCK` guard for serial access.
|
||||
struct XdgGuard {
|
||||
prior: Option<String>,
|
||||
}
|
||||
|
||||
impl XdgGuard {
|
||||
fn capture() -> Self {
|
||||
Self {
|
||||
prior: std::env::var("XDG_DATA_HOME").ok(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for XdgGuard {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: edition 2024 marks `set_var`/`remove_var` unsafe
|
||||
// because env mutation is not thread-safe. Callers hold
|
||||
// `ENV_LOCK` for the duration of the test, so no other
|
||||
// thread observes the mutation.
|
||||
unsafe {
|
||||
match &self.prior {
|
||||
Some(v) => std::env::set_var("XDG_DATA_HOME", v),
|
||||
None => std::env::remove_var("XDG_DATA_HOME"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_path_xdg_data_home_set() {
|
||||
let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
|
||||
let _guard = XdgGuard::capture();
|
||||
// SAFETY: lock held for the duration of this test.
|
||||
unsafe { std::env::set_var("XDG_DATA_HOME", "/custom/path") };
|
||||
|
||||
let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
|
||||
assert_eq!(p, PathBuf::from("/custom/path/kb"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_path_xdg_data_home_unset_falls_back_to_home() {
|
||||
let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
|
||||
let _guard = XdgGuard::capture();
|
||||
// SAFETY: lock held for the duration of this test.
|
||||
unsafe { std::env::remove_var("XDG_DATA_HOME") };
|
||||
|
||||
let home = std::env::var("HOME").expect("HOME must be set in tests");
|
||||
let expected = PathBuf::from(home).join(".local/share/kb");
|
||||
let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", "");
|
||||
assert_eq!(p, expected);
|
||||
}
|
||||
// expand_path tests live in `kb-config::paths`. The adapter imports
|
||||
// it and trusts the upstream coverage rather than duplicating it.
|
||||
}
|
||||
|
||||
30
crates/kb-eval/Cargo.toml
Normal file
30
crates/kb-eval/Cargo.toml
Normal file
@@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "kb-eval"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Golden-fixture eval runner: load YAML, drive kb-app search/ask, persist eval_runs / eval_query_results / per_query.jsonl"
|
||||
|
||||
[dependencies]
|
||||
# Allowed deps per p5-1 spec — domain types + facade only.
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-config = { path = "../kb-config" }
|
||||
kb-app = { path = "../kb-app" }
|
||||
kb-store-sqlite = { path = "../kb-store-sqlite" }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
serde_yaml = { workspace = true }
|
||||
time = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
# `uuid::Uuid::now_v7()` powers the `run_<ulid_lower>`-shaped run_id;
|
||||
# v7 UUIDs are timestamp-ordered (same monotonicity as ULID) and `uuid`
|
||||
# is already in workspace deps, so we avoid pulling a new ULID crate
|
||||
# just for the lower-cased timestamp prefix.
|
||||
uuid = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
rusqlite = { workspace = true }
|
||||
29
crates/kb-eval/src/lib.rs
Normal file
29
crates/kb-eval/src/lib.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
//! `kb-eval` — golden-fixture eval runner (P5-1).
|
||||
//!
|
||||
//! Loads `fixtures/golden_queries.yaml`, runs each entry through the
|
||||
//! [`kb_app`] facade (lexical / vector / hybrid + optional RAG), and
|
||||
//! persists results into `eval_runs` / `eval_query_results` plus
|
||||
//! `runs_dir/<run_id>/per_query.jsonl` (design §5.7, §6.3).
|
||||
//!
|
||||
//! Metric computation lives in P5-2 (`kb-eval::metrics`); this crate is
|
||||
//! the **data collector** only.
|
||||
//!
|
||||
//! ## Allowed deps (per task spec)
|
||||
//!
|
||||
//! `kb-core`, `kb-config`, `kb-app`, `kb-store-sqlite`, plus `serde`,
|
||||
//! `serde_yaml`, `serde_json`, `time`, `tracing`,
|
||||
//! `anyhow`, `uuid`. Retrieval / embedding / LLM crates are NOT
|
||||
//! reachable here — every retrieval and `ask` call must go through
|
||||
//! `kb-app`.
|
||||
//!
|
||||
//! ## `run_id` recipe
|
||||
//!
|
||||
//! `run_id` uses UUIDv7 simple — timestamp-ordered, lowercase hex.
|
||||
|
||||
mod loader;
|
||||
mod runner;
|
||||
mod types;
|
||||
|
||||
pub use loader::load_golden_set;
|
||||
pub use runner::{run_eval, run_eval_with_config};
|
||||
pub use types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult};
|
||||
229
crates/kb-eval/src/loader.rs
Normal file
229
crates/kb-eval/src/loader.rs
Normal file
@@ -0,0 +1,229 @@
|
||||
//! Golden-set YAML loader.
|
||||
//!
|
||||
//! Two entry points:
|
||||
//!
|
||||
//! - [`load_golden_set`] — pure YAML parse + uniqueness check. Used by
|
||||
//! tests that don't have a SQLite store handy.
|
||||
//! - [`load_golden_set_validated`] — additionally verifies every
|
||||
//! `expected_doc_id` / `expected_chunk_id` exists in the SQLite DB
|
||||
//! the supplied [`kb_config::Config`] points at. Used by
|
||||
//! [`crate::run_eval`] in production so a stale golden set fails
|
||||
//! fast at run start.
|
||||
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use kb_store_sqlite::SqliteStore;
|
||||
|
||||
use crate::types::GoldenQuery;
|
||||
|
||||
/// Parse the YAML at `path` into a `Vec<GoldenQuery>` and check that
|
||||
/// every `id` is unique.
|
||||
///
|
||||
/// The YAML is expected to be a top-level list of mappings. Required
|
||||
/// fields per entry: `id`, `query`. All other fields default to empty /
|
||||
/// `None` per [`GoldenQuery`]'s `serde(default)` annotations.
|
||||
pub fn load_golden_set(path: &Path) -> Result<Vec<GoldenQuery>> {
|
||||
let bytes =
|
||||
std::fs::read(path).with_context(|| format!("read golden YAML from {}", path.display()))?;
|
||||
let queries: Vec<GoldenQuery> = serde_yaml::from_slice(&bytes)
|
||||
.with_context(|| format!("parse golden YAML at {}", path.display()))?;
|
||||
check_unique_ids(&queries)?;
|
||||
Ok(queries)
|
||||
}
|
||||
|
||||
/// Same as [`load_golden_set`] but additionally validates that every
|
||||
/// `expected_doc_id` and `expected_chunk_id` referenced by the loaded
|
||||
/// entries actually exists in the SQLite database `cfg` resolves to.
|
||||
///
|
||||
/// Missing IDs are surfaced as a single sorted error listing every
|
||||
/// offender, so curators can fix the whole set in one pass.
|
||||
///
|
||||
/// Currently used only by the in-module tests below; production code
|
||||
/// inlines `load_golden_set` + `validate_against_db` in
|
||||
/// [`crate::run_eval_with_config`] so the validation can run against
|
||||
/// an already-opened [`kb_config::Config`] without re-parsing YAML.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn load_golden_set_validated(
|
||||
yaml_path: &Path,
|
||||
cfg: &kb_config::Config,
|
||||
) -> Result<Vec<GoldenQuery>> {
|
||||
let queries = load_golden_set(yaml_path)?;
|
||||
validate_against_db(&queries, cfg)?;
|
||||
Ok(queries)
|
||||
}
|
||||
|
||||
fn check_unique_ids(queries: &[GoldenQuery]) -> Result<()> {
|
||||
let mut seen: HashSet<&str> = HashSet::new();
|
||||
let mut dups: BTreeSet<String> = BTreeSet::new();
|
||||
for q in queries {
|
||||
if !seen.insert(q.id.as_str()) {
|
||||
dups.insert(q.id.clone());
|
||||
}
|
||||
}
|
||||
if dups.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
let list: Vec<String> = dups.into_iter().collect();
|
||||
Err(anyhow!("duplicate query id(s): {}", list.join(", ")))
|
||||
}
|
||||
}
|
||||
|
||||
/// Read every doc_id / chunk_id referenced by `queries` and confirm
|
||||
/// SQLite has rows for them. Builds a sorted, deduplicated error
|
||||
/// message listing every missing ID.
|
||||
pub(crate) fn validate_against_db(queries: &[GoldenQuery], cfg: &kb_config::Config) -> Result<()> {
|
||||
// Short-circuit when there is nothing to validate — saves opening
|
||||
// SQLite for golden sets that omit expected_*_ids entirely.
|
||||
let needs_check = queries
|
||||
.iter()
|
||||
.any(|q| !q.expected_doc_ids.is_empty() || !q.expected_chunk_ids.is_empty());
|
||||
if !needs_check {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let store = SqliteStore::open(cfg).context("open SqliteStore for golden validation")?;
|
||||
store
|
||||
.run_migrations()
|
||||
.context("run migrations for golden validation")?;
|
||||
|
||||
let mut missing_docs: BTreeSet<String> = BTreeSet::new();
|
||||
let mut missing_chunks: BTreeSet<String> = BTreeSet::new();
|
||||
|
||||
for q in queries {
|
||||
for did in &q.expected_doc_ids {
|
||||
let exists = store
|
||||
.document_exists(&did.0)
|
||||
.with_context(|| format!("probe document {}", did.0))?;
|
||||
if !exists {
|
||||
missing_docs.insert(did.0.clone());
|
||||
}
|
||||
}
|
||||
for cid in &q.expected_chunk_ids {
|
||||
let exists = store
|
||||
.chunk_exists(&cid.0)
|
||||
.with_context(|| format!("probe chunk {}", cid.0))?;
|
||||
if !exists {
|
||||
missing_chunks.insert(cid.0.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if missing_docs.is_empty() && missing_chunks.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut parts: Vec<String> = Vec::new();
|
||||
if !missing_docs.is_empty() {
|
||||
parts.push(format!(
|
||||
"missing doc_ids: {}",
|
||||
missing_docs.into_iter().collect::<Vec<_>>().join(", ")
|
||||
));
|
||||
}
|
||||
if !missing_chunks.is_empty() {
|
||||
parts.push(format!(
|
||||
"missing chunk_ids: {}",
|
||||
missing_chunks.into_iter().collect::<Vec<_>>().join(", ")
|
||||
));
|
||||
}
|
||||
Err(anyhow!(
|
||||
"golden set references unknown IDs — {}",
|
||||
parts.join("; ")
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
//! Tests that exercise the crate-private
|
||||
//! [`load_golden_set_validated`]. The pure-parser cases live in
|
||||
//! `tests/loader.rs`; only the validated-variant cases need to sit
|
||||
//! next to the function so they can see the `pub(crate)` symbol.
|
||||
use super::*;
|
||||
use kb_config::Config;
|
||||
use kb_store_sqlite::SqliteStore;
|
||||
use rusqlite::params;
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn rejects_unknown_expected_chunk_id() {
|
||||
let tmp = tempdir().unwrap();
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = tmp.path().to_string_lossy().into_owned();
|
||||
|
||||
let store = SqliteStore::open(&config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
seed_one_chunk(&store, "doc_present", "chunk_present");
|
||||
|
||||
let yaml_path = tmp.path().join("golden.yaml");
|
||||
fs::write(
|
||||
&yaml_path,
|
||||
"- id: g1\n query: hello\n expected_chunk_ids: [\"chunk_present\", \"chunk_missing\"]\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let err = load_golden_set_validated(&yaml_path, &config).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("missing chunk_ids"), "msg: {msg}");
|
||||
assert!(msg.contains("chunk_missing"), "msg: {msg}");
|
||||
assert!(!msg.contains("chunk_present"), "msg: {msg}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn accepts_resolved_ids() {
|
||||
let tmp = tempdir().unwrap();
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = tmp.path().to_string_lossy().into_owned();
|
||||
|
||||
let store = SqliteStore::open(&config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
seed_one_chunk(&store, "doc_present", "chunk_present");
|
||||
|
||||
let yaml_path = tmp.path().join("golden.yaml");
|
||||
fs::write(
|
||||
&yaml_path,
|
||||
"- id: g1\n query: hello\n expected_doc_ids: [\"doc_present\"]\n expected_chunk_ids: [\"chunk_present\"]\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let qs = load_golden_set_validated(&yaml_path, &config).unwrap();
|
||||
assert_eq!(qs.len(), 1);
|
||||
}
|
||||
|
||||
fn seed_one_chunk(store: &SqliteStore, doc_id: &str, chunk_id: &str) {
|
||||
let conn = store.read_conn();
|
||||
let asset_id = format!("a_{doc_id}");
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, ?, ?, '\"markdown\"', 0,
|
||||
'deadbeefdeadbeefdeadbeefdeadbeef',
|
||||
'reference', ?, '1970-01-01T00:00:00Z')",
|
||||
params![asset_id, "file:///tmp/x.md", "x.md", "x.md"],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang, source_type,
|
||||
trust_level, parser_version, doc_version, schema_version,
|
||||
metadata_json, provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
|
||||
'{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
|
||||
params![doc_id, asset_id, "x.md"],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, 'hi', '[]', NULL,
|
||||
'[{\"kind\":\"line\",\"start\":1,\"end\":3}]',
|
||||
1, 'v1', 'h', '[]', '1970-01-01T00:00:00Z')",
|
||||
params![chunk_id, doc_id],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
270
crates/kb-eval/src/runner.rs
Normal file
270
crates/kb-eval/src/runner.rs
Normal file
@@ -0,0 +1,270 @@
|
||||
//! Per-query eval runner. See [`run_eval`] / [`run_eval_with_config`].
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kb_app::App;
|
||||
use kb_config::expand_path;
|
||||
use kb_core::{SearchFilters, SearchQuery};
|
||||
use kb_store_sqlite::{EvalRunRow, SqliteStore};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::loader::{load_golden_set, validate_against_db};
|
||||
use crate::types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult};
|
||||
|
||||
/// Convert a wall-clock duration since `start` into milliseconds clamped
|
||||
/// to `u32::MAX`. The `QueryResult.elapsed_ms` and `eval_runs.duration_ms`
|
||||
/// fields are `u32`; saturate (rather than wrap) so a stuck run never
|
||||
/// reports a misleading sub-second duration.
|
||||
fn elapsed_ms_u32(start: Instant) -> u32 {
|
||||
start.elapsed().as_millis().min(u128::from(u32::MAX)) as u32
|
||||
}
|
||||
|
||||
/// Env var that overrides the default `fixtures/golden_queries.yaml`
|
||||
/// path. Resolved relative to the current working directory.
|
||||
const KB_EVAL_GOLDEN: &str = "KB_EVAL_GOLDEN";
|
||||
|
||||
/// Default golden YAML path (relative to CWD when set).
|
||||
const DEFAULT_GOLDEN_PATH: &str = "fixtures/golden_queries.yaml";
|
||||
|
||||
/// Run the golden suite end-to-end against the active XDG-loaded
|
||||
/// [`kb_config::Config`]. Wraps [`run_eval_with_config`] with
|
||||
/// `Config::load(None)`.
|
||||
pub fn run_eval(opts: &EvalRunOpts) -> Result<EvalRun> {
|
||||
let cfg = kb_config::Config::load(None).context("load Config for run_eval")?;
|
||||
run_eval_with_config(&cfg, opts)
|
||||
}
|
||||
|
||||
/// Run the golden suite end-to-end against an explicit
|
||||
/// [`kb_config::Config`]. Used by integration tests (TempDir-backed
|
||||
/// data_dir) and any future caller that wants to drive the runner
|
||||
/// against a non-default config.
|
||||
pub fn run_eval_with_config(cfg: &kb_config::Config, opts: &EvalRunOpts) -> Result<EvalRun> {
|
||||
let started = Instant::now();
|
||||
|
||||
// ── 1. Load golden set ────────────────────────────────────────────────
|
||||
//
|
||||
// `with_context` already names the path on error, so a separate
|
||||
// `tracing::debug!` here would just be noise.
|
||||
let golden_path = resolve_golden_path();
|
||||
let queries = load_golden_set(&golden_path).with_context(|| {
|
||||
format!(
|
||||
"load golden set from {} (override via KB_EVAL_GOLDEN)",
|
||||
golden_path.display()
|
||||
)
|
||||
})?;
|
||||
validate_against_db(&queries, cfg)?;
|
||||
|
||||
// ── 2. Mint identifiers + open store ──────────────────────────────────
|
||||
let run_id = mint_run_id();
|
||||
let created_at = OffsetDateTime::now_utc();
|
||||
let commit_hash = std::env::var("KB_COMMIT_HASH")
|
||||
.ok()
|
||||
.filter(|s| !s.is_empty());
|
||||
|
||||
// Open the store once so every per-query write reuses the same
|
||||
// connection-mutex lifetime.
|
||||
let store = SqliteStore::open(cfg).context("open SqliteStore for run_eval")?;
|
||||
store
|
||||
.run_migrations()
|
||||
.context("run migrations for run_eval")?;
|
||||
|
||||
// ── 3. Build config_snapshot_json ─────────────────────────────────────
|
||||
let config_snapshot_json = build_config_snapshot(cfg)?;
|
||||
let config_snapshot_text =
|
||||
serde_json::to_string(&config_snapshot_json).context("serialize config_snapshot_json")?;
|
||||
|
||||
// ── 4. Per-query execution ────────────────────────────────────────────
|
||||
//
|
||||
// Open one `App` for the whole suite. The embedder / vector store /
|
||||
// LLM are memoized on the App, so a 50-query run pays the ~470 MB
|
||||
// ONNX init + Lance reopen + Ollama handshake exactly once.
|
||||
let app = App::open_with_config(cfg.clone()).context("open App for run_eval")?;
|
||||
|
||||
let mut per_query: Vec<QueryResult> = Vec::with_capacity(queries.len());
|
||||
for gq in &queries {
|
||||
let qr = execute_query(&app, gq, opts);
|
||||
per_query.push(qr);
|
||||
}
|
||||
|
||||
// ── 5. Persist eval_runs + eval_query_results ────────────────────────
|
||||
// Serialize per-query JSON up front so the SQLite transaction below
|
||||
// never holds the connection mutex through serde failures.
|
||||
let mut results: Vec<(String, String)> = Vec::with_capacity(per_query.len());
|
||||
for qr in &per_query {
|
||||
let json = serde_json::to_string(qr)
|
||||
.with_context(|| format!("serialize QueryResult for {}", qr.query_id))?;
|
||||
results.push((qr.query_id.clone(), json));
|
||||
}
|
||||
let row = EvalRunRow {
|
||||
run_id: &run_id,
|
||||
suite: opts.suite.as_str(),
|
||||
config_snapshot_json: &config_snapshot_text,
|
||||
aggregate_json: "{}",
|
||||
commit_hash: commit_hash.as_deref(),
|
||||
created_at,
|
||||
};
|
||||
store
|
||||
.record_eval_run_with_results(&row, &results)
|
||||
.context("record eval_runs + eval_query_results (transactional)")?;
|
||||
|
||||
// ── 6. Mirror to runs_dir/<run_id>/per_query.jsonl ────────────────────
|
||||
write_per_query_jsonl(cfg, &run_id, &per_query)?;
|
||||
|
||||
let duration_ms = elapsed_ms_u32(started);
|
||||
tracing::info!(
|
||||
target: "kb-eval",
|
||||
run_id = %run_id,
|
||||
suite = %opts.suite,
|
||||
queries = per_query.len(),
|
||||
duration_ms,
|
||||
"kb-eval: run complete"
|
||||
);
|
||||
|
||||
Ok(EvalRun {
|
||||
run_id,
|
||||
created_at,
|
||||
commit_hash,
|
||||
config_snapshot_json,
|
||||
per_query,
|
||||
})
|
||||
}
|
||||
|
||||
/// Mint a `run_<lower>` identifier. UUIDv7 stands in for ULID — same
|
||||
/// timestamp-ordered monotonicity, already in workspace deps. Lower-
|
||||
/// case simple form to match the `ulid_lower()` shape the spec asks
|
||||
/// for.
|
||||
fn mint_run_id() -> String {
|
||||
let id = uuid::Uuid::now_v7().simple().to_string();
|
||||
format!("run_{id}")
|
||||
}
|
||||
|
||||
/// Resolve the golden YAML path. Honors the `KB_EVAL_GOLDEN` env
|
||||
/// override; otherwise relative to CWD. The path is NOT expanded for
|
||||
/// `~` / `${...}` placeholders — direct file paths only.
|
||||
fn resolve_golden_path() -> PathBuf {
|
||||
match std::env::var(KB_EVAL_GOLDEN) {
|
||||
Ok(s) if !s.is_empty() => PathBuf::from(s),
|
||||
_ => PathBuf::from(DEFAULT_GOLDEN_PATH),
|
||||
}
|
||||
}
|
||||
|
||||
/// Run one [`GoldenQuery`] through the kb-app facade. Errors are
|
||||
/// captured into `QueryResult.error` so the run continues.
|
||||
fn execute_query(app: &App, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult {
|
||||
let started = Instant::now();
|
||||
|
||||
let search_query = SearchQuery {
|
||||
text: gq.query.clone(),
|
||||
mode: opts.mode,
|
||||
k: opts.k,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
|
||||
let (hits_top_k, mut error) = match app.search(search_query) {
|
||||
Ok(hits) => (hits, None),
|
||||
Err(e) => (Vec::new(), Some(format!("{e:#}"))),
|
||||
};
|
||||
|
||||
// Optional RAG path: only attempted when `with_rag` and the search
|
||||
// call did not already error out (we want one error per query, not
|
||||
// a duplicated one).
|
||||
let answer = if opts.with_rag && error.is_none() {
|
||||
let ask_opts = kb_app::AskOpts {
|
||||
k: opts.k,
|
||||
explain: true,
|
||||
mode: opts.mode,
|
||||
temperature: opts.temperature,
|
||||
seed: opts.seed,
|
||||
stream_sink: None,
|
||||
};
|
||||
match app.ask(&gq.query, ask_opts) {
|
||||
Ok(ans) => Some(ans),
|
||||
Err(e) => {
|
||||
error = Some(format!("{e:#}"));
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
QueryResult {
|
||||
query_id: gq.id.clone(),
|
||||
query: gq.query.clone(),
|
||||
mode: opts.mode,
|
||||
hits_top_k,
|
||||
answer,
|
||||
elapsed_ms: elapsed_ms_u32(started),
|
||||
error,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the `config_snapshot_json` value: full Config as `config` plus
|
||||
/// the auxiliary version fields the spec calls out.
|
||||
///
|
||||
/// `index_version` is intentionally `None` here — it is composed
|
||||
/// dynamically by `kb-app` on a per-call basis from the configured
|
||||
/// embedder (e.g., `vec:<model>@<version>:<dim>`), so it is not a
|
||||
/// stable run-time property of the config alone. P5-2 may compose it
|
||||
/// from `embedding.{model,version,dimensions}` if it needs the field
|
||||
/// for compare reports.
|
||||
fn build_config_snapshot(cfg: &kb_config::Config) -> Result<serde_json::Value> {
|
||||
let cfg_value = serde_json::to_value(cfg).context("serialize Config")?;
|
||||
Ok(serde_json::json!({
|
||||
"config": cfg_value,
|
||||
"chunker_version": cfg.chunking.chunker_version,
|
||||
"embedding": {
|
||||
"model": cfg.models.embedding.model,
|
||||
"version": cfg.models.embedding.version,
|
||||
"dimensions": cfg.models.embedding.dimensions,
|
||||
"provider": cfg.models.embedding.provider,
|
||||
},
|
||||
"llm": {
|
||||
"model_id": cfg.models.llm.model,
|
||||
"provider": cfg.models.llm.provider,
|
||||
},
|
||||
"prompt_template_version": cfg.rag.prompt_template_version,
|
||||
"score_gate": cfg.rag.score_gate,
|
||||
"rrf_k": cfg.search.rrf_k,
|
||||
"index_version": serde_json::Value::Null,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Write the `runs_dir/<run_id>/per_query.jsonl` mirror (design §6.3).
|
||||
/// Each `QueryResult` is one line, separator `\n`. The directory is
|
||||
/// created if it doesn't exist; an existing file is overwritten (a
|
||||
/// `run_id` collision would already have failed the `eval_runs`
|
||||
/// PRIMARY KEY upstream).
|
||||
fn write_per_query_jsonl(
|
||||
cfg: &kb_config::Config,
|
||||
run_id: &str,
|
||||
per_query: &[QueryResult],
|
||||
) -> Result<()> {
|
||||
// `data_dir` may itself contain `${XDG_DATA_HOME:-…}` / `~` (the
|
||||
// workspace-default does); resolve it before threading it into the
|
||||
// `{data_dir}` substitution of `runs_dir`.
|
||||
let resolved_data_dir = expand_path(&cfg.storage.data_dir, "");
|
||||
let runs_dir = expand_path(
|
||||
&cfg.storage.runs_dir,
|
||||
&resolved_data_dir.to_string_lossy(),
|
||||
);
|
||||
let run_dir = runs_dir.join(run_id);
|
||||
std::fs::create_dir_all(&run_dir)
|
||||
.with_context(|| format!("create run dir {}", run_dir.display()))?;
|
||||
let path = run_dir.join("per_query.jsonl");
|
||||
let file = File::create(&path)
|
||||
.with_context(|| format!("create per_query.jsonl at {}", path.display()))?;
|
||||
let mut w = BufWriter::new(file);
|
||||
for qr in per_query {
|
||||
serde_json::to_writer(&mut w, qr)
|
||||
.with_context(|| format!("serialize QueryResult for {}", qr.query_id))?;
|
||||
w.write_all(b"\n")
|
||||
.context("write newline separator in per_query.jsonl")?;
|
||||
}
|
||||
w.flush().context("flush per_query.jsonl")?;
|
||||
Ok(())
|
||||
}
|
||||
87
crates/kb-eval/src/types.rs
Normal file
87
crates/kb-eval/src/types.rs
Normal file
@@ -0,0 +1,87 @@
|
||||
//! Public domain types for the eval runner (signatures pinned by
|
||||
//! `tasks/p5/p5-1-golden-fixture-runner.md` "Public surface").
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use kb_core::{Answer, ChunkId, DocumentId, Lang, SearchHit, SearchMode};
|
||||
|
||||
/// One golden query loaded from `fixtures/golden_queries.yaml`.
|
||||
///
|
||||
/// Required fields: `id`, `query`. Everything else defaults to
|
||||
/// empty / `None` per the loader contract.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct GoldenQuery {
|
||||
pub id: String,
|
||||
pub query: String,
|
||||
#[serde(default = "default_lang")]
|
||||
pub lang: Lang,
|
||||
#[serde(default)]
|
||||
pub expected_doc_ids: Vec<DocumentId>,
|
||||
#[serde(default)]
|
||||
pub expected_chunk_ids: Vec<ChunkId>,
|
||||
#[serde(default)]
|
||||
pub must_contain: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub forbidden: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub difficulty: Option<String>,
|
||||
}
|
||||
|
||||
fn default_lang() -> Lang {
|
||||
// `Lang` is a BCP-47 string newtype (§3.3); the empty string is
|
||||
// the safe default for golden entries that omit `lang`. Curators
|
||||
// may fill it in later; the runner does not branch on this field.
|
||||
Lang(String::new())
|
||||
}
|
||||
|
||||
/// Caller-supplied knobs for one [`crate::run_eval`] invocation.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EvalRunOpts {
|
||||
/// Suite label persisted into `eval_runs.suite`. The shipped
|
||||
/// fixture is `"golden"`; other suites can reuse the same runner.
|
||||
pub suite: String,
|
||||
/// Retrieval mode forwarded to every `kb_app::search` /
|
||||
/// `kb_app::ask` call inside the run.
|
||||
pub mode: SearchMode,
|
||||
/// When `true`, also call `kb_app::ask` per query and record the
|
||||
/// resulting `Answer` on the `QueryResult`.
|
||||
pub with_rag: bool,
|
||||
/// Top-k forwarded to retrieval (and `AskOpts.k` when `with_rag`).
|
||||
pub k: usize,
|
||||
/// Override `config.models.llm.temperature` when `with_rag`.
|
||||
/// Determinism contract requires `Some(0.0)` + a fixed `seed`.
|
||||
pub temperature: Option<f32>,
|
||||
/// Override `config.models.llm.seed` when `with_rag`.
|
||||
pub seed: Option<u64>,
|
||||
}
|
||||
|
||||
/// One full eval run. Persisted to `eval_runs` + `eval_query_results`
|
||||
/// (design §5.7) and mirrored to `runs_dir/<run_id>/per_query.jsonl`
|
||||
/// (design §6.3).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EvalRun {
|
||||
pub run_id: String,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: OffsetDateTime,
|
||||
pub commit_hash: Option<String>,
|
||||
/// Snapshot of the `Config` plus auxiliary version fields
|
||||
/// (`chunker_version`, embedding/llm/prompt versions, fusion
|
||||
/// params, `index_version`). See [`crate::run_eval`] for the
|
||||
/// exact shape.
|
||||
pub config_snapshot_json: serde_json::Value,
|
||||
pub per_query: Vec<QueryResult>,
|
||||
}
|
||||
|
||||
/// One per-query record. Every row in `eval_query_results` has its
|
||||
/// `result_json` filled with `serde_json::to_string(&QueryResult)`.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct QueryResult {
|
||||
pub query_id: String,
|
||||
pub query: String,
|
||||
pub mode: SearchMode,
|
||||
pub hits_top_k: Vec<SearchHit>,
|
||||
pub answer: Option<Answer>,
|
||||
pub elapsed_ms: u32,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
30
crates/kb-eval/tests/fixtures/eval/run-1.json
vendored
Normal file
30
crates/kb-eval/tests/fixtures/eval/run-1.json
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
[
|
||||
{
|
||||
"error": null,
|
||||
"first_hit": {
|
||||
"chunk_id": "chunk000000000000000000000000000000",
|
||||
"doc_id": "doc00000000000000000000000000000000",
|
||||
"heading_path": [],
|
||||
"score": 0.3429983854293823
|
||||
},
|
||||
"has_answer": false,
|
||||
"hits_count": 1,
|
||||
"mode": "lexical",
|
||||
"query": "ownership",
|
||||
"query_id": "q1"
|
||||
},
|
||||
{
|
||||
"error": null,
|
||||
"first_hit": {
|
||||
"chunk_id": "chunk000000000000000000000000000002",
|
||||
"doc_id": "doc00000000000000000000000000000002",
|
||||
"heading_path": [],
|
||||
"score": 0.3585492968559265
|
||||
},
|
||||
"has_answer": false,
|
||||
"hits_count": 1,
|
||||
"mode": "lexical",
|
||||
"query": "heading",
|
||||
"query_id": "q2"
|
||||
}
|
||||
]
|
||||
59
crates/kb-eval/tests/loader.rs
Normal file
59
crates/kb-eval/tests/loader.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
//! Loader tests for the golden-fixture YAML parser (P5-1).
|
||||
//!
|
||||
//! These tests exercise pure parsing and duplicate-id detection. The
|
||||
//! DB-validation tests for the crate-private
|
||||
//! `load_golden_set_validated` live next to the function in
|
||||
//! `src/loader.rs` (they need `pub(crate)` visibility, which integration
|
||||
//! tests can't see).
|
||||
|
||||
use std::fs;
|
||||
|
||||
use kb_eval::load_golden_set;
|
||||
use tempfile::tempdir;
|
||||
|
||||
// ── 1. parser accepts well-formed YAML with optional fields ──────────────────
|
||||
|
||||
#[test]
|
||||
fn loads_minimal_well_formed_yaml() {
|
||||
let tmp = tempdir().unwrap();
|
||||
let yaml_path = tmp.path().join("golden.yaml");
|
||||
fs::write(
|
||||
&yaml_path,
|
||||
"- id: g1\n query: hello\n- id: g2\n query: \"another\"\n lang: en\n must_contain: [\"foo\"]\n forbidden: [\"bar\"]\n difficulty: easy\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let qs = load_golden_set(&yaml_path).unwrap();
|
||||
assert_eq!(qs.len(), 2);
|
||||
assert_eq!(qs[0].id, "g1");
|
||||
assert_eq!(qs[0].query, "hello");
|
||||
assert!(qs[0].must_contain.is_empty());
|
||||
assert!(qs[0].forbidden.is_empty());
|
||||
assert!(qs[0].difficulty.is_none());
|
||||
|
||||
assert_eq!(qs[1].id, "g2");
|
||||
assert_eq!(qs[1].lang.0, "en");
|
||||
assert_eq!(qs[1].must_contain, vec!["foo".to_string()]);
|
||||
assert_eq!(qs[1].forbidden, vec!["bar".to_string()]);
|
||||
assert_eq!(qs[1].difficulty.as_deref(), Some("easy"));
|
||||
}
|
||||
|
||||
// ── 2. duplicate IDs error lists every offender (sorted, deduplicated) ───────
|
||||
|
||||
#[test]
|
||||
fn rejects_duplicate_ids() {
|
||||
let tmp = tempdir().unwrap();
|
||||
let yaml_path = tmp.path().join("dup.yaml");
|
||||
fs::write(
|
||||
&yaml_path,
|
||||
"- id: g1\n query: a\n- id: g2\n query: b\n- id: g1\n query: c\n- id: g2\n query: d\n- id: g1\n query: e\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let err = load_golden_set(&yaml_path).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("duplicate query id"), "msg: {msg}");
|
||||
// Both dup IDs should appear, sorted (BTreeSet) and deduplicated.
|
||||
assert!(msg.contains("g1"), "msg: {msg}");
|
||||
assert!(msg.contains("g2"), "msg: {msg}");
|
||||
}
|
||||
419
crates/kb-eval/tests/runner.rs
Normal file
419
crates/kb-eval/tests/runner.rs
Normal file
@@ -0,0 +1,419 @@
|
||||
//! Runner integration tests for `kb-eval` (P5-1).
|
||||
//!
|
||||
//! Drives [`kb_eval::run_eval_with_config`] end-to-end against a
|
||||
//! TempDir-backed config:
|
||||
//!
|
||||
//! - tiny seeded SQLite corpus (3 docs / 3 chunks) used as the
|
||||
//! workspace's source-of-truth,
|
||||
//! - lexical-only retrieval (`SearchMode::Lexical`) so no embedder is
|
||||
//! required (`models.embedding.provider = "none"`),
|
||||
//! - golden YAML pointed at via `KB_EVAL_GOLDEN`.
|
||||
//!
|
||||
//! Determinism: lexical-only with a fixed seed corpus produces
|
||||
//! byte-identical `per_query.jsonl` content (modulo `run_id` /
|
||||
//! `created_at`, which we strip when comparing).
|
||||
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Mutex;
|
||||
|
||||
use kb_config::Config;
|
||||
use kb_core::SearchMode;
|
||||
use kb_eval::{EvalRunOpts, QueryResult, run_eval_with_config};
|
||||
use kb_store_sqlite::SqliteStore;
|
||||
use rusqlite::params;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// `KB_EVAL_GOLDEN` is process-global state. Tests touching it must
|
||||
/// serialize so they don't trample each other when `cargo test`
|
||||
/// runs them in parallel.
|
||||
static GOLDEN_ENV_LOCK: Mutex<()> = Mutex::new(());
|
||||
|
||||
// ── shared scaffolding ───────────────────────────────────────────────────────
|
||||
|
||||
struct RunEnv {
|
||||
temp: TempDir,
|
||||
config: Config,
|
||||
}
|
||||
|
||||
impl RunEnv {
|
||||
fn new() -> Self {
|
||||
let temp = tempfile::tempdir().unwrap();
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = temp.path().to_string_lossy().into_owned();
|
||||
// Force lexical-only behavior so the runner never tries to
|
||||
// load fastembed during integration tests.
|
||||
config.models.embedding.provider = "none".to_string();
|
||||
config.models.embedding.dimensions = 0;
|
||||
// Pin search defaults so test asserts are stable.
|
||||
config.search.default_k = 5;
|
||||
|
||||
let store = SqliteStore::open(&config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
seed_corpus(&store);
|
||||
Self { temp, config }
|
||||
}
|
||||
|
||||
fn data_dir(&self) -> PathBuf {
|
||||
self.temp.path().to_path_buf()
|
||||
}
|
||||
}
|
||||
|
||||
/// Seed three (asset, document, chunk) triples with text the test
|
||||
/// queries can match against the FTS5 lexical index.
|
||||
fn seed_corpus(store: &SqliteStore) {
|
||||
let conn = store.read_conn();
|
||||
for (i, text) in [
|
||||
"Rust ownership and borrow checker basics.",
|
||||
"Cargo workspace members are listed in workspace.members.",
|
||||
"Markdown chunking respects heading boundaries.",
|
||||
]
|
||||
.iter()
|
||||
.enumerate()
|
||||
{
|
||||
let doc_id = format!("doc{i:032}");
|
||||
let chunk_id = format!("chunk{i:030}");
|
||||
let asset_id = format!("asset{i:030}");
|
||||
let path = format!("notes/{i}.md");
|
||||
conn.execute(
|
||||
"INSERT INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, ?, ?, '\"markdown\"', 0,
|
||||
'deadbeefdeadbeefdeadbeefdeadbeef',
|
||||
'reference', ?, '1970-01-01T00:00:00Z')",
|
||||
params![asset_id, format!("file:///{path}"), path, path],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang, source_type,
|
||||
trust_level, parser_version, doc_version, schema_version,
|
||||
metadata_json, provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
|
||||
'{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
|
||||
params![doc_id, asset_id, path],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, ?, '[]', NULL,
|
||||
'[{\"kind\":\"line\",\"start\":1,\"end\":3}]',
|
||||
1, 'md-heading-v1', 'h', '[]', '1970-01-01T00:00:00Z')",
|
||||
params![chunk_id, doc_id, text],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
// Build the FTS index so lexical search returns hits. Reuses the
|
||||
// same connection guard rather than reopening — the SAVEPOINT
|
||||
// protocol nests correctly under the existing read_conn lock.
|
||||
kb_store_sqlite::rebuild_chunks_fts(&conn).unwrap();
|
||||
drop(conn);
|
||||
}
|
||||
|
||||
fn write_golden(dir: &Path, body: &str) -> PathBuf {
|
||||
let path = dir.join("golden.yaml");
|
||||
fs::write(&path, body).unwrap();
|
||||
path
|
||||
}
|
||||
|
||||
/// Bind a fresh ephemeral port, then release it. The returned URL
|
||||
/// points at a port that was just freed; very likely still unbound
|
||||
/// when the test issues its outbound connection a moment later, in
|
||||
/// which case `connect()` fails fast with `ECONNREFUSED`. Beats
|
||||
/// hard-coding port 1 which can timeout slowly on hardened hosts.
|
||||
fn unreachable_endpoint() -> String {
|
||||
let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
|
||||
let port = listener.local_addr().unwrap().port();
|
||||
drop(listener);
|
||||
format!("http://127.0.0.1:{port}")
|
||||
}
|
||||
|
||||
fn lexical_opts() -> EvalRunOpts {
|
||||
EvalRunOpts {
|
||||
suite: "test".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
with_rag: false,
|
||||
k: 5,
|
||||
temperature: Some(0.0),
|
||||
seed: Some(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the eval after pointing `KB_EVAL_GOLDEN` at `yaml`. The env
|
||||
/// guard must outlive the call so concurrent tests don't reset the
|
||||
/// var mid-run.
|
||||
fn run_with_golden<F: FnOnce() -> R, R>(yaml: &Path, f: F) -> R {
|
||||
let _g = GOLDEN_ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
|
||||
// SAFETY: `KB_EVAL_GOLDEN` is a benign env var; the GOLDEN_ENV_LOCK
|
||||
// serializes mutations so concurrent tests don't race.
|
||||
unsafe {
|
||||
std::env::set_var("KB_EVAL_GOLDEN", yaml);
|
||||
}
|
||||
let out = f();
|
||||
unsafe {
|
||||
std::env::remove_var("KB_EVAL_GOLDEN");
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
// ── 1. elapsed_ms recorded for every query ──────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn runner_records_elapsed_for_every_query() {
|
||||
let env = RunEnv::new();
|
||||
let yaml = write_golden(
|
||||
env.data_dir().as_path(),
|
||||
"- id: q1\n query: ownership\n- id: q2\n query: heading\n- id: q3\n query: workspace\n",
|
||||
);
|
||||
|
||||
let run = run_with_golden(&yaml, || {
|
||||
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
|
||||
});
|
||||
|
||||
assert_eq!(run.per_query.len(), 3);
|
||||
for qr in &run.per_query {
|
||||
assert_eq!(qr.mode, SearchMode::Lexical);
|
||||
// `elapsed_ms` is `u32`; the assertion that it's a valid
|
||||
// unsigned value is implicit. We additionally bound it well
|
||||
// below the 4G ceiling to detect a stuck/overflow path.
|
||||
assert!(
|
||||
qr.elapsed_ms < 60_000,
|
||||
"elapsed_ms suspicious: {}",
|
||||
qr.elapsed_ms
|
||||
);
|
||||
}
|
||||
// The id-list round-trips into the per-query records.
|
||||
let ids: Vec<&str> = run.per_query.iter().map(|q| q.query_id.as_str()).collect();
|
||||
assert_eq!(ids, vec!["q1", "q2", "q3"]);
|
||||
}
|
||||
|
||||
// ── 2. config snapshot carries the documented version fields ────────────────
|
||||
|
||||
#[test]
|
||||
fn runner_records_config_snapshot_with_versions() {
|
||||
let env = RunEnv::new();
|
||||
let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n query: ownership\n");
|
||||
|
||||
let run = run_with_golden(&yaml, || {
|
||||
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
|
||||
});
|
||||
|
||||
let snap = &run.config_snapshot_json;
|
||||
assert!(snap.get("config").is_some(), "config field missing");
|
||||
assert_eq!(
|
||||
snap.pointer("/chunker_version"),
|
||||
Some(&serde_json::Value::String("md-heading-v1".to_string())),
|
||||
);
|
||||
assert!(snap.pointer("/embedding/model").is_some());
|
||||
assert!(snap.pointer("/embedding/dimensions").is_some());
|
||||
assert!(snap.pointer("/llm/model_id").is_some());
|
||||
assert_eq!(
|
||||
snap.pointer("/prompt_template_version"),
|
||||
Some(&serde_json::Value::String("rag-v1".to_string())),
|
||||
);
|
||||
assert!(snap.pointer("/score_gate").is_some());
|
||||
assert!(snap.pointer("/rrf_k").is_some());
|
||||
}
|
||||
|
||||
// ── 3. failing query (ask path with no Ollama) records an error ─────────────
|
||||
|
||||
#[test]
|
||||
fn runner_captures_per_query_error_when_rag_unreachable() {
|
||||
let env = RunEnv::new();
|
||||
// Point Ollama at an unbound port so `ask_with_config` surfaces a
|
||||
// connection error per query. We use bind-then-release rather than
|
||||
// a hard-coded `:1` because port 1 is reserved-but-not-guaranteed-
|
||||
// unbound (some hardened systems answer with ICMP unreachable
|
||||
// instantly, others timeout slowly). TOCTOU race is theoretically
|
||||
// possible but rare in practice and faster-failing than `:1`.
|
||||
let mut config = env.config.clone();
|
||||
config.models.llm.endpoint = unreachable_endpoint();
|
||||
|
||||
let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n query: ownership\n");
|
||||
|
||||
let opts = EvalRunOpts {
|
||||
with_rag: true,
|
||||
..lexical_opts()
|
||||
};
|
||||
let run = run_with_golden(&yaml, || run_eval_with_config(&config, &opts).unwrap());
|
||||
|
||||
let qr = &run.per_query[0];
|
||||
// hits_top_k still populated by lexical search before the RAG attempt.
|
||||
assert!(
|
||||
!qr.hits_top_k.is_empty(),
|
||||
"lexical hits should populate before RAG attempt"
|
||||
);
|
||||
assert!(qr.answer.is_none(), "no answer when RAG fails");
|
||||
assert!(qr.error.is_some(), "error must be recorded");
|
||||
}
|
||||
|
||||
// ── 4. eval_runs + eval_query_results rows persisted ────────────────────────
|
||||
|
||||
#[test]
|
||||
fn runner_persists_eval_run_and_query_result_rows() {
|
||||
let env = RunEnv::new();
|
||||
let yaml = write_golden(
|
||||
env.data_dir().as_path(),
|
||||
"- id: q1\n query: ownership\n- id: q2\n query: heading\n",
|
||||
);
|
||||
|
||||
let run = run_with_golden(&yaml, || {
|
||||
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
|
||||
});
|
||||
|
||||
// Reopen the same SQLite file with a new store handle and read
|
||||
// the rows back. We use the inherent `read_conn` helper rather
|
||||
// than rusqlite directly because the latter would require kb-eval
|
||||
// to add a runtime rusqlite dep (forbidden by the spec).
|
||||
let store = SqliteStore::open(&env.config).unwrap();
|
||||
let conn = store.read_conn();
|
||||
|
||||
let n_runs: i64 = conn
|
||||
.query_row(
|
||||
"SELECT COUNT(*) FROM eval_runs WHERE run_id = ?",
|
||||
params![run.run_id],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(n_runs, 1);
|
||||
|
||||
let n_results: i64 = conn
|
||||
.query_row(
|
||||
"SELECT COUNT(*) FROM eval_query_results WHERE run_id = ?",
|
||||
params![run.run_id],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(n_results, 2);
|
||||
}
|
||||
|
||||
// ── 5. per_query.jsonl mirror exists and round-trips ────────────────────────
|
||||
|
||||
#[test]
|
||||
fn runner_writes_per_query_jsonl_mirror() {
|
||||
let env = RunEnv::new();
|
||||
let yaml = write_golden(
|
||||
env.data_dir().as_path(),
|
||||
"- id: q1\n query: ownership\n- id: q2\n query: heading\n",
|
||||
);
|
||||
|
||||
let run = run_with_golden(&yaml, || {
|
||||
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
|
||||
});
|
||||
|
||||
let mirror = env
|
||||
.data_dir()
|
||||
.join("runs")
|
||||
.join(&run.run_id)
|
||||
.join("per_query.jsonl");
|
||||
assert!(
|
||||
mirror.exists(),
|
||||
"per_query.jsonl missing at {}",
|
||||
mirror.display()
|
||||
);
|
||||
let body = fs::read_to_string(&mirror).unwrap();
|
||||
let lines: Vec<&str> = body.lines().collect();
|
||||
assert_eq!(lines.len(), 2);
|
||||
let parsed: Vec<QueryResult> = lines
|
||||
.iter()
|
||||
.map(|l| serde_json::from_str::<QueryResult>(l).expect("valid JSONL line"))
|
||||
.collect();
|
||||
assert_eq!(parsed[0].query_id, "q1");
|
||||
assert_eq!(parsed[1].query_id, "q2");
|
||||
}
|
||||
|
||||
// ── 6. determinism — repeating the run produces byte-identical per_query JSON ─
|
||||
|
||||
#[test]
|
||||
fn runner_lexical_is_deterministic_per_query_payload() {
|
||||
let env = RunEnv::new();
|
||||
let yaml = write_golden(
|
||||
env.data_dir().as_path(),
|
||||
"- id: q1\n query: ownership\n- id: q2\n query: heading\n",
|
||||
);
|
||||
|
||||
let run_a = run_with_golden(&yaml, || {
|
||||
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
|
||||
});
|
||||
let run_b = run_with_golden(&yaml, || {
|
||||
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
|
||||
});
|
||||
|
||||
// Run-level fields (`run_id`, `created_at`) intentionally diverge;
|
||||
// the per-query payload (which is what the snapshot fixture pins)
|
||||
// must be byte-identical.
|
||||
let a_json = serde_json::to_string(&run_a.per_query).unwrap();
|
||||
let b_json = serde_json::to_string(&run_b.per_query).unwrap();
|
||||
assert_eq!(
|
||||
a_json, b_json,
|
||||
"lexical-only per_query payload must be byte-identical across runs"
|
||||
);
|
||||
}
|
||||
|
||||
// ── 7. snapshot — per_query JSON pinned to fixtures/eval/run-1.json ─────────
|
||||
|
||||
#[test]
|
||||
fn runner_per_query_snapshot_matches_fixture() {
|
||||
let env = RunEnv::new();
|
||||
let yaml = write_golden(
|
||||
env.data_dir().as_path(),
|
||||
"- id: q1\n query: ownership\n- id: q2\n query: heading\n",
|
||||
);
|
||||
|
||||
let run = run_with_golden(&yaml, || {
|
||||
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
|
||||
});
|
||||
|
||||
// Fixture pins the *shape* of the per-query payload, including the
|
||||
// first hit's stable scalar fields (chunk_id, doc_id, heading_path,
|
||||
// fusion_score). FTS scores depend on the SQLite version, so the
|
||||
// fusion_score is captured into the fixture from one passing run
|
||||
// and must remain stable across re-runs against the same seeded
|
||||
// corpus. Timing-sensitive fields (`elapsed_ms`, raw `Instant`
|
||||
// byproducts) are excluded. Verifying byte stability is the
|
||||
// determinism test (#6); this test verifies the field set +
|
||||
// ordering is stable.
|
||||
let projection: Vec<_> = run
|
||||
.per_query
|
||||
.iter()
|
||||
.map(|qr| {
|
||||
let first_hit = qr.hits_top_k.first().map(|h| {
|
||||
serde_json::json!({
|
||||
"chunk_id": h.chunk_id,
|
||||
"doc_id": h.doc_id,
|
||||
"heading_path": h.heading_path,
|
||||
"score": h.retrieval.fusion_score,
|
||||
})
|
||||
});
|
||||
serde_json::json!({
|
||||
"query_id": qr.query_id,
|
||||
"query": qr.query,
|
||||
"mode": qr.mode,
|
||||
"hits_count": qr.hits_top_k.len(),
|
||||
"first_hit": first_hit,
|
||||
"has_answer": qr.answer.is_some(),
|
||||
"error": qr.error,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
let actual = serde_json::to_string_pretty(&projection).unwrap();
|
||||
|
||||
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/eval/run-1.json");
|
||||
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
fs::create_dir_all(fixture_path.parent().unwrap()).unwrap();
|
||||
fs::write(&fixture_path, &actual).unwrap();
|
||||
}
|
||||
|
||||
let expected = fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("read snapshot {}: {e}", fixture_path.display()));
|
||||
assert_eq!(
|
||||
actual.trim(),
|
||||
expected.trim(),
|
||||
"snapshot drift — re-run with UPDATE_SNAPSHOTS=1 to refresh"
|
||||
);
|
||||
}
|
||||
161
crates/kb-store-sqlite/src/eval.rs
Normal file
161
crates/kb-store-sqlite/src/eval.rs
Normal file
@@ -0,0 +1,161 @@
|
||||
//! `eval_runs` / `eval_query_results` row writers (P5-1 — design §5.7).
|
||||
//!
|
||||
//! `kb-eval` calls these directly via the inherent methods on
|
||||
//! [`SqliteStore`]. The pattern mirrors [`crate::answers`]: the trait
|
||||
//! `kb_core::DocumentStore` is the document surface, and run-level
|
||||
//! audit rows (jobs, ingest_runs, answers, eval_runs) are inherent
|
||||
//! methods so the trait surface stays small.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use rusqlite::params;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::error::StoreError;
|
||||
use crate::store::SqliteStore;
|
||||
|
||||
/// One row about to land in `eval_runs` (per V001 schema).
|
||||
///
|
||||
/// `aggregate_json` is filled by P5-1 with the literal `"{}"` —
|
||||
/// metric computation lives in P5-2 and updates the row in place.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct EvalRunRow<'a> {
|
||||
pub run_id: &'a str,
|
||||
pub suite: &'a str,
|
||||
pub config_snapshot_json: &'a str,
|
||||
pub aggregate_json: &'a str,
|
||||
pub commit_hash: Option<&'a str>,
|
||||
pub created_at: OffsetDateTime,
|
||||
}
|
||||
|
||||
impl SqliteStore {
|
||||
/// Return `true` iff a row with `doc_id = ?` exists in
|
||||
/// `documents`. Lightweight existence probe used by
|
||||
/// `kb-eval`'s golden-fixture validator — full
|
||||
/// `DocumentStore::get_document` deserializes blocks + metadata
|
||||
/// JSON, which is overkill for "does this ID exist?"
|
||||
pub fn document_exists(&self, doc_id: &str) -> Result<bool> {
|
||||
let conn = self.lock_conn();
|
||||
let row: Result<i64, rusqlite::Error> = conn.query_row(
|
||||
"SELECT 1 FROM documents WHERE doc_id = ? LIMIT 1",
|
||||
params![doc_id],
|
||||
|r| r.get(0),
|
||||
);
|
||||
match row {
|
||||
Ok(_) => Ok(true),
|
||||
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(false),
|
||||
Err(e) => Err(StoreError::from(e).into()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Same shape as [`Self::document_exists`] but probes the
|
||||
/// `chunks` table by `chunk_id`.
|
||||
pub fn chunk_exists(&self, chunk_id: &str) -> Result<bool> {
|
||||
let conn = self.lock_conn();
|
||||
let row: Result<i64, rusqlite::Error> = conn.query_row(
|
||||
"SELECT 1 FROM chunks WHERE chunk_id = ? LIMIT 1",
|
||||
params![chunk_id],
|
||||
|r| r.get(0),
|
||||
);
|
||||
match row {
|
||||
Ok(_) => Ok(true),
|
||||
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(false),
|
||||
Err(e) => Err(StoreError::from(e).into()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert one row into `eval_runs`. Mirrors the schema in
|
||||
/// `migrations/V001__init.sql` (§5.7). Called by
|
||||
/// `kb-eval::run_eval` once per run, after every per-query result
|
||||
/// row has been written.
|
||||
pub fn record_eval_run(&self, row: &EvalRunRow<'_>) -> Result<()> {
|
||||
let created_at = row
|
||||
.created_at
|
||||
.format(&time::format_description::well_known::Rfc3339)
|
||||
.context("format eval_runs.created_at")?;
|
||||
let conn = self.lock_conn();
|
||||
conn.execute(
|
||||
"INSERT INTO eval_runs (
|
||||
run_id, suite, config_snapshot_json, aggregate_json,
|
||||
commit_hash, created_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
params![
|
||||
row.run_id,
|
||||
row.suite,
|
||||
row.config_snapshot_json,
|
||||
row.aggregate_json,
|
||||
row.commit_hash,
|
||||
created_at,
|
||||
],
|
||||
)
|
||||
.map_err(StoreError::from)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Insert one row into `eval_query_results`. PRIMARY KEY is
|
||||
/// `(run_id, query_id)` so writing the same `(run, query)` twice
|
||||
/// surfaces a `UNIQUE` violation through `StoreError`.
|
||||
pub fn record_eval_query_result(
|
||||
&self,
|
||||
run_id: &str,
|
||||
query_id: &str,
|
||||
result_json: &str,
|
||||
) -> Result<()> {
|
||||
let conn = self.lock_conn();
|
||||
conn.execute(
|
||||
"INSERT INTO eval_query_results (run_id, query_id, result_json)
|
||||
VALUES (?, ?, ?)",
|
||||
params![run_id, query_id, result_json],
|
||||
)
|
||||
.map_err(StoreError::from)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Insert the `eval_runs` row plus every `eval_query_results` row
|
||||
/// for the same run inside a single SQLite transaction. This is the
|
||||
/// preferred path for `kb-eval::run_eval` — a panic between the run
|
||||
/// row and the per-query rows can't leave orphan run rows.
|
||||
///
|
||||
/// `results` is a slice of `(query_id, result_json)` tuples mirroring
|
||||
/// the per-call `record_eval_query_result` arguments.
|
||||
pub fn record_eval_run_with_results(
|
||||
&self,
|
||||
row: &EvalRunRow<'_>,
|
||||
results: &[(String, String)],
|
||||
) -> Result<()> {
|
||||
let created_at = row
|
||||
.created_at
|
||||
.format(&time::format_description::well_known::Rfc3339)
|
||||
.context("format eval_runs.created_at")?;
|
||||
let mut conn = self.lock_conn();
|
||||
let tx = conn.transaction().map_err(StoreError::from)?;
|
||||
tx.execute(
|
||||
"INSERT INTO eval_runs (
|
||||
run_id, suite, config_snapshot_json, aggregate_json,
|
||||
commit_hash, created_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
params![
|
||||
row.run_id,
|
||||
row.suite,
|
||||
row.config_snapshot_json,
|
||||
row.aggregate_json,
|
||||
row.commit_hash,
|
||||
created_at,
|
||||
],
|
||||
)
|
||||
.map_err(StoreError::from)?;
|
||||
{
|
||||
let mut stmt = tx
|
||||
.prepare(
|
||||
"INSERT INTO eval_query_results (run_id, query_id, result_json)
|
||||
VALUES (?, ?, ?)",
|
||||
)
|
||||
.map_err(StoreError::from)?;
|
||||
for (query_id, result_json) in results {
|
||||
stmt.execute(params![row.run_id, query_id, result_json])
|
||||
.map_err(StoreError::from)?;
|
||||
}
|
||||
}
|
||||
tx.commit().map_err(StoreError::from)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,7 @@ mod answers;
|
||||
mod documents;
|
||||
mod embeddings;
|
||||
mod error;
|
||||
mod eval;
|
||||
mod filters;
|
||||
mod fts;
|
||||
mod jobs;
|
||||
@@ -29,6 +30,7 @@ mod store;
|
||||
|
||||
pub use embeddings::EmbeddingRecordRow;
|
||||
pub use error::StoreError;
|
||||
pub use eval::EvalRunRow;
|
||||
pub use fts::rebuild_chunks_fts;
|
||||
pub use jobs::IngestRunRow;
|
||||
pub use store::SqliteStore;
|
||||
|
||||
@@ -64,7 +64,7 @@ impl SqliteStore {
|
||||
/// temp_store=MEMORY), and create parent directories as needed.
|
||||
/// **Does not run migrations** — call [`Self::run_migrations`] next.
|
||||
pub fn open(config: &kb_config::Config) -> Result<Self> {
|
||||
let data_dir = expand_data_dir(&config.storage.data_dir);
|
||||
let data_dir = kb_config::expand_path(&config.storage.data_dir, "");
|
||||
std::fs::create_dir_all(&data_dir)
|
||||
.with_context(|| format!("create data_dir {}", data_dir.display()))?;
|
||||
let db_path = data_dir.join(SQLITE_FILE);
|
||||
@@ -363,53 +363,3 @@ fn apply_pragmas(conn: &Connection) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Expand the placeholders / `~` / env-vars used by `Config::storage.data_dir`.
|
||||
///
|
||||
/// Supported substitutions, in order:
|
||||
/// - `${XDG_DATA_HOME:-~/.local/share}` (and the bare `${XDG_DATA_HOME}`)
|
||||
/// - leading `~` → `$HOME`
|
||||
///
|
||||
/// If neither produces an absolute path, the input is returned as-is
|
||||
/// (relative paths are kept relative to the caller's CWD).
|
||||
fn expand_data_dir(raw: &str) -> PathBuf {
|
||||
let mut s = raw.to_string();
|
||||
|
||||
// ${XDG_DATA_HOME:-~/.local/share}: respect the env override, else
|
||||
// fall back to the suffix after `:-`.
|
||||
if let Some(start) = s.find("${XDG_DATA_HOME") {
|
||||
if let Some(rel_end) = s[start..].find('}') {
|
||||
let end = start + rel_end + 1; // include trailing '}'
|
||||
let inner = &s[start + 2..end - 1]; // strip ${ and }
|
||||
let replacement = match std::env::var("XDG_DATA_HOME") {
|
||||
Ok(v) if !v.is_empty() => v,
|
||||
_ => {
|
||||
// inner is e.g. `XDG_DATA_HOME:-~/.local/share`.
|
||||
if let Some((_, default)) = inner.split_once(":-") {
|
||||
default.to_string()
|
||||
} else {
|
||||
// No default supplied; mimic Bash and yield "".
|
||||
String::new()
|
||||
}
|
||||
}
|
||||
};
|
||||
s.replace_range(start..end, &replacement);
|
||||
}
|
||||
}
|
||||
|
||||
// ~ at the front → $HOME (or `dirs::home_dir`).
|
||||
if let Some(rest) = s.strip_prefix('~') {
|
||||
if let Some(home) = std::env::var_os("HOME").map(PathBuf::from).or_else(dirs_home_fallback)
|
||||
{
|
||||
return home.join(rest.trim_start_matches('/'));
|
||||
}
|
||||
}
|
||||
|
||||
PathBuf::from(s)
|
||||
}
|
||||
|
||||
/// Tiny shim to avoid pulling in the `dirs` crate as a direct dep — we
|
||||
/// only fall back when `$HOME` is unset, which is exotic on the platforms
|
||||
/// we target. Returns `None` so the caller keeps the literal `~`.
|
||||
fn dirs_home_fallback() -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
@@ -1,51 +1,10 @@
|
||||
//! Path expansion + table-name sanitization.
|
||||
//!
|
||||
//! Mirrors `kb-store-sqlite::store::expand_data_dir` and
|
||||
//! `kb-embed-local::expand_path` so the three crates resolve
|
||||
//! `${XDG_DATA_HOME:-…}` / leading `~` / `{data_dir}` identically. A
|
||||
//! shared helper would live in `kb-config`, but the task spec forbids
|
||||
//! adding new types to `kb-config`, so we keep a private clone.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Expand `{data_dir}` → `data_dir`, `${XDG_DATA_HOME:-…}` → env or
|
||||
/// default, leading `~` → `$HOME`. Pass an empty `data_dir` when
|
||||
/// resolving `data_dir` itself (the `{data_dir}` substitution is a
|
||||
/// no-op in that case).
|
||||
pub(crate) fn expand_path(raw: &str, data_dir: &str) -> PathBuf {
|
||||
let mut s = raw.to_string();
|
||||
|
||||
if !data_dir.is_empty() {
|
||||
s = s.replace("{data_dir}", data_dir);
|
||||
}
|
||||
|
||||
// ${XDG_DATA_HOME:-~/.local/share}: env override, else default after `:-`.
|
||||
if let Some(start) = s.find("${XDG_DATA_HOME") {
|
||||
if let Some(rel_end) = s[start..].find('}') {
|
||||
let end = start + rel_end + 1;
|
||||
let inner = &s[start + 2..end - 1];
|
||||
let replacement = match std::env::var("XDG_DATA_HOME") {
|
||||
Ok(v) if !v.is_empty() => v,
|
||||
_ => {
|
||||
if let Some((_, default)) = inner.split_once(":-") {
|
||||
default.to_string()
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
}
|
||||
};
|
||||
s.replace_range(start..end, &replacement);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(rest) = s.strip_prefix('~') {
|
||||
if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
|
||||
return home.join(rest.trim_start_matches('/'));
|
||||
}
|
||||
}
|
||||
|
||||
PathBuf::from(s)
|
||||
}
|
||||
//! `expand_path` lives in `kb-config` so `kb-store-vector`,
|
||||
//! `kb-store-sqlite`, `kb-embed-local`, and `kb-eval` all resolve
|
||||
//! `${XDG_DATA_HOME:-…}` / leading `~` / `{data_dir}` identically. This
|
||||
//! module re-exports nothing; consumers within the crate `use
|
||||
//! kb_config::expand_path` directly.
|
||||
|
||||
/// Build the per-model Lance table name. Per design §6.3:
|
||||
/// `chunk_embeddings_<model>_<dim>.lance`. Model IDs may contain
|
||||
@@ -104,16 +63,4 @@ mod tests {
|
||||
"chunk_embeddings_BAAI_bge-small-en_384"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_path_substitutes_data_dir() {
|
||||
let p = expand_path("{data_dir}/lancedb", "/tmp/kbtest");
|
||||
assert_eq!(p, PathBuf::from("/tmp/kbtest/lancedb"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_path_passthrough_absolute() {
|
||||
let p = expand_path("/abs/dir", "/ignored");
|
||||
assert_eq!(p, PathBuf::from("/abs/dir"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,8 +22,10 @@ use serde_json::json;
|
||||
use time::OffsetDateTime;
|
||||
use tokio::runtime::{Builder as RuntimeBuilder, Runtime};
|
||||
|
||||
use kb_config::expand_path;
|
||||
|
||||
use crate::arrow_batch::{build_batch, schema_for, schema_params_hash};
|
||||
use crate::paths::{expand_path, lance_table_name};
|
||||
use crate::paths::lance_table_name;
|
||||
|
||||
/// Overfetch multiplier: when post-filtering Lance results against
|
||||
/// SQLite-side filters we ask for `2 * k` candidates so a moderately
|
||||
|
||||
45
fixtures/golden_queries.yaml
Normal file
45
fixtures/golden_queries.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
# Golden query suite for `kb eval run` (P5-1 / P5-2).
|
||||
#
|
||||
# Top-level: list of queries. Required fields: `id`, `query`. All
|
||||
# others are optional and default to empty / null.
|
||||
#
|
||||
# Curators: `expected_doc_ids` and `expected_chunk_ids` MUST refer to
|
||||
# real rows in the active workspace's SQLite store at run time. Stale
|
||||
# references make the runner bail at start. The shipped template
|
||||
# leaves them empty so the file is loadable on any fresh workspace —
|
||||
# fill them in after a `kb ingest` to enable hit@k / MRR metrics
|
||||
# (P5-2).
|
||||
#
|
||||
# `must_contain` / `forbidden` drive the rule-based groundedness
|
||||
# metric (P5-2).
|
||||
|
||||
- id: g001
|
||||
query: "Cargo workspace 멤버 추가하는 법"
|
||||
lang: ko
|
||||
must_contain: ["[workspace]", "members"]
|
||||
difficulty: easy
|
||||
|
||||
- id: g002
|
||||
query: "What is Rust ownership?"
|
||||
lang: en
|
||||
must_contain: ["borrow", "lifetime"]
|
||||
difficulty: easy
|
||||
|
||||
- id: g003
|
||||
query: "Markdown chunking 규칙은?"
|
||||
lang: ko
|
||||
must_contain: ["heading"]
|
||||
forbidden: ["embedding"]
|
||||
difficulty: medium
|
||||
|
||||
- id: g004
|
||||
query: "How does FTS5 tokenization work for Korean text?"
|
||||
lang: en
|
||||
must_contain: ["unicode61", "tokenizer"]
|
||||
difficulty: medium
|
||||
|
||||
- id: g005
|
||||
query: "RAG citation 검증은 어떻게 동작?"
|
||||
lang: ko
|
||||
must_contain: ["citation", "marker"]
|
||||
difficulty: hard
|
||||
@@ -3,7 +3,7 @@ phase: P5
|
||||
component: kb-eval (runner)
|
||||
task_id: p5-1
|
||||
title: "Golden query fixture loader + per-query runner"
|
||||
status: planned
|
||||
status: completed
|
||||
depends_on: [p4-3]
|
||||
unblocks: [p5-2]
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kb-final-form-design.md
|
||||
@@ -149,6 +149,6 @@ All tests under `cargo test -p kb-eval runner`.
|
||||
|
||||
## Risks / notes
|
||||
|
||||
- Large RAG suites can be slow. Consider `--max-queries` for incremental runs (kept here as a flag spec; implementation is the responsibility of this task).
|
||||
- Large RAG suites can be slow. `--max-queries` flag is deferred to P5-2 / a follow-up. Rationale: (a) the runner currently runs all queries unconditionally; (b) without metrics aggregation it adds little incremental value; (c) trivial to add as a `Vec::truncate` once the eval CLI subcommand exists.
|
||||
- `expected_chunk_id` references depend on `chunker_version`. If chunker bumps, golden set must be re-curated. Fail fast in the loader.
|
||||
- Use `time::OffsetDateTime::now_utc()` for `created_at`; never local TZ.
|
||||
|
||||
Reference in New Issue
Block a user