refactor(rename): kb crates → kebab — Cargo packages, folders, Rust modules
프로젝트 이름 `kb` → `kebab` rename 의 첫 단계. - workspace `Cargo.toml`: members `crates/kb-*` → `crates/kebab-*`, repository URL `altair823/kb` → `altair823/kebab`. - 18 crate 폴더 rename via `git mv` (history 보존). - 각 crate `Cargo.toml`: `name = "kb-*"` → `"kebab-*"`, path deps `../kb-*` → `../kebab-*`. - 모든 `.rs`: `kb_<id>` snake-case 모듈 path 18 개 (`kb_core`, `kb_config`, `kb_app`, `kb_cli`, `kb_eval`, `kb_search`, `kb_chunk`, `kb_normalize`, `kb_source_fs`, `kb_parse_md`, `kb_parse_types`, `kb_store_sqlite`, `kb_store_vector`, `kb_embed`, `kb_embed_local`, `kb_llm`, `kb_llm_local`, `kb_rag`) → `kebab_<id>` 일괄 sed (단어 경계 \\b 사용해 영어 문장 안의 "kb" 약어 미오염). CLI binary 이름 (`[[bin]] name = "kb"`), 환경변수 `KB_*`, XDG paths, tracing target, 그리고 docs sweep 은 다음 commit 에서. ## 검증 - `cargo check --workspace` clean — 모든 crate 빌드 통과 후 commit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
39
crates/kebab-app/Cargo.toml
Normal file
39
crates/kebab-app/Cargo.toml
Normal file
@@ -0,0 +1,39 @@
|
||||
[package]
|
||||
name = "kebab-app"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Facade — orchestrates components for kb-cli/tui/desktop"
|
||||
|
||||
[dependencies]
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
kebab-source-fs = { path = "../kebab-source-fs" }
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-parse-types = { path = "../kebab-parse-types" }
|
||||
kebab-normalize = { path = "../kebab-normalize" }
|
||||
kebab-chunk = { path = "../kebab-chunk" }
|
||||
kebab-store-sqlite = { path = "../kebab-store-sqlite" }
|
||||
kebab-store-vector = { path = "../kebab-store-vector" }
|
||||
kebab-search = { path = "../kebab-search" }
|
||||
kebab-embed = { path = "../kebab-embed" }
|
||||
kebab-embed-local = { path = "../kebab-embed-local" }
|
||||
kebab-llm = { path = "../kebab-llm" }
|
||||
kebab-llm-local = { path = "../kebab-llm-local" }
|
||||
kebab-rag = { path = "../kebab-rag" }
|
||||
anyhow = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
|
||||
tracing-appender = "0.2"
|
||||
toml = "0.8"
|
||||
dirs = "5"
|
||||
|
||||
[dev-dependencies]
|
||||
rusqlite = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
304
crates/kebab-app/src/app.rs
Normal file
304
crates/kebab-app/src/app.rs
Normal file
@@ -0,0 +1,304 @@
|
||||
//! `App` — facade lifecycle struct (§7).
|
||||
//!
|
||||
//! A single `App` represents one CLI invocation's (or one TUI
|
||||
//! session's / one eval-runner suite's) worth of state: a resolved
|
||||
//! `Config`, an open `SqliteStore`, and (when embeddings are enabled)
|
||||
//! an `Embedder` + `LanceVectorStore`. Each public free function on
|
||||
//! `kb-app` builds an `App` once, runs the requested op, and drops
|
||||
//! everything on return; long-lived callers (kb-eval, the future P9
|
||||
//! TUI session) hold onto an `App` across many calls so the per-query
|
||||
//! cost is just a method dispatch.
|
||||
//!
|
||||
//! ## Embedder + Vector store lifetime
|
||||
//!
|
||||
//! `App::open_with_config` builds the SQLite store unconditionally.
|
||||
//! The embedder and vector store are *lazy + memoized* — built on
|
||||
//! first call to [`App::embedder`] / [`App::vector`] and cached in
|
||||
//! `OnceLock`s — so a long-lived `App` (kb-eval driving 50 queries,
|
||||
//! the P9 TUI session) pays the ~470 MB ONNX init plus Lance reopen
|
||||
//! cost exactly once.
|
||||
//!
|
||||
//! - `kb list` / `kb inspect` never need them.
|
||||
//! - `kb search --mode lexical` never needs them.
|
||||
//! - `kb ingest` and `kb search --mode {vector,hybrid}` always do.
|
||||
//!
|
||||
//! Building eagerly would force every CLI invocation to load ~470 MB of
|
||||
//! ONNX weights, which is the dominant cold-start cost. The lazy
|
||||
//! pattern keeps the lexical-only paths instant; the memoization makes
|
||||
//! the TUI's repeated searches and the eval runner's per-query loop
|
||||
//! cheap after the first invocation.
|
||||
//!
|
||||
//! Embeddings can also be **disabled** workspace-wide via
|
||||
//! `config.models.embedding.provider = "none"` (or `dimensions = 0`);
|
||||
//! in that mode [`App::embedder`] returns `None` and callers must fall
|
||||
//! back to lexical-only search.
|
||||
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
|
||||
use kebab_core::{
|
||||
Answer, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit, SearchMode,
|
||||
SearchQuery, VectorStore,
|
||||
};
|
||||
use kebab_embed_local::FastembedEmbedder;
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_rag::{AskOpts, RagPipeline};
|
||||
use kebab_search::{HybridRetriever, LexicalRetriever, VectorRetriever};
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
use kebab_store_vector::LanceVectorStore;
|
||||
|
||||
/// Facade state — see module docs for lifetime rules.
|
||||
///
|
||||
/// The struct is public so long-lived callers (kb-eval, the future P9
|
||||
/// TUI session) can construct one and reuse it across many search /
|
||||
/// ask calls. The OnceLock-backed `embedder` / `vector` fields ensure
|
||||
/// the cold-start cost is paid exactly once per instance.
|
||||
pub struct App {
|
||||
pub(crate) config: kebab_config::Config,
|
||||
pub(crate) sqlite: Arc<SqliteStore>,
|
||||
/// Memoized embedder — built lazily on first `embedder()` call when
|
||||
/// embeddings are enabled. `OnceLock` keeps the struct `Sync` and
|
||||
/// the build path cold-only-once.
|
||||
embedder: OnceLock<Arc<dyn Embedder + Send + Sync>>,
|
||||
/// Memoized vector store — built lazily on first `vector()` call
|
||||
/// when embeddings are enabled. Same rationale as `embedder`.
|
||||
vector: OnceLock<Arc<LanceVectorStore>>,
|
||||
/// Memoized LLM — built lazily on first `ask()` call. Sharing one
|
||||
/// across the eval runner avoids re-handshaking the Ollama HTTP
|
||||
/// client per query (cheap, but still measurable on a 50-query
|
||||
/// suite).
|
||||
llm: OnceLock<Arc<dyn LanguageModel>>,
|
||||
}
|
||||
|
||||
impl App {
|
||||
/// Open the SQLite store and run migrations. Does NOT load the
|
||||
/// embedder or vector store — those are lazy via
|
||||
/// [`Self::embedder`] / [`Self::vector`].
|
||||
///
|
||||
/// **Caveat:** must be called from a synchronous context.
|
||||
/// Downstream `LanceVectorStore::new` (called by [`Self::vector`])
|
||||
/// internally drives a `tokio::Runtime::block_on`, which panics if
|
||||
/// invoked from inside another tokio runtime.
|
||||
pub fn open_with_config(config: kebab_config::Config) -> Result<Self> {
|
||||
let sqlite = SqliteStore::open(&config).context("kb-app: open SqliteStore")?;
|
||||
sqlite
|
||||
.run_migrations()
|
||||
.context("kb-app: run SqliteStore migrations")?;
|
||||
Ok(Self {
|
||||
config,
|
||||
sqlite: Arc::new(sqlite),
|
||||
embedder: OnceLock::new(),
|
||||
vector: OnceLock::new(),
|
||||
llm: OnceLock::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Run a [`SearchQuery`] through the configured retriever stack and
|
||||
/// return the top-k hits.
|
||||
///
|
||||
/// Reuses any previously-built embedder / vector store on this `App`
|
||||
/// — long-lived callers (kb-eval, future TUI) get amortized cost
|
||||
/// across calls.
|
||||
pub fn search(&self, query: SearchQuery) -> Result<Vec<SearchHit>> {
|
||||
match query.mode {
|
||||
SearchMode::Lexical => {
|
||||
let lex = LexicalRetriever::with_settings(
|
||||
self.sqlite.clone(),
|
||||
lexical_index_version(&self.config),
|
||||
self.config.search.snippet_chars,
|
||||
);
|
||||
lex.search(&query)
|
||||
}
|
||||
SearchMode::Vector => {
|
||||
let (emb, vec_store) = self.require_embeddings()?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
let retr = VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
self.sqlite.clone(),
|
||||
vec_iv,
|
||||
self.config.search.snippet_chars,
|
||||
);
|
||||
retr.search(&query)
|
||||
}
|
||||
SearchMode::Hybrid => {
|
||||
let lex = Arc::new(LexicalRetriever::with_settings(
|
||||
self.sqlite.clone(),
|
||||
lexical_index_version(&self.config),
|
||||
self.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
let (emb, vec_store) = self.require_embeddings()?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
let vec_retr = Arc::new(VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
self.sqlite.clone(),
|
||||
vec_iv,
|
||||
self.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
let hybrid = HybridRetriever::new(&self.config, lex, vec_retr);
|
||||
hybrid.search(&query)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Run a RAG `ask` against the configured retriever + LLM. Reuses
|
||||
/// the memoized embedder / vector / LLM where applicable.
|
||||
pub fn ask(&self, query: &str, opts: AskOpts) -> Result<Answer> {
|
||||
let retriever: Arc<dyn Retriever> = match opts.mode {
|
||||
SearchMode::Lexical => Arc::new(LexicalRetriever::with_settings(
|
||||
self.sqlite.clone(),
|
||||
lexical_index_version(&self.config),
|
||||
self.config.search.snippet_chars,
|
||||
)),
|
||||
SearchMode::Vector => {
|
||||
let (emb, vec_store) = self.require_embeddings()?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
Arc::new(VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
self.sqlite.clone(),
|
||||
vec_iv,
|
||||
self.config.search.snippet_chars,
|
||||
))
|
||||
}
|
||||
SearchMode::Hybrid => {
|
||||
let lex = Arc::new(LexicalRetriever::with_settings(
|
||||
self.sqlite.clone(),
|
||||
lexical_index_version(&self.config),
|
||||
self.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
let (emb, vec_store) = self.require_embeddings()?;
|
||||
let vec_iv = vector_index_version(emb.as_ref());
|
||||
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
|
||||
let emb_dyn: Arc<dyn Embedder> = emb;
|
||||
let vec_retr = Arc::new(VectorRetriever::with_settings(
|
||||
vec_dyn,
|
||||
emb_dyn,
|
||||
self.sqlite.clone(),
|
||||
vec_iv,
|
||||
self.config.search.snippet_chars,
|
||||
)) as Arc<dyn Retriever>;
|
||||
Arc::new(HybridRetriever::new(&self.config, lex, vec_retr))
|
||||
}
|
||||
};
|
||||
|
||||
let llm = self.llm()?;
|
||||
let pipeline =
|
||||
RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
pipeline.ask(query, opts)
|
||||
}
|
||||
|
||||
/// Returns `true` when the workspace has embeddings turned off
|
||||
/// (`provider = "none"` or `dimensions = 0`). Lexical-only mode.
|
||||
pub(crate) fn embeddings_disabled(&self) -> bool {
|
||||
let cfg = &self.config.models.embedding;
|
||||
cfg.provider == "none" || cfg.dimensions == 0
|
||||
}
|
||||
|
||||
/// Build (or reuse) the fastembed embedder. Returns `None` when the
|
||||
/// workspace is in lexical-only mode (see
|
||||
/// [`Self::embeddings_disabled`]). The first call pays the ~470 MB
|
||||
/// ONNX load; subsequent calls are a single `OnceLock` read.
|
||||
pub(crate) fn embedder(&self) -> Result<Option<Arc<dyn Embedder + Send + Sync>>> {
|
||||
if self.embeddings_disabled() {
|
||||
return Ok(None);
|
||||
}
|
||||
if let Some(e) = self.embedder.get() {
|
||||
return Ok(Some(e.clone()));
|
||||
}
|
||||
let emb: Arc<dyn Embedder + Send + Sync> = Arc::new(
|
||||
FastembedEmbedder::new(&self.config)
|
||||
.context("kb-app: load FastembedEmbedder")?,
|
||||
);
|
||||
// `set` returns Err if another thread won the race; in that case
|
||||
// the loser still returns the (now-cached) winner via `get()`.
|
||||
let _ = self.embedder.set(emb.clone());
|
||||
Ok(Some(self.embedder.get().cloned().unwrap_or(emb)))
|
||||
}
|
||||
|
||||
/// Build (or reuse) the LanceDB-backed vector store. Returns `None`
|
||||
/// when embeddings are disabled. Memoized via `OnceLock` for the
|
||||
/// same reasons as [`Self::embedder`].
|
||||
pub(crate) fn vector(&self) -> Result<Option<Arc<LanceVectorStore>>> {
|
||||
if self.embeddings_disabled() {
|
||||
return Ok(None);
|
||||
}
|
||||
if let Some(v) = self.vector.get() {
|
||||
return Ok(Some(v.clone()));
|
||||
}
|
||||
let store = Arc::new(
|
||||
LanceVectorStore::new(&self.config, self.sqlite.clone())
|
||||
.context("kb-app: open LanceVectorStore")?,
|
||||
);
|
||||
let _ = self.vector.set(store.clone());
|
||||
Ok(Some(self.vector.get().cloned().unwrap_or(store)))
|
||||
}
|
||||
|
||||
/// Build (or reuse) the configured LLM. Currently always Ollama;
|
||||
/// when a second provider lands this is the place to switch on
|
||||
/// `config.models.llm.provider`.
|
||||
fn llm(&self) -> Result<Arc<dyn LanguageModel>> {
|
||||
if let Some(l) = self.llm.get() {
|
||||
return Ok(l.clone());
|
||||
}
|
||||
let llm: Arc<dyn LanguageModel> = Arc::new(
|
||||
OllamaLanguageModel::new(&self.config)
|
||||
.context("kb-app::ask: build OllamaLanguageModel")?,
|
||||
);
|
||||
let _ = self.llm.set(llm.clone());
|
||||
Ok(self.llm.get().cloned().unwrap_or(llm))
|
||||
}
|
||||
|
||||
/// Resolve the embedder + vector store, surfacing the user-friendly
|
||||
/// "switch to --mode lexical" error when embeddings are disabled.
|
||||
fn require_embeddings(
|
||||
&self,
|
||||
) -> Result<(
|
||||
Arc<dyn Embedder + Send + Sync>,
|
||||
Arc<LanceVectorStore>,
|
||||
)> {
|
||||
let emb = self.embedder()?.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"embeddings disabled (config.models.embedding.provider == \"none\" \
|
||||
or dimensions == 0); vector / hybrid search require embeddings — \
|
||||
switch to --mode lexical or enable an embedding provider in config.toml"
|
||||
)
|
||||
})?;
|
||||
let vec_store = self.vector()?.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"vector store unavailable while embedder is configured — this should \
|
||||
not happen; check `kb doctor` and the data_dir permissions"
|
||||
)
|
||||
})?;
|
||||
Ok((emb, vec_store))
|
||||
}
|
||||
}
|
||||
|
||||
/// Compose a stable `IndexVersion` for the lexical retriever from
|
||||
/// the active config. This token surfaces in `SearchHit.index_version`
|
||||
/// and on snapshot tests; including the chunker version pins it to
|
||||
/// the chunking policy in effect.
|
||||
fn lexical_index_version(config: &kebab_config::Config) -> IndexVersion {
|
||||
IndexVersion(format!("lex:{}", config.chunking.chunker_version))
|
||||
}
|
||||
|
||||
/// Compose a stable `IndexVersion` for the vector retriever. Tracks
|
||||
/// `(embedding_model, embedding_version, dimensions)` so a model swap
|
||||
/// flags drift via the existing index_version mismatch warning in
|
||||
/// `HybridRetriever::new`.
|
||||
fn vector_index_version(embedder: &dyn Embedder) -> IndexVersion {
|
||||
IndexVersion(format!(
|
||||
"vec:{}@{}:{}",
|
||||
embedder.model_id().0,
|
||||
embedder.model_version().0,
|
||||
embedder.dimensions(),
|
||||
))
|
||||
}
|
||||
39
crates/kebab-app/src/doctor_signal.rs
Normal file
39
crates/kebab-app/src/doctor_signal.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
//! Signal types used by `kb-cli`'s `exit_code` mapping (§10).
|
||||
//!
|
||||
//! These are *not* errors per se: a doctor failure is normal output, just
|
||||
//! signalled out-of-band so the CLI can exit with the right status.
|
||||
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DoctorUnhealthy;
|
||||
|
||||
impl fmt::Display for DoctorUnhealthy {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str("doctor unhealthy")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for DoctorUnhealthy {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RefusalSignal;
|
||||
|
||||
impl fmt::Display for RefusalSignal {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str("refusal")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for RefusalSignal {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct NoHitSignal;
|
||||
|
||||
impl fmt::Display for NoHitSignal {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str("no hit")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for NoHitSignal {}
|
||||
845
crates/kebab-app/src/lib.rs
Normal file
845
crates/kebab-app/src/lib.rs
Normal file
@@ -0,0 +1,845 @@
|
||||
//! `kb-app` — facade that downstream `kb-cli` / `kb-tui` / `kb-desktop`
|
||||
//! depend on (§7, §8).
|
||||
//!
|
||||
//! P3-5 swapped the `bail!("not yet wired")` stubs for real bodies that
|
||||
//! compose the libraries shipped through P3-4. After this task, `kb
|
||||
//! ingest` actually walks a workspace and persists chunks, and `kb
|
||||
//! search --mode {lexical,vector,hybrid}` returns real `SearchHit`s.
|
||||
//! `kb-app::ask` stays stubbed (P4-3 owns it).
|
||||
//!
|
||||
//! ## Wire-schema convention
|
||||
//!
|
||||
//! `kb-app` returns pure domain types (`IngestReport`, `DocSummary`,
|
||||
//! `Chunk`, `SearchHit`, `Answer`, …) re-exported from `kb-core`. These do
|
||||
//! NOT carry a `schema_version` field. The CLI (`kb-cli/src/wire.rs`) is
|
||||
//! responsible for wrapping each Ok-path return value with the matching
|
||||
//! `*.v1` envelope before emitting JSON on stdout in `--json` mode. The
|
||||
//! sole exception is [`DoctorReport`], whose `schema_version` is part of
|
||||
//! the struct because the doctor wire object IS its own structured
|
||||
//! surface (no domain-side equivalent in `kb-core`). When adding a new
|
||||
//! facade function in a later phase, remember: keep the return type pure,
|
||||
//! and add a matching `wire_*` helper in `kb-cli/src/wire.rs`.
|
||||
//!
|
||||
//! ## Config seam (`*_with_config`)
|
||||
//!
|
||||
//! Each public free function has a `#[doc(hidden)] pub fn *_with_config`
|
||||
//! companion that takes a fully-resolved [`kebab_config::Config`] directly.
|
||||
//! Three callers go through it: (1) the top-level free functions
|
||||
//! themselves, after `load_config()`; (2) `kb-cli` when the user passes
|
||||
//! `--config <path>` (CLI builds the Config via
|
||||
//! `Config::load(cli.config.as_deref())` and threads it in directly so
|
||||
//! the flag is honored); (3) integration tests, which mutate a Config
|
||||
//! to point at a `TempDir` to avoid polluting the user's real
|
||||
//! `data_dir` / `model_dir`. `#[doc(hidden)]` keeps rustdoc clean while
|
||||
//! still allowing the cross-crate calls.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use kebab_chunk::MdHeadingV1Chunker;
|
||||
use kebab_core::{
|
||||
Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
|
||||
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
|
||||
EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery,
|
||||
SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore,
|
||||
};
|
||||
use kebab_normalize::build_canonical_document;
|
||||
use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
mod app;
|
||||
pub mod doctor_signal;
|
||||
pub mod logging;
|
||||
|
||||
pub use app::App;
|
||||
|
||||
/// Parser-version label persisted in `documents.parser_version` for
|
||||
/// every Markdown file ingested through the `kb-parse-md` pipeline.
|
||||
/// Kept in lock-step with the literal used in the `kb-store-sqlite`
|
||||
/// idempotency / round-trip tests so the version label written by the
|
||||
/// app and the one used in cross-crate fixtures match.
|
||||
const KB_PARSE_MD_VERSION: &str = "pulldown-cmark-0.x";
|
||||
|
||||
/// Caller-supplied knobs for one [`ask`] invocation.
|
||||
///
|
||||
/// Re-exported from [`kebab_rag::AskOpts`] (P4-3 owns the type) so kb-cli's
|
||||
/// `use kebab_app::AskOpts` keeps working without churn. The struct gained
|
||||
/// a `stream_sink` field in P4-3; non-streaming callers (kb-cli today)
|
||||
/// pass `stream_sink: None`.
|
||||
pub use kebab_rag::AskOpts;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct DoctorReport {
|
||||
/// Wire schema version label (`"doctor.v1"`).
|
||||
pub schema_version: String,
|
||||
pub ok: bool,
|
||||
pub checks: Vec<DoctorCheck>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct DoctorCheck {
|
||||
pub name: String,
|
||||
pub ok: bool,
|
||||
pub detail: String,
|
||||
pub hint: Option<String>,
|
||||
}
|
||||
|
||||
/// Create XDG dirs and write a starter `config.toml`. Idempotent unless
|
||||
/// `force=true` (which overwrites an existing config).
|
||||
pub fn init_workspace(force: bool) -> anyhow::Result<()> {
|
||||
let cfg_path = kebab_config::Config::xdg_config_path();
|
||||
let data_dir = kebab_config::Config::xdg_data_dir();
|
||||
let cache_dir = kebab_config::Config::xdg_cache_dir();
|
||||
let state_dir = kebab_config::Config::xdg_state_dir();
|
||||
|
||||
for d in [
|
||||
cfg_path.parent().map(PathBuf::from).unwrap_or_default(),
|
||||
data_dir.clone(),
|
||||
cache_dir,
|
||||
state_dir.clone(),
|
||||
state_dir.join("logs"),
|
||||
] {
|
||||
if !d.as_os_str().is_empty() {
|
||||
std::fs::create_dir_all(&d)?;
|
||||
}
|
||||
}
|
||||
|
||||
let workspace_root = expand_tilde(&kebab_config::Config::defaults().workspace.root);
|
||||
std::fs::create_dir_all(&workspace_root)?;
|
||||
|
||||
if !cfg_path.exists() || force {
|
||||
let cfg = kebab_config::Config::defaults();
|
||||
let toml_text = toml::to_string_pretty(&cfg)?;
|
||||
std::fs::write(&cfg_path, toml_text)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn expand_tilde(s: &str) -> PathBuf {
|
||||
if let Some(rest) = s.strip_prefix("~/") {
|
||||
if let Some(home) = dirs::home_dir() {
|
||||
return home.join(rest);
|
||||
}
|
||||
}
|
||||
if s == "~" {
|
||||
if let Some(home) = dirs::home_dir() {
|
||||
return home;
|
||||
}
|
||||
}
|
||||
PathBuf::from(s)
|
||||
}
|
||||
|
||||
/// Load the active Config from XDG (or fall back to defaults). Mirrors
|
||||
/// what `kb-cli` does at the top of every subcommand path; we re-do
|
||||
/// the load inside each facade entry so callers don't have to thread
|
||||
/// a Config through.
|
||||
///
|
||||
/// Callers that already have a Config in hand (CLI honoring `--config`,
|
||||
/// integration tests, TUI session) should bypass this and call the
|
||||
/// matching `*_with_config` helper directly.
|
||||
fn load_config() -> anyhow::Result<kebab_config::Config> {
|
||||
kebab_config::Config::load(None)
|
||||
}
|
||||
|
||||
// ── ingest ────────────────────────────────────────────────────────────────
|
||||
|
||||
pub fn ingest(scope: SourceScope, summary_only: bool) -> anyhow::Result<IngestReport> {
|
||||
let config = load_config()?;
|
||||
ingest_with_config(config, scope, summary_only)
|
||||
}
|
||||
|
||||
/// Config-explicit variant — bypasses [`load_config`] when the
|
||||
/// caller (kb-cli with `--config`, integration tests, TUI session)
|
||||
/// already has a [`kebab_config::Config`] in hand. The public free
|
||||
/// function [`ingest`] wraps this with the XDG-default load.
|
||||
#[doc(hidden)]
|
||||
pub fn ingest_with_config(
|
||||
config: kebab_config::Config,
|
||||
scope: SourceScope,
|
||||
summary_only: bool,
|
||||
) -> anyhow::Result<IngestReport> {
|
||||
let started_instant = std::time::Instant::now();
|
||||
|
||||
let app = App::open_with_config(config)?;
|
||||
|
||||
// Walk the workspace.
|
||||
let connector = FsSourceConnector::new(&app.config)
|
||||
.context("kb-app::ingest: build FsSourceConnector")?;
|
||||
let assets = connector
|
||||
.scan(&scope)
|
||||
.context("kb-app::ingest: scan workspace")?;
|
||||
|
||||
// Embedder + vector store: build once at the top so the cold-start
|
||||
// cost is paid once even when the workspace has 1000 markdown files.
|
||||
let embedder = app.embedder()?;
|
||||
let vector_store = app.vector()?;
|
||||
|
||||
// If both are present, ensure the table exists for the (model, dim)
|
||||
// pair so the first per-doc upsert doesn't pay the create-table
|
||||
// round-trip.
|
||||
if let (Some(emb), Some(vec)) = (embedder.as_ref(), vector_store.as_ref()) {
|
||||
let mid = emb.model_id();
|
||||
vec.ensure_table(&mid, emb.dimensions())
|
||||
.context("kb-app::ingest: ensure Lance table")?;
|
||||
}
|
||||
|
||||
let parser_version = ParserVersion(KB_PARSE_MD_VERSION.to_string());
|
||||
let chunk_policy = chunk_policy_from_config(&app.config);
|
||||
|
||||
// Pre-load every existing doc_id so we can label `IngestItem.kind`
|
||||
// as `New` vs `Updated` correctly. `list_documents` returns one
|
||||
// row per `(workspace_path, asset_id)` — index by the deterministic
|
||||
// `doc_id` recipe input so the first ingest of an unseen file is
|
||||
// labelled `New`.
|
||||
let existing_doc_ids: std::collections::HashSet<String> = app
|
||||
.sqlite
|
||||
.list_documents(&DocFilter::default())
|
||||
.context("kb-app::ingest: list existing documents")?
|
||||
.into_iter()
|
||||
.map(|d| d.doc_id.0)
|
||||
.collect();
|
||||
|
||||
let started_at = time::OffsetDateTime::now_utc();
|
||||
|
||||
let mut items: Vec<kebab_core::IngestItem> = Vec::new();
|
||||
let mut new_count: u32 = 0;
|
||||
let mut updated_count: u32 = 0;
|
||||
let mut skipped_count: u32 = 0;
|
||||
let mut error_count: u32 = 0;
|
||||
// Aggregate counts surfaced into `ingest_runs` (and tracing). Not
|
||||
// exposed on `IngestReport` today — `kebab_core::IngestReport` is a
|
||||
// wire-stable struct without these fields — but persisting them
|
||||
// means audit tooling and `kb jobs` (P+) can recover the totals
|
||||
// without re-walking the DB.
|
||||
let mut chunks_indexed: u32 = 0;
|
||||
let mut embeddings_indexed: u32 = 0;
|
||||
let scanned_count: u32 = u32::try_from(assets.len()).unwrap_or(u32::MAX);
|
||||
|
||||
let embed_active = embedder.is_some() && vector_store.is_some();
|
||||
|
||||
for asset in assets {
|
||||
let item = ingest_one_asset(
|
||||
&app,
|
||||
&asset,
|
||||
&parser_version,
|
||||
&chunk_policy,
|
||||
embedder.as_ref(),
|
||||
vector_store.as_ref(),
|
||||
&existing_doc_ids,
|
||||
);
|
||||
|
||||
let item = match item {
|
||||
Ok(i) => i,
|
||||
Err(e) => {
|
||||
tracing::error!(
|
||||
target: "kb-app",
|
||||
path = %asset.workspace_path.0,
|
||||
error = %e,
|
||||
"kb-app::ingest: per-file fatal"
|
||||
);
|
||||
error_count = error_count.saturating_add(1);
|
||||
kebab_core::IngestItem {
|
||||
kind: kebab_core::IngestItemKind::Error,
|
||||
doc_id: None,
|
||||
doc_path: asset.workspace_path.clone(),
|
||||
asset_id: Some(asset.asset_id.clone()),
|
||||
byte_len: Some(asset.byte_len),
|
||||
block_count: None,
|
||||
chunk_count: None,
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: Vec::new(),
|
||||
error: Some(format!("{e:#}")),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match item.kind {
|
||||
kebab_core::IngestItemKind::New => {
|
||||
new_count = new_count.saturating_add(1);
|
||||
let n = item.chunk_count.unwrap_or(0);
|
||||
chunks_indexed = chunks_indexed.saturating_add(n);
|
||||
if embed_active {
|
||||
embeddings_indexed = embeddings_indexed.saturating_add(n);
|
||||
}
|
||||
}
|
||||
kebab_core::IngestItemKind::Updated => {
|
||||
updated_count = updated_count.saturating_add(1);
|
||||
let n = item.chunk_count.unwrap_or(0);
|
||||
chunks_indexed = chunks_indexed.saturating_add(n);
|
||||
if embed_active {
|
||||
embeddings_indexed = embeddings_indexed.saturating_add(n);
|
||||
}
|
||||
}
|
||||
kebab_core::IngestItemKind::Skipped => {
|
||||
skipped_count = skipped_count.saturating_add(1)
|
||||
}
|
||||
kebab_core::IngestItemKind::Error => {
|
||||
error_count = error_count.saturating_add(1)
|
||||
}
|
||||
}
|
||||
items.push(item);
|
||||
}
|
||||
|
||||
// Record a row in `jobs` so `kb jobs` (P+) can list the run. Distinct
|
||||
// from the `ingest_runs` row written below — the `jobs` table is the
|
||||
// generic job-lifecycle surface (`kind=ingest`), `ingest_runs` is the
|
||||
// ingest-specific aggregate counts row.
|
||||
let payload = serde_json::json!({
|
||||
"scope": scope,
|
||||
"summary_only": summary_only,
|
||||
});
|
||||
let job_id_res = <SqliteStoreAlias as kebab_core::JobRepo>::create(
|
||||
&app.sqlite,
|
||||
kebab_core::JobKind::Ingest,
|
||||
payload,
|
||||
);
|
||||
match job_id_res {
|
||||
Ok(jid) => {
|
||||
// Stash the aggregate counts as the job's `progress_json`
|
||||
// so a future `kb jobs show` can surface them without
|
||||
// joining `ingest_runs`.
|
||||
let progress = serde_json::json!({
|
||||
"scanned": scanned_count,
|
||||
"new": new_count,
|
||||
"updated": updated_count,
|
||||
"skipped": skipped_count,
|
||||
"errors": error_count,
|
||||
"chunks_indexed": chunks_indexed,
|
||||
"embeddings_indexed": embeddings_indexed,
|
||||
});
|
||||
if let Err(e) = <SqliteStoreAlias as kebab_core::JobRepo>::update_progress(
|
||||
&app.sqlite,
|
||||
&jid,
|
||||
progress,
|
||||
) {
|
||||
tracing::warn!(
|
||||
target: "kb-app",
|
||||
error = %e,
|
||||
"kb-app::ingest: JobRepo::update_progress failed"
|
||||
);
|
||||
}
|
||||
if let Err(e) = <SqliteStoreAlias as kebab_core::JobRepo>::finish(
|
||||
&app.sqlite,
|
||||
&jid,
|
||||
kebab_core::JobStatus::Succeeded,
|
||||
None,
|
||||
) {
|
||||
tracing::warn!(
|
||||
target: "kb-app",
|
||||
error = %e,
|
||||
"kb-app::ingest: JobRepo::finish failed"
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kb-app",
|
||||
error = %e,
|
||||
"kb-app::ingest: JobRepo::create failed; run not recorded in `jobs`"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let duration_ms = u32::try_from(started_instant.elapsed().as_millis())
|
||||
.unwrap_or(u32::MAX);
|
||||
let finished_at = time::OffsetDateTime::now_utc();
|
||||
|
||||
// Record the ingest_runs row with aggregate counts.
|
||||
// `summary_only=true` writes `items_json=NULL` (per design §5.7);
|
||||
// the count columns are populated either way.
|
||||
let scope_json = serde_json::to_string(&scope)
|
||||
.context("kb-app::ingest: serialize scope for ingest_runs.scope_json")?;
|
||||
let items_json: Option<String> = if summary_only {
|
||||
None
|
||||
} else {
|
||||
match serde_json::to_string(&items) {
|
||||
Ok(s) => Some(s),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kb-app",
|
||||
error = %e,
|
||||
"kb-app::ingest: failed to serialize items_json; storing NULL"
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
};
|
||||
let run_id = mint_ingest_run_id(&scope_json, started_at);
|
||||
let row = kebab_store_sqlite::IngestRunRow {
|
||||
run_id: &run_id,
|
||||
scope_json: &scope_json,
|
||||
scanned: scanned_count,
|
||||
new_count,
|
||||
updated_count,
|
||||
skipped_count,
|
||||
error_count,
|
||||
duration_ms,
|
||||
started_at,
|
||||
finished_at,
|
||||
items_json: items_json.as_deref(),
|
||||
};
|
||||
if let Err(e) = app.sqlite.record_ingest_run(&row) {
|
||||
tracing::warn!(
|
||||
target: "kb-app",
|
||||
error = %e,
|
||||
"kb-app::ingest: record_ingest_run failed"
|
||||
);
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
target: "kb-app",
|
||||
scanned = scanned_count,
|
||||
new = new_count,
|
||||
updated = updated_count,
|
||||
skipped = skipped_count,
|
||||
errors = error_count,
|
||||
chunks_indexed,
|
||||
embeddings_indexed,
|
||||
duration_ms,
|
||||
"kb-app::ingest: run complete"
|
||||
);
|
||||
|
||||
Ok(IngestReport {
|
||||
scope,
|
||||
scanned: scanned_count,
|
||||
new: new_count,
|
||||
updated: updated_count,
|
||||
skipped: skipped_count,
|
||||
errors: error_count,
|
||||
duration_ms,
|
||||
items: if summary_only { None } else { Some(items) },
|
||||
})
|
||||
}
|
||||
|
||||
/// Mint a stable 32-hex-char `run_id` for an `ingest_runs` row.
|
||||
/// `(scope, started_at_nanos)` is enough to make two runs with the
|
||||
/// same scope started a nanosecond apart distinguish — same shape as
|
||||
/// the JobId recipe in `kb-store-sqlite::jobs`.
|
||||
fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
hasher.update(scope_json.as_bytes());
|
||||
hasher.update(&at.unix_timestamp_nanos().to_be_bytes());
|
||||
let hex = hasher.finalize().to_hex().to_string();
|
||||
hex[..32].to_string()
|
||||
}
|
||||
|
||||
/// Trait alias type used to disambiguate the two impls (`DocumentStore`
|
||||
/// vs `JobRepo`) on the same store. Plain `app.sqlite.create(...)`
|
||||
/// would pick one based on inherent vs trait methods; we go through
|
||||
/// `<… as JobRepo>` to be explicit.
|
||||
type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
|
||||
|
||||
/// Process a single asset: read bytes, parse, normalize, chunk,
|
||||
/// persist, embed. Per-asset failures bubble up to the caller for
|
||||
/// labelling as `IngestItemKind::Error` — they do NOT abort the
|
||||
/// whole run.
|
||||
fn ingest_one_asset(
|
||||
app: &App,
|
||||
asset: &RawAsset,
|
||||
parser_version: &ParserVersion,
|
||||
chunk_policy: &ChunkPolicy,
|
||||
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
|
||||
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
) -> anyhow::Result<kebab_core::IngestItem> {
|
||||
tracing::debug!(
|
||||
target: "kb-app::ingest",
|
||||
path = %asset.workspace_path.0,
|
||||
"processing asset"
|
||||
);
|
||||
// Only handle Markdown for now; other media types are P6+ work.
|
||||
if asset.media_type != kebab_core::MediaType::Markdown {
|
||||
return Ok(kebab_core::IngestItem {
|
||||
kind: kebab_core::IngestItemKind::Skipped,
|
||||
doc_id: None,
|
||||
doc_path: asset.workspace_path.clone(),
|
||||
asset_id: Some(asset.asset_id.clone()),
|
||||
byte_len: Some(asset.byte_len),
|
||||
block_count: None,
|
||||
chunk_count: None,
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: Vec::new(),
|
||||
error: None,
|
||||
});
|
||||
}
|
||||
|
||||
let path = match &asset.source_uri {
|
||||
SourceUri::File(p) => p.clone(),
|
||||
SourceUri::Kb(_) => {
|
||||
return Ok(kebab_core::IngestItem {
|
||||
kind: kebab_core::IngestItemKind::Skipped,
|
||||
doc_id: None,
|
||||
doc_path: asset.workspace_path.clone(),
|
||||
asset_id: Some(asset.asset_id.clone()),
|
||||
byte_len: Some(asset.byte_len),
|
||||
block_count: None,
|
||||
chunk_count: None,
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: vec![
|
||||
"kb:// source URIs are not supported by the fs ingester".into(),
|
||||
],
|
||||
error: None,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let bytes = std::fs::read(&path)
|
||||
.with_context(|| format!("read asset bytes from {}", path.display()))?;
|
||||
|
||||
let body_hints = build_body_hints(asset);
|
||||
|
||||
// Frontmatter — `parse_frontmatter` returns Ok even on malformed
|
||||
// frontmatter (warnings are surfaced through the `Vec<Warning>`).
|
||||
let (metadata, fm_span, fm_warns) = parse_frontmatter(&bytes, &body_hints)
|
||||
.context("kb-parse-md::parse_frontmatter")?;
|
||||
|
||||
let body_offset_lines = match fm_span {
|
||||
Some(span) => count_lines_in(&bytes[..span.end]),
|
||||
None => 0,
|
||||
};
|
||||
|
||||
let (parsed_blocks, blk_warns) = parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines)
|
||||
.context("kb-parse-md::parse_blocks")?;
|
||||
|
||||
let mut all_warnings = Vec::with_capacity(fm_warns.len() + blk_warns.len());
|
||||
all_warnings.extend(fm_warns);
|
||||
all_warnings.extend(blk_warns);
|
||||
|
||||
// Snapshot warning notes for the IngestItem before the vec is
|
||||
// consumed by `build_canonical_document`.
|
||||
let warning_notes: Vec<String> = all_warnings
|
||||
.iter()
|
||||
.map(|w| format!("{:?}: {}", w.kind, w.note))
|
||||
.collect();
|
||||
|
||||
let canonical = build_canonical_document(
|
||||
asset,
|
||||
metadata,
|
||||
parsed_blocks,
|
||||
parser_version,
|
||||
all_warnings,
|
||||
)
|
||||
.context("kb-normalize::build_canonical_document")?;
|
||||
|
||||
let chunks = MdHeadingV1Chunker
|
||||
.chunk(&canonical, chunk_policy)
|
||||
.context("kb-chunk::MdHeadingV1Chunker::chunk")?;
|
||||
|
||||
// Persist. Each `put_*` call wraps its own short transaction
|
||||
// (per-document tx semantics per design §5.8); composing them is
|
||||
// the kb-app job. A failure mid-way leaves the DB in a state the
|
||||
// next ingest run can re-converge (UPSERT + DELETE-then-INSERT).
|
||||
app.sqlite
|
||||
.put_asset_with_bytes(asset, &bytes)
|
||||
.context("DocumentStore::put_asset_with_bytes")?;
|
||||
app.sqlite
|
||||
.put_document(&canonical)
|
||||
.context("DocumentStore::put_document")?;
|
||||
app.sqlite
|
||||
.put_blocks(&canonical.doc_id, &canonical.blocks)
|
||||
.context("DocumentStore::put_blocks")?;
|
||||
app.sqlite
|
||||
.put_chunks(&canonical.doc_id, &chunks)
|
||||
.context("DocumentStore::put_chunks")?;
|
||||
|
||||
// Embed + vector upsert (only when both sides are configured).
|
||||
if let (Some(emb), Some(vec_store)) = (embedder, vector_store) {
|
||||
if !chunks.is_empty() {
|
||||
let inputs: Vec<EmbeddingInput<'_>> = chunks
|
||||
.iter()
|
||||
.map(|c| EmbeddingInput {
|
||||
text: c.text.as_str(),
|
||||
kind: EmbeddingKind::Document,
|
||||
})
|
||||
.collect();
|
||||
let vectors = emb
|
||||
.embed(&inputs)
|
||||
.context("Embedder::embed (document chunks)")?;
|
||||
let model_id = emb.model_id();
|
||||
let model_version = emb.model_version();
|
||||
let dimensions = emb.dimensions();
|
||||
let records: Vec<VectorRecord> = chunks
|
||||
.iter()
|
||||
.zip(vectors)
|
||||
.map(|(c, v)| VectorRecord {
|
||||
embedding_id: kebab_core::id_for_embedding(
|
||||
&c.chunk_id,
|
||||
&model_id,
|
||||
&model_version,
|
||||
dimensions,
|
||||
),
|
||||
chunk_id: c.chunk_id.clone(),
|
||||
vector: v,
|
||||
doc_id: canonical.doc_id.clone(),
|
||||
text: c.text.clone(),
|
||||
heading_path: c.heading_path.clone(),
|
||||
model_id: model_id.clone(),
|
||||
model_version: model_version.clone(),
|
||||
dimensions,
|
||||
})
|
||||
.collect();
|
||||
vec_store
|
||||
.upsert(&records)
|
||||
.context("VectorStore::upsert")?;
|
||||
}
|
||||
}
|
||||
|
||||
let kind = if existing_doc_ids.contains(&canonical.doc_id.0) {
|
||||
kebab_core::IngestItemKind::Updated
|
||||
} else {
|
||||
kebab_core::IngestItemKind::New
|
||||
};
|
||||
|
||||
Ok(kebab_core::IngestItem {
|
||||
kind,
|
||||
doc_id: Some(canonical.doc_id.clone()),
|
||||
doc_path: asset.workspace_path.clone(),
|
||||
asset_id: Some(asset.asset_id.clone()),
|
||||
byte_len: Some(asset.byte_len),
|
||||
block_count: u32::try_from(canonical.blocks.len()).ok(),
|
||||
chunk_count: u32::try_from(chunks.len()).ok(),
|
||||
parser_version: Some(parser_version.clone()),
|
||||
chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
|
||||
warnings: warning_notes,
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Convenience: end byte of the frontmatter region (or 0 when absent).
|
||||
fn fm_span_end(span: Option<kebab_parse_md::FrontmatterSpan>) -> usize {
|
||||
span.map(|s| s.end).unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Count `\n` in a byte prefix to convert frontmatter byte span to
|
||||
/// the line-offset `parse_blocks` expects.
|
||||
fn count_lines_in(bytes: &[u8]) -> u32 {
|
||||
let n = bytes.iter().filter(|&&b| b == b'\n').count();
|
||||
u32::try_from(n).unwrap_or(u32::MAX)
|
||||
}
|
||||
|
||||
/// Build `BodyHints` from the asset alone. We use the asset's
|
||||
/// `discovered_at` for both `fs_ctime` and `fs_mtime` because going
|
||||
/// through the FS metadata API for every file would be a noticeable
|
||||
/// overhead for large workspaces and the source-of-truth timestamps
|
||||
/// are written into the document's frontmatter when the user wants
|
||||
/// authoritative values.
|
||||
fn build_body_hints(asset: &RawAsset) -> BodyHints {
|
||||
BodyHints {
|
||||
first_h1: None,
|
||||
fs_ctime: asset.discovered_at,
|
||||
fs_mtime: asset.discovered_at,
|
||||
fallback_lang: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a `ChunkPolicy` from the active config.
|
||||
fn chunk_policy_from_config(config: &kebab_config::Config) -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: config.chunking.target_tokens,
|
||||
overlap_tokens: config.chunking.overlap_tokens,
|
||||
respect_markdown_headings: config.chunking.respect_markdown_headings,
|
||||
chunker_version: ChunkerVersion(config.chunking.chunker_version.clone()),
|
||||
}
|
||||
}
|
||||
|
||||
// ── list_docs / inspect_doc / inspect_chunk ───────────────────────────────
|
||||
|
||||
pub fn list_docs(filter: DocFilter) -> anyhow::Result<Vec<DocSummary>> {
|
||||
let config = load_config()?;
|
||||
list_docs_with_config(config, filter)
|
||||
}
|
||||
|
||||
/// Test-only seam — kb-cli must call the public free function
|
||||
/// ([`list_docs`]), not this.
|
||||
#[doc(hidden)]
|
||||
pub fn list_docs_with_config(
|
||||
config: kebab_config::Config,
|
||||
filter: DocFilter,
|
||||
) -> anyhow::Result<Vec<DocSummary>> {
|
||||
let app = App::open_with_config(config)?;
|
||||
app.sqlite.list_documents(&filter)
|
||||
}
|
||||
|
||||
pub fn inspect_doc(id: &DocumentId) -> anyhow::Result<CanonicalDocument> {
|
||||
let config = load_config()?;
|
||||
inspect_doc_with_config(config, id)
|
||||
}
|
||||
|
||||
/// Test-only seam — kb-cli must call the public free function
|
||||
/// ([`inspect_doc`]), not this.
|
||||
#[doc(hidden)]
|
||||
pub fn inspect_doc_with_config(
|
||||
config: kebab_config::Config,
|
||||
id: &DocumentId,
|
||||
) -> anyhow::Result<CanonicalDocument> {
|
||||
let app = App::open_with_config(config)?;
|
||||
app.sqlite
|
||||
.get_document(id)?
|
||||
.ok_or_else(|| anyhow!("document not found: {} (try `kb list docs`)", id.0))
|
||||
}
|
||||
|
||||
pub fn inspect_chunk(id: &ChunkId) -> anyhow::Result<Chunk> {
|
||||
let config = load_config()?;
|
||||
inspect_chunk_with_config(config, id)
|
||||
}
|
||||
|
||||
/// Test-only seam — kb-cli must call the public free function
|
||||
/// ([`inspect_chunk`]), not this.
|
||||
#[doc(hidden)]
|
||||
pub fn inspect_chunk_with_config(
|
||||
config: kebab_config::Config,
|
||||
id: &ChunkId,
|
||||
) -> anyhow::Result<Chunk> {
|
||||
let app = App::open_with_config(config)?;
|
||||
app.sqlite
|
||||
.get_chunk(id)?
|
||||
.ok_or_else(|| anyhow!("chunk not found: {} (try `kb inspect doc <id>`)", id.0))
|
||||
}
|
||||
|
||||
// ── search ────────────────────────────────────────────────────────────────
|
||||
|
||||
pub fn search(query: SearchQuery) -> anyhow::Result<Vec<SearchHit>> {
|
||||
let config = load_config()?;
|
||||
search_with_config(config, query)
|
||||
}
|
||||
|
||||
/// Test-only seam — kb-cli must call the public free function
|
||||
/// ([`search`]), not this. Builds a one-shot `App` and delegates to
|
||||
/// [`App::search`]; long-lived callers should hold an `App` instance
|
||||
/// directly to amortize the embedder / vector-store cold start.
|
||||
#[doc(hidden)]
|
||||
pub fn search_with_config(
|
||||
config: kebab_config::Config,
|
||||
query: SearchQuery,
|
||||
) -> anyhow::Result<Vec<SearchHit>> {
|
||||
App::open_with_config(config)?.search(query)
|
||||
}
|
||||
|
||||
// ── ask ──────────────────────────────────────────────────────────────────
|
||||
//
|
||||
// P4-3 wires `ask` end-to-end. The retriever is built per `opts.mode`;
|
||||
// vector / hybrid require an enabled embedding provider (else we surface
|
||||
// the same "switch to --mode lexical" error as `search`). The LLM is
|
||||
// always Ollama for now — when we grow a second provider (llama.cpp,
|
||||
// candle, etc.) this is the place to switch on `config.models.llm.provider`.
|
||||
|
||||
pub fn ask(query: &str, opts: AskOpts) -> anyhow::Result<Answer> {
|
||||
let config = load_config()?;
|
||||
ask_with_config(config, query, opts)
|
||||
}
|
||||
|
||||
/// Test-only seam — kb-cli must call the public free function
|
||||
/// ([`ask`]), not this. Builds a one-shot `App` and delegates to
|
||||
/// [`App::ask`].
|
||||
#[doc(hidden)]
|
||||
pub fn ask_with_config(
|
||||
config: kebab_config::Config,
|
||||
query: &str,
|
||||
opts: AskOpts,
|
||||
) -> anyhow::Result<Answer> {
|
||||
App::open_with_config(config)?.ask(query, opts)
|
||||
}
|
||||
|
||||
/// Run the doctor checks against the explicit config path the user
|
||||
/// requested via `--config` (or the XDG default if `None`). The
|
||||
/// `config_loaded` check reports the actual path probed and the
|
||||
/// `data_dir_writable` check probes the resolved `storage.data_dir`
|
||||
/// from that config (so `--config` users see their custom paths
|
||||
/// reflected in the report rather than the XDG defaults).
|
||||
pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow::Result<DoctorReport> {
|
||||
tracing::debug!("doctor() invoked");
|
||||
let mut checks = Vec::new();
|
||||
|
||||
// Resolve the config path the same way `Config::load` does: explicit
|
||||
// override first, else XDG default. Report whichever was probed.
|
||||
let cfg_path: PathBuf = match config_path {
|
||||
Some(p) => p.to_path_buf(),
|
||||
None => kebab_config::Config::xdg_config_path(),
|
||||
};
|
||||
let (config_ok, config_detail, loaded_cfg) = if cfg_path.exists() {
|
||||
match kebab_config::Config::from_file(&cfg_path) {
|
||||
Ok(c) => (true, cfg_path.display().to_string(), Some(c)),
|
||||
Err(e) => (false, format!("{} ({e})", cfg_path.display()), None),
|
||||
}
|
||||
} else if config_path.is_some() {
|
||||
// Explicit `--config <path>` that doesn't exist is a hard error
|
||||
// — defaults would silently mask the user's intent.
|
||||
(
|
||||
false,
|
||||
format!("{} (not found)", cfg_path.display()),
|
||||
None,
|
||||
)
|
||||
} else {
|
||||
// No `--config` and no XDG file: defaults are always loadable.
|
||||
(true, format!("{} (defaults)", cfg_path.display()), None)
|
||||
};
|
||||
checks.push(DoctorCheck {
|
||||
name: "config_loaded".to_string(),
|
||||
ok: config_ok,
|
||||
detail: config_detail,
|
||||
hint: if config_ok {
|
||||
None
|
||||
} else if config_path.is_some() {
|
||||
Some("--config path does not exist or is malformed".to_string())
|
||||
} else {
|
||||
Some("run `kb init` to seed config".to_string())
|
||||
},
|
||||
});
|
||||
|
||||
// data_dir_writable — probe the resolved storage.data_dir from the
|
||||
// loaded config when present, else the XDG default. Apply env
|
||||
// overrides so KB_STORAGE_DATA_DIR is respected too.
|
||||
let data_dir = match loaded_cfg.as_ref() {
|
||||
Some(c) => {
|
||||
// Re-apply env overrides on top so the same precedence as
|
||||
// Config::load is preserved here.
|
||||
let env: std::collections::HashMap<String, String> = std::env::vars().collect();
|
||||
let merged = c.clone().apply_env(&env);
|
||||
expand_tilde(&merged.storage.data_dir)
|
||||
}
|
||||
None => kebab_config::Config::xdg_data_dir(),
|
||||
};
|
||||
let writable = (|| -> anyhow::Result<()> {
|
||||
std::fs::create_dir_all(&data_dir)?;
|
||||
let probe = data_dir.join(".kb-doctor-probe");
|
||||
std::fs::write(&probe, b"ok")?;
|
||||
std::fs::remove_file(&probe).ok();
|
||||
Ok(())
|
||||
})();
|
||||
let (data_ok, data_detail, data_hint) = match writable {
|
||||
Ok(()) => (true, data_dir.display().to_string(), None),
|
||||
Err(e) => (
|
||||
false,
|
||||
format!("{} ({e})", data_dir.display()),
|
||||
Some("ensure the configured data_dir is writable".to_string()),
|
||||
),
|
||||
};
|
||||
checks.push(DoctorCheck {
|
||||
name: "data_dir_writable".to_string(),
|
||||
ok: data_ok,
|
||||
detail: data_detail,
|
||||
hint: data_hint,
|
||||
});
|
||||
|
||||
let ok = checks.iter().all(|c| c.ok);
|
||||
Ok(DoctorReport {
|
||||
schema_version: "doctor.v1".to_string(),
|
||||
ok,
|
||||
checks,
|
||||
})
|
||||
}
|
||||
|
||||
/// Run the doctor checks against the XDG-default config. Convenience
|
||||
/// wrapper that mirrors the historical `kb-app::doctor()` signature
|
||||
/// for callers that don't honor `--config` (e.g., legacy code paths
|
||||
/// or smoke harnesses).
|
||||
pub fn doctor() -> anyhow::Result<DoctorReport> {
|
||||
doctor_with_config_path(None)
|
||||
}
|
||||
43
crates/kebab-app/src/logging.rs
Normal file
43
crates/kebab-app/src/logging.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
//! Tracing initialization helper for `kb-cli`.
|
||||
//!
|
||||
//! Daily-rolling file appender at `~/.local/state/kb/logs/` per task spec.
|
||||
//! Returns a `WorkerGuard` that the caller must keep alive until program
|
||||
//! exit (so buffered log lines flush).
|
||||
|
||||
use anyhow::Result;
|
||||
use tracing_appender::non_blocking::WorkerGuard;
|
||||
use tracing_subscriber::{EnvFilter, fmt, prelude::*};
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub enum LogLevel {
|
||||
Default,
|
||||
Verbose,
|
||||
Debug,
|
||||
}
|
||||
|
||||
/// Initialize tracing. Returns a guard to keep alive until exit. Idempotent
|
||||
/// — a second call is a no-op (the second `try_init` is dropped silently
|
||||
/// but the guard is still returned so the caller can keep it alive).
|
||||
pub fn init(level: LogLevel) -> Result<WorkerGuard> {
|
||||
let log_dir = kebab_config::Config::xdg_state_dir().join("logs");
|
||||
std::fs::create_dir_all(&log_dir)?;
|
||||
|
||||
let file_appender = tracing_appender::rolling::daily(&log_dir, "kb.log");
|
||||
let (nb, guard) = tracing_appender::non_blocking(file_appender);
|
||||
|
||||
let env_filter = match level {
|
||||
LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")),
|
||||
LogLevel::Verbose => EnvFilter::new("info"),
|
||||
LogLevel::Debug => EnvFilter::new("debug"),
|
||||
};
|
||||
|
||||
let registry = tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(fmt::layer().with_writer(nb).with_ansi(false));
|
||||
|
||||
// `try_init` rather than `init` so a second call (e.g. in tests) is a
|
||||
// no-op.
|
||||
let _ = registry.try_init();
|
||||
|
||||
Ok(guard)
|
||||
}
|
||||
43
crates/kebab-app/tests/ask_smoke.rs
Normal file
43
crates/kebab-app/tests/ask_smoke.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
//! `kb-app::ask` smoke tests.
|
||||
//!
|
||||
//! The pipeline's behavior is exhaustively covered by `kb-rag` tests
|
||||
//! (which inject `MockLanguageModel` + `MockRetriever`). The kb-app
|
||||
//! facade is a thin component wirer: it picks the retriever per
|
||||
//! `opts.mode` and constructs an `OllamaLanguageModel`. Exercising
|
||||
//! that wiring requires a real Ollama on `127.0.0.1:11434`, so this
|
||||
//! test is `#[ignore]` by default — run with `cargo test -p kb-app
|
||||
//! --test ask_smoke -- --ignored` against a live Ollama.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
/// Lexical-mode ask end-to-end. Requires a real Ollama on
|
||||
/// `config.models.llm.endpoint` (default `127.0.0.1:11434`) running the
|
||||
/// configured model. The pipeline body is otherwise covered by kb-rag's
|
||||
/// integration tests; this just verifies the facade composes the
|
||||
/// components correctly.
|
||||
#[test]
|
||||
#[ignore = "requires real Ollama on 127.0.0.1:11434"]
|
||||
fn ask_lexical_smoke() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
|
||||
let opts = kebab_app::AskOpts {
|
||||
k: 5,
|
||||
explain: false,
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
temperature: Some(0.0),
|
||||
seed: Some(0),
|
||||
stream_sink: None,
|
||||
};
|
||||
// The fixture workspace contains "ownership" content; the model's
|
||||
// citation behavior depends on its training, so we don't assert on
|
||||
// grounded — only that the call returns a structurally-valid Answer.
|
||||
let answer = kebab_app::ask_with_config(env.config.clone(), "ownership", opts)
|
||||
.expect("ask returns Ok with a real Ollama backend");
|
||||
// retrieval summary always populated, regardless of grounded path.
|
||||
assert_eq!(answer.retrieval.mode, kebab_core::SearchMode::Lexical);
|
||||
assert!(answer.retrieval.k >= 5);
|
||||
assert!(answer.retrieval.trace_id.0.starts_with("ret_"));
|
||||
}
|
||||
104
crates/kebab-app/tests/common/mod.rs
Normal file
104
crates/kebab-app/tests/common/mod.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
//! Shared test scaffolding for `kb-app` integration tests.
|
||||
//!
|
||||
//! Each test gets a fresh `TempDir` and a `Config` whose storage paths
|
||||
//! all point inside it, so the user's real `data_dir` / `model_dir`
|
||||
//! is never touched. The fixture workspace at
|
||||
//! `tests/fixtures/workspace/` is *copied* into the temp dir for each
|
||||
//! test so a write-side ingest can't trip on a read-only fixture
|
||||
//! tree. The default lane (no `--ignored`) opts out of embeddings via
|
||||
//! `provider = "none"` so AVX is not required.
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use kebab_config::Config;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Test environment: owns a `TempDir` and exposes a `Config` whose
|
||||
/// storage paths live inside it.
|
||||
pub struct TestEnv {
|
||||
pub temp: TempDir,
|
||||
pub workspace_root: PathBuf,
|
||||
pub config: Config,
|
||||
}
|
||||
|
||||
impl TestEnv {
|
||||
/// Build an env with embeddings disabled (lexical-only). Default
|
||||
/// lane — no AVX, no fastembed download.
|
||||
pub fn lexical_only() -> Self {
|
||||
let env = Self::new_inner();
|
||||
let mut e = env;
|
||||
e.config.models.embedding.provider = "none".to_string();
|
||||
e.config.models.embedding.dimensions = 0;
|
||||
e
|
||||
}
|
||||
|
||||
/// Build an env with the default fastembed embedding provider.
|
||||
/// Used by AVX-gated `#[ignore]` tests.
|
||||
pub fn with_embeddings() -> Self {
|
||||
Self::new_inner()
|
||||
}
|
||||
|
||||
fn new_inner() -> Self {
|
||||
let temp = tempfile::tempdir().expect("tempdir");
|
||||
let workspace_root = temp.path().join("workspace");
|
||||
copy_fixture_workspace(&workspace_root);
|
||||
|
||||
let data_dir = temp.path().join("data");
|
||||
std::fs::create_dir_all(&data_dir).unwrap();
|
||||
let model_dir = temp.path().join("models");
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut config = Config::defaults();
|
||||
config.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
// Drop the ".obsidian" / "node_modules" excludes — they bring
|
||||
// in nothing useful for fixtures and just hide debugging.
|
||||
config.workspace.exclude.clear();
|
||||
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
// Pin model_dir to the TempDir so a future fastembed-touching
|
||||
// test can't accidentally write to the user's `~/.local/share`.
|
||||
config.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
// Drop in a small chunk policy so the fixture's small files
|
||||
// emit at least a couple of chunks even with overlap_tokens
|
||||
// honored.
|
||||
config.chunking.target_tokens = 80;
|
||||
config.chunking.overlap_tokens = 20;
|
||||
|
||||
Self {
|
||||
temp,
|
||||
workspace_root,
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn scope(&self) -> kebab_core::SourceScope {
|
||||
kebab_core::SourceScope {
|
||||
root: self.workspace_root.clone(),
|
||||
include: self.config.workspace.include.clone(),
|
||||
exclude: self.config.workspace.exclude.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn copy_fixture_workspace(dest: &Path) {
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
.join("workspace");
|
||||
copy_dir_recursive(&src, dest);
|
||||
}
|
||||
|
||||
fn copy_dir_recursive(src: &Path, dest: &Path) {
|
||||
std::fs::create_dir_all(dest).unwrap();
|
||||
for entry in std::fs::read_dir(src).expect("read fixture dir") {
|
||||
let entry = entry.unwrap();
|
||||
let path = entry.path();
|
||||
let target = dest.join(entry.file_name());
|
||||
if path.is_dir() {
|
||||
copy_dir_recursive(&path, &target);
|
||||
} else {
|
||||
std::fs::copy(&path, &target).expect("copy fixture file");
|
||||
}
|
||||
}
|
||||
}
|
||||
24
crates/kebab-app/tests/fixtures/workspace/intro.md
vendored
Normal file
24
crates/kebab-app/tests/fixtures/workspace/intro.md
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
---
|
||||
title: Introduction to Rust
|
||||
tags: [rust, language]
|
||||
lang: en
|
||||
created_at: 2024-03-01T00:00:00Z
|
||||
updated_at: 2024-03-02T00:00:00Z
|
||||
source_type: note
|
||||
trust_level: primary
|
||||
---
|
||||
|
||||
# Introduction to Rust
|
||||
|
||||
Rust is a systems programming language focused on safety, speed, and concurrency.
|
||||
The compiler enforces memory safety without a garbage collector.
|
||||
|
||||
## Ownership
|
||||
|
||||
Each value has a single owner. When the owner goes out of scope the value is
|
||||
dropped. References are borrows that the compiler tracks at compile time.
|
||||
|
||||
## Concurrency
|
||||
|
||||
Threads in Rust use the ownership system to prevent data races. The Send and
|
||||
Sync traits codify which types can move between threads.
|
||||
23
crates/kebab-app/tests/fixtures/workspace/notes/cargo.md
vendored
Normal file
23
crates/kebab-app/tests/fixtures/workspace/notes/cargo.md
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
---
|
||||
title: Cargo Notes
|
||||
tags: [rust, cargo, tools]
|
||||
lang: en
|
||||
created_at: 2024-04-01T00:00:00Z
|
||||
updated_at: 2024-04-02T00:00:00Z
|
||||
source_type: note
|
||||
trust_level: primary
|
||||
---
|
||||
|
||||
# Cargo Notes
|
||||
|
||||
Cargo is the Rust package manager and build tool.
|
||||
|
||||
## Workspaces
|
||||
|
||||
A workspace is a set of packages that share the same `Cargo.lock` and output
|
||||
directory. Member crates are listed under `[workspace.members]`.
|
||||
|
||||
## Features
|
||||
|
||||
Cargo features let crates expose optional functionality behind a feature flag.
|
||||
Default features are enabled unless `default-features = false` is set.
|
||||
23
crates/kebab-app/tests/fixtures/workspace/notes/python.md
vendored
Normal file
23
crates/kebab-app/tests/fixtures/workspace/notes/python.md
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
---
|
||||
title: Python Snippets
|
||||
tags: [python, language]
|
||||
lang: en
|
||||
created_at: 2024-05-01T00:00:00Z
|
||||
updated_at: 2024-05-02T00:00:00Z
|
||||
source_type: note
|
||||
trust_level: primary
|
||||
---
|
||||
|
||||
# Python Snippets
|
||||
|
||||
Quick reference for everyday Python tasks.
|
||||
|
||||
## List comprehensions
|
||||
|
||||
Filter and transform in one pass: `[x*2 for x in xs if x > 0]`. Cleaner than
|
||||
the map+filter pair when the predicate is simple.
|
||||
|
||||
## Decorators
|
||||
|
||||
Wrap a function in another function. `functools.wraps` preserves the
|
||||
docstring and `__name__` of the inner function on the outer wrapper.
|
||||
220
crates/kebab-app/tests/ingest_lexical.rs
Normal file
220
crates/kebab-app/tests/ingest_lexical.rs
Normal file
@@ -0,0 +1,220 @@
|
||||
//! Integration tests for `kb-app::ingest` + `list_docs` + `inspect_*`
|
||||
//! along the lexical-only path (no embeddings → no AVX requirement).
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
#[test]
|
||||
fn ingest_then_list_inspects_round_trip() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
|
||||
// The fixture has 3 markdown files; first ingest should label them
|
||||
// all as New.
|
||||
assert_eq!(report.scanned, 3, "scanned: {report:?}");
|
||||
assert_eq!(report.new, 3, "new: {report:?}");
|
||||
assert_eq!(report.updated, 0, "updated: {report:?}");
|
||||
assert_eq!(report.errors, 0, "errors: {report:?}");
|
||||
let items = report.items.as_ref().expect("items present");
|
||||
assert_eq!(items.len(), 3);
|
||||
for it in items {
|
||||
assert!(it.error.is_none(), "per-item error: {it:?}");
|
||||
assert!(it.doc_id.is_some());
|
||||
// Each fixture file emits ≥1 chunk.
|
||||
assert!(it.chunk_count.unwrap_or(0) >= 1, "chunks: {it:?}");
|
||||
}
|
||||
|
||||
// list_docs returns the 3 docs.
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3, "docs: {docs:?}");
|
||||
|
||||
// inspect_doc round-trips one of them.
|
||||
let any_doc_id = docs[0].doc_id.clone();
|
||||
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id)
|
||||
.unwrap();
|
||||
assert_eq!(canonical.doc_id, any_doc_id);
|
||||
assert!(!canonical.blocks.is_empty(), "blocks empty");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_idempotent_on_second_run() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let r1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(r1.new, 3);
|
||||
|
||||
let r2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
// Same files re-ingested — labelled Updated, not duplicated.
|
||||
assert_eq!(r2.scanned, 3, "second scan: {r2:?}");
|
||||
assert_eq!(r2.new, 0, "second run new should be 0: {r2:?}");
|
||||
assert_eq!(r2.updated, 3, "second run updated: {r2:?}");
|
||||
|
||||
// list_docs still has 3 docs (no duplicates).
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_summary_only_drops_items() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert!(report.items.is_none(), "summary-only should null items");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_records_ingest_runs_row_with_aggregate_counts() {
|
||||
// The ingest_runs table is the §5.7 sibling of `jobs`: dedicated
|
||||
// count columns (`scanned`, `new_count`, …) populated at the end
|
||||
// of every run. `summary_only=true` writes `items_json=NULL`; the
|
||||
// counts MUST still be present.
|
||||
let env = TestEnv::lexical_only();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
|
||||
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("kb.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).expect("open kb.sqlite");
|
||||
let (scanned, new_c, updated, skipped, errors, items_json): (
|
||||
i64,
|
||||
i64,
|
||||
i64,
|
||||
i64,
|
||||
i64,
|
||||
Option<String>,
|
||||
) = conn
|
||||
.query_row(
|
||||
"SELECT scanned, new_count, updated_count, skipped_count,
|
||||
error_count, items_json
|
||||
FROM ingest_runs
|
||||
ORDER BY started_at DESC
|
||||
LIMIT 1",
|
||||
[],
|
||||
|r| {
|
||||
Ok((
|
||||
r.get(0)?,
|
||||
r.get(1)?,
|
||||
r.get(2)?,
|
||||
r.get(3)?,
|
||||
r.get(4)?,
|
||||
r.get(5)?,
|
||||
))
|
||||
},
|
||||
)
|
||||
.expect("ingest_runs row present");
|
||||
assert_eq!(scanned, 3);
|
||||
assert_eq!(new_c, 3);
|
||||
assert_eq!(updated, 0);
|
||||
assert_eq!(skipped, 0);
|
||||
assert_eq!(errors, 0);
|
||||
assert!(
|
||||
items_json.is_none(),
|
||||
"summary_only=true must store items_json=NULL: {items_json:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_provider_none_skips_lance() {
|
||||
// `provider="none"` must short-circuit the embedder + vector store
|
||||
// build entirely, so the LanceDB directory MUST NOT be created on
|
||||
// disk during ingest. `IngestReport` currently has no
|
||||
// `embeddings_indexed` field, so we assert via the on-disk lance
|
||||
// tree shape (no `<data_dir>/lancedb` directory, or no `*.lance`
|
||||
// tables under it).
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0, "lexical-only run must not error");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("lancedb");
|
||||
if lance_dir.exists() {
|
||||
// If the dir was created (e.g., by an earlier consumer touching
|
||||
// the path), it MUST contain no `.lance` tables.
|
||||
let mut had_lance_table = false;
|
||||
for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") {
|
||||
let entry = entry.unwrap();
|
||||
if entry
|
||||
.path()
|
||||
.extension()
|
||||
.and_then(|s| s.to_str())
|
||||
== Some("lance")
|
||||
{
|
||||
had_lance_table = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert!(
|
||||
!had_lance_table,
|
||||
"provider=none must not produce any *.lance table under {}",
|
||||
lance_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list_docs_filters_by_tags_any() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
|
||||
let filter = kebab_core::DocFilter {
|
||||
tags_any: vec!["python".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let docs = kebab_app::list_docs_with_config(env.config.clone(), filter).unwrap();
|
||||
assert_eq!(docs.len(), 1, "expected only the python doc: {docs:?}");
|
||||
assert!(docs[0].tags.contains(&"python".to_string()));
|
||||
|
||||
let rust_filter = kebab_core::DocFilter {
|
||||
tags_any: vec!["rust".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let rust_docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
|
||||
// intro.md and notes/cargo.md both tag "rust".
|
||||
assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn inspect_doc_not_found_returns_actionable_error() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bogus =
|
||||
kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string());
|
||||
let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(
|
||||
msg.contains("not found"),
|
||||
"error must mention not-found: {msg}"
|
||||
);
|
||||
assert!(
|
||||
msg.contains("kb list docs") || msg.contains("list"),
|
||||
"error must hint at `kb list docs`: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn inspect_chunk_not_found_returns_actionable_error() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bogus = kebab_core::ChunkId(
|
||||
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
|
||||
);
|
||||
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus)
|
||||
.unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("not found"), "got: {msg}");
|
||||
}
|
||||
69
crates/kebab-app/tests/search_lexical.rs
Normal file
69
crates/kebab-app/tests/search_lexical.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
//! Lexical search integration tests. The vector / hybrid lanes are
|
||||
//! AVX-gated and live in `search_vector.rs` (`#[ignore]`).
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
fn lexical_query(text: &str) -> kebab_core::SearchQuery {
|
||||
kebab_core::SearchQuery {
|
||||
text: text.to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_search_returns_hits_after_ingest() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
|
||||
// "Ownership" appears as a heading + paragraph in intro.md and
|
||||
// matches FTS5 default tokenizer easily.
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), lexical_query("ownership"))
|
||||
.unwrap();
|
||||
assert!(!hits.is_empty(), "expected ≥1 hit for 'ownership'");
|
||||
|
||||
for h in &hits {
|
||||
// Lexical retriever sets embedding_model=None per spec.
|
||||
assert!(
|
||||
h.embedding_model.is_none(),
|
||||
"lexical-mode hit must have None embedding_model: {h:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
h.retrieval.method,
|
||||
kebab_core::SearchMode::Lexical,
|
||||
"method label should be Lexical"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_search_empty_query_returns_empty() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query(" "))
|
||||
.unwrap();
|
||||
assert!(hits.is_empty(), "blank query must short-circuit empty");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vector_mode_with_provider_none_errors_clearly() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "ownership".to_string(),
|
||||
mode: kebab_core::SearchMode::Vector,
|
||||
k: 10,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let err = kebab_app::search_with_config(env.config.clone(), q).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(
|
||||
msg.contains("embeddings disabled") || msg.contains("disabled"),
|
||||
"error must mention embeddings disabled: {msg}"
|
||||
);
|
||||
}
|
||||
89
crates/kebab-app/tests/search_vector.rs
Normal file
89
crates/kebab-app/tests/search_vector.rs
Normal file
@@ -0,0 +1,89 @@
|
||||
//! Vector / Hybrid lane — AVX-gated. Marked `#[ignore]` because Lance
|
||||
//! crashes with `SIGILL` on hosts without AVX, and CI lanes that are
|
||||
//! AVX-less should not run these. Local hosts run them via
|
||||
//! `cargo test -p kb-app -- --ignored`.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
/// Panic if the host CPU lacks AVX. Mirrors the helper in
|
||||
/// `kb-store-vector/tests/common/mod.rs` and `kb-search` so a
|
||||
/// `--ignored` invocation on a non-AVX host fails loudly with a
|
||||
/// clear message instead of crashing inside Lance's SIMD kernel.
|
||||
fn require_avx_or_panic() {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if !std::is_x86_feature_detected!("avx") {
|
||||
panic!(
|
||||
"kb-app vector integration test requires AVX-capable hardware; \
|
||||
host CPU lacks AVX. Run on an AVX-capable machine."
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// First run downloads ~470MB; expect ~30-60s warm, several minutes cold.
|
||||
#[test]
|
||||
#[ignore = "AVX-required (Lance SIMD kernels)"]
|
||||
fn ingest_then_hybrid_search_returns_hits() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "ownership".to_string(),
|
||||
mode: kebab_core::SearchMode::Hybrid,
|
||||
k: 10,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), q).unwrap();
|
||||
assert!(!hits.is_empty(), "expected hybrid hits for 'ownership'");
|
||||
let methods: Vec<_> = hits.iter().map(|h| h.retrieval.method).collect();
|
||||
assert!(
|
||||
methods.iter().all(|m| *m == kebab_core::SearchMode::Hybrid),
|
||||
"every hit must report method=Hybrid: {methods:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// First run downloads ~470MB; expect ~30-60s warm, several minutes cold.
|
||||
#[test]
|
||||
#[ignore = "AVX-required (Lance SIMD kernels)"]
|
||||
fn ingest_then_vector_search_carries_embedding_model() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "ownership".to_string(),
|
||||
mode: kebab_core::SearchMode::Vector,
|
||||
k: 10,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), q).unwrap();
|
||||
assert!(!hits.is_empty(), "expected vector hits for 'ownership'");
|
||||
|
||||
// Vector mode dispatches through `VectorRetriever` and MUST stamp
|
||||
// each hit with the configured embedding_model id.
|
||||
let expected = kebab_core::EmbeddingModelId(env.config.models.embedding.model.clone());
|
||||
for h in &hits {
|
||||
assert_eq!(
|
||||
h.embedding_model,
|
||||
Some(expected.clone()),
|
||||
"vector-mode hit must carry embedding_model={expected:?}: {h:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
h.retrieval.method,
|
||||
kebab_core::SearchMode::Vector,
|
||||
"vector-mode hit must report method=Vector"
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user