refactor(rename): kb crates → kebab — Cargo packages, folders, Rust modules

프로젝트 이름 `kb` → `kebab` rename 의 첫 단계.

- workspace `Cargo.toml`: members `crates/kb-*` → `crates/kebab-*`,
  repository URL `altair823/kb` → `altair823/kebab`.
- 18 crate 폴더 rename via `git mv` (history 보존).
- 각 crate `Cargo.toml`: `name = "kb-*"` → `"kebab-*"`, path deps
  `../kb-*` → `../kebab-*`.
- 모든 `.rs`: `kb_<id>` snake-case 모듈 path 18 개 (`kb_core`,
  `kb_config`, `kb_app`, `kb_cli`, `kb_eval`, `kb_search`, `kb_chunk`,
  `kb_normalize`, `kb_source_fs`, `kb_parse_md`, `kb_parse_types`,
  `kb_store_sqlite`, `kb_store_vector`, `kb_embed`, `kb_embed_local`,
  `kb_llm`, `kb_llm_local`, `kb_rag`) → `kebab_<id>` 일괄 sed (단어
  경계 \\b 사용해 영어 문장 안의 "kb" 약어 미오염).

CLI binary 이름 (`[[bin]] name = "kb"`), 환경변수 `KB_*`, XDG paths,
tracing target, 그리고 docs sweep 은 다음 commit 에서.

## 검증

- `cargo check --workspace` clean — 모든 crate 빌드 통과 후 commit.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-02 03:28:08 +00:00
parent 2aecbf3d9f
commit 911fb49550
143 changed files with 727 additions and 727 deletions

View File

@@ -0,0 +1,39 @@
[package]
name = "kebab-app"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Facade — orchestrates components for kb-cli/tui/desktop"
[dependencies]
kebab-core = { path = "../kebab-core" }
kebab-config = { path = "../kebab-config" }
kebab-source-fs = { path = "../kebab-source-fs" }
kebab-parse-md = { path = "../kebab-parse-md" }
kebab-parse-types = { path = "../kebab-parse-types" }
kebab-normalize = { path = "../kebab-normalize" }
kebab-chunk = { path = "../kebab-chunk" }
kebab-store-sqlite = { path = "../kebab-store-sqlite" }
kebab-store-vector = { path = "../kebab-store-vector" }
kebab-search = { path = "../kebab-search" }
kebab-embed = { path = "../kebab-embed" }
kebab-embed-local = { path = "../kebab-embed-local" }
kebab-llm = { path = "../kebab-llm" }
kebab-llm-local = { path = "../kebab-llm-local" }
kebab-rag = { path = "../kebab-rag" }
anyhow = { workspace = true }
blake3 = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
time = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
tracing-appender = "0.2"
toml = "0.8"
dirs = "5"
[dev-dependencies]
rusqlite = { workspace = true }
tempfile = { workspace = true }

304
crates/kebab-app/src/app.rs Normal file
View File

@@ -0,0 +1,304 @@
//! `App` — facade lifecycle struct (§7).
//!
//! A single `App` represents one CLI invocation's (or one TUI
//! session's / one eval-runner suite's) worth of state: a resolved
//! `Config`, an open `SqliteStore`, and (when embeddings are enabled)
//! an `Embedder` + `LanceVectorStore`. Each public free function on
//! `kb-app` builds an `App` once, runs the requested op, and drops
//! everything on return; long-lived callers (kb-eval, the future P9
//! TUI session) hold onto an `App` across many calls so the per-query
//! cost is just a method dispatch.
//!
//! ## Embedder + Vector store lifetime
//!
//! `App::open_with_config` builds the SQLite store unconditionally.
//! The embedder and vector store are *lazy + memoized* — built on
//! first call to [`App::embedder`] / [`App::vector`] and cached in
//! `OnceLock`s — so a long-lived `App` (kb-eval driving 50 queries,
//! the P9 TUI session) pays the ~470 MB ONNX init plus Lance reopen
//! cost exactly once.
//!
//! - `kb list` / `kb inspect` never need them.
//! - `kb search --mode lexical` never needs them.
//! - `kb ingest` and `kb search --mode {vector,hybrid}` always do.
//!
//! Building eagerly would force every CLI invocation to load ~470 MB of
//! ONNX weights, which is the dominant cold-start cost. The lazy
//! pattern keeps the lexical-only paths instant; the memoization makes
//! the TUI's repeated searches and the eval runner's per-query loop
//! cheap after the first invocation.
//!
//! Embeddings can also be **disabled** workspace-wide via
//! `config.models.embedding.provider = "none"` (or `dimensions = 0`);
//! in that mode [`App::embedder`] returns `None` and callers must fall
//! back to lexical-only search.
use std::sync::{Arc, OnceLock};
use anyhow::{Context, Result, anyhow};
use kebab_core::{
Answer, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit, SearchMode,
SearchQuery, VectorStore,
};
use kebab_embed_local::FastembedEmbedder;
use kebab_llm_local::OllamaLanguageModel;
use kebab_rag::{AskOpts, RagPipeline};
use kebab_search::{HybridRetriever, LexicalRetriever, VectorRetriever};
use kebab_store_sqlite::SqliteStore;
use kebab_store_vector::LanceVectorStore;
/// Facade state — see module docs for lifetime rules.
///
/// The struct is public so long-lived callers (kb-eval, the future P9
/// TUI session) can construct one and reuse it across many search /
/// ask calls. The OnceLock-backed `embedder` / `vector` fields ensure
/// the cold-start cost is paid exactly once per instance.
pub struct App {
pub(crate) config: kebab_config::Config,
pub(crate) sqlite: Arc<SqliteStore>,
/// Memoized embedder — built lazily on first `embedder()` call when
/// embeddings are enabled. `OnceLock` keeps the struct `Sync` and
/// the build path cold-only-once.
embedder: OnceLock<Arc<dyn Embedder + Send + Sync>>,
/// Memoized vector store — built lazily on first `vector()` call
/// when embeddings are enabled. Same rationale as `embedder`.
vector: OnceLock<Arc<LanceVectorStore>>,
/// Memoized LLM — built lazily on first `ask()` call. Sharing one
/// across the eval runner avoids re-handshaking the Ollama HTTP
/// client per query (cheap, but still measurable on a 50-query
/// suite).
llm: OnceLock<Arc<dyn LanguageModel>>,
}
impl App {
/// Open the SQLite store and run migrations. Does NOT load the
/// embedder or vector store — those are lazy via
/// [`Self::embedder`] / [`Self::vector`].
///
/// **Caveat:** must be called from a synchronous context.
/// Downstream `LanceVectorStore::new` (called by [`Self::vector`])
/// internally drives a `tokio::Runtime::block_on`, which panics if
/// invoked from inside another tokio runtime.
pub fn open_with_config(config: kebab_config::Config) -> Result<Self> {
let sqlite = SqliteStore::open(&config).context("kb-app: open SqliteStore")?;
sqlite
.run_migrations()
.context("kb-app: run SqliteStore migrations")?;
Ok(Self {
config,
sqlite: Arc::new(sqlite),
embedder: OnceLock::new(),
vector: OnceLock::new(),
llm: OnceLock::new(),
})
}
/// Run a [`SearchQuery`] through the configured retriever stack and
/// return the top-k hits.
///
/// Reuses any previously-built embedder / vector store on this `App`
/// — long-lived callers (kb-eval, future TUI) get amortized cost
/// across calls.
pub fn search(&self, query: SearchQuery) -> Result<Vec<SearchHit>> {
match query.mode {
SearchMode::Lexical => {
let lex = LexicalRetriever::with_settings(
self.sqlite.clone(),
lexical_index_version(&self.config),
self.config.search.snippet_chars,
);
lex.search(&query)
}
SearchMode::Vector => {
let (emb, vec_store) = self.require_embeddings()?;
let vec_iv = vector_index_version(emb.as_ref());
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
let emb_dyn: Arc<dyn Embedder> = emb;
let retr = VectorRetriever::with_settings(
vec_dyn,
emb_dyn,
self.sqlite.clone(),
vec_iv,
self.config.search.snippet_chars,
);
retr.search(&query)
}
SearchMode::Hybrid => {
let lex = Arc::new(LexicalRetriever::with_settings(
self.sqlite.clone(),
lexical_index_version(&self.config),
self.config.search.snippet_chars,
)) as Arc<dyn Retriever>;
let (emb, vec_store) = self.require_embeddings()?;
let vec_iv = vector_index_version(emb.as_ref());
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
let emb_dyn: Arc<dyn Embedder> = emb;
let vec_retr = Arc::new(VectorRetriever::with_settings(
vec_dyn,
emb_dyn,
self.sqlite.clone(),
vec_iv,
self.config.search.snippet_chars,
)) as Arc<dyn Retriever>;
let hybrid = HybridRetriever::new(&self.config, lex, vec_retr);
hybrid.search(&query)
}
}
}
/// Run a RAG `ask` against the configured retriever + LLM. Reuses
/// the memoized embedder / vector / LLM where applicable.
pub fn ask(&self, query: &str, opts: AskOpts) -> Result<Answer> {
let retriever: Arc<dyn Retriever> = match opts.mode {
SearchMode::Lexical => Arc::new(LexicalRetriever::with_settings(
self.sqlite.clone(),
lexical_index_version(&self.config),
self.config.search.snippet_chars,
)),
SearchMode::Vector => {
let (emb, vec_store) = self.require_embeddings()?;
let vec_iv = vector_index_version(emb.as_ref());
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
let emb_dyn: Arc<dyn Embedder> = emb;
Arc::new(VectorRetriever::with_settings(
vec_dyn,
emb_dyn,
self.sqlite.clone(),
vec_iv,
self.config.search.snippet_chars,
))
}
SearchMode::Hybrid => {
let lex = Arc::new(LexicalRetriever::with_settings(
self.sqlite.clone(),
lexical_index_version(&self.config),
self.config.search.snippet_chars,
)) as Arc<dyn Retriever>;
let (emb, vec_store) = self.require_embeddings()?;
let vec_iv = vector_index_version(emb.as_ref());
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
let emb_dyn: Arc<dyn Embedder> = emb;
let vec_retr = Arc::new(VectorRetriever::with_settings(
vec_dyn,
emb_dyn,
self.sqlite.clone(),
vec_iv,
self.config.search.snippet_chars,
)) as Arc<dyn Retriever>;
Arc::new(HybridRetriever::new(&self.config, lex, vec_retr))
}
};
let llm = self.llm()?;
let pipeline =
RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
pipeline.ask(query, opts)
}
/// Returns `true` when the workspace has embeddings turned off
/// (`provider = "none"` or `dimensions = 0`). Lexical-only mode.
pub(crate) fn embeddings_disabled(&self) -> bool {
let cfg = &self.config.models.embedding;
cfg.provider == "none" || cfg.dimensions == 0
}
/// Build (or reuse) the fastembed embedder. Returns `None` when the
/// workspace is in lexical-only mode (see
/// [`Self::embeddings_disabled`]). The first call pays the ~470 MB
/// ONNX load; subsequent calls are a single `OnceLock` read.
pub(crate) fn embedder(&self) -> Result<Option<Arc<dyn Embedder + Send + Sync>>> {
if self.embeddings_disabled() {
return Ok(None);
}
if let Some(e) = self.embedder.get() {
return Ok(Some(e.clone()));
}
let emb: Arc<dyn Embedder + Send + Sync> = Arc::new(
FastembedEmbedder::new(&self.config)
.context("kb-app: load FastembedEmbedder")?,
);
// `set` returns Err if another thread won the race; in that case
// the loser still returns the (now-cached) winner via `get()`.
let _ = self.embedder.set(emb.clone());
Ok(Some(self.embedder.get().cloned().unwrap_or(emb)))
}
/// Build (or reuse) the LanceDB-backed vector store. Returns `None`
/// when embeddings are disabled. Memoized via `OnceLock` for the
/// same reasons as [`Self::embedder`].
pub(crate) fn vector(&self) -> Result<Option<Arc<LanceVectorStore>>> {
if self.embeddings_disabled() {
return Ok(None);
}
if let Some(v) = self.vector.get() {
return Ok(Some(v.clone()));
}
let store = Arc::new(
LanceVectorStore::new(&self.config, self.sqlite.clone())
.context("kb-app: open LanceVectorStore")?,
);
let _ = self.vector.set(store.clone());
Ok(Some(self.vector.get().cloned().unwrap_or(store)))
}
/// Build (or reuse) the configured LLM. Currently always Ollama;
/// when a second provider lands this is the place to switch on
/// `config.models.llm.provider`.
fn llm(&self) -> Result<Arc<dyn LanguageModel>> {
if let Some(l) = self.llm.get() {
return Ok(l.clone());
}
let llm: Arc<dyn LanguageModel> = Arc::new(
OllamaLanguageModel::new(&self.config)
.context("kb-app::ask: build OllamaLanguageModel")?,
);
let _ = self.llm.set(llm.clone());
Ok(self.llm.get().cloned().unwrap_or(llm))
}
/// Resolve the embedder + vector store, surfacing the user-friendly
/// "switch to --mode lexical" error when embeddings are disabled.
fn require_embeddings(
&self,
) -> Result<(
Arc<dyn Embedder + Send + Sync>,
Arc<LanceVectorStore>,
)> {
let emb = self.embedder()?.ok_or_else(|| {
anyhow!(
"embeddings disabled (config.models.embedding.provider == \"none\" \
or dimensions == 0); vector / hybrid search require embeddings — \
switch to --mode lexical or enable an embedding provider in config.toml"
)
})?;
let vec_store = self.vector()?.ok_or_else(|| {
anyhow!(
"vector store unavailable while embedder is configured — this should \
not happen; check `kb doctor` and the data_dir permissions"
)
})?;
Ok((emb, vec_store))
}
}
/// Compose a stable `IndexVersion` for the lexical retriever from
/// the active config. This token surfaces in `SearchHit.index_version`
/// and on snapshot tests; including the chunker version pins it to
/// the chunking policy in effect.
fn lexical_index_version(config: &kebab_config::Config) -> IndexVersion {
IndexVersion(format!("lex:{}", config.chunking.chunker_version))
}
/// Compose a stable `IndexVersion` for the vector retriever. Tracks
/// `(embedding_model, embedding_version, dimensions)` so a model swap
/// flags drift via the existing index_version mismatch warning in
/// `HybridRetriever::new`.
fn vector_index_version(embedder: &dyn Embedder) -> IndexVersion {
IndexVersion(format!(
"vec:{}@{}:{}",
embedder.model_id().0,
embedder.model_version().0,
embedder.dimensions(),
))
}

View File

@@ -0,0 +1,39 @@
//! Signal types used by `kb-cli`'s `exit_code` mapping (§10).
//!
//! These are *not* errors per se: a doctor failure is normal output, just
//! signalled out-of-band so the CLI can exit with the right status.
use std::fmt;
#[derive(Debug)]
pub struct DoctorUnhealthy;
impl fmt::Display for DoctorUnhealthy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("doctor unhealthy")
}
}
impl std::error::Error for DoctorUnhealthy {}
#[derive(Debug)]
pub struct RefusalSignal;
impl fmt::Display for RefusalSignal {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("refusal")
}
}
impl std::error::Error for RefusalSignal {}
#[derive(Debug)]
pub struct NoHitSignal;
impl fmt::Display for NoHitSignal {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("no hit")
}
}
impl std::error::Error for NoHitSignal {}

845
crates/kebab-app/src/lib.rs Normal file
View File

@@ -0,0 +1,845 @@
//! `kb-app` — facade that downstream `kb-cli` / `kb-tui` / `kb-desktop`
//! depend on (§7, §8).
//!
//! P3-5 swapped the `bail!("not yet wired")` stubs for real bodies that
//! compose the libraries shipped through P3-4. After this task, `kb
//! ingest` actually walks a workspace and persists chunks, and `kb
//! search --mode {lexical,vector,hybrid}` returns real `SearchHit`s.
//! `kb-app::ask` stays stubbed (P4-3 owns it).
//!
//! ## Wire-schema convention
//!
//! `kb-app` returns pure domain types (`IngestReport`, `DocSummary`,
//! `Chunk`, `SearchHit`, `Answer`, …) re-exported from `kb-core`. These do
//! NOT carry a `schema_version` field. The CLI (`kb-cli/src/wire.rs`) is
//! responsible for wrapping each Ok-path return value with the matching
//! `*.v1` envelope before emitting JSON on stdout in `--json` mode. The
//! sole exception is [`DoctorReport`], whose `schema_version` is part of
//! the struct because the doctor wire object IS its own structured
//! surface (no domain-side equivalent in `kb-core`). When adding a new
//! facade function in a later phase, remember: keep the return type pure,
//! and add a matching `wire_*` helper in `kb-cli/src/wire.rs`.
//!
//! ## Config seam (`*_with_config`)
//!
//! Each public free function has a `#[doc(hidden)] pub fn *_with_config`
//! companion that takes a fully-resolved [`kebab_config::Config`] directly.
//! Three callers go through it: (1) the top-level free functions
//! themselves, after `load_config()`; (2) `kb-cli` when the user passes
//! `--config <path>` (CLI builds the Config via
//! `Config::load(cli.config.as_deref())` and threads it in directly so
//! the flag is honored); (3) integration tests, which mutate a Config
//! to point at a `TempDir` to avoid polluting the user's real
//! `data_dir` / `model_dir`. `#[doc(hidden)]` keeps rustdoc clean while
//! still allowing the cross-crate calls.
use std::path::PathBuf;
use std::sync::Arc;
use anyhow::{Context, anyhow};
use serde::{Deserialize, Serialize};
use kebab_chunk::MdHeadingV1Chunker;
use kebab_core::{
Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery,
SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore,
};
use kebab_normalize::build_canonical_document;
use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
use kebab_source_fs::FsSourceConnector;
mod app;
pub mod doctor_signal;
pub mod logging;
pub use app::App;
/// Parser-version label persisted in `documents.parser_version` for
/// every Markdown file ingested through the `kb-parse-md` pipeline.
/// Kept in lock-step with the literal used in the `kb-store-sqlite`
/// idempotency / round-trip tests so the version label written by the
/// app and the one used in cross-crate fixtures match.
const KB_PARSE_MD_VERSION: &str = "pulldown-cmark-0.x";
/// Caller-supplied knobs for one [`ask`] invocation.
///
/// Re-exported from [`kebab_rag::AskOpts`] (P4-3 owns the type) so kb-cli's
/// `use kebab_app::AskOpts` keeps working without churn. The struct gained
/// a `stream_sink` field in P4-3; non-streaming callers (kb-cli today)
/// pass `stream_sink: None`.
pub use kebab_rag::AskOpts;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct DoctorReport {
/// Wire schema version label (`"doctor.v1"`).
pub schema_version: String,
pub ok: bool,
pub checks: Vec<DoctorCheck>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct DoctorCheck {
pub name: String,
pub ok: bool,
pub detail: String,
pub hint: Option<String>,
}
/// Create XDG dirs and write a starter `config.toml`. Idempotent unless
/// `force=true` (which overwrites an existing config).
pub fn init_workspace(force: bool) -> anyhow::Result<()> {
let cfg_path = kebab_config::Config::xdg_config_path();
let data_dir = kebab_config::Config::xdg_data_dir();
let cache_dir = kebab_config::Config::xdg_cache_dir();
let state_dir = kebab_config::Config::xdg_state_dir();
for d in [
cfg_path.parent().map(PathBuf::from).unwrap_or_default(),
data_dir.clone(),
cache_dir,
state_dir.clone(),
state_dir.join("logs"),
] {
if !d.as_os_str().is_empty() {
std::fs::create_dir_all(&d)?;
}
}
let workspace_root = expand_tilde(&kebab_config::Config::defaults().workspace.root);
std::fs::create_dir_all(&workspace_root)?;
if !cfg_path.exists() || force {
let cfg = kebab_config::Config::defaults();
let toml_text = toml::to_string_pretty(&cfg)?;
std::fs::write(&cfg_path, toml_text)?;
}
Ok(())
}
fn expand_tilde(s: &str) -> PathBuf {
if let Some(rest) = s.strip_prefix("~/") {
if let Some(home) = dirs::home_dir() {
return home.join(rest);
}
}
if s == "~" {
if let Some(home) = dirs::home_dir() {
return home;
}
}
PathBuf::from(s)
}
/// Load the active Config from XDG (or fall back to defaults). Mirrors
/// what `kb-cli` does at the top of every subcommand path; we re-do
/// the load inside each facade entry so callers don't have to thread
/// a Config through.
///
/// Callers that already have a Config in hand (CLI honoring `--config`,
/// integration tests, TUI session) should bypass this and call the
/// matching `*_with_config` helper directly.
fn load_config() -> anyhow::Result<kebab_config::Config> {
kebab_config::Config::load(None)
}
// ── ingest ────────────────────────────────────────────────────────────────
pub fn ingest(scope: SourceScope, summary_only: bool) -> anyhow::Result<IngestReport> {
let config = load_config()?;
ingest_with_config(config, scope, summary_only)
}
/// Config-explicit variant — bypasses [`load_config`] when the
/// caller (kb-cli with `--config`, integration tests, TUI session)
/// already has a [`kebab_config::Config`] in hand. The public free
/// function [`ingest`] wraps this with the XDG-default load.
#[doc(hidden)]
pub fn ingest_with_config(
config: kebab_config::Config,
scope: SourceScope,
summary_only: bool,
) -> anyhow::Result<IngestReport> {
let started_instant = std::time::Instant::now();
let app = App::open_with_config(config)?;
// Walk the workspace.
let connector = FsSourceConnector::new(&app.config)
.context("kb-app::ingest: build FsSourceConnector")?;
let assets = connector
.scan(&scope)
.context("kb-app::ingest: scan workspace")?;
// Embedder + vector store: build once at the top so the cold-start
// cost is paid once even when the workspace has 1000 markdown files.
let embedder = app.embedder()?;
let vector_store = app.vector()?;
// If both are present, ensure the table exists for the (model, dim)
// pair so the first per-doc upsert doesn't pay the create-table
// round-trip.
if let (Some(emb), Some(vec)) = (embedder.as_ref(), vector_store.as_ref()) {
let mid = emb.model_id();
vec.ensure_table(&mid, emb.dimensions())
.context("kb-app::ingest: ensure Lance table")?;
}
let parser_version = ParserVersion(KB_PARSE_MD_VERSION.to_string());
let chunk_policy = chunk_policy_from_config(&app.config);
// Pre-load every existing doc_id so we can label `IngestItem.kind`
// as `New` vs `Updated` correctly. `list_documents` returns one
// row per `(workspace_path, asset_id)` — index by the deterministic
// `doc_id` recipe input so the first ingest of an unseen file is
// labelled `New`.
let existing_doc_ids: std::collections::HashSet<String> = app
.sqlite
.list_documents(&DocFilter::default())
.context("kb-app::ingest: list existing documents")?
.into_iter()
.map(|d| d.doc_id.0)
.collect();
let started_at = time::OffsetDateTime::now_utc();
let mut items: Vec<kebab_core::IngestItem> = Vec::new();
let mut new_count: u32 = 0;
let mut updated_count: u32 = 0;
let mut skipped_count: u32 = 0;
let mut error_count: u32 = 0;
// Aggregate counts surfaced into `ingest_runs` (and tracing). Not
// exposed on `IngestReport` today — `kebab_core::IngestReport` is a
// wire-stable struct without these fields — but persisting them
// means audit tooling and `kb jobs` (P+) can recover the totals
// without re-walking the DB.
let mut chunks_indexed: u32 = 0;
let mut embeddings_indexed: u32 = 0;
let scanned_count: u32 = u32::try_from(assets.len()).unwrap_or(u32::MAX);
let embed_active = embedder.is_some() && vector_store.is_some();
for asset in assets {
let item = ingest_one_asset(
&app,
&asset,
&parser_version,
&chunk_policy,
embedder.as_ref(),
vector_store.as_ref(),
&existing_doc_ids,
);
let item = match item {
Ok(i) => i,
Err(e) => {
tracing::error!(
target: "kb-app",
path = %asset.workspace_path.0,
error = %e,
"kb-app::ingest: per-file fatal"
);
error_count = error_count.saturating_add(1);
kebab_core::IngestItem {
kind: kebab_core::IngestItemKind::Error,
doc_id: None,
doc_path: asset.workspace_path.clone(),
asset_id: Some(asset.asset_id.clone()),
byte_len: Some(asset.byte_len),
block_count: None,
chunk_count: None,
parser_version: None,
chunker_version: None,
warnings: Vec::new(),
error: Some(format!("{e:#}")),
}
}
};
match item.kind {
kebab_core::IngestItemKind::New => {
new_count = new_count.saturating_add(1);
let n = item.chunk_count.unwrap_or(0);
chunks_indexed = chunks_indexed.saturating_add(n);
if embed_active {
embeddings_indexed = embeddings_indexed.saturating_add(n);
}
}
kebab_core::IngestItemKind::Updated => {
updated_count = updated_count.saturating_add(1);
let n = item.chunk_count.unwrap_or(0);
chunks_indexed = chunks_indexed.saturating_add(n);
if embed_active {
embeddings_indexed = embeddings_indexed.saturating_add(n);
}
}
kebab_core::IngestItemKind::Skipped => {
skipped_count = skipped_count.saturating_add(1)
}
kebab_core::IngestItemKind::Error => {
error_count = error_count.saturating_add(1)
}
}
items.push(item);
}
// Record a row in `jobs` so `kb jobs` (P+) can list the run. Distinct
// from the `ingest_runs` row written below — the `jobs` table is the
// generic job-lifecycle surface (`kind=ingest`), `ingest_runs` is the
// ingest-specific aggregate counts row.
let payload = serde_json::json!({
"scope": scope,
"summary_only": summary_only,
});
let job_id_res = <SqliteStoreAlias as kebab_core::JobRepo>::create(
&app.sqlite,
kebab_core::JobKind::Ingest,
payload,
);
match job_id_res {
Ok(jid) => {
// Stash the aggregate counts as the job's `progress_json`
// so a future `kb jobs show` can surface them without
// joining `ingest_runs`.
let progress = serde_json::json!({
"scanned": scanned_count,
"new": new_count,
"updated": updated_count,
"skipped": skipped_count,
"errors": error_count,
"chunks_indexed": chunks_indexed,
"embeddings_indexed": embeddings_indexed,
});
if let Err(e) = <SqliteStoreAlias as kebab_core::JobRepo>::update_progress(
&app.sqlite,
&jid,
progress,
) {
tracing::warn!(
target: "kb-app",
error = %e,
"kb-app::ingest: JobRepo::update_progress failed"
);
}
if let Err(e) = <SqliteStoreAlias as kebab_core::JobRepo>::finish(
&app.sqlite,
&jid,
kebab_core::JobStatus::Succeeded,
None,
) {
tracing::warn!(
target: "kb-app",
error = %e,
"kb-app::ingest: JobRepo::finish failed"
);
}
}
Err(e) => {
tracing::warn!(
target: "kb-app",
error = %e,
"kb-app::ingest: JobRepo::create failed; run not recorded in `jobs`"
);
}
}
let duration_ms = u32::try_from(started_instant.elapsed().as_millis())
.unwrap_or(u32::MAX);
let finished_at = time::OffsetDateTime::now_utc();
// Record the ingest_runs row with aggregate counts.
// `summary_only=true` writes `items_json=NULL` (per design §5.7);
// the count columns are populated either way.
let scope_json = serde_json::to_string(&scope)
.context("kb-app::ingest: serialize scope for ingest_runs.scope_json")?;
let items_json: Option<String> = if summary_only {
None
} else {
match serde_json::to_string(&items) {
Ok(s) => Some(s),
Err(e) => {
tracing::warn!(
target: "kb-app",
error = %e,
"kb-app::ingest: failed to serialize items_json; storing NULL"
);
None
}
}
};
let run_id = mint_ingest_run_id(&scope_json, started_at);
let row = kebab_store_sqlite::IngestRunRow {
run_id: &run_id,
scope_json: &scope_json,
scanned: scanned_count,
new_count,
updated_count,
skipped_count,
error_count,
duration_ms,
started_at,
finished_at,
items_json: items_json.as_deref(),
};
if let Err(e) = app.sqlite.record_ingest_run(&row) {
tracing::warn!(
target: "kb-app",
error = %e,
"kb-app::ingest: record_ingest_run failed"
);
}
tracing::info!(
target: "kb-app",
scanned = scanned_count,
new = new_count,
updated = updated_count,
skipped = skipped_count,
errors = error_count,
chunks_indexed,
embeddings_indexed,
duration_ms,
"kb-app::ingest: run complete"
);
Ok(IngestReport {
scope,
scanned: scanned_count,
new: new_count,
updated: updated_count,
skipped: skipped_count,
errors: error_count,
duration_ms,
items: if summary_only { None } else { Some(items) },
})
}
/// Mint a stable 32-hex-char `run_id` for an `ingest_runs` row.
/// `(scope, started_at_nanos)` is enough to make two runs with the
/// same scope started a nanosecond apart distinguish — same shape as
/// the JobId recipe in `kb-store-sqlite::jobs`.
fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
let mut hasher = blake3::Hasher::new();
hasher.update(scope_json.as_bytes());
hasher.update(&at.unix_timestamp_nanos().to_be_bytes());
let hex = hasher.finalize().to_hex().to_string();
hex[..32].to_string()
}
/// Trait alias type used to disambiguate the two impls (`DocumentStore`
/// vs `JobRepo`) on the same store. Plain `app.sqlite.create(...)`
/// would pick one based on inherent vs trait methods; we go through
/// `<… as JobRepo>` to be explicit.
type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
/// Process a single asset: read bytes, parse, normalize, chunk,
/// persist, embed. Per-asset failures bubble up to the caller for
/// labelling as `IngestItemKind::Error` — they do NOT abort the
/// whole run.
fn ingest_one_asset(
app: &App,
asset: &RawAsset,
parser_version: &ParserVersion,
chunk_policy: &ChunkPolicy,
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
) -> anyhow::Result<kebab_core::IngestItem> {
tracing::debug!(
target: "kb-app::ingest",
path = %asset.workspace_path.0,
"processing asset"
);
// Only handle Markdown for now; other media types are P6+ work.
if asset.media_type != kebab_core::MediaType::Markdown {
return Ok(kebab_core::IngestItem {
kind: kebab_core::IngestItemKind::Skipped,
doc_id: None,
doc_path: asset.workspace_path.clone(),
asset_id: Some(asset.asset_id.clone()),
byte_len: Some(asset.byte_len),
block_count: None,
chunk_count: None,
parser_version: None,
chunker_version: None,
warnings: Vec::new(),
error: None,
});
}
let path = match &asset.source_uri {
SourceUri::File(p) => p.clone(),
SourceUri::Kb(_) => {
return Ok(kebab_core::IngestItem {
kind: kebab_core::IngestItemKind::Skipped,
doc_id: None,
doc_path: asset.workspace_path.clone(),
asset_id: Some(asset.asset_id.clone()),
byte_len: Some(asset.byte_len),
block_count: None,
chunk_count: None,
parser_version: None,
chunker_version: None,
warnings: vec![
"kb:// source URIs are not supported by the fs ingester".into(),
],
error: None,
});
}
};
let bytes = std::fs::read(&path)
.with_context(|| format!("read asset bytes from {}", path.display()))?;
let body_hints = build_body_hints(asset);
// Frontmatter — `parse_frontmatter` returns Ok even on malformed
// frontmatter (warnings are surfaced through the `Vec<Warning>`).
let (metadata, fm_span, fm_warns) = parse_frontmatter(&bytes, &body_hints)
.context("kb-parse-md::parse_frontmatter")?;
let body_offset_lines = match fm_span {
Some(span) => count_lines_in(&bytes[..span.end]),
None => 0,
};
let (parsed_blocks, blk_warns) = parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines)
.context("kb-parse-md::parse_blocks")?;
let mut all_warnings = Vec::with_capacity(fm_warns.len() + blk_warns.len());
all_warnings.extend(fm_warns);
all_warnings.extend(blk_warns);
// Snapshot warning notes for the IngestItem before the vec is
// consumed by `build_canonical_document`.
let warning_notes: Vec<String> = all_warnings
.iter()
.map(|w| format!("{:?}: {}", w.kind, w.note))
.collect();
let canonical = build_canonical_document(
asset,
metadata,
parsed_blocks,
parser_version,
all_warnings,
)
.context("kb-normalize::build_canonical_document")?;
let chunks = MdHeadingV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::MdHeadingV1Chunker::chunk")?;
// Persist. Each `put_*` call wraps its own short transaction
// (per-document tx semantics per design §5.8); composing them is
// the kb-app job. A failure mid-way leaves the DB in a state the
// next ingest run can re-converge (UPSERT + DELETE-then-INSERT).
app.sqlite
.put_asset_with_bytes(asset, &bytes)
.context("DocumentStore::put_asset_with_bytes")?;
app.sqlite
.put_document(&canonical)
.context("DocumentStore::put_document")?;
app.sqlite
.put_blocks(&canonical.doc_id, &canonical.blocks)
.context("DocumentStore::put_blocks")?;
app.sqlite
.put_chunks(&canonical.doc_id, &chunks)
.context("DocumentStore::put_chunks")?;
// Embed + vector upsert (only when both sides are configured).
if let (Some(emb), Some(vec_store)) = (embedder, vector_store) {
if !chunks.is_empty() {
let inputs: Vec<EmbeddingInput<'_>> = chunks
.iter()
.map(|c| EmbeddingInput {
text: c.text.as_str(),
kind: EmbeddingKind::Document,
})
.collect();
let vectors = emb
.embed(&inputs)
.context("Embedder::embed (document chunks)")?;
let model_id = emb.model_id();
let model_version = emb.model_version();
let dimensions = emb.dimensions();
let records: Vec<VectorRecord> = chunks
.iter()
.zip(vectors)
.map(|(c, v)| VectorRecord {
embedding_id: kebab_core::id_for_embedding(
&c.chunk_id,
&model_id,
&model_version,
dimensions,
),
chunk_id: c.chunk_id.clone(),
vector: v,
doc_id: canonical.doc_id.clone(),
text: c.text.clone(),
heading_path: c.heading_path.clone(),
model_id: model_id.clone(),
model_version: model_version.clone(),
dimensions,
})
.collect();
vec_store
.upsert(&records)
.context("VectorStore::upsert")?;
}
}
let kind = if existing_doc_ids.contains(&canonical.doc_id.0) {
kebab_core::IngestItemKind::Updated
} else {
kebab_core::IngestItemKind::New
};
Ok(kebab_core::IngestItem {
kind,
doc_id: Some(canonical.doc_id.clone()),
doc_path: asset.workspace_path.clone(),
asset_id: Some(asset.asset_id.clone()),
byte_len: Some(asset.byte_len),
block_count: u32::try_from(canonical.blocks.len()).ok(),
chunk_count: u32::try_from(chunks.len()).ok(),
parser_version: Some(parser_version.clone()),
chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
warnings: warning_notes,
error: None,
})
}
/// Convenience: end byte of the frontmatter region (or 0 when absent).
fn fm_span_end(span: Option<kebab_parse_md::FrontmatterSpan>) -> usize {
span.map(|s| s.end).unwrap_or(0)
}
/// Count `\n` in a byte prefix to convert frontmatter byte span to
/// the line-offset `parse_blocks` expects.
fn count_lines_in(bytes: &[u8]) -> u32 {
let n = bytes.iter().filter(|&&b| b == b'\n').count();
u32::try_from(n).unwrap_or(u32::MAX)
}
/// Build `BodyHints` from the asset alone. We use the asset's
/// `discovered_at` for both `fs_ctime` and `fs_mtime` because going
/// through the FS metadata API for every file would be a noticeable
/// overhead for large workspaces and the source-of-truth timestamps
/// are written into the document's frontmatter when the user wants
/// authoritative values.
fn build_body_hints(asset: &RawAsset) -> BodyHints {
BodyHints {
first_h1: None,
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: None,
}
}
/// Build a `ChunkPolicy` from the active config.
fn chunk_policy_from_config(config: &kebab_config::Config) -> ChunkPolicy {
ChunkPolicy {
target_tokens: config.chunking.target_tokens,
overlap_tokens: config.chunking.overlap_tokens,
respect_markdown_headings: config.chunking.respect_markdown_headings,
chunker_version: ChunkerVersion(config.chunking.chunker_version.clone()),
}
}
// ── list_docs / inspect_doc / inspect_chunk ───────────────────────────────
pub fn list_docs(filter: DocFilter) -> anyhow::Result<Vec<DocSummary>> {
let config = load_config()?;
list_docs_with_config(config, filter)
}
/// Test-only seam — kb-cli must call the public free function
/// ([`list_docs`]), not this.
#[doc(hidden)]
pub fn list_docs_with_config(
config: kebab_config::Config,
filter: DocFilter,
) -> anyhow::Result<Vec<DocSummary>> {
let app = App::open_with_config(config)?;
app.sqlite.list_documents(&filter)
}
pub fn inspect_doc(id: &DocumentId) -> anyhow::Result<CanonicalDocument> {
let config = load_config()?;
inspect_doc_with_config(config, id)
}
/// Test-only seam — kb-cli must call the public free function
/// ([`inspect_doc`]), not this.
#[doc(hidden)]
pub fn inspect_doc_with_config(
config: kebab_config::Config,
id: &DocumentId,
) -> anyhow::Result<CanonicalDocument> {
let app = App::open_with_config(config)?;
app.sqlite
.get_document(id)?
.ok_or_else(|| anyhow!("document not found: {} (try `kb list docs`)", id.0))
}
pub fn inspect_chunk(id: &ChunkId) -> anyhow::Result<Chunk> {
let config = load_config()?;
inspect_chunk_with_config(config, id)
}
/// Test-only seam — kb-cli must call the public free function
/// ([`inspect_chunk`]), not this.
#[doc(hidden)]
pub fn inspect_chunk_with_config(
config: kebab_config::Config,
id: &ChunkId,
) -> anyhow::Result<Chunk> {
let app = App::open_with_config(config)?;
app.sqlite
.get_chunk(id)?
.ok_or_else(|| anyhow!("chunk not found: {} (try `kb inspect doc <id>`)", id.0))
}
// ── search ────────────────────────────────────────────────────────────────
pub fn search(query: SearchQuery) -> anyhow::Result<Vec<SearchHit>> {
let config = load_config()?;
search_with_config(config, query)
}
/// Test-only seam — kb-cli must call the public free function
/// ([`search`]), not this. Builds a one-shot `App` and delegates to
/// [`App::search`]; long-lived callers should hold an `App` instance
/// directly to amortize the embedder / vector-store cold start.
#[doc(hidden)]
pub fn search_with_config(
config: kebab_config::Config,
query: SearchQuery,
) -> anyhow::Result<Vec<SearchHit>> {
App::open_with_config(config)?.search(query)
}
// ── ask ──────────────────────────────────────────────────────────────────
//
// P4-3 wires `ask` end-to-end. The retriever is built per `opts.mode`;
// vector / hybrid require an enabled embedding provider (else we surface
// the same "switch to --mode lexical" error as `search`). The LLM is
// always Ollama for now — when we grow a second provider (llama.cpp,
// candle, etc.) this is the place to switch on `config.models.llm.provider`.
pub fn ask(query: &str, opts: AskOpts) -> anyhow::Result<Answer> {
let config = load_config()?;
ask_with_config(config, query, opts)
}
/// Test-only seam — kb-cli must call the public free function
/// ([`ask`]), not this. Builds a one-shot `App` and delegates to
/// [`App::ask`].
#[doc(hidden)]
pub fn ask_with_config(
config: kebab_config::Config,
query: &str,
opts: AskOpts,
) -> anyhow::Result<Answer> {
App::open_with_config(config)?.ask(query, opts)
}
/// Run the doctor checks against the explicit config path the user
/// requested via `--config` (or the XDG default if `None`). The
/// `config_loaded` check reports the actual path probed and the
/// `data_dir_writable` check probes the resolved `storage.data_dir`
/// from that config (so `--config` users see their custom paths
/// reflected in the report rather than the XDG defaults).
pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow::Result<DoctorReport> {
tracing::debug!("doctor() invoked");
let mut checks = Vec::new();
// Resolve the config path the same way `Config::load` does: explicit
// override first, else XDG default. Report whichever was probed.
let cfg_path: PathBuf = match config_path {
Some(p) => p.to_path_buf(),
None => kebab_config::Config::xdg_config_path(),
};
let (config_ok, config_detail, loaded_cfg) = if cfg_path.exists() {
match kebab_config::Config::from_file(&cfg_path) {
Ok(c) => (true, cfg_path.display().to_string(), Some(c)),
Err(e) => (false, format!("{} ({e})", cfg_path.display()), None),
}
} else if config_path.is_some() {
// Explicit `--config <path>` that doesn't exist is a hard error
// — defaults would silently mask the user's intent.
(
false,
format!("{} (not found)", cfg_path.display()),
None,
)
} else {
// No `--config` and no XDG file: defaults are always loadable.
(true, format!("{} (defaults)", cfg_path.display()), None)
};
checks.push(DoctorCheck {
name: "config_loaded".to_string(),
ok: config_ok,
detail: config_detail,
hint: if config_ok {
None
} else if config_path.is_some() {
Some("--config path does not exist or is malformed".to_string())
} else {
Some("run `kb init` to seed config".to_string())
},
});
// data_dir_writable — probe the resolved storage.data_dir from the
// loaded config when present, else the XDG default. Apply env
// overrides so KB_STORAGE_DATA_DIR is respected too.
let data_dir = match loaded_cfg.as_ref() {
Some(c) => {
// Re-apply env overrides on top so the same precedence as
// Config::load is preserved here.
let env: std::collections::HashMap<String, String> = std::env::vars().collect();
let merged = c.clone().apply_env(&env);
expand_tilde(&merged.storage.data_dir)
}
None => kebab_config::Config::xdg_data_dir(),
};
let writable = (|| -> anyhow::Result<()> {
std::fs::create_dir_all(&data_dir)?;
let probe = data_dir.join(".kb-doctor-probe");
std::fs::write(&probe, b"ok")?;
std::fs::remove_file(&probe).ok();
Ok(())
})();
let (data_ok, data_detail, data_hint) = match writable {
Ok(()) => (true, data_dir.display().to_string(), None),
Err(e) => (
false,
format!("{} ({e})", data_dir.display()),
Some("ensure the configured data_dir is writable".to_string()),
),
};
checks.push(DoctorCheck {
name: "data_dir_writable".to_string(),
ok: data_ok,
detail: data_detail,
hint: data_hint,
});
let ok = checks.iter().all(|c| c.ok);
Ok(DoctorReport {
schema_version: "doctor.v1".to_string(),
ok,
checks,
})
}
/// Run the doctor checks against the XDG-default config. Convenience
/// wrapper that mirrors the historical `kb-app::doctor()` signature
/// for callers that don't honor `--config` (e.g., legacy code paths
/// or smoke harnesses).
pub fn doctor() -> anyhow::Result<DoctorReport> {
doctor_with_config_path(None)
}

View File

@@ -0,0 +1,43 @@
//! Tracing initialization helper for `kb-cli`.
//!
//! Daily-rolling file appender at `~/.local/state/kb/logs/` per task spec.
//! Returns a `WorkerGuard` that the caller must keep alive until program
//! exit (so buffered log lines flush).
use anyhow::Result;
use tracing_appender::non_blocking::WorkerGuard;
use tracing_subscriber::{EnvFilter, fmt, prelude::*};
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum LogLevel {
Default,
Verbose,
Debug,
}
/// Initialize tracing. Returns a guard to keep alive until exit. Idempotent
/// — a second call is a no-op (the second `try_init` is dropped silently
/// but the guard is still returned so the caller can keep it alive).
pub fn init(level: LogLevel) -> Result<WorkerGuard> {
let log_dir = kebab_config::Config::xdg_state_dir().join("logs");
std::fs::create_dir_all(&log_dir)?;
let file_appender = tracing_appender::rolling::daily(&log_dir, "kb.log");
let (nb, guard) = tracing_appender::non_blocking(file_appender);
let env_filter = match level {
LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")),
LogLevel::Verbose => EnvFilter::new("info"),
LogLevel::Debug => EnvFilter::new("debug"),
};
let registry = tracing_subscriber::registry()
.with(env_filter)
.with(fmt::layer().with_writer(nb).with_ansi(false));
// `try_init` rather than `init` so a second call (e.g. in tests) is a
// no-op.
let _ = registry.try_init();
Ok(guard)
}

View File

@@ -0,0 +1,43 @@
//! `kb-app::ask` smoke tests.
//!
//! The pipeline's behavior is exhaustively covered by `kb-rag` tests
//! (which inject `MockLanguageModel` + `MockRetriever`). The kb-app
//! facade is a thin component wirer: it picks the retriever per
//! `opts.mode` and constructs an `OllamaLanguageModel`. Exercising
//! that wiring requires a real Ollama on `127.0.0.1:11434`, so this
//! test is `#[ignore]` by default — run with `cargo test -p kb-app
//! --test ask_smoke -- --ignored` against a live Ollama.
mod common;
use common::TestEnv;
/// Lexical-mode ask end-to-end. Requires a real Ollama on
/// `config.models.llm.endpoint` (default `127.0.0.1:11434`) running the
/// configured model. The pipeline body is otherwise covered by kb-rag's
/// integration tests; this just verifies the facade composes the
/// components correctly.
#[test]
#[ignore = "requires real Ollama on 127.0.0.1:11434"]
fn ask_lexical_smoke() {
let env = TestEnv::lexical_only();
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
let opts = kebab_app::AskOpts {
k: 5,
explain: false,
mode: kebab_core::SearchMode::Lexical,
temperature: Some(0.0),
seed: Some(0),
stream_sink: None,
};
// The fixture workspace contains "ownership" content; the model's
// citation behavior depends on its training, so we don't assert on
// grounded — only that the call returns a structurally-valid Answer.
let answer = kebab_app::ask_with_config(env.config.clone(), "ownership", opts)
.expect("ask returns Ok with a real Ollama backend");
// retrieval summary always populated, regardless of grounded path.
assert_eq!(answer.retrieval.mode, kebab_core::SearchMode::Lexical);
assert!(answer.retrieval.k >= 5);
assert!(answer.retrieval.trace_id.0.starts_with("ret_"));
}

View File

@@ -0,0 +1,104 @@
//! Shared test scaffolding for `kb-app` integration tests.
//!
//! Each test gets a fresh `TempDir` and a `Config` whose storage paths
//! all point inside it, so the user's real `data_dir` / `model_dir`
//! is never touched. The fixture workspace at
//! `tests/fixtures/workspace/` is *copied* into the temp dir for each
//! test so a write-side ingest can't trip on a read-only fixture
//! tree. The default lane (no `--ignored`) opts out of embeddings via
//! `provider = "none"` so AVX is not required.
#![allow(dead_code)]
use std::path::{Path, PathBuf};
use kebab_config::Config;
use tempfile::TempDir;
/// Test environment: owns a `TempDir` and exposes a `Config` whose
/// storage paths live inside it.
pub struct TestEnv {
pub temp: TempDir,
pub workspace_root: PathBuf,
pub config: Config,
}
impl TestEnv {
/// Build an env with embeddings disabled (lexical-only). Default
/// lane — no AVX, no fastembed download.
pub fn lexical_only() -> Self {
let env = Self::new_inner();
let mut e = env;
e.config.models.embedding.provider = "none".to_string();
e.config.models.embedding.dimensions = 0;
e
}
/// Build an env with the default fastembed embedding provider.
/// Used by AVX-gated `#[ignore]` tests.
pub fn with_embeddings() -> Self {
Self::new_inner()
}
fn new_inner() -> Self {
let temp = tempfile::tempdir().expect("tempdir");
let workspace_root = temp.path().join("workspace");
copy_fixture_workspace(&workspace_root);
let data_dir = temp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let model_dir = temp.path().join("models");
std::fs::create_dir_all(&model_dir).unwrap();
let mut config = Config::defaults();
config.workspace.root = workspace_root.to_string_lossy().into_owned();
// Drop the ".obsidian" / "node_modules" excludes — they bring
// in nothing useful for fixtures and just hide debugging.
config.workspace.exclude.clear();
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
// Pin model_dir to the TempDir so a future fastembed-touching
// test can't accidentally write to the user's `~/.local/share`.
config.storage.model_dir = model_dir.to_string_lossy().into_owned();
// Drop in a small chunk policy so the fixture's small files
// emit at least a couple of chunks even with overlap_tokens
// honored.
config.chunking.target_tokens = 80;
config.chunking.overlap_tokens = 20;
Self {
temp,
workspace_root,
config,
}
}
pub fn scope(&self) -> kebab_core::SourceScope {
kebab_core::SourceScope {
root: self.workspace_root.clone(),
include: self.config.workspace.include.clone(),
exclude: self.config.workspace.exclude.clone(),
}
}
}
fn copy_fixture_workspace(dest: &Path) {
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests")
.join("fixtures")
.join("workspace");
copy_dir_recursive(&src, dest);
}
fn copy_dir_recursive(src: &Path, dest: &Path) {
std::fs::create_dir_all(dest).unwrap();
for entry in std::fs::read_dir(src).expect("read fixture dir") {
let entry = entry.unwrap();
let path = entry.path();
let target = dest.join(entry.file_name());
if path.is_dir() {
copy_dir_recursive(&path, &target);
} else {
std::fs::copy(&path, &target).expect("copy fixture file");
}
}
}

View File

@@ -0,0 +1,24 @@
---
title: Introduction to Rust
tags: [rust, language]
lang: en
created_at: 2024-03-01T00:00:00Z
updated_at: 2024-03-02T00:00:00Z
source_type: note
trust_level: primary
---
# Introduction to Rust
Rust is a systems programming language focused on safety, speed, and concurrency.
The compiler enforces memory safety without a garbage collector.
## Ownership
Each value has a single owner. When the owner goes out of scope the value is
dropped. References are borrows that the compiler tracks at compile time.
## Concurrency
Threads in Rust use the ownership system to prevent data races. The Send and
Sync traits codify which types can move between threads.

View File

@@ -0,0 +1,23 @@
---
title: Cargo Notes
tags: [rust, cargo, tools]
lang: en
created_at: 2024-04-01T00:00:00Z
updated_at: 2024-04-02T00:00:00Z
source_type: note
trust_level: primary
---
# Cargo Notes
Cargo is the Rust package manager and build tool.
## Workspaces
A workspace is a set of packages that share the same `Cargo.lock` and output
directory. Member crates are listed under `[workspace.members]`.
## Features
Cargo features let crates expose optional functionality behind a feature flag.
Default features are enabled unless `default-features = false` is set.

View File

@@ -0,0 +1,23 @@
---
title: Python Snippets
tags: [python, language]
lang: en
created_at: 2024-05-01T00:00:00Z
updated_at: 2024-05-02T00:00:00Z
source_type: note
trust_level: primary
---
# Python Snippets
Quick reference for everyday Python tasks.
## List comprehensions
Filter and transform in one pass: `[x*2 for x in xs if x > 0]`. Cleaner than
the map+filter pair when the predicate is simple.
## Decorators
Wrap a function in another function. `functools.wraps` preserves the
docstring and `__name__` of the inner function on the outer wrapper.

View File

@@ -0,0 +1,220 @@
//! Integration tests for `kb-app::ingest` + `list_docs` + `inspect_*`
//! along the lexical-only path (no embeddings → no AVX requirement).
mod common;
use common::TestEnv;
#[test]
fn ingest_then_list_inspects_round_trip() {
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
// The fixture has 3 markdown files; first ingest should label them
// all as New.
assert_eq!(report.scanned, 3, "scanned: {report:?}");
assert_eq!(report.new, 3, "new: {report:?}");
assert_eq!(report.updated, 0, "updated: {report:?}");
assert_eq!(report.errors, 0, "errors: {report:?}");
let items = report.items.as_ref().expect("items present");
assert_eq!(items.len(), 3);
for it in items {
assert!(it.error.is_none(), "per-item error: {it:?}");
assert!(it.doc_id.is_some());
// Each fixture file emits ≥1 chunk.
assert!(it.chunk_count.unwrap_or(0) >= 1, "chunks: {it:?}");
}
// list_docs returns the 3 docs.
let docs = kebab_app::list_docs_with_config(
env.config.clone(),
kebab_core::DocFilter::default(),
)
.unwrap();
assert_eq!(docs.len(), 3, "docs: {docs:?}");
// inspect_doc round-trips one of them.
let any_doc_id = docs[0].doc_id.clone();
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id)
.unwrap();
assert_eq!(canonical.doc_id, any_doc_id);
assert!(!canonical.blocks.is_empty(), "blocks empty");
}
#[test]
fn ingest_idempotent_on_second_run() {
let env = TestEnv::lexical_only();
let r1 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert_eq!(r1.new, 3);
let r2 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
// Same files re-ingested — labelled Updated, not duplicated.
assert_eq!(r2.scanned, 3, "second scan: {r2:?}");
assert_eq!(r2.new, 0, "second run new should be 0: {r2:?}");
assert_eq!(r2.updated, 3, "second run updated: {r2:?}");
// list_docs still has 3 docs (no duplicates).
let docs = kebab_app::list_docs_with_config(
env.config.clone(),
kebab_core::DocFilter::default(),
)
.unwrap();
assert_eq!(docs.len(), 3);
}
#[test]
fn ingest_summary_only_drops_items() {
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert_eq!(report.scanned, 3);
assert!(report.items.is_none(), "summary-only should null items");
}
#[test]
fn ingest_records_ingest_runs_row_with_aggregate_counts() {
// The ingest_runs table is the §5.7 sibling of `jobs`: dedicated
// count columns (`scanned`, `new_count`, …) populated at the end
// of every run. `summary_only=true` writes `items_json=NULL`; the
// counts MUST still be present.
let env = TestEnv::lexical_only();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
.unwrap();
assert_eq!(report.scanned, 3);
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir)
.join("kb.sqlite");
let conn = rusqlite::Connection::open(&db_path).expect("open kb.sqlite");
let (scanned, new_c, updated, skipped, errors, items_json): (
i64,
i64,
i64,
i64,
i64,
Option<String>,
) = conn
.query_row(
"SELECT scanned, new_count, updated_count, skipped_count,
error_count, items_json
FROM ingest_runs
ORDER BY started_at DESC
LIMIT 1",
[],
|r| {
Ok((
r.get(0)?,
r.get(1)?,
r.get(2)?,
r.get(3)?,
r.get(4)?,
r.get(5)?,
))
},
)
.expect("ingest_runs row present");
assert_eq!(scanned, 3);
assert_eq!(new_c, 3);
assert_eq!(updated, 0);
assert_eq!(skipped, 0);
assert_eq!(errors, 0);
assert!(
items_json.is_none(),
"summary_only=true must store items_json=NULL: {items_json:?}"
);
}
#[test]
fn ingest_provider_none_skips_lance() {
// `provider="none"` must short-circuit the embedder + vector store
// build entirely, so the LanceDB directory MUST NOT be created on
// disk during ingest. `IngestReport` currently has no
// `embeddings_indexed` field, so we assert via the on-disk lance
// tree shape (no `<data_dir>/lancedb` directory, or no `*.lance`
// tables under it).
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert_eq!(report.errors, 0, "lexical-only run must not error");
assert_eq!(report.new, 3);
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir)
.join("lancedb");
if lance_dir.exists() {
// If the dir was created (e.g., by an earlier consumer touching
// the path), it MUST contain no `.lance` tables.
let mut had_lance_table = false;
for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") {
let entry = entry.unwrap();
if entry
.path()
.extension()
.and_then(|s| s.to_str())
== Some("lance")
{
had_lance_table = true;
break;
}
}
assert!(
!had_lance_table,
"provider=none must not produce any *.lance table under {}",
lance_dir.display()
);
}
}
#[test]
fn list_docs_filters_by_tags_any() {
let env = TestEnv::lexical_only();
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
let filter = kebab_core::DocFilter {
tags_any: vec!["python".to_string()],
..Default::default()
};
let docs = kebab_app::list_docs_with_config(env.config.clone(), filter).unwrap();
assert_eq!(docs.len(), 1, "expected only the python doc: {docs:?}");
assert!(docs[0].tags.contains(&"python".to_string()));
let rust_filter = kebab_core::DocFilter {
tags_any: vec!["rust".to_string()],
..Default::default()
};
let rust_docs =
kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
// intro.md and notes/cargo.md both tag "rust".
assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}");
}
#[test]
fn inspect_doc_not_found_returns_actionable_error() {
let env = TestEnv::lexical_only();
let bogus =
kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string());
let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err();
let msg = format!("{err:#}");
assert!(
msg.contains("not found"),
"error must mention not-found: {msg}"
);
assert!(
msg.contains("kb list docs") || msg.contains("list"),
"error must hint at `kb list docs`: {msg}"
);
}
#[test]
fn inspect_chunk_not_found_returns_actionable_error() {
let env = TestEnv::lexical_only();
let bogus = kebab_core::ChunkId(
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
);
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus)
.unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("not found"), "got: {msg}");
}

View File

@@ -0,0 +1,69 @@
//! Lexical search integration tests. The vector / hybrid lanes are
//! AVX-gated and live in `search_vector.rs` (`#[ignore]`).
mod common;
use common::TestEnv;
fn lexical_query(text: &str) -> kebab_core::SearchQuery {
kebab_core::SearchQuery {
text: text.to_string(),
mode: kebab_core::SearchMode::Lexical,
k: 10,
filters: kebab_core::SearchFilters::default(),
}
}
#[test]
fn lexical_search_returns_hits_after_ingest() {
let env = TestEnv::lexical_only();
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
// "Ownership" appears as a heading + paragraph in intro.md and
// matches FTS5 default tokenizer easily.
let hits =
kebab_app::search_with_config(env.config.clone(), lexical_query("ownership"))
.unwrap();
assert!(!hits.is_empty(), "expected ≥1 hit for 'ownership'");
for h in &hits {
// Lexical retriever sets embedding_model=None per spec.
assert!(
h.embedding_model.is_none(),
"lexical-mode hit must have None embedding_model: {h:?}"
);
assert_eq!(
h.retrieval.method,
kebab_core::SearchMode::Lexical,
"method label should be Lexical"
);
}
}
#[test]
fn lexical_search_empty_query_returns_empty() {
let env = TestEnv::lexical_only();
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query(" "))
.unwrap();
assert!(hits.is_empty(), "blank query must short-circuit empty");
}
#[test]
fn vector_mode_with_provider_none_errors_clearly() {
let env = TestEnv::lexical_only();
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
let q = kebab_core::SearchQuery {
text: "ownership".to_string(),
mode: kebab_core::SearchMode::Vector,
k: 10,
filters: kebab_core::SearchFilters::default(),
};
let err = kebab_app::search_with_config(env.config.clone(), q).unwrap_err();
let msg = format!("{err:#}");
assert!(
msg.contains("embeddings disabled") || msg.contains("disabled"),
"error must mention embeddings disabled: {msg}"
);
}

View File

@@ -0,0 +1,89 @@
//! Vector / Hybrid lane — AVX-gated. Marked `#[ignore]` because Lance
//! crashes with `SIGILL` on hosts without AVX, and CI lanes that are
//! AVX-less should not run these. Local hosts run them via
//! `cargo test -p kb-app -- --ignored`.
mod common;
use common::TestEnv;
/// Panic if the host CPU lacks AVX. Mirrors the helper in
/// `kb-store-vector/tests/common/mod.rs` and `kb-search` so a
/// `--ignored` invocation on a non-AVX host fails loudly with a
/// clear message instead of crashing inside Lance's SIMD kernel.
fn require_avx_or_panic() {
#[cfg(target_arch = "x86_64")]
{
if !std::is_x86_feature_detected!("avx") {
panic!(
"kb-app vector integration test requires AVX-capable hardware; \
host CPU lacks AVX. Run on an AVX-capable machine."
);
}
}
}
// First run downloads ~470MB; expect ~30-60s warm, several minutes cold.
#[test]
#[ignore = "AVX-required (Lance SIMD kernels)"]
fn ingest_then_hybrid_search_returns_hits() {
require_avx_or_panic();
let env = TestEnv::with_embeddings();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
assert_eq!(report.new, 3);
let q = kebab_core::SearchQuery {
text: "ownership".to_string(),
mode: kebab_core::SearchMode::Hybrid,
k: 10,
filters: kebab_core::SearchFilters::default(),
};
let hits = kebab_app::search_with_config(env.config.clone(), q).unwrap();
assert!(!hits.is_empty(), "expected hybrid hits for 'ownership'");
let methods: Vec<_> = hits.iter().map(|h| h.retrieval.method).collect();
assert!(
methods.iter().all(|m| *m == kebab_core::SearchMode::Hybrid),
"every hit must report method=Hybrid: {methods:?}"
);
}
// First run downloads ~470MB; expect ~30-60s warm, several minutes cold.
#[test]
#[ignore = "AVX-required (Lance SIMD kernels)"]
fn ingest_then_vector_search_carries_embedding_model() {
require_avx_or_panic();
let env = TestEnv::with_embeddings();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
assert_eq!(report.new, 3);
let q = kebab_core::SearchQuery {
text: "ownership".to_string(),
mode: kebab_core::SearchMode::Vector,
k: 10,
filters: kebab_core::SearchFilters::default(),
};
let hits = kebab_app::search_with_config(env.config.clone(), q).unwrap();
assert!(!hits.is_empty(), "expected vector hits for 'ownership'");
// Vector mode dispatches through `VectorRetriever` and MUST stamp
// each hit with the configured embedding_model id.
let expected = kebab_core::EmbeddingModelId(env.config.models.embedding.model.clone());
for h in &hits {
assert_eq!(
h.embedding_model,
Some(expected.clone()),
"vector-mode hit must carry embedding_model={expected:?}: {h:?}"
);
assert_eq!(
h.retrieval.method,
kebab_core::SearchMode::Vector,
"vector-mode hit must report method=Vector"
);
}
}