From b335151d18512c32ef4860b245c99d4cc5bedc45 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Fri, 1 May 2026 05:20:35 +0000
Subject: [PATCH] feat(p2-2): kb-search crate + LexicalRetriever (FTS5 + bm25)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the first concrete kb_core::Retriever, exercising chunks_fts (P2-1)
to answer SearchMode::Lexical queries. Returns Vec<SearchHit> with
bm25-derived ranking, snippet() previews, and W3C-fragment-style
Citation built from the chunk's first source_spans entry.

New crate kb-search:
- LexicalRetriever::new(Arc<SqliteStore>, IndexVersion).
- search() builds an FTS5 MATCH expression by escaping every whitespace
  token into a quoted literal (inner " doubled); single-quote-wrapped
  text passes through verbatim as raw FTS5 syntax. Empty query
  short-circuits to Ok(vec![]).
- bm25 normalization: score = -bm25 / (1 + |bm25|), bounded (0, 1] for
  any FTS5-returned negative bm25.
- Snippet via snippet(chunks_fts, 3, '', '', '…', word_budget) where
  word_budget = snippet_chars / 4 clamped to [1, 64]; trim_snippet
  enforces the char cap on the way out (chars per design §6.4 — accepts
  the combining-mark trade-off).
- Citation from chunks.source_spans_json first span: Line / Page /
  Region / Time forwarded; Byte / empty array fall back to Line{1,1}
  with a tracing::warn so forward-compat regressions surface.
- Filters: tags_any (subquery on document_tags), lang (= column),
  trust_min (CASE-rank in SQL) all applied at SQL level. path_glob
  uses globset with literal_separator(true) — guarantees '*' does not
  cross '/' per spec Risks/notes — applied as Rust post-filter with
  +128 row over-fetch when set, then rank reassigned 1..k contiguously.
- Determinism: ORDER BY score, f.chunk_id (lexicographic blake3 hex
  tiebreaker on identical bm25). Tested explicitly with two chunks of
  identical text content.
- RetrievalDetail: method=Lexical, both lexical_score and fusion_score
  set, vector_* None.

kb-store-sqlite:
- Adds pub fn read_conn(&self) -> MutexGuard<'_, Connection>.
  Read-only contract is doc-only — kb-search needs MutexGuard for
  prepare_cached + iter, which a closure-scoped wrapper would awkwardly
  constrain. Closure variant left as a P3 follow-up.

Tests (26 new): empty corpus, empty query, single hit + citation
round-trip, snippet length cap, tags_any exclusion, lang+trust
composition, path_glob with '*' not crossing '/', citation line round-
trip, bm25 top-1 ∈ (0, 1], determinism (varied scores AND identical-
score tiebreaker), index_version passthrough, snapshot
(crates/kb-search/tests/fixtures/search/lexical/run-1.json — stable
under bundled SQLite; KB_UPDATE_SNAPSHOTS=1 to regenerate). Workspace:
211 tests pass, cargo clippy --workspace --all-targets -D warnings
clean.

Allowed deps respected: kb-core, kb-config, kb-store-sqlite, rusqlite,
tracing, thiserror, anyhow (forced by trait return type), serde_json
(parses *_json TEXT columns), globset (path_glob '*' boundary).

Out of scope (deferred): vector retriever (p3-3), hybrid fusion (p3-4),
reranker (P+), Korean morphological tokenizer (P+).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                                    |  16 +
 Cargo.toml                                    |   6 +
 crates/kb-search/Cargo.toml                   |  22 +
 crates/kb-search/src/lexical.rs               | 649 +++++++++++++++++
 crates/kb-search/src/lib.rs                   |  15 +
 .../tests/fixtures/search/lexical/run-1.json  |  60 ++
 crates/kb-search/tests/lexical.rs             | 666 ++++++++++++++++++
 crates/kb-store-sqlite/src/store.rs           |  16 +
 8 files changed, 1450 insertions(+)
 create mode 100644 crates/kb-search/Cargo.toml
 create mode 100644 crates/kb-search/src/lexical.rs
 create mode 100644 crates/kb-search/src/lib.rs
 create mode 100644 crates/kb-search/tests/fixtures/search/lexical/run-1.json
 create mode 100644 crates/kb-search/tests/lexical.rs
diff --git a/Cargo.lock b/Cargo.lock
index cecb492..1bd474c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -801,6 +801,22 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "kb-search"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "globset",
+ "kb-config",
+ "kb-core",
+ "kb-store-sqlite",
+ "rusqlite",
+ "serde_json",
+ "tempfile",
+ "thiserror 2.0.18",
+ "tracing",
+]
+
 [[package]]
 name = "kb-source-fs"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index b957c22..951f3ac 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
     "crates/kb-normalize",
     "crates/kb-chunk",
     "crates/kb-store-sqlite",
+    "crates/kb-search",
     "crates/kb-app",
     "crates/kb-cli",
 ]
@@ -29,3 +30,8 @@ time         = { version = "0.3", features = ["serde", "macros", "formatting", "
 uuid         = { version = "1", features = ["v7", "serde"] }
 blake3       = "1"
 tracing      = "0.1"
+# `bundled` ships SQLite source so the workspace doesn't depend on a
+# system libsqlite3 (matches the kb-store-sqlite feature set).
+rusqlite     = { version = "0.32", features = ["bundled"] }
+globset      = "0.4"
+tempfile     = "3"
diff --git a/crates/kb-search/Cargo.toml b/crates/kb-search/Cargo.toml
new file mode 100644
index 0000000..d7a49ab
--- /dev/null
+++ b/crates/kb-search/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name          = "kb-search"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "Retriever implementations for kb (P2-2 lexical FTS5; P3 vector / hybrid will follow)"
+
+[dependencies]
+kb-core         = { path = "../kb-core" }
+kb-config       = { path = "../kb-config" }
+kb-store-sqlite = { path = "../kb-store-sqlite" }
+rusqlite        = { workspace = true }
+globset         = { workspace = true }
+serde_json      = { workspace = true }
+tracing         = { workspace = true }
+thiserror       = { workspace = true }
+anyhow          = { workspace = true }
+
+[dev-dependencies]
+tempfile        = { workspace = true }
diff --git a/crates/kb-search/src/lexical.rs b/crates/kb-search/src/lexical.rs
new file mode 100644
index 0000000..6b06dc1
--- /dev/null
+++ b/crates/kb-search/src/lexical.rs
@@ -0,0 +1,649 @@
+//! Lexical (FTS5 + bm25) retriever — design §3.7 / §1.5 / §2.2 / §6.4.
+//!
+//! Owns the SQL pattern documented in `tasks/p2/p2-2-lexical-retriever.md`
+//! and constructs `kb_core::SearchHit` values directly from the joined
+//! `chunks_fts` / `chunks` / `documents` rows. Reads only — never mutates
+//! the underlying SQLite file.
+
+use std::sync::Arc;
+
+use anyhow::{Context, Result};
+use globset::GlobMatcher;
+use kb_core::{
+    ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail,
+    Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, SourceSpan,
+    TrustLevel, WorkspacePath,
+};
+use kb_store_sqlite::SqliteStore;
+use rusqlite::{params_from_iter, Connection, Row, ToSql};
+
+// ── Tunables ─────────────────────────────────────────────────────────────
+
+/// FTS5 hard limit on the `snippet()` `nToken` argument.
+/// See SQLite's FTS5 docs: snippet() rejects nToken > 64.
+const FTS5_SNIPPET_MAX_WORDS: usize = 64;
+
+/// Floor for the snippet word budget. `snippet_chars / 4` may yield 0 for
+/// pathologically small configs; we always ask FTS5 for at least one word
+/// so it can still return something matchable for the test harness.
+const FTS5_SNIPPET_MIN_WORDS: usize = 1;
+
+/// Default `k` when `SearchQuery::k == 0`. Mirrors §6.4 default_k=10.
+const DEFAULT_K: usize = 10;
+
+/// When `path_glob` is set we have to over-fetch and post-filter in Rust,
+/// because SQLite's GLOB operator treats `*` as "any chars including `/`",
+/// which contradicts the design rule that `*` must NOT cross path
+/// separators. Empirically `+128` is generous for any realistic workspace
+/// and bounded enough to keep memory predictable.
+const PATH_GLOB_OVERFETCH: usize = 128;
+
+// ── Public surface ───────────────────────────────────────────────────────
+
+/// Lexical retriever backed by SQLite FTS5 + bm25.
+pub struct LexicalRetriever {
+    store: Arc<SqliteStore>,
+    index_version: IndexVersion,
+    /// Number of `snippet()` words derived from `kb-config::search.snippet_chars`,
+    /// clamped into `[FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS]`.
+    snippet_words: usize,
+    /// Hard cap on the returned snippet's character length per design §6.4.
+    snippet_chars: usize,
+}
+
+impl LexicalRetriever {
+    /// Construct with default settings derived from `kb-config`'s defaults.
+    /// Snippet width is computed from `Config::defaults().search.snippet_chars`.
+    pub fn new(store: Arc<SqliteStore>, index_version: IndexVersion) -> Self {
+        let cfg = kb_config::Config::defaults();
+        Self::with_settings(store, index_version, cfg.search.snippet_chars)
+    }
+
+    /// Construct with explicit `snippet_chars`. Used by tests / callers
+    /// that have already loaded a `Config`.
+    pub fn with_settings(
+        store: Arc<SqliteStore>,
+        index_version: IndexVersion,
+        snippet_chars: usize,
+    ) -> Self {
+        // Heuristic: 1 token ≈ 4 chars (English-leaning estimate; Korean
+        // tokens average shorter, so the cap-by-chars trim below is what
+        // actually enforces the contract). The `/4` keeps us well below
+        // FTS5's nToken=64 limit for typical snippet_chars=220 budgets.
+        let raw = snippet_chars / 4;
+        let snippet_words = raw.clamp(FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS);
+        Self {
+            store,
+            index_version,
+            snippet_words,
+            snippet_chars,
+        }
+    }
+}
+
+impl Retriever for LexicalRetriever {
+    fn search(&self, query: &SearchQuery) -> Result<Vec<SearchHit>> {
+        let match_opt = build_match_string(&query.text);
+        let k = if query.k == 0 { DEFAULT_K } else { query.k };
+        let filters = &query.filters;
+        // One-line summary at request entry. Filter shape only — no
+        // tag/lang/path values, which could be PII-sensitive.
+        tracing::debug!(
+            match_str = match_opt.as_deref().unwrap_or("<empty>"),
+            tags_any = filters.tags_any.len(),
+            has_lang = filters.lang.is_some(),
+            has_trust_min = filters.trust_min.is_some(),
+            has_path_glob = filters.path_glob.is_some(),
+            k,
+            "kb-search lexical: search start"
+        );
+
+        // Empty / whitespace-only query → nothing to do. Per spec we
+        // succeed with an empty hit list rather than erroring.
+        let match_str = match match_opt {
+            Some(s) => s,
+            None => return Ok(Vec::new()),
+        };
+
+        // Pre-compile the path_glob once. The `Glob` produced rejects
+        // syntactically invalid patterns at construction time so the
+        // caller gets a clear error rather than a silent empty result.
+        let path_matcher = match &filters.path_glob {
+            Some(g) => Some(compile_glob(g)?),
+            None => None,
+        };
+
+        // Fetch budget: when post-filtering by glob we need to over-fetch
+        // so that the final `take(k)` still has enough rows after culling.
+        let fetch_limit = if path_matcher.is_some() {
+            k.saturating_add(PATH_GLOB_OVERFETCH)
+        } else {
+            k
+        };
+
+        let conn = self.store.read_conn();
+        let raw_rows = run_query(
+            &conn,
+            &match_str,
+            self.snippet_words,
+            filters,
+            fetch_limit,
+        )?;
+
+        let mut hits: Vec<SearchHit> = Vec::with_capacity(raw_rows.len().min(k));
+        let mut rank: u32 = 0;
+        for row in raw_rows {
+            // Path glob is the only filter we evaluate in Rust because the
+            // semantics differ from SQLite's GLOB (no `/` crossing).
+            if let Some(m) = &path_matcher {
+                if !m.is_match(&row.workspace_path) {
+                    continue;
+                }
+            }
+            rank = rank.saturating_add(1);
+            let hit = build_hit(row, rank, &self.index_version, self.snippet_chars)?;
+            hits.push(hit);
+            if hits.len() >= k {
+                break;
+            }
+        }
+        tracing::debug!(rows = hits.len(), "kb-search lexical: search done");
+        Ok(hits)
+    }
+
+    fn index_version(&self) -> IndexVersion {
+        self.index_version.clone()
+    }
+}
+
+// ── Match-string construction ────────────────────────────────────────────
+
+/// Translate a user-typed query into an FTS5 match string.
+///
+/// Rules (from the task spec):
+///
+/// - The query is wrapped in a single pair of `'...'` → strip the quotes
+///   and pass the inner text through verbatim. The user has explicitly
+///   opted into FTS5 syntax (e.g. `'rust AND cargo'`, `'foo*'`).
+///
+/// - Otherwise: split on whitespace, escape every token by wrapping it
+///   in `"..."` (FTS5 string literal), with any inner `"` doubled. Join
+///   with spaces — FTS5 default operator is implicit AND.
+///
+/// - An empty / whitespace-only token list → return `None` (caller
+///   short-circuits to `Ok(vec![])`).
+fn build_match_string(text: &str) -> Option<String> {
+    let trimmed = text.trim();
+    if trimmed.is_empty() {
+        return None;
+    }
+    if let Some(inner) = strip_single_quotes(trimmed) {
+        let inner_trim = inner.trim();
+        if inner_trim.is_empty() {
+            return None;
+        }
+        return Some(inner_trim.to_string());
+    }
+    let tokens: Vec<String> = trimmed
+        .split_whitespace()
+        .map(escape_fts5_token)
+        .collect();
+    if tokens.is_empty() {
+        None
+    } else {
+        Some(tokens.join(" "))
+    }
+}
+
+/// Return `Some(inner)` if `s` is wrapped in a matching pair of single
+/// quotes (`'...'`), otherwise `None`. We require the closing quote to
+/// be the last character so `'foo' bar` doesn't accidentally engage
+/// raw-FTS5 mode.
+fn strip_single_quotes(s: &str) -> Option<&str> {
+    let bytes = s.as_bytes();
+    if bytes.len() >= 2 && bytes[0] == b'\'' && bytes[bytes.len() - 1] == b'\'' {
+        Some(&s[1..s.len() - 1])
+    } else {
+        None
+    }
+}
+
+/// FTS5-escape one token by wrapping it in double quotes (FTS5 string
+/// literal). Inner `"` are escaped by doubling per FTS5 grammar. This is
+/// the simple-and-safe approach that defangs every special character —
+/// `(`, `)`, `*`, `^`, `:`, `"`, etc. — without trying to parse FTS5
+/// expressions.
+fn escape_fts5_token(tok: &str) -> String {
+    let mut out = String::with_capacity(tok.len() + 2);
+    out.push('"');
+    for ch in tok.chars() {
+        if ch == '"' {
+            out.push('"');
+            out.push('"');
+        } else {
+            out.push(ch);
+        }
+    }
+    out.push('"');
+    out
+}
+
+// ── SQL execution ────────────────────────────────────────────────────────
+
+/// Raw row shape mirroring the columns selected by [`run_query`]. Kept
+/// internal — every public path constructs `SearchHit` from this.
+struct RawRow {
+    chunk_id: String,
+    doc_id: String,
+    bm25_raw: f64,
+    snippet: String,
+    heading_path_json: String,
+    section_label: Option<String>,
+    source_spans_json: String,
+    chunker_version: String,
+    workspace_path: String,
+}
+
+/// Build + execute the FTS5 query. The SQL pattern is the one documented
+/// in `tasks/p2/p2-2-lexical-retriever.md` (§Behavior contract).
+fn run_query(
+    conn: &Connection,
+    match_str: &str,
+    snippet_words: usize,
+    filters: &SearchFilters,
+    fetch_limit: usize,
+) -> Result<Vec<RawRow>> {
+    // Build the dynamic SQL + positional parameter vector. Positional `?`
+    // is used (not named bindings) because the dynamic IN-list for
+    // `tags_any` is most natural with `params_from_iter`.
+    let mut sql = String::from(
+        "SELECT \
+            f.chunk_id, f.doc_id, \
+            bm25(chunks_fts) AS score, \
+            snippet(chunks_fts, 3, '', '', '…', ?) AS snippet, \
+            c.heading_path_json, c.section_label, c.source_spans_json, \
+            c.chunker_version, \
+            d.workspace_path \
+         FROM chunks_fts f \
+         JOIN chunks c    ON c.chunk_id = f.chunk_id \
+         JOIN documents d ON d.doc_id = f.doc_id",
+    );
+
+    let mut params: Vec<Box<dyn ToSql>> = Vec::new();
+    // 1) snippet word count.
+    params.push(Box::new(snippet_words as i64));
+    // 2) MATCH expression.
+    sql.push_str(" WHERE chunks_fts MATCH ?");
+    params.push(Box::new(match_str.to_owned()));
+
+    // tags_any: doc must own at least one of the requested tags.
+    if !filters.tags_any.is_empty() {
+        sql.push_str(
+            " AND f.doc_id IN (SELECT doc_id FROM document_tags WHERE tag IN (",
+        );
+        for (i, tag) in filters.tags_any.iter().enumerate() {
+            if i > 0 {
+                sql.push(',');
+            }
+            sql.push('?');
+            params.push(Box::new(tag.clone()));
+        }
+        sql.push_str("))");
+    }
+    if let Some(lang) = &filters.lang {
+        sql.push_str(" AND d.lang = ?");
+        params.push(Box::new(lang.0.clone()));
+    }
+    if let Some(trust_min) = &filters.trust_min {
+        // Mirror `kb_store_sqlite::documents::list_documents` ranking:
+        // Generated < Secondary < Primary. Doing the rank in SQL
+        // (rather than post-filtering) keeps the row stream short
+        // when the workspace contains many low-trust docs.
+        sql.push_str(
+            " AND CASE d.trust_level \
+                WHEN 'primary'   THEN 3 \
+                WHEN 'secondary' THEN 2 \
+                WHEN 'generated' THEN 1 \
+                ELSE 0 \
+              END >= ?",
+        );
+        let rank: i64 = match trust_min {
+            TrustLevel::Primary => 3,
+            TrustLevel::Secondary => 2,
+            TrustLevel::Generated => 1,
+        };
+        params.push(Box::new(rank));
+    }
+    // path_glob is intentionally NOT applied here — see module comment
+    // on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`.
+
+    // Determinism: tie-break on chunk_id so equal bm25 scores produce a
+    // stable order across runs. `f.chunk_id` is the FTS row's UNINDEXED
+    // copy of the same value as `c.chunk_id`; either side works.
+    sql.push_str(" ORDER BY score, f.chunk_id LIMIT ?");
+    params.push(Box::new(i64::try_from(fetch_limit).unwrap_or(i64::MAX)));
+
+    let mut stmt = conn
+        .prepare(&sql)
+        .context("kb-search lexical: prepare FTS5 statement")?;
+    let rows = stmt
+        .query_map(params_from_iter(params.iter().map(|b| b.as_ref())), row_from_sql)
+        .context("kb-search lexical: execute FTS5 query")?;
+    let mut out: Vec<RawRow> = Vec::new();
+    for r in rows {
+        out.push(r.context("kb-search lexical: read row")?);
+    }
+    Ok(out)
+}
+
+fn row_from_sql(row: &Row<'_>) -> rusqlite::Result<RawRow> {
+    Ok(RawRow {
+        chunk_id: row.get(0)?,
+        doc_id: row.get(1)?,
+        bm25_raw: row.get(2)?,
+        snippet: row.get(3)?,
+        heading_path_json: row.get(4)?,
+        section_label: row.get(5)?,
+        source_spans_json: row.get(6)?,
+        chunker_version: row.get(7)?,
+        workspace_path: row.get(8)?,
+    })
+}
+
+// ── Hit construction ─────────────────────────────────────────────────────
+
+fn build_hit(
+    raw: RawRow,
+    rank: u32,
+    index_version: &IndexVersion,
+    snippet_chars: usize,
+) -> Result<SearchHit> {
+    let normalized = normalize_bm25(raw.bm25_raw);
+    let heading_path: Vec<String> = serde_json::from_str(&raw.heading_path_json)
+        .context("kb-search lexical: deserialize heading_path_json")?;
+    let source_spans: Vec<SourceSpan> = serde_json::from_str(&raw.source_spans_json)
+        .context("kb-search lexical: deserialize source_spans_json")?;
+
+    let workspace_path = WorkspacePath::new(raw.workspace_path)
+        .context("kb-search lexical: documents.workspace_path violates WorkspacePath invariant")?;
+
+    let citation = build_citation(
+        &raw.chunk_id,
+        workspace_path.clone(),
+        raw.section_label.clone(),
+        source_spans.first(),
+    );
+
+    // FTS5's snippet() respects the word budget but produces a
+    // character-length we can't predict precisely (token boundaries vary
+    // with the tokenizer). The contract caps at `snippet_chars`; trim
+    // defensively if SQLite ever returns a longer string.
+    let snippet = trim_snippet(&raw.snippet, snippet_chars);
+
+    Ok(SearchHit {
+        rank,
+        chunk_id: ChunkId(raw.chunk_id),
+        doc_id: DocumentId(raw.doc_id),
+        doc_path: workspace_path,
+        heading_path,
+        section_label: raw.section_label,
+        snippet,
+        citation,
+        retrieval: RetrievalDetail {
+            method: SearchMode::Lexical,
+            fusion_score: normalized,
+            lexical_score: Some(normalized),
+            vector_score: None,
+            lexical_rank: Some(rank),
+            vector_rank: None,
+        },
+        index_version: index_version.clone(),
+        embedding_model: None,
+        chunker_version: ChunkerVersion(raw.chunker_version),
+    })
+}
+
+/// Map the raw bm25 score (FTS5 returns a *negative* number; lower is
+/// better) into a positive score in `(0, 1]`. The formula
+/// `score = -bm25 / (1 + |bm25|)` is monotonic, smooth, and bounded —
+/// suitable both for human display and for use as an RRF input.
+fn normalize_bm25(bm25_raw: f64) -> f32 {
+    let abs = bm25_raw.abs();
+    let normalized = -bm25_raw / (1.0_f64 + abs);
+    normalized as f32
+}
+
+/// Build a `Citation` from the chunk's first `SourceSpan`. P1 markdown
+/// only emits `Line`, so the other variants are mostly defensive — we
+/// forward them as faithfully as possible so a future PDF / image
+/// extractor can flow through without churn.
+fn build_citation(
+    chunk_id: &str,
+    path: WorkspacePath,
+    section: Option<String>,
+    first_span: Option<&SourceSpan>,
+) -> Citation {
+    match first_span {
+        Some(SourceSpan::Line { start, end }) => Citation::Line {
+            path,
+            start: *start,
+            end: *end,
+            section,
+        },
+        Some(SourceSpan::Page { page, .. }) => Citation::Page {
+            path,
+            page: *page,
+            section,
+        },
+        Some(SourceSpan::Region { x, y, w, h }) => Citation::Region {
+            path,
+            x: *x,
+            y: *y,
+            w: *w,
+            h: *h,
+        },
+        Some(SourceSpan::Time { start_ms, end_ms }) => Citation::Time {
+            path,
+            start_ms: *start_ms,
+            end_ms: *end_ms,
+            speaker: None,
+        },
+        // Byte-spans don't have a Citation variant. Fall back to a Line
+        // citation pointing at the document head — better than fabricating
+        // a position. Spans-empty falls into the same branch.
+        other @ (Some(SourceSpan::Byte { .. }) | None) => {
+            let span_shape = match other {
+                Some(_) => "Byte",
+                None => "empty array",
+            };
+            tracing::warn!(
+                chunk_id,
+                span_shape,
+                "kb-search lexical: SourceSpan has no Citation mapping; falling back to Line {{1, 1}}"
+            );
+            Citation::Line {
+                path,
+                start: 1,
+                end: 1,
+                section,
+            }
+        }
+    }
+}
+
+/// Cap the snippet at `max_chars` characters (Unicode scalar values, not
+/// bytes — matches the §6.4 setting's "characters" semantics). Returns
+/// the input unchanged when already short enough.
+fn trim_snippet(s: &str, max_chars: usize) -> String {
+    // We slice on Unicode scalar values per §6.4's "characters" semantics; this
+    // can orphan a combining mark in extreme cases (Hebrew niqqud, Devanagari)
+    // but matches the spec's char-budget definition.
+    if s.chars().count() <= max_chars {
+        return s.to_string();
+    }
+    s.chars().take(max_chars).collect()
+}
+
+// ── path_glob ────────────────────────────────────────────────────────────
+
+/// Compile a `path_glob` pattern. We enable `literal_separator` so `*`
+/// does NOT cross `/` — design requires `*` to match within a single
+/// path segment, not across them. (`globset`'s default is to let `*`
+/// span separators.)
+fn compile_glob(pattern: &str) -> Result<GlobMatcher> {
+    let g = globset::GlobBuilder::new(pattern)
+        .literal_separator(true)
+        .build()
+        .with_context(|| format!("kb-search lexical: invalid path_glob {pattern:?}"))?;
+    Ok(g.compile_matcher())
+}
+
+// ── Unit tests for pure helpers ──────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn build_match_string_empty_returns_none() {
+        assert!(build_match_string("").is_none());
+        assert!(build_match_string("   ").is_none());
+        assert!(build_match_string("''").is_none());
+        assert!(build_match_string("'   '").is_none());
+    }
+
+    #[test]
+    fn build_match_string_default_is_quoted_and_anded() {
+        let s = build_match_string("rust cargo").unwrap();
+        // Two tokens, each quoted, joined by a space (implicit AND).
+        assert_eq!(s, r#""rust" "cargo""#);
+    }
+
+    #[test]
+    fn build_match_string_escapes_special_chars() {
+        // `*`, `(`, `)`, `:`, `^`, `"` should all be wrapped inside
+        // FTS5 string-literal quotes so they're treated as literal
+        // text rather than FTS5 operators.
+        let s = build_match_string(r#"foo* (bar) baz:qux ^head he"llo"#).unwrap();
+        assert_eq!(
+            s,
+            r#""foo*" "(bar)" "baz:qux" "^head" "he""llo""#
+        );
+        // The doubled `""` is FTS5's way of embedding a literal quote
+        // inside a string literal.
+        assert!(s.contains(r#"he""llo"#));
+        // Sanity: every special character lives between matching `"`
+        // delimiters — there is no bare-token (unquoted) span anywhere.
+        // We check this by confirming the string starts and ends with `"`
+        // and the count of unescaped `"` is even (each token is wrapped).
+        assert!(s.starts_with('"') && s.ends_with('"'));
+    }
+
+    #[test]
+    fn build_match_string_passthrough_when_single_quoted() {
+        // The FTS5 expression is preserved verbatim.
+        let s = build_match_string("'foo OR bar*'").unwrap();
+        assert_eq!(s, "foo OR bar*");
+    }
+
+    #[test]
+    fn normalize_bm25_top_score_in_unit_interval() {
+        // A "perfect" hit is bm25 = -1.0 → normalized 0.5.
+        // A high-relevance hit (bm25 = -10.0) → 10/11 ≈ 0.909.
+        let high = normalize_bm25(-10.0);
+        assert!(high > 0.0 && high <= 1.0, "got {high}");
+        let medium = normalize_bm25(-1.0);
+        assert!((medium - 0.5).abs() < 1e-6);
+    }
+
+    #[test]
+    fn normalize_bm25_monotonic() {
+        // Lower (more-negative) bm25 must map to a higher normalized score.
+        let a = normalize_bm25(-2.0);
+        let b = normalize_bm25(-1.0);
+        assert!(a > b, "{a} should exceed {b}");
+    }
+
+    #[test]
+    fn trim_snippet_caps_at_char_count() {
+        let s = "a".repeat(300);
+        let trimmed = trim_snippet(&s, 220);
+        assert_eq!(trimmed.chars().count(), 220);
+    }
+
+    #[test]
+    fn trim_snippet_passthrough_when_short() {
+        let s = "short";
+        assert_eq!(trim_snippet(s, 220), "short");
+    }
+
+    #[test]
+    fn build_citation_line_round_trip() {
+        let p = WorkspacePath::new("a/b.md".to_string()).unwrap();
+        let span = SourceSpan::Line { start: 7, end: 12 };
+        let c = build_citation("c1", p.clone(), Some("S1".to_string()), Some(&span));
+        match c {
+            Citation::Line {
+                start,
+                end,
+                ref section,
+                path: ref pp,
+            } => {
+                assert_eq!(start, 7);
+                assert_eq!(end, 12);
+                assert_eq!(section.as_deref(), Some("S1"));
+                assert_eq!(pp, &p);
+            }
+            other => panic!("expected Citation::Line, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn build_citation_page_forwards_section() {
+        let p = WorkspacePath::new("doc.pdf".to_string()).unwrap();
+        let span = SourceSpan::Page {
+            page: 4,
+            char_start: None,
+            char_end: None,
+        };
+        let c = build_citation("c1", p, Some("Intro".to_string()), Some(&span));
+        match c {
+            Citation::Page {
+                page,
+                ref section,
+                ..
+            } => {
+                assert_eq!(page, 4);
+                assert_eq!(section.as_deref(), Some("Intro"));
+            }
+            other => panic!("expected Citation::Page, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn build_citation_none_falls_back_to_line_one() {
+        let p = WorkspacePath::new("x.md".to_string()).unwrap();
+        let c = build_citation("c1", p, None, None);
+        match c {
+            Citation::Line { start, end, .. } => {
+                assert_eq!((start, end), (1, 1));
+            }
+            other => panic!("expected fallback Citation::Line, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn compile_glob_rejects_invalid_pattern() {
+        // `[` is a character-class opener; an unclosed class is invalid.
+        let r = compile_glob("notes/[abc");
+        assert!(r.is_err());
+    }
+
+    #[test]
+    fn compile_glob_star_does_not_cross_slash() {
+        // This is the design invariant: `*` must NOT match `/`.
+        let m = compile_glob("notes/*.md").unwrap();
+        assert!(m.is_match("notes/foo.md"));
+        assert!(!m.is_match("notes/sub/foo.md"));
+    }
+}
diff --git a/crates/kb-search/src/lib.rs b/crates/kb-search/src/lib.rs
new file mode 100644
index 0000000..76afd55
--- /dev/null
+++ b/crates/kb-search/src/lib.rs
@@ -0,0 +1,15 @@
+//! `kb-search` — `kb_core::Retriever` implementations.
+//!
+//! P2-2 ships [`LexicalRetriever`], a SQLite-FTS5-backed retriever for
+//! `SearchMode::Lexical`. Vector + Hybrid retrievers land in P3-3 / P3-4.
+//!
+//! Allowed deps per task spec: `kb-core`, `kb-config`, `kb-store-sqlite`,
+//! `rusqlite`, `globset`, `tracing`, `thiserror`, `anyhow`. Forbidden:
+//! `kb-source-fs`, `kb-parse-md`, `kb-normalize`, `kb-chunk`,
+//! `kb-store-vector`, `kb-embed*`, `kb-llm*`, `kb-rag`, `kb-tui`,
+//! `kb-desktop`. Only `serde_json` is a transitive helper used to decode
+//! JSON-typed columns from `chunks` / `documents`.
+
+mod lexical;
+
+pub use lexical::LexicalRetriever;
diff --git a/crates/kb-search/tests/fixtures/search/lexical/run-1.json b/crates/kb-search/tests/fixtures/search/lexical/run-1.json
new file mode 100644
index 0000000..701d2fb
--- /dev/null
+++ b/crates/kb-search/tests/fixtures/search/lexical/run-1.json
@@ -0,0 +1,60 @@
+[
+  {
+    "chunk_id": "c3000000000000000000000000000000",
+    "chunker_version": "v1",
+    "citation": {
+      "end": 8,
+      "kind": "line",
+      "path": "notes/snap.md",
+      "section": "Snap",
+      "start": 7
+    },
+    "doc_id": "d0000000000000000000000000000000",
+    "doc_path": "notes/snap.md",
+    "embedding_model": null,
+    "heading_path": [
+      "Snap"
+    ],
+    "index_version": "v1.0",
+    "rank": 1,
+    "retrieval": {
+      "fusion_score": 1.4490997273242101e-6,
+      "lexical_rank": 1,
+      "lexical_score": 1.4490997273242101e-6,
+      "method": "lexical",
+      "vector_rank": null,
+      "vector_score": null
+    },
+    "section_label": "Snap",
+    "snippet": "alpha alpha"
+  },
+  {
+    "chunk_id": "c1000000000000000000000000000000",
+    "chunker_version": "v1",
+    "citation": {
+      "end": 2,
+      "kind": "line",
+      "path": "notes/snap.md",
+      "section": "Snap",
+      "start": 1
+    },
+    "doc_id": "d0000000000000000000000000000000",
+    "doc_path": "notes/snap.md",
+    "embedding_model": null,
+    "heading_path": [
+      "Snap"
+    ],
+    "index_version": "v1.0",
+    "rank": 2,
+    "retrieval": {
+      "fusion_score": 9.641424867368187e-7,
+      "lexical_rank": 2,
+      "lexical_score": 9.641424867368187e-7,
+      "method": "lexical",
+      "vector_rank": null,
+      "vector_score": null
+    },
+    "section_label": "Snap",
+    "snippet": "alpha bravo charlie"
+  }
+]
\ No newline at end of file
diff --git a/crates/kb-search/tests/lexical.rs b/crates/kb-search/tests/lexical.rs
new file mode 100644
index 0000000..91a2b71
--- /dev/null
+++ b/crates/kb-search/tests/lexical.rs
@@ -0,0 +1,666 @@
+//! P2-2 integration tests for `LexicalRetriever`.
+//!
+//! Strategy: seed the SQLite store via raw inserts with `foreign_keys =
+//! OFF` (mirroring the P2-1 FTS tests). This avoids dragging
+//! `kb-parse-md` / `kb-normalize` / `kb-chunk` into kb-search's dev-deps,
+//! which would violate the task's "Allowed deps" list.
+
+use std::sync::Arc;
+
+use kb_config::Config;
+use kb_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel};
+use kb_search::LexicalRetriever;
+use kb_store_sqlite::SqliteStore;
+use rusqlite::Connection;
+use tempfile::TempDir;
+
+// ── Test scaffolding ─────────────────────────────────────────────────────
+
+struct Env {
+    _temp: TempDir,
+    store: Arc<SqliteStore>,
+    db_path: std::path::PathBuf,
+}
+
+impl Env {
+    fn new() -> Self {
+        let temp = tempfile::tempdir().expect("tempdir");
+        let mut config = Config::defaults();
+        config.storage.data_dir = temp.path().to_string_lossy().into_owned();
+        let store = SqliteStore::open(&config).expect("open store");
+        store.run_migrations().expect("run migrations");
+        let db_path = temp.path().join("kb.sqlite");
+        Self {
+            _temp: temp,
+            store: Arc::new(store),
+            db_path,
+        }
+    }
+
+    /// Side-channel raw connection with FK enforcement off — same
+    /// trick used by P2-1's FTS tests so we can seed `chunks` /
+    /// `documents` directly without the full ingest graph.
+    fn raw_conn(&self) -> Connection {
+        let conn = Connection::open(&self.db_path).expect("open side conn");
+        conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
+        conn
+    }
+
+    fn retriever(&self) -> LexicalRetriever {
+        LexicalRetriever::new(
+            Arc::clone(&self.store),
+            IndexVersion("v1.0".to_string()),
+        )
+    }
+
+    fn retriever_with_snippet_chars(&self, snippet_chars: usize) -> LexicalRetriever {
+        LexicalRetriever::with_settings(
+            Arc::clone(&self.store),
+            IndexVersion("v1.0".to_string()),
+            snippet_chars,
+        )
+    }
+}
+
+/// Minimal documents row. Many columns are NOT NULL and we don't care
+/// about their exact values for retrieval tests, so we wedge in
+/// reasonable defaults.
+#[allow(clippy::too_many_arguments)]
+fn insert_document(
+    conn: &Connection,
+    doc_id: &str,
+    workspace_path: &str,
+    title: &str,
+    lang: &str,
+    trust_level: &str,
+    tags: &[&str],
+) {
+    // assets row first — documents.asset_id has a FK with ON DELETE
+    // RESTRICT but FKs are OFF on this connection. Still we insert a
+    // matching row so JOINs pick it up.
+    let asset_id = format!("{:0>32}", &doc_id[..1.min(doc_id.len())]); // 32-hex-ish
+    let asset_id = format!("{:0>32}", asset_id.chars().take(32).collect::<String>());
+    conn.execute(
+        "INSERT OR IGNORE INTO assets (
+            asset_id, source_uri, workspace_path, media_type, byte_len,
+            checksum, storage_kind, storage_path, discovered_at
+        ) VALUES (?, 'file:///x', ?, '\"markdown\"', 0,
+                  'd0', 'reference', '/x', '2024-01-01T00:00:00Z')",
+        rusqlite::params![asset_id, workspace_path],
+    )
+    .expect("insert asset");
+
+    conn.execute(
+        "INSERT INTO documents (
+            doc_id, asset_id, workspace_path, title, lang,
+            source_type, trust_level, parser_version,
+            doc_version, schema_version, metadata_json,
+            provenance_json, created_at, updated_at
+        ) VALUES (?, ?, ?, ?, ?, 'markdown', ?, 'pv1', 1, 1,
+                  '{}', '{\"events\":[]}',
+                  '2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z')",
+        rusqlite::params![doc_id, asset_id, workspace_path, title, lang, trust_level],
+    )
+    .expect("insert document");
+
+    for tag in tags {
+        conn.execute(
+            "INSERT INTO document_tags (doc_id, tag) VALUES (?, ?)",
+            rusqlite::params![doc_id, tag],
+        )
+        .expect("insert tag");
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+fn insert_chunk(
+    conn: &Connection,
+    chunk_id: &str,
+    doc_id: &str,
+    text: &str,
+    heading_path: &[&str],
+    section_label: Option<&str>,
+    source_spans_json: &str,
+    chunker_version: &str,
+) {
+    let heading_json = serde_json::to_string(heading_path).unwrap();
+    conn.execute(
+        "INSERT INTO chunks (
+            chunk_id, doc_id, text, heading_path_json, section_label,
+            source_spans_json, token_estimate, chunker_version,
+            policy_hash, block_ids_json, created_at
+        ) VALUES (?, ?, ?, ?, ?, ?, 0, ?, 'h', '[]', '2024-01-01T00:00:00Z')",
+        rusqlite::params![
+            chunk_id,
+            doc_id,
+            text,
+            heading_json,
+            section_label,
+            source_spans_json,
+            chunker_version,
+        ],
+    )
+    .expect("insert chunk");
+}
+
+/// Pad a short ID to the 32-hex shape kb_core newtypes expect.
+fn id32(prefix: &str) -> String {
+    let mut s = prefix.to_string();
+    while s.len() < 32 {
+        s.push('0');
+    }
+    s.truncate(32);
+    s
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────
+
+#[test]
+fn lexical_empty_corpus_returns_empty_vec() {
+    let env = Env::new();
+    let r = env.retriever();
+    let q = SearchQuery {
+        text: "rust".to_string(),
+        mode: SearchMode::Lexical,
+        k: 10,
+        filters: SearchFilters::default(),
+    };
+    let hits = r.search(&q).expect("search");
+    assert!(hits.is_empty(), "empty corpus must yield empty Vec");
+}
+
+#[test]
+fn lexical_empty_query_returns_empty_vec_without_db_hit() {
+    // Even with rows in the DB, a blank query must short-circuit to [].
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d"), "notes/a.md", "A", "en", "primary", &[]);
+    insert_chunk(
+        &conn,
+        &id32("c1"),
+        &id32("d"),
+        "rust cargo macros",
+        &["A"],
+        None,
+        r#"[{"kind":"line","start":1,"end":3}]"#,
+        "v1",
+    );
+    drop(conn);
+
+    let r = env.retriever();
+    for empty in ["", "   ", "''"] {
+        let q = SearchQuery {
+            text: empty.to_string(),
+            mode: SearchMode::Lexical,
+            k: 5,
+            filters: SearchFilters::default(),
+        };
+        let hits = r.search(&q).unwrap();
+        assert!(hits.is_empty(), "query {empty:?} must yield empty Vec");
+    }
+}
+
+#[test]
+fn lexical_single_doc_match_returns_one_hit_with_citation_round_trip() {
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d"), "notes/rust.md", "Rust Notes", "en", "primary", &[]);
+    insert_chunk(
+        &conn,
+        &id32("c1"),
+        &id32("d"),
+        "Rust borrow checker enforces ownership.",
+        &["Notes"],
+        Some("Notes"),
+        r#"[{"kind":"line","start":4,"end":4}]"#,
+        "v1",
+    );
+    drop(conn);
+
+    let r = env.retriever();
+    let q = SearchQuery {
+        text: "borrow".to_string(),
+        mode: SearchMode::Lexical,
+        k: 10,
+        filters: SearchFilters::default(),
+    };
+    let hits = r.search(&q).expect("search");
+    assert_eq!(hits.len(), 1);
+    let h = &hits[0];
+    assert_eq!(h.rank, 1);
+    assert_eq!(h.doc_path.0, "notes/rust.md");
+    assert_eq!(h.heading_path, vec!["Notes".to_string()]);
+    assert_eq!(h.section_label.as_deref(), Some("Notes"));
+    assert_eq!(h.retrieval.method, SearchMode::Lexical);
+    assert_eq!(h.retrieval.lexical_rank, Some(1));
+    assert!(h.retrieval.vector_score.is_none());
+
+    // Citation round-trips through `to_uri`/`parse` (line variant).
+    let uri = h.citation.to_uri();
+    let parsed = kb_core::Citation::parse(&uri).expect("parse uri");
+    // Reparsed citation has section=None (URI fragment doesn't carry it),
+    // so compare by `to_uri` equivalence rather than struct equality.
+    assert_eq!(parsed.to_uri(), uri);
+    // Sanity: this is a Line citation matching the seeded source span.
+    assert_eq!(uri, "notes/rust.md#L4");
+}
+
+#[test]
+fn lexical_snippet_length_capped_at_snippet_chars() {
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(
+        &conn,
+        &id32("d"),
+        "notes/long.md",
+        "Long",
+        "en",
+        "primary",
+        &[],
+    );
+    // A text long enough that FTS5 might return a snippet > 80 chars
+    // when given a high word budget. We instead set a tight cap below
+    // and rely on `trim_snippet` as the backstop.
+    let mut text = String::new();
+    for _ in 0..50 {
+        text.push_str("alpha beta gamma delta epsilon ");
+    }
+    insert_chunk(
+        &conn,
+        &id32("c1"),
+        &id32("d"),
+        &text,
+        &["Long"],
+        None,
+        r#"[{"kind":"line","start":1,"end":1}]"#,
+        "v1",
+    );
+    drop(conn);
+
+    // Set snippet_chars to a known bound; the retriever clamps + trims
+    // any snippet to fit.
+    let r = env.retriever_with_snippet_chars(80);
+    let hits = r
+        .search(&SearchQuery {
+            text: "alpha".to_string(),
+            mode: SearchMode::Lexical,
+            k: 1,
+            filters: SearchFilters::default(),
+        })
+        .unwrap();
+    assert_eq!(hits.len(), 1);
+    assert!(
+        hits[0].snippet.chars().count() <= 80,
+        "snippet must be ≤ snippet_chars; got {} chars: {:?}",
+        hits[0].snippet.chars().count(),
+        hits[0].snippet
+    );
+}
+
+#[test]
+fn lexical_filter_tags_any_excludes_untagged_docs() {
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &["rust"]);
+    insert_document(&conn, &id32("d2"), "notes/b.md", "B", "en", "primary", &["python"]);
+    insert_chunk(
+        &conn,
+        &id32("c1"),
+        &id32("d1"),
+        "ownership and borrow checker",
+        &["A"],
+        None,
+        r#"[{"kind":"line","start":1,"end":1}]"#,
+        "v1",
+    );
+    insert_chunk(
+        &conn,
+        &id32("c2"),
+        &id32("d2"),
+        "borrow semantics in python",
+        &["B"],
+        None,
+        r#"[{"kind":"line","start":1,"end":1}]"#,
+        "v1",
+    );
+    drop(conn);
+
+    let r = env.retriever();
+    let q = SearchQuery {
+        text: "borrow".to_string(),
+        mode: SearchMode::Lexical,
+        k: 10,
+        filters: SearchFilters {
+            tags_any: vec!["rust".to_string()],
+            ..Default::default()
+        },
+    };
+    let hits = r.search(&q).unwrap();
+    assert_eq!(hits.len(), 1, "tags_any=[rust] must exclude python doc");
+    assert_eq!(hits[0].doc_path.0, "notes/a.md");
+}
+
+#[test]
+fn lexical_filter_lang_and_trust_min_compose() {
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d1"), "ko/a.md", "A", "ko", "primary", &[]);
+    insert_document(&conn, &id32("d2"), "en/b.md", "B", "en", "primary", &[]);
+    insert_document(&conn, &id32("d3"), "en/c.md", "C", "en", "generated", &[]);
+    for (cid, did, body) in [
+        ("c1", "d1", "검색 키워드 alpha"),
+        ("c2", "d2", "alpha bravo"),
+        ("c3", "d3", "alpha gamma"),
+    ] {
+        insert_chunk(
+            &conn,
+            &id32(cid),
+            &id32(did),
+            body,
+            &[],
+            None,
+            r#"[{"kind":"line","start":1,"end":1}]"#,
+            "v1",
+        );
+    }
+    drop(conn);
+
+    let r = env.retriever();
+    // lang=en + trust_min=secondary → only d2 (primary ≥ secondary).
+    let hits = r
+        .search(&SearchQuery {
+            text: "alpha".to_string(),
+            mode: SearchMode::Lexical,
+            k: 10,
+            filters: SearchFilters {
+                lang: Some(Lang("en".to_string())),
+                trust_min: Some(TrustLevel::Secondary),
+                ..Default::default()
+            },
+        })
+        .unwrap();
+    assert_eq!(hits.len(), 1);
+    assert_eq!(hits[0].doc_path.0, "en/b.md");
+}
+
+#[test]
+fn lexical_filter_path_glob_does_not_cross_slash() {
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &[]);
+    insert_document(&conn, &id32("d2"), "notes/sub/b.md", "B", "en", "primary", &[]);
+    insert_chunk(
+        &conn,
+        &id32("c1"),
+        &id32("d1"),
+        "shared keyword",
+        &[],
+        None,
+        r#"[{"kind":"line","start":1,"end":1}]"#,
+        "v1",
+    );
+    insert_chunk(
+        &conn,
+        &id32("c2"),
+        &id32("d2"),
+        "shared keyword",
+        &[],
+        None,
+        r#"[{"kind":"line","start":1,"end":1}]"#,
+        "v1",
+    );
+    drop(conn);
+
+    let r = env.retriever();
+    let hits = r
+        .search(&SearchQuery {
+            text: "keyword".to_string(),
+            mode: SearchMode::Lexical,
+            k: 10,
+            filters: SearchFilters {
+                path_glob: Some("notes/*.md".to_string()),
+                ..Default::default()
+            },
+        })
+        .unwrap();
+    let paths: Vec<&str> = hits.iter().map(|h| h.doc_path.0.as_str()).collect();
+    assert_eq!(paths, vec!["notes/a.md"], "* must not match across `/`");
+}
+
+#[test]
+fn lexical_citation_round_trip_against_first_source_span() {
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d"), "notes/m.md", "M", "en", "primary", &[]);
+    insert_chunk(
+        &conn,
+        &id32("c1"),
+        &id32("d"),
+        "echo bravo",
+        &[],
+        None,
+        // Two spans; the citation uses the first.
+        r#"[{"kind":"line","start":12,"end":34},{"kind":"line","start":60,"end":61}]"#,
+        "v1",
+    );
+    drop(conn);
+
+    let r = env.retriever();
+    let hits = r
+        .search(&SearchQuery {
+            text: "bravo".to_string(),
+            mode: SearchMode::Lexical,
+            k: 1,
+            filters: SearchFilters::default(),
+        })
+        .unwrap();
+    assert_eq!(hits.len(), 1);
+    let uri = hits[0].citation.to_uri();
+    assert_eq!(uri, "notes/m.md#L12-L34");
+    let parsed = kb_core::Citation::parse(&uri).unwrap();
+    assert_eq!(parsed.to_uri(), uri);
+}
+
+#[test]
+fn lexical_top_score_within_unit_interval_three_chunks() {
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]);
+    // Three chunks of varying relevance to the query 'alpha':
+    //   c1: alpha alpha alpha (best)
+    //   c2: alpha bravo
+    //   c3: bravo charlie alpha (one occurrence)
+    for (cid, body) in [
+        ("c1", "alpha alpha alpha keyword"),
+        ("c2", "alpha bravo charlie"),
+        ("c3", "bravo charlie alpha"),
+    ] {
+        insert_chunk(
+            &conn,
+            &id32(cid),
+            &id32("d"),
+            body,
+            &[],
+            None,
+            r#"[{"kind":"line","start":1,"end":1}]"#,
+            "v1",
+        );
+    }
+    drop(conn);
+
+    let r = env.retriever();
+    let hits = r
+        .search(&SearchQuery {
+            text: "alpha".to_string(),
+            mode: SearchMode::Lexical,
+            k: 10,
+            filters: SearchFilters::default(),
+        })
+        .unwrap();
+    assert!(!hits.is_empty(), "must surface at least one hit");
+    let top = hits[0].retrieval.fusion_score;
+    assert!(
+        top > 0.0 && top <= 1.0,
+        "top normalized score must be in (0, 1]; got {top}"
+    );
+    // All scores in [0, 1].
+    for h in &hits {
+        let s = h.retrieval.fusion_score;
+        assert!((0.0..=1.0).contains(&s), "hit score {s} out of [0, 1]");
+        // lexical_score and fusion_score equal in lexical-only mode.
+        assert_eq!(h.retrieval.lexical_score, Some(s));
+    }
+    // bm25 should rank c1 (3 occurrences) above c2 / c3.
+    assert!(hits[0].chunk_id.0.starts_with("c1"));
+}
+
+#[test]
+fn lexical_determinism_same_query_twice() {
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]);
+    for (cid, body) in [
+        ("c1", "alpha alpha"),
+        ("c2", "alpha bravo"),
+        ("c3", "alpha charlie"),
+        ("c4", "alpha delta"),
+    ] {
+        insert_chunk(
+            &conn,
+            &id32(cid),
+            &id32("d"),
+            body,
+            &[],
+            None,
+            r#"[{"kind":"line","start":1,"end":1}]"#,
+            "v1",
+        );
+    }
+    drop(conn);
+
+    let r = env.retriever();
+    let q = SearchQuery {
+        text: "alpha".to_string(),
+        mode: SearchMode::Lexical,
+        k: 10,
+        filters: SearchFilters::default(),
+    };
+    let a = r.search(&q).unwrap();
+    let b = r.search(&q).unwrap();
+    assert_eq!(a, b, "same DB + same query must yield identical Vec<SearchHit>");
+}
+
+#[test]
+fn lexical_determinism_chunk_id_tiebreaker_on_equal_bm25() {
+    // Two chunks with byte-identical text + length → identical bm25 scores
+    // for any `MATCH` against them. The retriever must fall back to
+    // `chunk_id` ordering so the result is stable across runs.
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d"), "notes/tie.md", "Tie", "en", "primary", &[]);
+    let cid_a = id32("aaaa");
+    let cid_b = id32("bbbb");
+    assert!(cid_a < cid_b, "test premise: aaaa-id sorts before bbbb-id");
+    for cid in [&cid_a, &cid_b] {
+        insert_chunk(
+            &conn,
+            cid,
+            &id32("d"),
+            "alpha bravo charlie",
+            &[],
+            None,
+            r#"[{"kind":"line","start":1,"end":1}]"#,
+            "v1",
+        );
+    }
+    drop(conn);
+
+    let r = env.retriever();
+    let q = SearchQuery {
+        text: "alpha".to_string(),
+        mode: SearchMode::Lexical,
+        k: 10,
+        filters: SearchFilters::default(),
+    };
+    let a = r.search(&q).unwrap();
+    let b = r.search(&q).unwrap();
+    assert_eq!(a.len(), 2, "both chunks should match");
+    // bm25 must be equal for byte-identical chunks; the secondary sort
+    // by chunk_id pins the order.
+    assert!(
+        (a[0].retrieval.fusion_score - a[1].retrieval.fusion_score).abs() < 1e-9,
+        "byte-identical chunks must score equally; got {} vs {}",
+        a[0].retrieval.fusion_score,
+        a[1].retrieval.fusion_score
+    );
+    assert!(
+        a[0].chunk_id.0 < a[1].chunk_id.0,
+        "tiebreaker must order by chunk_id ascending; got {} then {}",
+        a[0].chunk_id.0,
+        a[1].chunk_id.0
+    );
+    assert_eq!(a, b, "tiebreaker order must be stable across runs");
+}
+
+#[test]
+fn lexical_index_version_is_returned_unchanged() {
+    let env = Env::new();
+    let r = LexicalRetriever::new(
+        Arc::clone(&env.store),
+        IndexVersion("custom-label-1".to_string()),
+    );
+    assert_eq!(r.index_version().0, "custom-label-1");
+}
+
+#[test]
+fn lexical_snapshot_run_1() {
+    // Pinned snapshot. A small, deterministic corpus; the JSON shape of
+    // `Vec<SearchHit>` for a fixed query is checked verbatim against
+    // `tests/fixtures/search/lexical/run-1.json`. Update both sides in
+    // the same commit when intentional changes ship.
+    // Stable because rusqlite ships bundled SQLite — a tokenizer/bm25 algorithm change in a future SQLite bump will require regenerating run-1.json via `KB_UPDATE_SNAPSHOTS=1`.
+    let env = Env::new();
+    let conn = env.raw_conn();
+    insert_document(&conn, &id32("d"), "notes/snap.md", "Snap", "en", "primary", &[]);
+    for (cid, body, span) in [
+        (
+            "c1",
+            "alpha bravo charlie",
+            r#"[{"kind":"line","start":1,"end":2}]"#,
+        ),
+        (
+            "c2",
+            "bravo only here",
+            r#"[{"kind":"line","start":4,"end":5}]"#,
+        ),
+        (
+            "c3",
+            "alpha alpha",
+            r#"[{"kind":"line","start":7,"end":8}]"#,
+        ),
+    ] {
+        insert_chunk(&conn, &id32(cid), &id32("d"), body, &["Snap"], Some("Snap"), span, "v1");
+    }
+    drop(conn);
+
+    let r = env.retriever();
+    let hits = r
+        .search(&SearchQuery {
+            text: "alpha".to_string(),
+            mode: SearchMode::Lexical,
+            k: 10,
+            filters: SearchFilters::default(),
+        })
+        .unwrap();
+    let actual = serde_json::to_value(&hits).unwrap();
+
+    let baseline_path =
+        std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/search/lexical/run-1.json");
+    if std::env::var_os("KB_UPDATE_SNAPSHOTS").is_some() {
+        std::fs::write(&baseline_path, serde_json::to_string_pretty(&actual).unwrap()).unwrap();
+    }
+    let baseline_text = std::fs::read_to_string(&baseline_path)
+        .expect("baseline snapshot must exist; run with KB_UPDATE_SNAPSHOTS=1 to seed");
+    let expected: serde_json::Value = serde_json::from_str(&baseline_text).unwrap();
+    assert_eq!(actual, expected, "lexical run-1 snapshot drift");
+}
diff --git a/crates/kb-store-sqlite/src/store.rs b/crates/kb-store-sqlite/src/store.rs
index 83d0182..b7f57d6 100644
--- a/crates/kb-store-sqlite/src/store.rs
+++ b/crates/kb-store-sqlite/src/store.rs
@@ -109,6 +109,22 @@ impl SqliteStore {
         self.conn.lock().unwrap_or_else(|p| p.into_inner())
     }
 
+    /// Read-only borrow of the connection.
+    ///
+    /// Provided so sibling crates (e.g. `kb-search`) can run SELECTs
+    /// against the schema owned by this crate without re-opening the
+    /// SQLite file. Callers MUST treat the returned `Connection` as
+    /// read-only — issuing mutating SQL (INSERT / UPDATE / DELETE / DDL)
+    /// through this guard bypasses the per-document transaction discipline
+    /// (`put_*` methods) and the FTS5 backfill helpers that the store
+    /// layer enforces. Mutating callers must use `kb-store-sqlite`'s own
+    /// public write methods instead.
+    ///
+    /// Poisoning is recovered the same way as [`Self::lock_conn`].
+    pub fn read_conn(&self) -> MutexGuard<'_, Connection> {
+        self.conn.lock().unwrap_or_else(|p| p.into_inner())
+    }
+
     /// Persist a `RawAsset` *with its raw bytes*: row goes into `assets`,
     /// bytes go to `data_dir/assets/<aa>/<asset_id>` if `byte_len ≤
     /// copy_threshold_mb`, otherwise the row records the source URI's
-- 
2.49.1