diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs index 8101e5b..3b2ef9e 100644 --- a/crates/kebab-search/src/lexical.rs +++ b/crates/kebab-search/src/lexical.rs @@ -123,7 +123,29 @@ impl Retriever for LexicalRetriever { }; let conn = self.store.read_conn(); - let raw_rows = run_query(&conn, &match_str, self.snippet_words, filters, fetch_limit)?; + let body_rows = run_query(&conn, &match_str, self.snippet_words, filters, fetch_limit)?; + // doc-side expansion (V010): re-run the same query against the + // `aliases` column of `chunk_aliases_fts`. Empty table → 0 rows → + // `body_rows` unchanged (regression-safe). body wins; alias-only + // chunks are appended so a term present only in a chunk's aliases + // still enters the pool. + // + // Raw mode (`'...'`) is a body-FTS5 escape hatch and may reference + // body-only columns (e.g. `heading_path : ...`) that don't exist on + // `chunk_aliases_fts`. Running such an expression against the alias + // table is a hard FTS5 error, so we skip the alias channel for raw + // queries — they target the body intentionally. + let alias_rows = if strip_single_quotes(query.text.trim()).is_some() { + Vec::new() + } else { + match build_match_string_for_column(&query.text, "aliases") { + Some(alias_match) => { + run_alias_query(&conn, &alias_match, self.snippet_chars, fetch_limit)? + } + None => Vec::new(), + } + }; + let raw_rows = merge_body_alias(body_rows, alias_rows, fetch_limit); let mut hits: Vec = Vec::with_capacity(raw_rows.len().min(k)); let mut rank: u32 = 0; @@ -206,6 +228,16 @@ impl Retriever for LexicalRetriever { /// match is scoped to the body column. FTS5's column-filter syntax /// accepts an arbitrary OR/AND sub-expression inside the parens. fn build_match_string(text: &str) -> Option { + build_match_string_for_column(text, "text") +} + +/// Column-parameterized variant of [`build_match_string`]. `column` is the +/// FTS5 column-filter prefix the combined expression is scoped to — `"text"` +/// for the body channel (`chunks_fts`) or `"aliases"` for the doc-side +/// expansion channel (`chunk_aliases_fts`, V010). Raw mode (`'...'`) is still +/// passed through verbatim without any column scoping, so an explicit +/// user-supplied column filter is honored unchanged. +fn build_match_string_for_column(text: &str, column: &str) -> Option { let trimmed = text.trim(); if trimmed.is_empty() { return None; @@ -242,7 +274,7 @@ fn build_match_string(text: &str) -> Option { (Some(w), Some(a)) if w == a => w, (Some(w), Some(a)) => format!("({w}) OR ({a})"), }; - Some(format!("text : ({expression})")) + Some(format!("{column} : ({expression})")) } /// Return `Some(inner)` if `s` is wrapped in a matching pair of single @@ -480,6 +512,77 @@ fn row_from_sql(row: &Row<'_>) -> rusqlite::Result { }) } +/// Search the doc-side expansion channel (`chunk_aliases_fts`, V010) and +/// build [`RawRow`]s with the **same 10-column shape** as [`run_query`] so +/// `row_from_sql` / `build_hit` can be reused verbatim. The snippet is taken +/// from the body (`substr(c.text, 1, ?)`) rather than the alias text so the +/// rendered hit stays consistent with the body channel. When +/// `chunk_aliases_fts` is empty (no chunk carries aliases) this returns 0 +/// rows, making the merge a no-op (regression-safe). +/// +/// 1차는 filters 미적용 — body 채널이 필터를 적용하고, 별칭 경로는 pool 진입 +/// (회수)이 목적이다(측정 후 필요 시 filters 공유). `bm25(chunk_aliases_fts)` +/// 오름차순 + `af.chunk_id` tie-break 로 결정적 순서. +fn run_alias_query( + conn: &Connection, + match_str: &str, + snippet_chars: usize, + fetch_limit: usize, +) -> Result> { + let sql = "SELECT \ + af.chunk_id, af.doc_id, \ + bm25(chunk_aliases_fts) AS score, \ + substr(c.text, 1, ?) AS snippet, \ + c.heading_path_json, c.section_label, c.source_spans_json, \ + c.chunker_version, \ + d.workspace_path, d.updated_at \ + FROM chunk_aliases_fts af \ + JOIN chunks c ON c.chunk_id = af.chunk_id \ + JOIN documents d ON d.doc_id = af.doc_id \ + WHERE chunk_aliases_fts MATCH ? \ + ORDER BY score, af.chunk_id LIMIT ?"; + let params: Vec> = vec![ + Box::new(snippet_chars as i64), + Box::new(match_str.to_owned()), + Box::new(i64::try_from(fetch_limit).unwrap_or(i64::MAX)), + ]; + let mut stmt = conn + .prepare(sql) + .context("kb-search lexical: prepare alias FTS5 statement")?; + let rows = stmt + .query_map( + params_from_iter(params.iter().map(std::convert::AsRef::as_ref)), + row_from_sql, + ) + .context("kb-search lexical: execute alias FTS5 query")?; + let mut out: Vec = Vec::new(); + for r in rows { + out.push(r.context("kb-search lexical: read alias row")?); + } + Ok(out) +} + +/// Merge body + alias rows: body rows first (already bm25-ordered), then +/// any alias-only chunk (not already present in the body result) appended in +/// alias-relevance order. Capped at `limit`. An empty `alias` slice leaves +/// `body` unchanged, so an empty `chunk_aliases_fts` reproduces the +/// pre-expansion behavior exactly. +fn merge_body_alias(body: Vec, alias: Vec, limit: usize) -> Vec { + use std::collections::HashSet; + let mut seen: HashSet = body.iter().map(|r| r.chunk_id.clone()).collect(); + let mut out = body; + for r in alias { + if out.len() >= limit { + break; + } + if seen.insert(r.chunk_id.clone()) { + out.push(r); + } + } + out.truncate(limit); + out +} + // ── Hit construction ───────────────────────────────────────────────────── fn build_hit( diff --git a/crates/kebab-search/tests/lexical.rs b/crates/kebab-search/tests/lexical.rs index beb8151..9e9efbc 100644 --- a/crates/kebab-search/tests/lexical.rs +++ b/crates/kebab-search/tests/lexical.rs @@ -144,6 +144,42 @@ fn insert_chunk( .expect("insert chunk"); } +/// Like [`insert_chunk`] but also writes the `chunks.aliases` column so the +/// `chunk_aliases_ai` trigger (V010) mirrors the row into `chunk_aliases_fts`. +/// `aliases=None` leaves the column NULL (trigger skips → no alias row). +#[allow(clippy::too_many_arguments)] +fn insert_chunk_with_aliases( + conn: &Connection, + chunk_id: &str, + doc_id: &str, + text: &str, + heading_path: &[&str], + section_label: Option<&str>, + source_spans_json: &str, + chunker_version: &str, + aliases: Option<&str>, +) { + let heading_json = serde_json::to_string(heading_path).unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at, aliases + ) VALUES (?, ?, ?, ?, ?, ?, 0, ?, 'h', '[]', '2024-01-01T00:00:00Z', ?)", + rusqlite::params![ + chunk_id, + doc_id, + text, + heading_json, + section_label, + source_spans_json, + chunker_version, + aliases, + ], + ) + .expect("insert chunk with aliases"); +} + /// Pad a short ID to the 32-hex shape kebab_core newtypes expect. fn id32(prefix: &str) -> String { let mut s = prefix.to_string(); @@ -1253,3 +1289,87 @@ fn lexical_raw_mode_can_opt_into_heading_path_filter() { "raw-mode heading_path filter must hit the seeded chunk" ); } + +// ── doc-side expansion (V010) — body+alias merged search ────────────────── + +/// pool-rescue core: a term present ONLY in `chunks.aliases` (not in the +/// body) must still recall the chunk via the `chunk_aliases_fts` channel. +/// Body is English ("backpropagation…"); the Korean term "역전파" lives only +/// in the alias text, so the body `chunks_fts` MATCH alone would miss it. +#[test] +fn alias_only_term_recalls_chunk() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d"), "notes/nn.md", "NN", "en", "primary", &[]); + insert_chunk_with_aliases( + &conn, + &id32("c1"), + &id32("d"), + "backpropagation computes gradients", + &["NN"], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + Some("역전파\n신경망 오차 역전달"), + ); + drop(conn); + + let r = env.retriever(); + let hits = r + .search(&SearchQuery { + text: "역전파".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters::default(), + }) + .unwrap(); + assert!( + hits.iter().any(|h| h.chunk_id.0 == id32("c1")), + "별칭에만 있는 term 으로도 청크가 회수돼야 한다 (pool-rescue); got {:?}", + hits.iter().map(|h| h.chunk_id.0.clone()).collect::>() + ); +} + +/// Regression-safety: with every chunk's `aliases=NULL` the +/// `chunk_aliases_fts` table is empty, so the alias channel yields 0 rows +/// and the body search result is identical to the pre-expansion behavior. +#[test] +fn empty_aliases_table_matches_baseline() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document( + &conn, + &id32("d"), + "notes/own.md", + "Own", + "en", + "primary", + &[], + ); + // aliases=None → no chunk_aliases_fts row; body channel only. + insert_chunk( + &conn, + &id32("c1"), + &id32("d"), + "rust ownership and borrow checker", + &["Own"], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + drop(conn); + + let r = env.retriever(); + let hits = r + .search(&SearchQuery { + text: "ownership".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters::default(), + }) + .unwrap(); + assert!( + hits.iter().any(|h| h.chunk_id.0 == id32("c1")), + "aliases 빈 상태에서 본문 매칭 청크가 정상 회수돼야 한다 (회귀 안전)" + ); +}