Files
kebab/crates/kebab-core/src/search.rs
th-kim0823 fa4eeb5a87 feat(p10-1a-1): add SearchHit.repo / code_lang + SearchFilters.repo / code_lang
Wire two new optional fields onto SearchHit (skip_serializing_if = None)
and two Vec<String> filter fields onto SearchFilters (serde default).
Add RetrievalDetail::Default impl (manual, uses SearchMode::Hybrid as
sentinel). Patch all downstream SearchHit / SearchFilters literal
constructors with repo: None / code_lang: None / vec![] as appropriate.
Also covers Citation::Code arm in kebab-eval metrics match.
2026-05-15 15:04:23 +09:00

535 lines
18 KiB
Rust

//! Search query / filters / hit (§3.7) + DocFilter / DocSummary (§2.5).
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::asset::WorkspacePath;
use crate::citation::Citation;
use crate::ids::{ChunkId, DocumentId};
use crate::media::Lang;
use crate::metadata::{SourceType, TrustLevel};
use crate::versions::{ChunkerVersion, EmbeddingModelId, IndexVersion, ParserVersion};
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SearchMode {
Lexical,
Vector,
Hybrid,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct SearchQuery {
pub text: String,
pub mode: SearchMode,
pub k: usize,
pub filters: SearchFilters,
}
/// p9-fb-36: canonical kind labels for `SearchFilters.media`. Mirrors
/// `MediaType` variant tags; CLI / MCP normalize aliases (`md` → `markdown`)
/// before populating this Vec.
pub const MEDIA_KINDS: &[&str] = &["markdown", "pdf", "image", "audio", "other"];
/// p9-fb-38: top-level `SearchHit.score` declaration.
/// `Rrf` (hybrid) / `Bm25` (lexical-only) / `Cosine` (vector-only).
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ScoreKind {
#[default]
Rrf,
Bm25,
Cosine,
}
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct SearchFilters {
pub tags_any: Vec<String>,
pub lang: Option<Lang>,
pub path_glob: Option<String>,
pub trust_min: Option<TrustLevel>,
/// p9-fb-36: media_type filter — IN-list of `MediaType.kind`
/// strings (`"markdown"`, `"pdf"`, `"image"`, `"audio"`, `"other"`).
/// Empty Vec = no filter. Match is on the variant tag only;
/// e.g. `["image"]` matches `Image(Png)` and `Image(Jpeg)`.
#[serde(default)]
pub media: Vec<String>,
/// p9-fb-36: hits whose source doc's `documents.updated_at` is at
/// or after this timestamp. None = no filter. RFC3339 / UTC.
#[serde(default, with = "time::serde::rfc3339::option")]
pub ingested_after: Option<OffsetDateTime>,
/// p9-fb-36: restrict hits to a single document. None = no filter.
#[serde(default)]
pub doc_id: Option<DocumentId>,
/// p10-1A-1: filter by `metadata.repo`. Empty = no filter; multi-value = OR.
#[serde(default)]
pub repo: Vec<String>,
/// p10-1A-1: filter by `metadata.code_lang`. Empty = no filter; multi-value = OR.
/// Identifiers are lowercase canonical names (`rust`, `python`, `typescript`, ...).
/// Unknown values produce empty hits (consistent with `media` policy).
#[serde(default)]
pub code_lang: Vec<String>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct SearchHit {
pub rank: u32,
pub chunk_id: ChunkId,
pub doc_id: DocumentId,
pub doc_path: WorkspacePath,
pub heading_path: Vec<String>,
pub section_label: Option<String>,
pub snippet: String,
pub citation: Citation,
pub retrieval: RetrievalDetail,
pub index_version: IndexVersion,
pub embedding_model: Option<EmbeddingModelId>,
pub chunker_version: ChunkerVersion,
/// p9-fb-32: source doc's `documents.updated_at` (last actual re-process).
/// fb-23 incremental ingest skip path leaves this unchanged.
#[serde(with = "time::serde::rfc3339")]
pub indexed_at: OffsetDateTime,
/// p9-fb-32: server-computed `now - indexed_at > threshold` per
/// `config.search.stale_threshold_days`. `false` when threshold = 0.
pub stale: bool,
/// p9-fb-38: declares the meaning of the top-level `score`.
/// `Rrf` (hybrid mode), `Bm25` (lexical-only), `Cosine` (vector-only).
/// 옛 wire (fb-38 미만) 부재 시 `Rrf` default — hybrid 가 기본 mode.
#[serde(default)]
pub score_kind: ScoreKind,
/// p10-1A-1: optional. Filled when the source file lives in a git repo
/// (`.git/` walk-up). null for markdown / pdf / image hits and for code
/// hits ingested via `kebab ingest-file` outside a repo boundary.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub repo: Option<String>,
/// p10-1A-1: optional. Programming language identifier (lowercase). Set for
/// every code/manifest/k8s chunk; null for markdown / pdf / image hits.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub code_lang: Option<String>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct RetrievalDetail {
pub method: SearchMode,
pub fusion_score: f32,
pub lexical_score: Option<f32>,
pub vector_score: Option<f32>,
pub lexical_rank: Option<u32>,
pub vector_rank: Option<u32>,
}
impl Default for RetrievalDetail {
fn default() -> Self {
Self {
method: SearchMode::Hybrid,
fusion_score: 0.0,
lexical_score: None,
vector_score: None,
lexical_rank: None,
vector_rank: None,
}
}
}
/// Filter for `kb-app::list_docs` (§7.2 DocumentStore::list_documents).
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct DocFilter {
pub tags_any: Vec<String>,
pub lang: Option<Lang>,
pub path_glob: Option<String>,
pub trust_min: Option<TrustLevel>,
}
/// Internal mirror of wire `doc_summary.v1` (§2.5).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct DocSummary {
pub doc_id: DocumentId,
pub doc_path: WorkspacePath,
pub title: String,
pub lang: Lang,
pub tags: Vec<String>,
pub trust_level: TrustLevel,
pub source_type: SourceType,
pub byte_len: u64,
pub chunk_count: u32,
#[serde(with = "time::serde::rfc3339")]
pub created_at: OffsetDateTime,
#[serde(with = "time::serde::rfc3339")]
pub updated_at: OffsetDateTime,
pub parser_version: ParserVersion,
pub chunker_version: ChunkerVersion,
}
/// p9-fb-34: caller-supplied output budget knobs for `App::search_with_opts`.
/// All `None` = no enforcement (existing behavior).
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct SearchOpts {
/// chars/4 approximation of wire JSON token cost. None = no cap.
pub max_tokens: Option<usize>,
/// Per-hit snippet character cap. None = use config default.
pub snippet_chars: Option<usize>,
/// Opaque base64 cursor from a previous response. None = first page.
pub cursor: Option<String>,
/// p9-fb-37: when true, capture pipeline trace (cache bypassed,
/// lex / vec pre-fusion lists + timing populated on the response).
#[serde(default)]
pub trace: bool,
}
/// p9-fb-37: search retrieval pipeline trace. Populated only when
/// `SearchOpts.trace = true`; `None` on the wrapping `SearchResponse`
/// otherwise. `lexical` / `vector` are pre-fusion candidate lists
/// (each retriever's full output for the fanout query). `rrf_inputs`
/// is the union (chunk_id) used by RRF, with each side's rank
/// captured. `timing` is wall-clock per stage.
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct SearchTrace {
pub lexical: Vec<TraceCandidate>,
pub vector: Vec<TraceCandidate>,
pub rrf_inputs: Vec<TraceFusionInput>,
pub timing: TraceTiming,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TraceCandidate {
pub chunk_id: ChunkId,
pub doc_id: DocumentId,
pub doc_path: WorkspacePath,
pub rank: u32,
pub score: f32,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TraceFusionInput {
pub chunk_id: ChunkId,
pub lexical_rank: Option<u32>,
pub vector_rank: Option<u32>,
/// Hybrid mode: normalized RRF score in `[0, 1]`.
/// Lexical / Vector mode: equals the underlying retriever's score
/// (no fusion ran). 0.0 for chunks dropped past `target_k`.
pub fusion_score: f32,
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct TraceTiming {
pub lexical_ms: u64,
pub vector_ms: u64,
pub fusion_ms: u64,
pub total_ms: u64,
}
/// p9-fb-37: on-disk index size breakdown. Mirrored on the
/// wire `schema.v1.stats.index_bytes` block.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct IndexBytes {
pub sqlite: u64,
pub lancedb: u64,
}
/// p9-fb-42: per-query result in bulk search. `response` XOR `error` —
/// exactly one is `Some`. `query` is the input echo (raw JSON value)
/// so consumers can correlate input to output without index tracking.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct BulkSearchItem {
pub query: serde_json::Value,
pub response: Option<serde_json::Value>,
pub error: Option<serde_json::Value>,
}
/// p9-fb-42: bulk summary counts. Invariant: total == succeeded + failed.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct BulkSearchSummary {
pub total: u32,
pub succeeded: u32,
pub failed: u32,
}
/// p9-fb-42: MCP-only envelope. CLI emits raw ndjson without envelope.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct BulkSearchResponse {
pub schema_version: String,
pub results: Vec<BulkSearchItem>,
pub summary: BulkSearchSummary,
}
#[cfg(test)]
mod tests {
use super::*;
use time::macros::datetime;
#[test]
fn search_hit_serializes_indexed_at_and_stale() {
let hit = SearchHit {
rank: 1,
chunk_id: ChunkId("c".to_string()),
doc_id: DocumentId("d".to_string()),
doc_path: WorkspacePath::new("a/b.md".to_string()).unwrap(),
heading_path: vec!["H".to_string()],
section_label: None,
snippet: "s".to_string(),
citation: Citation::Line {
path: WorkspacePath::new("a/b.md".to_string()).unwrap(),
start: 1,
end: 1,
section: None,
},
retrieval: RetrievalDetail {
method: SearchMode::Lexical,
fusion_score: 0.5,
lexical_score: Some(0.5),
vector_score: None,
lexical_rank: Some(1),
vector_rank: None,
},
index_version: IndexVersion("v1".to_string()),
embedding_model: None,
chunker_version: ChunkerVersion("c1".to_string()),
indexed_at: datetime!(2026-05-09 12:00:00 UTC),
stale: true,
score_kind: ScoreKind::Rrf,
repo: None,
code_lang: None,
};
let v = serde_json::to_value(&hit).unwrap();
assert_eq!(v["indexed_at"], "2026-05-09T12:00:00Z");
assert_eq!(v["stale"], true);
}
#[test]
fn search_opts_default_is_all_none() {
let opts = SearchOpts::default();
assert!(opts.max_tokens.is_none());
assert!(opts.snippet_chars.is_none());
assert!(opts.cursor.is_none());
}
#[test]
fn search_filters_default_includes_new_fb36_fields() {
let f = SearchFilters::default();
assert!(f.media.is_empty(), "media default empty");
assert!(f.ingested_after.is_none(), "ingested_after default None");
assert!(f.doc_id.is_none(), "doc_id default None");
assert!(f.tags_any.is_empty());
assert!(f.lang.is_none());
assert!(f.path_glob.is_none());
assert!(f.trust_min.is_none());
}
#[test]
fn search_filters_serialize_with_serde_default_compat() {
let old: SearchFilters = serde_json::from_str(r#"{"tags_any":[],"lang":null,"path_glob":null,"trust_min":null}"#).unwrap();
assert!(old.media.is_empty());
assert!(old.ingested_after.is_none());
assert!(old.doc_id.is_none());
}
#[test]
fn search_trace_serde_roundtrip() {
let t = SearchTrace {
lexical: vec![TraceCandidate {
chunk_id: ChunkId("c1".into()),
doc_id: DocumentId("d1".into()),
doc_path: WorkspacePath::new("a.md".into()).unwrap(),
rank: 1,
score: 0.42,
}],
vector: vec![],
rrf_inputs: vec![TraceFusionInput {
chunk_id: ChunkId("c1".into()),
lexical_rank: Some(1),
vector_rank: None,
fusion_score: 0.0234,
}],
timing: TraceTiming {
lexical_ms: 12,
vector_ms: 0,
fusion_ms: 1,
total_ms: 14,
},
};
let v = serde_json::to_value(&t).unwrap();
assert_eq!(v["timing"]["lexical_ms"], 12);
assert_eq!(
v["lexical"][0]["score"].as_f64().unwrap() as f32,
0.42_f32
);
let back: SearchTrace = serde_json::from_value(v).unwrap();
assert_eq!(back, t);
}
#[test]
fn index_bytes_default_is_zero() {
let b = IndexBytes::default();
assert_eq!(b.sqlite, 0);
assert_eq!(b.lancedb, 0);
}
#[test]
fn search_opts_trace_default_false() {
let opts = SearchOpts::default();
assert!(!opts.trace);
}
#[test]
fn score_kind_serde_roundtrip() {
use ScoreKind::*;
for (kind, expected) in [(Rrf, "rrf"), (Bm25, "bm25"), (Cosine, "cosine")] {
let v = serde_json::to_value(kind).unwrap();
assert_eq!(v.as_str(), Some(expected));
let back: ScoreKind = serde_json::from_value(v).unwrap();
assert_eq!(back, kind);
}
}
#[test]
fn score_kind_default_is_rrf() {
assert_eq!(ScoreKind::default(), ScoreKind::Rrf);
}
#[test]
fn search_hit_deserialize_without_score_kind_defaults_to_rrf() {
let json = serde_json::json!({
"rank": 1,
"chunk_id": "c1",
"doc_id": "d1",
"doc_path": "a.md",
"heading_path": [],
"section_label": null,
"snippet": "x",
"citation": { "kind": "line", "path": "a.md", "start": 1, "end": 1, "section": null },
"retrieval": {
"method": "lexical",
"fusion_score": 0.5,
"lexical_score": 0.5,
"vector_score": null,
"lexical_rank": 1,
"vector_rank": null
},
"index_version": "v1",
"embedding_model": null,
"chunker_version": "c1",
"indexed_at": "2026-05-10T12:00:00Z",
"stale": false
});
let hit: SearchHit = serde_json::from_value(json).unwrap();
assert_eq!(hit.score_kind, ScoreKind::Rrf);
}
#[test]
fn bulk_search_summary_serde_roundtrip() {
let s = BulkSearchSummary {
total: 5,
succeeded: 4,
failed: 1,
};
let v = serde_json::to_value(s).unwrap();
assert_eq!(v["total"], 5);
assert_eq!(v["succeeded"], 4);
assert_eq!(v["failed"], 1);
let back: BulkSearchSummary = serde_json::from_value(v).unwrap();
assert_eq!(back, s);
}
#[test]
fn bulk_search_summary_default_is_zeros() {
let s = BulkSearchSummary::default();
assert_eq!(s.total, 0);
assert_eq!(s.succeeded, 0);
assert_eq!(s.failed, 0);
}
#[test]
fn bulk_search_item_serde_response_variant() {
let item = BulkSearchItem {
query: serde_json::json!({"query": "rust"}),
response: Some(serde_json::json!({"hits": []})),
error: None,
};
let v = serde_json::to_value(&item).unwrap();
assert!(v["response"].is_object());
assert!(v["error"].is_null());
}
#[test]
fn bulk_search_item_serde_error_variant() {
let item = BulkSearchItem {
query: serde_json::json!({"query": "rust"}),
response: None,
error: Some(serde_json::json!({"code": "config_invalid", "message": "bad"})),
};
let v = serde_json::to_value(&item).unwrap();
assert!(v["response"].is_null());
assert_eq!(v["error"]["code"], "config_invalid");
}
#[test]
fn search_hit_repo_and_code_lang_are_optional_and_omit_when_none() {
let hit = SearchHit {
rank: 1,
chunk_id: ChunkId("c1".into()),
doc_id: DocumentId("d1".into()),
doc_path: WorkspacePath("a.md".into()),
heading_path: vec![],
section_label: None,
snippet: "".into(),
citation: Citation::Line {
path: WorkspacePath("a.md".into()),
start: 1,
end: 2,
section: None,
},
retrieval: RetrievalDetail::default(),
index_version: IndexVersion("v1".into()),
embedding_model: None,
chunker_version: ChunkerVersion("md-heading-v1".into()),
indexed_at: time::OffsetDateTime::UNIX_EPOCH,
stale: false,
score_kind: ScoreKind::Rrf,
repo: None,
code_lang: None,
};
let v = serde_json::to_value(&hit).unwrap();
assert!(v.get("repo").is_none(), "repo should be omitted when None");
assert!(v.get("code_lang").is_none(), "code_lang should be omitted when None");
}
#[test]
fn search_hit_repo_and_code_lang_present_when_some() {
let hit = SearchHit {
rank: 1,
chunk_id: ChunkId("c1".into()),
doc_id: DocumentId("d1".into()),
doc_path: WorkspacePath("a.rs".into()),
heading_path: vec![],
section_label: None,
snippet: "".into(),
citation: Citation::Code {
path: WorkspacePath("a.rs".into()),
line_start: 1,
line_end: 2,
symbol: None,
lang: Some("rust".into()),
},
retrieval: RetrievalDetail::default(),
index_version: IndexVersion("v1".into()),
embedding_model: None,
chunker_version: ChunkerVersion("code-rust-ast-v1".into()),
indexed_at: time::OffsetDateTime::UNIX_EPOCH,
stale: false,
score_kind: ScoreKind::Rrf,
repo: Some("kebab".into()),
code_lang: Some("rust".into()),
};
let v = serde_json::to_value(&hit).unwrap();
assert_eq!(v["repo"], "kebab");
assert_eq!(v["code_lang"], "rust");
}
#[test]
fn search_filters_repo_and_code_lang_default_to_empty_vec() {
let f = SearchFilters::default();
assert!(f.repo.is_empty());
assert!(f.code_lang.is_empty());
}
}