trigram tokenizer 가 snippet 단위 + 단어 경계 + BM25 raw score 분포를 모두 바꿔서 unicode61 assumption 기반의 3 test 가 regression. - wire_search_response::search_json_truncates_with_max_tokens + search_plain_emits_truncated_hint_to_stderr: 단일 doc + 작은 max_tokens 로는 snippet 이 짧아서 budget loop 가 trip 안 함. 다중 doc fixture (5 doc) + budget 30 token 으로 hit-pop 경로 통해 truncated=true 보장. - fetch_integration::fetch_chunk_with_context_returns_neighbors: fixture body 의 2-char tokens (A1/A3 등) 가 trigram 비호환으로 0-hit. apples/banana/cherry/durian/elder 5-char unique words 로 갱신, query 도 cherry 로 deterministic pin. - eval/runner::runner_per_query_snapshot_matches_fixture: trigram token stream 으로 BM25 raw score 변동. UPDATE_SNAPSHOTS=1 로 regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
246 lines
8.2 KiB
Rust
246 lines
8.2 KiB
Rust
//! p9-fb-34: CLI search wire wrapper + budget controls.
|
|
//!
|
|
//! Lexical-only — no fastembed / no Ollama. Each test builds its own
|
|
//! TempDir KB via `common::write_config` + `common::ingest` and drives
|
|
//! `kebab search` through `common::run_search_with_args`. Verifies:
|
|
//!
|
|
//! - `--json` emits the `search_response.v1` wrapper (hits + cursor +
|
|
//! truncated).
|
|
//! - `--max-tokens` flips `truncated: true` once the budget binds.
|
|
//! - `--cursor` advances paging (page 2 chunk_ids disjoint from page 1).
|
|
//! - Plain (non-JSON) output prints the `[truncated; ...]` hint to
|
|
//! stderr (stdout stays the hit list).
|
|
|
|
mod common;
|
|
|
|
use serde_json::Value;
|
|
use std::fs;
|
|
|
|
#[test]
|
|
fn search_json_emits_search_response_v1_wrapper() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
|
|
fs::write(workspace.join("a.md"), "# T\n\napples are red.\n").unwrap();
|
|
common::ingest(&cfg, &workspace);
|
|
|
|
let (stdout, _stderr) = common::run_search_with_args(
|
|
&cfg,
|
|
&["--json", "--mode", "lexical", "apples"],
|
|
);
|
|
let v: Value = serde_json::from_str(stdout.trim())
|
|
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
|
assert_eq!(v["schema_version"], "search_response.v1");
|
|
assert!(v["hits"].is_array(), "hits must be array, got {v}");
|
|
assert!(
|
|
v["next_cursor"].is_null() || v["next_cursor"].is_string(),
|
|
"next_cursor must be null or string, got {}",
|
|
v["next_cursor"]
|
|
);
|
|
assert!(
|
|
v["truncated"].is_boolean(),
|
|
"truncated must be bool, got {}",
|
|
v["truncated"]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn search_json_truncates_with_max_tokens() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
|
|
// v0.17.0 trigram tokenizer makes FTS5 snippet() tokens 3-char wide
|
|
// (was full words under unicode61), so an individual snippet stays
|
|
// around ~60 chars — too short to ever exceed the snippet-shorten
|
|
// budget cap on a single-hit fixture. To still exercise the budget
|
|
// loop deterministically, we ingest multiple hits and pick a budget
|
|
// small enough that the loop has to *pop* hits, which flips
|
|
// truncated=true regardless of snippet length.
|
|
for i in 0..5 {
|
|
fs::write(
|
|
workspace.join(format!("d{i}.md")),
|
|
format!("# T{i}\n\nrust ownership is a memory model.\n"),
|
|
)
|
|
.unwrap();
|
|
}
|
|
common::ingest(&cfg, &workspace);
|
|
|
|
let (stdout, _stderr) = common::run_search_with_args(
|
|
&cfg,
|
|
&["--json", "--mode", "lexical", "--max-tokens", "30", "rust"],
|
|
);
|
|
let v: Value = serde_json::from_str(stdout.trim())
|
|
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
|
assert_eq!(
|
|
v["truncated"], true,
|
|
"30-token cap must trip truncation: {v}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn search_json_cursor_paginates() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
|
|
for i in 0..6 {
|
|
fs::write(
|
|
workspace.join(format!("d{i}.md")),
|
|
format!("# T{i}\n\nrust topic {i}\n"),
|
|
)
|
|
.unwrap();
|
|
}
|
|
common::ingest(&cfg, &workspace);
|
|
|
|
let (page1, _) = common::run_search_with_args(
|
|
&cfg,
|
|
&["--json", "--mode", "lexical", "--k", "2", "rust"],
|
|
);
|
|
let v1: Value = serde_json::from_str(page1.trim())
|
|
.unwrap_or_else(|e| panic!("page1 not JSON: {page1:?}: {e}"));
|
|
let cursor = v1["next_cursor"]
|
|
.as_str()
|
|
.unwrap_or_else(|| panic!("next_cursor missing on page1: {v1}"));
|
|
|
|
let (page2, _) = common::run_search_with_args(
|
|
&cfg,
|
|
&[
|
|
"--json",
|
|
"--mode",
|
|
"lexical",
|
|
"--k",
|
|
"2",
|
|
"--cursor",
|
|
cursor,
|
|
"rust",
|
|
],
|
|
);
|
|
let v2: Value = serde_json::from_str(page2.trim())
|
|
.unwrap_or_else(|e| panic!("page2 not JSON: {page2:?}: {e}"));
|
|
|
|
let p1_ids: Vec<String> = v1["hits"]
|
|
.as_array()
|
|
.expect("page1 hits array")
|
|
.iter()
|
|
.map(|h| {
|
|
h["chunk_id"]
|
|
.as_str()
|
|
.expect("chunk_id string")
|
|
.to_string()
|
|
})
|
|
.collect();
|
|
let p2_ids: Vec<String> = v2["hits"]
|
|
.as_array()
|
|
.expect("page2 hits array")
|
|
.iter()
|
|
.map(|h| {
|
|
h["chunk_id"]
|
|
.as_str()
|
|
.expect("chunk_id string")
|
|
.to_string()
|
|
})
|
|
.collect();
|
|
assert!(
|
|
!p2_ids.is_empty(),
|
|
"page2 must return at least one hit (cursor advanced past page1)"
|
|
);
|
|
assert!(
|
|
p2_ids.iter().all(|id| !p1_ids.contains(id)),
|
|
"page2 must not repeat page1 chunk_ids: page1={p1_ids:?} page2={p2_ids:?}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() {
|
|
// p9-fb-34 round-1 review: end-to-end wire contract — when the
|
|
// corpus_revision bumps between cursor issuance and the cursored
|
|
// search, `kebab --json search --cursor <stale>` must emit an
|
|
// `error.v1` ndjson line on stderr with `code = "stale_cursor"`.
|
|
// Pre-fix this returned `code = "generic"` because
|
|
// `App::search_with_opts` string-formatted the typed payload into
|
|
// anyhow, losing the structured wrapper.
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
|
|
fs::write(workspace.join("a.md"), "# T\n\napples\n").unwrap();
|
|
common::ingest(&cfg, &workspace);
|
|
|
|
// Get a valid cursor first.
|
|
let (page1_stdout, _) = common::run_search_with_args(
|
|
&cfg,
|
|
&["--mode", "lexical", "--json", "--k", "1", "apples"],
|
|
);
|
|
let v1: Value = serde_json::from_str(page1_stdout.trim()).expect("json");
|
|
let cursor = v1["next_cursor"]
|
|
.as_str()
|
|
.expect("k=1 page must emit next_cursor — fixture too small if this fails")
|
|
.to_string();
|
|
|
|
// Bump corpus_revision by ingesting a second doc.
|
|
fs::write(workspace.join("b.md"), "# B\n\nbananas\n").unwrap();
|
|
common::ingest(&cfg, &workspace);
|
|
|
|
// Use the now-stale cursor. Direct invocation (not via the
|
|
// success-asserting helper) so we can read stderr on failure.
|
|
let exe = env!("CARGO_BIN_EXE_kebab");
|
|
let cfg_str = cfg.to_str().expect("utf8");
|
|
let out = std::process::Command::new(exe)
|
|
.args([
|
|
"--config",
|
|
cfg_str,
|
|
"--json",
|
|
"search",
|
|
"--mode",
|
|
"lexical",
|
|
"--json",
|
|
"--cursor",
|
|
&cursor,
|
|
"apples",
|
|
])
|
|
.output()
|
|
.expect("kebab search --cursor");
|
|
|
|
let stderr = String::from_utf8_lossy(&out.stderr);
|
|
// Find the error.v1 ndjson line on stderr (one event per line).
|
|
let err_line = stderr
|
|
.lines()
|
|
.find(|l| {
|
|
serde_json::from_str::<Value>(l)
|
|
.ok()
|
|
.and_then(|v| {
|
|
v.get("schema_version")
|
|
.and_then(|s| s.as_str())
|
|
.map(String::from)
|
|
})
|
|
.as_deref()
|
|
== Some("error.v1")
|
|
})
|
|
.unwrap_or_else(|| panic!("no error.v1 line on stderr: {stderr:?}"));
|
|
|
|
let v: Value = serde_json::from_str(err_line).expect("error.v1 json");
|
|
assert_eq!(
|
|
v["code"], "stale_cursor",
|
|
"code must be stale_cursor: {err_line}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn search_plain_emits_truncated_hint_to_stderr() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
|
|
// v0.17.0 trigram tokenizer — same multi-doc rationale as
|
|
// `search_json_truncates_with_max_tokens` above.
|
|
for i in 0..5 {
|
|
fs::write(
|
|
workspace.join(format!("d{i}.md")),
|
|
format!("# T{i}\n\nrust ownership is a memory model.\n"),
|
|
)
|
|
.unwrap();
|
|
}
|
|
common::ingest(&cfg, &workspace);
|
|
|
|
let (_stdout, stderr) = common::run_search_with_args(
|
|
&cfg,
|
|
&["--mode", "lexical", "--max-tokens", "30", "rust"],
|
|
);
|
|
assert!(
|
|
stderr.contains("[truncated;"),
|
|
"stderr must carry truncated hint: {stderr:?}"
|
|
);
|
|
}
|