test(v0.17.0/A5 follow-up): trigram tokenizer downstream test fixes

trigram tokenizer 가 snippet 단위 + 단어 경계 + BM25 raw score 분포를
모두 바꿔서 unicode61 assumption 기반의 3 test 가 regression.

- wire_search_response::search_json_truncates_with_max_tokens +
  search_plain_emits_truncated_hint_to_stderr: 단일 doc + 작은
  max_tokens 로는 snippet 이 짧아서 budget loop 가 trip 안 함.
  다중 doc fixture (5 doc) + budget 30 token 으로 hit-pop 경로
  통해 truncated=true 보장.
- fetch_integration::fetch_chunk_with_context_returns_neighbors:
  fixture body 의 2-char tokens (A1/A3 등) 가 trigram 비호환으로
  0-hit. apples/banana/cherry/durian/elder 5-char unique words
  로 갱신, query 도 cherry 로 deterministic pin.
- eval/runner::runner_per_query_snapshot_matches_fixture: trigram
  token stream 으로 BM25 raw score 변동. UPDATE_SNAPSHOTS=1 로
  regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-24 12:21:34 +00:00
parent 8a68289499
commit 0ee18149e7
3 changed files with 31 additions and 8 deletions

View File

@@ -38,12 +38,16 @@ fn fetch_chunk_returns_target_only_when_no_context() {
#[test]
fn fetch_chunk_with_context_returns_neighbors() {
let env = common::TestEnv::new();
let body = "# H1\n\nA1\n\n# H2\n\nA2\n\n# H3\n\nA3\n\n# H4\n\nA4\n\n# H5\n\nA5\n";
// v0.17.0 trigram tokenizer: terms must be ≥3 Unicode chars to
// match. The earlier fixture used 2-char tokens like `A1`/`A3` for
// section bodies — those zero-hit under trigram. Use 5-char unique
// words per section so the query can pin one chunk deterministically.
let body = "# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
common::ingest_md(&env, "multi.md", body);
let app = env.app();
let q = kebab_core::SearchQuery {
text: "A3".to_string(),
text: "cherry".to_string(),
mode: kebab_core::SearchMode::Lexical,
k: 1,
filters: kebab_core::SearchFilters::default(),

View File

@@ -47,8 +47,20 @@ fn search_json_emits_search_response_v1_wrapper() {
fn search_json_truncates_with_max_tokens() {
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
let body: String = "rust ownership is a memory model. ".repeat(10);
fs::write(workspace.join("a.md"), format!("# T\n\n{body}\n")).unwrap();
// v0.17.0 trigram tokenizer makes FTS5 snippet() tokens 3-char wide
// (was full words under unicode61), so an individual snippet stays
// around ~60 chars — too short to ever exceed the snippet-shorten
// budget cap on a single-hit fixture. To still exercise the budget
// loop deterministically, we ingest multiple hits and pick a budget
// small enough that the loop has to *pop* hits, which flips
// truncated=true regardless of snippet length.
for i in 0..5 {
fs::write(
workspace.join(format!("d{i}.md")),
format!("# T{i}\n\nrust ownership is a memory model.\n"),
)
.unwrap();
}
common::ingest(&cfg, &workspace);
let (stdout, _stderr) = common::run_search_with_args(
@@ -211,8 +223,15 @@ fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() {
fn search_plain_emits_truncated_hint_to_stderr() {
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
let body: String = "rust ownership is a memory model. ".repeat(10);
fs::write(workspace.join("a.md"), format!("# T\n\n{body}\n")).unwrap();
// v0.17.0 trigram tokenizer — same multi-doc rationale as
// `search_json_truncates_with_max_tokens` above.
for i in 0..5 {
fs::write(
workspace.join(format!("d{i}.md")),
format!("# T{i}\n\nrust ownership is a memory model.\n"),
)
.unwrap();
}
common::ingest(&cfg, &workspace);
let (_stdout, stderr) = common::run_search_with_args(

View File

@@ -5,7 +5,7 @@
"chunk_id": "chunk000000000000000000000000000000",
"doc_id": "doc00000000000000000000000000000000",
"heading_path": [],
"score": 0.3429983854293823
"score": 0.35202541947364807
},
"has_answer": false,
"hits_count": 1,
@@ -19,7 +19,7 @@
"chunk_id": "chunk000000000000000000000000000002",
"doc_id": "doc00000000000000000000000000000002",
"heading_path": [],
"score": 0.3585492968559265
"score": 0.3414848744869232
},
"has_answer": false,
"hits_count": 1,