Phase C4 executor 의 마지막 `fix(test): clippy + fmt fixes` commit 이 test file 부분만 fmt 적용. workspace 전체 fmt 누락 발견 → cargo fmt --all 적용. 모든 import alphabetical reorder + line wrapping 정합. 추가 untracked artifact 동시 commit: - docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md (491 line, ACCEPT) - docs/superpowers/plans/2026-05-28-v0.20-ingest-log-plan.md (616 line, ACCEPT) workspace test: 1370 passed / 0 failed / 50 ignored, ingest_log_smoke green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
130 lines
5.2 KiB
Rust
130 lines
5.2 KiB
Rust
//! p9-fb-10: smoke pin that a Korean query reaches FTS5 and returns
|
|
//! the matching Hangul document. NFC normalization happens upstream
|
|
//! in `kebab-normalize`; this test only exercises the end-to-end
|
|
//! facade — ingest a Korean .md → lexical search → at least one hit.
|
|
|
|
mod common;
|
|
|
|
use common::TestEnv;
|
|
|
|
/// p9-fb-10 — A Korean token present in a Hangul document must survive
|
|
/// the ingest → FTS5 → search round-trip. NFC normalization is wired
|
|
/// upstream in `kebab-normalize`; this test just verifies the facade
|
|
/// doesn't drop or corrupt CJK text along the way.
|
|
#[test]
|
|
fn korean_lexical_query_returns_korean_document() {
|
|
let env = TestEnv::lexical_only();
|
|
|
|
// Write a Korean Markdown document into the temp workspace.
|
|
let doc_path = env.workspace_root.join("러스트-비동기.md");
|
|
std::fs::write(
|
|
&doc_path,
|
|
"# 러스트 비동기 프로그래밍\n\n토큰: 러스트, 비동기, async, await\n",
|
|
)
|
|
.expect("write Korean fixture doc");
|
|
|
|
// Ingest — lexical_only() disables fastembed so no AVX required.
|
|
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
|
.expect("ingest must succeed");
|
|
|
|
// Lexical search for "러스트" — must return the Korean document.
|
|
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("러스트"))
|
|
.expect("search must succeed");
|
|
|
|
assert!(
|
|
!hits.is_empty(),
|
|
"expected at least one hit for Korean lexical query '러스트'"
|
|
);
|
|
|
|
// At least one hit must reference our Korean document.
|
|
// "러스트-비동기" is the exact filename stem — a single combined
|
|
// check is unambiguous and avoids false positives from other docs.
|
|
let any_korean = hits.iter().any(|h| h.doc_path.0.contains("러스트-비동기"));
|
|
assert!(
|
|
any_korean,
|
|
"expected at least one hit on the Korean fixture doc, got: {:?}",
|
|
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
|
);
|
|
}
|
|
|
|
/// A4 Step 1c — multi-token Korean query (`해시 충돌`) must hit when
|
|
/// the lexical builder routes it through a whole-phrase MATCH candidate.
|
|
///
|
|
/// Expected: FAIL until A5 (`build_match_string` redesign) lands — the
|
|
/// current builder emits `"해시" "충돌"` AND, but FTS5 trigram tokenizer
|
|
/// has no 2-char terms so each side is 0-hit. A5 introduces a whole-
|
|
/// phrase candidate (`"해시 충돌"`) OR'd with the token AND, restoring
|
|
/// hits for the dominant Korean usage pattern.
|
|
#[test]
|
|
fn lexical_multi_token_korean_query_hits() {
|
|
let env = TestEnv::lexical_only();
|
|
|
|
// Copy the synthetic Korean fixture (introduced in A4 Step 0) into
|
|
// the test workspace. The fixture contains the exact phrase
|
|
// "해시 충돌" multiple times.
|
|
let dest = env.workspace_root.join("hash-table.md");
|
|
let src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
|
.join("..")
|
|
.join("..")
|
|
.join("fixtures")
|
|
.join("search")
|
|
.join("korean")
|
|
.join("hash-table.md");
|
|
std::fs::copy(&src, &dest).expect("copy korean fixture");
|
|
|
|
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
|
.expect("ingest must succeed");
|
|
|
|
let hits =
|
|
kebab_app::search_with_config(env.config.clone(), common::lexical_query("해시 충돌"))
|
|
.expect("search must succeed");
|
|
|
|
assert!(
|
|
!hits.is_empty(),
|
|
"multi-token Korean query '해시 충돌' must hit the hash-table fixture; got {:?}",
|
|
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
|
);
|
|
let any_hash_table = hits.iter().any(|h| h.doc_path.0.contains("hash-table"));
|
|
assert!(
|
|
any_hash_table,
|
|
"expected at least one hit on the hash-table fixture, got: {:?}",
|
|
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
|
);
|
|
}
|
|
|
|
/// A4 Step 1c — mixed Korean+English multi-token query (`Rust 충돌은`).
|
|
/// Both tokens are ≥3 chars, so the redesigned builder (A5) emits
|
|
/// `("Rust 충돌은") OR ("Rust" AND "충돌은")`. With trigram tokenizer
|
|
/// each side has substring coverage in the document, so the AND branch
|
|
/// alone is enough. Expected: FAIL pre-A5, PASS post-A5.
|
|
#[test]
|
|
fn lexical_mixed_korean_english_multi_token_query_hits() {
|
|
let env = TestEnv::lexical_only();
|
|
let doc_path = env.workspace_root.join("rust-hash.md");
|
|
std::fs::write(
|
|
&doc_path,
|
|
"# Rust 해시 테이블\n\nRust 의 std::collections::HashMap 에서 \
|
|
해시 충돌은 SipHash 로 완화한다.\n",
|
|
)
|
|
.expect("write rust-hash fixture");
|
|
|
|
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
|
.expect("ingest must succeed");
|
|
|
|
let hits =
|
|
kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust 충돌은"))
|
|
.expect("search must succeed");
|
|
|
|
assert!(
|
|
!hits.is_empty(),
|
|
"mixed Korean+English multi-token query 'Rust 충돌은' must hit the rust-hash fixture; got {:?}",
|
|
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
|
);
|
|
let any_rust_hash = hits.iter().any(|h| h.doc_path.0.contains("rust-hash"));
|
|
assert!(
|
|
any_rust_hash,
|
|
"expected at least one hit on the rust-hash fixture, got: {:?}",
|
|
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
|
);
|
|
}
|