style: cargo fmt --all (S3+S4+S5+S7 follow-up)

V009 morphological tokenizer 작업 (S3 chunk + S4 backfill + S5
short_query_hint 제거 + S7 신규 tests) 의 형식 정리. 동작 변경 없음.

Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md
Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S11)
This commit is contained in:
2026-05-28 12:06:01 +00:00
parent 97fd895a10
commit 21b52bc285
5 changed files with 30 additions and 23 deletions

View File

@@ -187,9 +187,8 @@ fn korean_morphological_mixed_english_korean_query() {
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
);
let hits =
kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
.expect("search 최적화");
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
.expect("search 최적화");
assert!(
!hits.is_empty(),
"'최적화' Korean morpheme must hit; got {:?}",

View File

@@ -50,7 +50,7 @@ pub use pdf_page_v1::PdfPageV1Chunker;
// ── Korean morphological tokenizer ───────────────────────────────────────────
use lindera::dictionary::{load_embedded_dictionary, DictionaryKind};
use lindera::dictionary::{DictionaryKind, load_embedded_dictionary};
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::tokenizer::Tokenizer;

View File

@@ -663,14 +663,8 @@ mod tests {
assert!(build_match_string("").is_none());
assert!(build_match_string("").is_none());
// 2-char Korean queries now produce a valid expression (V009 unicode61).
assert_eq!(
build_match_string("충돌").unwrap(),
r#"text : ("충돌")"#
);
assert_eq!(
build_match_string(" 충돌 ").unwrap(),
r#"text : ("충돌")"#
);
assert_eq!(build_match_string("충돌").unwrap(), r#"text : ("충돌")"#);
assert_eq!(build_match_string(" 충돌 ").unwrap(), r#"text : ("충돌")"#);
}
/// V009 unicode61: `해시 충돌` — both tokens are 2 chars and now pass

View File

@@ -517,7 +517,9 @@ impl SqliteStore {
)
.map_err(StoreError::from)?;
let iter = stmt
.query_map([], |row| Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)))
.query_map([], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
})
.map_err(StoreError::from)?;
let mut out = Vec::new();
for r in iter {
@@ -1100,7 +1102,8 @@ impl SqliteStore {
/// means "delete everything older than now" (i.e. all past rows).
pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result<u64> {
use time::format_description::well_known::Rfc3339;
let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days));
let cutoff =
time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days));
let cutoff_ts = cutoff
.format(&Rfc3339)
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string());

View File

@@ -465,8 +465,20 @@ fn backfill_tokenized_korean_text_populates_nullable_rows() {
// chunks 에 한국어 row 두 개 INSERT (tokenized_korean_text 는 chunks_ai trigger
// 가 채우지만, 여기서는 raw_conn_no_fk 로 직접 INSERT 하므로 NULL 로 남음).
let conn = raw_conn_no_fk(&env);
insert_chunk(&conn, &"a".repeat(32), &"d".repeat(32), "[]", "한국 문화는 오래되었다");
insert_chunk(&conn, &"b".repeat(32), &"d".repeat(32), "[]", "서울특별시는 한국의 수도");
insert_chunk(
&conn,
&"a".repeat(32),
&"d".repeat(32),
"[]",
"한국 문화는 오래되었다",
);
insert_chunk(
&conn,
&"b".repeat(32),
&"d".repeat(32),
"[]",
"서울특별시는 한국의 수도",
);
let null_count_before: i64 = conn
.query_row(
"SELECT COUNT(*) FROM chunks WHERE tokenized_korean_text IS NULL",
@@ -579,7 +591,11 @@ fn fts_v009_unicode61_space_separated_korean_token_hits() {
assert_eq!(count_match(&conn, "충돌은"), 1, "whole-token '충돌은' hit");
assert_eq!(count_match(&conn, "해시"), 1, "whole-token '해시' hit");
// substring (token 의 부분 문자열) 은 V009 unicode61 에서 0-hit.
assert_eq!(count_match(&conn, "발생한"), 0, "substring '발생한' of '발생한다' 0-hit");
assert_eq!(
count_match(&conn, "발생한"),
0,
"substring '발생한' of '발생한다' 0-hit"
);
}
// ── 8. V009 morphological tokenizer behavior ──────────────────────────
@@ -602,12 +618,7 @@ fn fts_v009_korean_morphological_2char_query_hits() {
policy_hash, block_ids_json, created_at,
tokenized_korean_text
) VALUES (?, ?, ?, '[]', NULL, '[]', 0, 'v1', 'h', '[]', '2024-01-01T00:00:00Z', ?)",
rusqlite::params![
&"k".repeat(32),
&"d".repeat(32),
text,
tokenized,
],
rusqlite::params![&"k".repeat(32), &"d".repeat(32), text, tokenized,],
)
.expect("insert chunk with tokenized_korean_text");