From 21b52bc285423d7ae1e180c03c413514046cbe11 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 28 May 2026 12:06:01 +0000 Subject: [PATCH] style: cargo fmt --all (S3+S4+S5+S7 follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V009 morphological tokenizer 작업 (S3 chunk + S4 backfill + S5 short_query_hint 제거 + S7 신규 tests) 의 형식 정리. 동작 변경 없음. Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S11) --- crates/kebab-app/tests/search_korean.rs | 5 ++--- crates/kebab-chunk/src/lib.rs | 2 +- crates/kebab-search/src/lexical.rs | 10 ++------- crates/kebab-store-sqlite/src/store.rs | 7 ++++-- crates/kebab-store-sqlite/tests/fts.rs | 29 +++++++++++++++++-------- 5 files changed, 30 insertions(+), 23 deletions(-) diff --git a/crates/kebab-app/tests/search_korean.rs b/crates/kebab-app/tests/search_korean.rs index d2b916e..146c1db 100644 --- a/crates/kebab-app/tests/search_korean.rs +++ b/crates/kebab-app/tests/search_korean.rs @@ -187,9 +187,8 @@ fn korean_morphological_mixed_english_korean_query() { hits.iter().map(|h| &h.doc_path.0).collect::>() ); - let hits = - kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화")) - .expect("search 최적화"); + let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화")) + .expect("search 최적화"); assert!( !hits.is_empty(), "'최적화' Korean morpheme must hit; got {:?}", diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index 36225d6..0279935 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -50,7 +50,7 @@ pub use pdf_page_v1::PdfPageV1Chunker; // ── Korean morphological tokenizer ─────────────────────────────────────────── -use lindera::dictionary::{load_embedded_dictionary, DictionaryKind}; +use lindera::dictionary::{DictionaryKind, load_embedded_dictionary}; use lindera::mode::Mode; use lindera::segmenter::Segmenter; use lindera::tokenizer::Tokenizer; diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs index 2aec6d3..8101e5b 100644 --- a/crates/kebab-search/src/lexical.rs +++ b/crates/kebab-search/src/lexical.rs @@ -663,14 +663,8 @@ mod tests { assert!(build_match_string("키").is_none()); assert!(build_match_string("나").is_none()); // 2-char Korean queries now produce a valid expression (V009 unicode61). - assert_eq!( - build_match_string("충돌").unwrap(), - r#"text : ("충돌")"# - ); - assert_eq!( - build_match_string(" 충돌 ").unwrap(), - r#"text : ("충돌")"# - ); + assert_eq!(build_match_string("충돌").unwrap(), r#"text : ("충돌")"#); + assert_eq!(build_match_string(" 충돌 ").unwrap(), r#"text : ("충돌")"#); } /// V009 unicode61: `해시 충돌` — both tokens are 2 chars and now pass diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index 0a0324c..0948470 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -517,7 +517,9 @@ impl SqliteStore { ) .map_err(StoreError::from)?; let iter = stmt - .query_map([], |row| Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))) + .query_map([], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) + }) .map_err(StoreError::from)?; let mut out = Vec::new(); for r in iter { @@ -1100,7 +1102,8 @@ impl SqliteStore { /// means "delete everything older than now" (i.e. all past rows). pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result { use time::format_description::well_known::Rfc3339; - let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days)); + let cutoff = + time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days)); let cutoff_ts = cutoff .format(&Rfc3339) .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string()); diff --git a/crates/kebab-store-sqlite/tests/fts.rs b/crates/kebab-store-sqlite/tests/fts.rs index fc52f25..7c4c08e 100644 --- a/crates/kebab-store-sqlite/tests/fts.rs +++ b/crates/kebab-store-sqlite/tests/fts.rs @@ -465,8 +465,20 @@ fn backfill_tokenized_korean_text_populates_nullable_rows() { // chunks 에 한국어 row 두 개 INSERT (tokenized_korean_text 는 chunks_ai trigger // 가 채우지만, 여기서는 raw_conn_no_fk 로 직접 INSERT 하므로 NULL 로 남음). let conn = raw_conn_no_fk(&env); - insert_chunk(&conn, &"a".repeat(32), &"d".repeat(32), "[]", "한국 문화는 오래되었다"); - insert_chunk(&conn, &"b".repeat(32), &"d".repeat(32), "[]", "서울특별시는 한국의 수도"); + insert_chunk( + &conn, + &"a".repeat(32), + &"d".repeat(32), + "[]", + "한국 문화는 오래되었다", + ); + insert_chunk( + &conn, + &"b".repeat(32), + &"d".repeat(32), + "[]", + "서울특별시는 한국의 수도", + ); let null_count_before: i64 = conn .query_row( "SELECT COUNT(*) FROM chunks WHERE tokenized_korean_text IS NULL", @@ -579,7 +591,11 @@ fn fts_v009_unicode61_space_separated_korean_token_hits() { assert_eq!(count_match(&conn, "충돌은"), 1, "whole-token '충돌은' hit"); assert_eq!(count_match(&conn, "해시"), 1, "whole-token '해시' hit"); // substring (token 의 부분 문자열) 은 V009 unicode61 에서 0-hit. - assert_eq!(count_match(&conn, "발생한"), 0, "substring '발생한' of '발생한다' 0-hit"); + assert_eq!( + count_match(&conn, "발생한"), + 0, + "substring '발생한' of '발생한다' 0-hit" + ); } // ── 8. V009 morphological tokenizer behavior ────────────────────────── @@ -602,12 +618,7 @@ fn fts_v009_korean_morphological_2char_query_hits() { policy_hash, block_ids_json, created_at, tokenized_korean_text ) VALUES (?, ?, ?, '[]', NULL, '[]', 0, 'v1', 'h', '[]', '2024-01-01T00:00:00Z', ?)", - rusqlite::params![ - &"k".repeat(32), - &"d".repeat(32), - text, - tokenized, - ], + rusqlite::params![&"k".repeat(32), &"d".repeat(32), text, tokenized,], ) .expect("insert chunk with tokenized_korean_text");