style: cargo fmt --all (S3+S4+S5+S7 follow-up)
V009 morphological tokenizer 작업 (S3 chunk + S4 backfill + S5 short_query_hint 제거 + S7 신규 tests) 의 형식 정리. 동작 변경 없음. Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S11)
This commit is contained in:
@@ -187,8 +187,7 @@ fn korean_morphological_mixed_english_korean_query() {
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
|
||||
.expect("search 최적화");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
|
||||
@@ -50,7 +50,7 @@ pub use pdf_page_v1::PdfPageV1Chunker;
|
||||
|
||||
// ── Korean morphological tokenizer ───────────────────────────────────────────
|
||||
|
||||
use lindera::dictionary::{load_embedded_dictionary, DictionaryKind};
|
||||
use lindera::dictionary::{DictionaryKind, load_embedded_dictionary};
|
||||
use lindera::mode::Mode;
|
||||
use lindera::segmenter::Segmenter;
|
||||
use lindera::tokenizer::Tokenizer;
|
||||
|
||||
@@ -663,14 +663,8 @@ mod tests {
|
||||
assert!(build_match_string("키").is_none());
|
||||
assert!(build_match_string("나").is_none());
|
||||
// 2-char Korean queries now produce a valid expression (V009 unicode61).
|
||||
assert_eq!(
|
||||
build_match_string("충돌").unwrap(),
|
||||
r#"text : ("충돌")"#
|
||||
);
|
||||
assert_eq!(
|
||||
build_match_string(" 충돌 ").unwrap(),
|
||||
r#"text : ("충돌")"#
|
||||
);
|
||||
assert_eq!(build_match_string("충돌").unwrap(), r#"text : ("충돌")"#);
|
||||
assert_eq!(build_match_string(" 충돌 ").unwrap(), r#"text : ("충돌")"#);
|
||||
}
|
||||
|
||||
/// V009 unicode61: `해시 충돌` — both tokens are 2 chars and now pass
|
||||
|
||||
@@ -517,7 +517,9 @@ impl SqliteStore {
|
||||
)
|
||||
.map_err(StoreError::from)?;
|
||||
let iter = stmt
|
||||
.query_map([], |row| Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)))
|
||||
.query_map([], |row| {
|
||||
Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
|
||||
})
|
||||
.map_err(StoreError::from)?;
|
||||
let mut out = Vec::new();
|
||||
for r in iter {
|
||||
@@ -1100,7 +1102,8 @@ impl SqliteStore {
|
||||
/// means "delete everything older than now" (i.e. all past rows).
|
||||
pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result<u64> {
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days));
|
||||
let cutoff =
|
||||
time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days));
|
||||
let cutoff_ts = cutoff
|
||||
.format(&Rfc3339)
|
||||
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string());
|
||||
|
||||
@@ -465,8 +465,20 @@ fn backfill_tokenized_korean_text_populates_nullable_rows() {
|
||||
// chunks 에 한국어 row 두 개 INSERT (tokenized_korean_text 는 chunks_ai trigger
|
||||
// 가 채우지만, 여기서는 raw_conn_no_fk 로 직접 INSERT 하므로 NULL 로 남음).
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
insert_chunk(&conn, &"a".repeat(32), &"d".repeat(32), "[]", "한국 문화는 오래되었다");
|
||||
insert_chunk(&conn, &"b".repeat(32), &"d".repeat(32), "[]", "서울특별시는 한국의 수도");
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&"a".repeat(32),
|
||||
&"d".repeat(32),
|
||||
"[]",
|
||||
"한국 문화는 오래되었다",
|
||||
);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&"b".repeat(32),
|
||||
&"d".repeat(32),
|
||||
"[]",
|
||||
"서울특별시는 한국의 수도",
|
||||
);
|
||||
let null_count_before: i64 = conn
|
||||
.query_row(
|
||||
"SELECT COUNT(*) FROM chunks WHERE tokenized_korean_text IS NULL",
|
||||
@@ -579,7 +591,11 @@ fn fts_v009_unicode61_space_separated_korean_token_hits() {
|
||||
assert_eq!(count_match(&conn, "충돌은"), 1, "whole-token '충돌은' hit");
|
||||
assert_eq!(count_match(&conn, "해시"), 1, "whole-token '해시' hit");
|
||||
// substring (token 의 부분 문자열) 은 V009 unicode61 에서 0-hit.
|
||||
assert_eq!(count_match(&conn, "발생한"), 0, "substring '발생한' of '발생한다' 0-hit");
|
||||
assert_eq!(
|
||||
count_match(&conn, "발생한"),
|
||||
0,
|
||||
"substring '발생한' of '발생한다' 0-hit"
|
||||
);
|
||||
}
|
||||
|
||||
// ── 8. V009 morphological tokenizer behavior ──────────────────────────
|
||||
@@ -602,12 +618,7 @@ fn fts_v009_korean_morphological_2char_query_hits() {
|
||||
policy_hash, block_ids_json, created_at,
|
||||
tokenized_korean_text
|
||||
) VALUES (?, ?, ?, '[]', NULL, '[]', 0, 'v1', 'h', '[]', '2024-01-01T00:00:00Z', ?)",
|
||||
rusqlite::params![
|
||||
&"k".repeat(32),
|
||||
&"d".repeat(32),
|
||||
text,
|
||||
tokenized,
|
||||
],
|
||||
rusqlite::params![&"k".repeat(32), &"d".repeat(32), text, tokenized,],
|
||||
)
|
||||
.expect("insert chunk with tokenized_korean_text");
|
||||
|
||||
|
||||
Reference in New Issue
Block a user