Files
kebab/crates/kebab-core/src/chunk.rs
altair823 b1c5feb3f3 refactor(core): Chunk.aliases 필드 제거
doc-side expansion(별칭) 제거 — Chunk 의 aliases: Option<String> 필드와
serde default 테스트 제거. Metadata.aliases(Vec, 문서 메타)는 유지.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 21:36:44 +00:00

55 lines
1.8 KiB
Rust

//! Chunk (§3.5).
use serde::{Deserialize, Serialize};
use crate::document::SourceSpan;
use crate::ids::{BlockId, ChunkId, DocumentId};
use crate::versions::ChunkerVersion;
/// A unit of retrievable text per design §3.5 + §5.5.
///
/// `policy_hash` is the chunker's hex digest of the active `ChunkPolicy`
/// (e.g. `target_tokens`, `overlap_tokens`). It mirrors the §5.5 SQLite
/// schema column so persistence is a straight copy, and feeds the
/// `chunk_id` recipe (§4.2) so policy edits invalidate downstream IDs.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Chunk {
pub chunk_id: ChunkId,
pub doc_id: DocumentId,
pub block_ids: Vec<BlockId>,
pub text: String,
pub heading_path: Vec<String>,
pub source_spans: Vec<SourceSpan>,
pub token_estimate: usize,
pub chunker_version: ChunkerVersion,
pub policy_hash: String,
/// 한국어 형태소 분해된 token 시퀀스 (공백 join). lindera ko-dic
/// 으로 chunker 가 pre-fill. None 시 raw text 만 FTS5 index.
/// Bug #8 (한국어 2자 query) 해결을 위한 V009 cascade.
#[serde(default)]
pub tokenized_korean_text: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenized_korean_text_defaults_to_none_on_deserialize() {
// tokenized_korean_text 필드가 없는 과거 JSON 도 파싱되어야 한다 (#[serde(default)]).
let json = r#"{
"chunk_id": "c1",
"doc_id": "d1",
"block_ids": [],
"text": "hello",
"heading_path": [],
"source_spans": [],
"token_estimate": 1,
"chunker_version": "md-heading-v1",
"policy_hash": "abc"
}"#;
let c: Chunk = serde_json::from_str(json).unwrap();
assert_eq!(c.tokenized_korean_text, None);
}
}