fix(chunk): close S3 reviewer blockers — get_chunk read + AST chunker cascade
S3 spec compliance reviewer (sonnet) 가 2 blocker 발견:
1. crates/kebab-store-sqlite/src/documents.rs: get_chunk SELECT 가
tokenized_korean_text column 을 미조회 → DB 의 값이 read 시 유실.
SELECT column list + row → Chunk 변환 시 row.get 인덱스 추가.
ChunkRow struct + chunk_row_from_sql + get_chunk Chunk 생성 cascade.
2. crates/kebab-chunk/src/code_*_ast_v1.rs (9 file): make_chunk 가
tokenized_korean_text: None 하드코딩 → 한국어 주석을 가진 코드
파일이 FTS hit 안 됨. tier2_shared 와 동일 패턴으로
tokenize_korean_morphological(text) 호출 cascade.
이 commit 은 S3 의 rework — amend 아닌 별 commit (S3 boundary
유지). spec §6.2 invariant ("모든 chunker 가 chunk emit 직전에
tokenize 호출") 충족.
Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md §6.2
Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S3 rework)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -145,7 +145,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
|
||||
@@ -147,7 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
|
||||
@@ -147,7 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
|
||||
@@ -147,7 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
|
||||
@@ -147,7 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
|
||||
@@ -147,7 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
|
||||
@@ -147,7 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
|
||||
@@ -147,7 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
|
||||
@@ -147,7 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
|
||||
@@ -223,7 +223,7 @@ impl kebab_core::DocumentStore for SqliteStore {
|
||||
"SELECT
|
||||
chunk_id, doc_id, text, heading_path_json,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json
|
||||
policy_hash, block_ids_json, tokenized_korean_text
|
||||
FROM chunks WHERE chunk_id = ?",
|
||||
params![id.0],
|
||||
chunk_row_from_sql,
|
||||
@@ -249,7 +249,7 @@ impl kebab_core::DocumentStore for SqliteStore {
|
||||
token_estimate: row.token_estimate as usize,
|
||||
chunker_version: kebab_core::ChunkerVersion(row.chunker_version),
|
||||
policy_hash: row.policy_hash,
|
||||
tokenized_korean_text: None,
|
||||
tokenized_korean_text: row.tokenized_korean_text,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -560,6 +560,7 @@ struct ChunkRow {
|
||||
chunker_version: String,
|
||||
policy_hash: String,
|
||||
block_ids_json: String,
|
||||
tokenized_korean_text: Option<String>,
|
||||
}
|
||||
|
||||
fn chunk_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result<ChunkRow> {
|
||||
@@ -573,6 +574,7 @@ fn chunk_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result<ChunkRow> {
|
||||
chunker_version: row.get(6)?,
|
||||
policy_hash: row.get(7)?,
|
||||
block_ids_json: row.get(8)?,
|
||||
tokenized_korean_text: row.get(9)?,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user