fix(chunk): close S3 reviewer blockers — get_chunk read + AST chunker cascade

S3 spec compliance reviewer (sonnet) 가 2 blocker 발견:

1. crates/kebab-store-sqlite/src/documents.rs: get_chunk SELECT 가
   tokenized_korean_text column 을 미조회 → DB 의 값이 read 시 유실.
   SELECT column list + row → Chunk 변환 시 row.get 인덱스 추가.
   ChunkRow struct + chunk_row_from_sql + get_chunk Chunk 생성 cascade.

2. crates/kebab-chunk/src/code_*_ast_v1.rs (9 file): make_chunk 가
   tokenized_korean_text: None 하드코딩 → 한국어 주석을 가진 코드
   파일이 FTS hit 안 됨. tier2_shared 와 동일 패턴으로
   tokenize_korean_morphological(text) 호출 cascade.

이 commit 은 S3 의 rework — amend 아닌 별 commit (S3 boundary
유지). spec §6.2 invariant ("모든 chunker 가 chunk emit 직전에
tokenize 호출") 충족.

Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md §6.2
Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S3 rework)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 10:30:53 +00:00
parent b134ae9dd5
commit bd86f61c9c
10 changed files with 13 additions and 11 deletions

View File

@@ -145,7 +145,7 @@ fn make_chunk(
chunk_id,
doc_id: DocumentId(doc.doc_id.0.clone()),
block_ids: block_ids.to_vec(),
tokenized_korean_text: None,
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
text,
heading_path: Vec::new(),
source_spans: vec![span],

View File

@@ -147,7 +147,7 @@ fn make_chunk(
chunk_id,
doc_id: DocumentId(doc.doc_id.0.clone()),
block_ids: block_ids.to_vec(),
tokenized_korean_text: None,
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
text,
heading_path: Vec::new(),
source_spans: vec![span],

View File

@@ -147,7 +147,7 @@ fn make_chunk(
chunk_id,
doc_id: DocumentId(doc.doc_id.0.clone()),
block_ids: block_ids.to_vec(),
tokenized_korean_text: None,
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
text,
heading_path: Vec::new(),
source_spans: vec![span],

View File

@@ -147,7 +147,7 @@ fn make_chunk(
chunk_id,
doc_id: DocumentId(doc.doc_id.0.clone()),
block_ids: block_ids.to_vec(),
tokenized_korean_text: None,
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
text,
heading_path: Vec::new(),
source_spans: vec![span],

View File

@@ -147,7 +147,7 @@ fn make_chunk(
chunk_id,
doc_id: DocumentId(doc.doc_id.0.clone()),
block_ids: block_ids.to_vec(),
tokenized_korean_text: None,
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
text,
heading_path: Vec::new(),
source_spans: vec![span],

View File

@@ -147,7 +147,7 @@ fn make_chunk(
chunk_id,
doc_id: DocumentId(doc.doc_id.0.clone()),
block_ids: block_ids.to_vec(),
tokenized_korean_text: None,
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
text,
heading_path: Vec::new(),
source_spans: vec![span],

View File

@@ -147,7 +147,7 @@ fn make_chunk(
chunk_id,
doc_id: DocumentId(doc.doc_id.0.clone()),
block_ids: block_ids.to_vec(),
tokenized_korean_text: None,
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
text,
heading_path: Vec::new(),
source_spans: vec![span],

View File

@@ -147,7 +147,7 @@ fn make_chunk(
chunk_id,
doc_id: DocumentId(doc.doc_id.0.clone()),
block_ids: block_ids.to_vec(),
tokenized_korean_text: None,
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
text,
heading_path: Vec::new(),
source_spans: vec![span],

View File

@@ -147,7 +147,7 @@ fn make_chunk(
chunk_id,
doc_id: DocumentId(doc.doc_id.0.clone()),
block_ids: block_ids.to_vec(),
tokenized_korean_text: None,
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
text,
heading_path: Vec::new(),
source_spans: vec![span],

View File

@@ -223,7 +223,7 @@ impl kebab_core::DocumentStore for SqliteStore {
"SELECT
chunk_id, doc_id, text, heading_path_json,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json
policy_hash, block_ids_json, tokenized_korean_text
FROM chunks WHERE chunk_id = ?",
params![id.0],
chunk_row_from_sql,
@@ -249,7 +249,7 @@ impl kebab_core::DocumentStore for SqliteStore {
token_estimate: row.token_estimate as usize,
chunker_version: kebab_core::ChunkerVersion(row.chunker_version),
policy_hash: row.policy_hash,
tokenized_korean_text: None,
tokenized_korean_text: row.tokenized_korean_text,
}))
}
@@ -560,6 +560,7 @@ struct ChunkRow {
chunker_version: String,
policy_hash: String,
block_ids_json: String,
tokenized_korean_text: Option<String>,
}
fn chunk_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result<ChunkRow> {
@@ -573,6 +574,7 @@ fn chunk_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result<ChunkRow> {
chunker_version: row.get(6)?,
policy_hash: row.get(7)?,
block_ids_json: row.get(8)?,
tokenized_korean_text: row.get(9)?,
})
}