feat(store): V010 chunk_aliases_fts + put_chunks 별칭 영속화

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-05-30 02:15:27 +00:00
parent 848b75c069
commit b12a616ab2
4 changed files with 208 additions and 11 deletions

View File

@@ -106,8 +106,8 @@ impl kebab_core::DocumentStore for SqliteStore {
chunk_id, doc_id, text, heading_path_json,
section_label, source_spans_json, token_estimate,
chunker_version, policy_hash, block_ids_json, created_at,
tokenized_korean_text
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
tokenized_korean_text, aliases
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
)
.map_err(StoreError::from)?;
for chunk in chunks {
@@ -136,6 +136,7 @@ impl kebab_core::DocumentStore for SqliteStore {
block_ids,
now,
chunk.tokenized_korean_text.as_deref(),
chunk.aliases.as_deref(),
])
.map_err(StoreError::from)?;
}

View File

@@ -0,0 +1,163 @@
//! V010 doc-side expansion: `put_chunks` 가 `chunk.aliases` 를 chunks.aliases
//! 컬럼에 영속화하고, chunk_aliases_ai trigger 가 별도 `chunk_aliases_fts`
//! 가상 테이블로 mirror 하는지 검증.
//!
//! `put_chunks` 는 store-owned conn(FK ON)에서 도므로 chunks 의
//! `doc_id REFERENCES documents(doc_id)` FK 를 만족시키려면 asset +
//! document 그래프가 먼저 있어야 한다. 헬퍼는 `idempotency.rs` 패턴 복제.
//! 인덱싱 검증은 side-channel `env.with_conn` 으로 chunk_aliases_fts 를 직접
//! MATCH 한다(같은 established 패턴).
use std::path::PathBuf;
use kebab_core::{
AssetId, AssetStorage, Block, CanonicalDocument, Checksum, Chunk, ChunkerVersion, CommonBlock,
DocumentId, DocumentStore, HeadingBlock, Lang, MediaType, Metadata, ParserVersion, Provenance,
SourceSpan, SourceType, SourceUri, TextBlock, TrustLevel, WorkspacePath,
};
use kebab_store_sqlite::SqliteStore;
use time::OffsetDateTime;
mod common;
fn make_asset() -> kebab_core::RawAsset {
let bytes = b"dummy";
kebab_core::RawAsset {
asset_id: AssetId("a".repeat(32)),
source_uri: SourceUri::File(PathBuf::from("/tmp/foo.md")),
workspace_path: WorkspacePath::new("notes/foo.md".into()).unwrap(),
media_type: MediaType::Markdown,
byte_len: bytes.len() as u64,
checksum: Checksum(blake3::hash(bytes).to_hex().to_string()),
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
stored: AssetStorage::Reference {
path: PathBuf::from("/tmp/foo.md"),
sha: Checksum(blake3::hash(bytes).to_hex().to_string()),
},
}
}
fn make_metadata() -> Metadata {
Metadata {
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Markdown,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: None,
git_branch: None,
git_commit: None,
code_lang: None,
}
}
fn make_doc() -> CanonicalDocument {
let doc_id = DocumentId("d".repeat(32));
let span = SourceSpan::Line { start: 1, end: 1 };
let block = Block::Heading(HeadingBlock {
common: CommonBlock {
block_id: kebab_core::BlockId("b".repeat(32)),
heading_path: vec![],
source_span: span.clone(),
},
level: 1,
text: "Title".into(),
});
let para = Block::Paragraph(TextBlock {
common: CommonBlock {
block_id: kebab_core::BlockId("c".repeat(32)),
heading_path: vec!["Title".into()],
source_span: span,
},
text: "body".into(),
inlines: vec![],
});
CanonicalDocument {
doc_id,
source_asset_id: AssetId("a".repeat(32)),
workspace_path: WorkspacePath::new("notes/foo.md".into()).unwrap(),
title: "Title".into(),
lang: Lang("en".into()),
blocks: vec![block, para],
metadata: make_metadata(),
provenance: Provenance { events: vec![] },
parser_version: ParserVersion("test-parser".into()),
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
/// 단일 청크 생성. `aliases` 만 호출측이 지정.
fn base_chunk(chunk_id: &str, doc_id: &DocumentId, aliases: Option<String>) -> Chunk {
Chunk {
chunk_id: kebab_core::ChunkId(chunk_id.into()),
doc_id: doc_id.clone(),
block_ids: vec![kebab_core::BlockId("b".repeat(32))],
text: "Rust ownership and borrowing".into(),
heading_path: vec!["Title".into()],
source_spans: vec![SourceSpan::Line { start: 1, end: 1 }],
token_estimate: 5,
chunker_version: ChunkerVersion("md-heading-v1".into()),
policy_hash: "h".into(),
tokenized_korean_text: None,
aliases,
}
}
/// asset + document 그래프를 깔고 마이그레이션된 store 를 돌려준다.
fn open_store_with_document(env: &common::TestEnv) -> SqliteStore {
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
store.put_asset(&make_asset()).expect("put_asset");
store.put_document(&make_doc()).expect("put_document");
store
}
#[test]
fn aliases_indexed_into_chunk_aliases_fts() {
let env = common::TestEnv::new();
let store = open_store_with_document(&env);
let doc = DocumentId("d".repeat(32));
let chunk = base_chunk(
&"e".repeat(32),
&doc,
Some("메모리 안전성\nwho owns the value".into()),
);
store.put_chunks(&doc, &[chunk]).unwrap();
// 별칭에만 있는 한국어 term 으로 chunk_aliases_fts 검색 → 청크 회수.
let n: i64 = env.with_conn(|c| {
c.query_row(
"SELECT count(*) FROM chunk_aliases_fts \
WHERE chunk_aliases_fts MATCH 'aliases : (\"메모리\")'",
[],
|r| r.get(0),
)
});
assert_eq!(
n, 1,
"aliases 의 한국어 term 이 chunk_aliases_fts 에 색인돼야 한다"
);
}
#[test]
fn none_aliases_not_indexed() {
let env = common::TestEnv::new();
let store = open_store_with_document(&env);
let doc = DocumentId("d".repeat(32));
let chunk = base_chunk(&"e".repeat(32), &doc, None);
store.put_chunks(&doc, &[chunk]).unwrap();
let n: i64 = env.with_conn(|c| {
c.query_row("SELECT count(*) FROM chunk_aliases_fts", [], |r| r.get(0))
});
assert_eq!(
n, 0,
"aliases=None 이면 chunk_aliases_fts 에 행이 없어야 한다"
);
}

View File

@@ -20,26 +20,26 @@ fn open_store(tmp: &TempDir) -> SqliteStore {
store
}
/// Fresh store baseline: V004 seeds `corpus_revision = 0`, then V009
/// migration bumps it by one to invalidate any pre-V009 LRU cache —
/// so a fresh store after `run_migrations()` reads back as `1`.
/// Fresh store baseline: V004 seeds `corpus_revision = 0`, then both V009
/// and V010 migrations bump it by one each to invalidate any stale LRU
/// cache — so a fresh store after `run_migrations()` reads back as `2`.
#[test]
fn fresh_store_starts_at_post_migration_baseline() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
assert_eq!(store.corpus_revision(), 1);
assert_eq!(store.corpus_revision(), 2);
}
/// Each `bump_corpus_revision` returns the new value monotonically
/// from the post-migration baseline.
/// from the post-migration baseline (V009 + V010 → 2).
#[test]
fn bump_increments_monotonically() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
assert_eq!(store.bump_corpus_revision().unwrap(), 2);
assert_eq!(store.bump_corpus_revision().unwrap(), 3);
assert_eq!(store.bump_corpus_revision().unwrap(), 4);
assert_eq!(store.corpus_revision(), 4);
assert_eq!(store.bump_corpus_revision().unwrap(), 5);
assert_eq!(store.corpus_revision(), 5);
}
/// `corpus_revision` survives a store re-open (persisted in SQLite).
@@ -52,6 +52,6 @@ fn revision_persists_across_reopen() {
store.bump_corpus_revision().unwrap();
} // store dropped — file closed
let store = open_store(&tmp);
assert_eq!(store.corpus_revision(), 3);
assert_eq!(store.bump_corpus_revision().unwrap(), 4);
assert_eq!(store.corpus_revision(), 4);
assert_eq!(store.bump_corpus_revision().unwrap(), 5);
}

View File

@@ -0,0 +1,33 @@
-- V010__chunk_aliases.sql — doc-side expansion (Phase 2) 검색용 별칭 채널.
--
-- 설계 spec docs/superpowers/specs/2026-05-30-doc-side-expansion-design.md §4.
-- chunks 에 nullable `aliases` 컬럼 + 별도 FTS5 테이블 chunk_aliases_fts +
-- 별도 sync trigger. 기존 chunks_fts / chunks_ai/ad/au (design §5.5 verbatim,
-- CI test fts_v009_matches_design_section_5_5_verbatim) 는 무수정.
-- aliases 는 additive: 미생성/flag off 이면 NULL → chunk_aliases_fts 빈 채로
-- 시작, 검색 UNION 둘째 절 0행 → 기존 동작과 동일. 자동 backfill 없음.
ALTER TABLE chunks ADD COLUMN aliases TEXT;
CREATE VIRTUAL TABLE chunk_aliases_fts USING fts5(
chunk_id UNINDEXED,
doc_id UNINDEXED,
aliases,
tokenize = 'unicode61'
);
CREATE TRIGGER chunk_aliases_ai AFTER INSERT ON chunks WHEN new.aliases IS NOT NULL BEGIN
INSERT INTO chunk_aliases_fts(chunk_id, doc_id, aliases)
VALUES (new.chunk_id, new.doc_id, new.aliases);
END;
CREATE TRIGGER chunk_aliases_ad AFTER DELETE ON chunks BEGIN
DELETE FROM chunk_aliases_fts WHERE chunk_id = old.chunk_id;
END;
CREATE TRIGGER chunk_aliases_au AFTER UPDATE ON chunks BEGIN
DELETE FROM chunk_aliases_fts WHERE chunk_id = old.chunk_id;
INSERT INTO chunk_aliases_fts(chunk_id, doc_id, aliases)
SELECT new.chunk_id, new.doc_id, new.aliases WHERE new.aliases IS NOT NULL;
END;
-- in-process LRU search cache 무효화 (V009 와 동일 패턴).
UPDATE kv SET value = CAST(CAST(value AS INTEGER) + 1 AS TEXT) WHERE key = 'corpus_revision';