diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index e02db50..b1cc3de 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -106,8 +106,8 @@ impl kebab_core::DocumentStore for SqliteStore { chunk_id, doc_id, text, heading_path_json, section_label, source_spans_json, token_estimate, chunker_version, policy_hash, block_ids_json, created_at, - tokenized_korean_text - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + tokenized_korean_text, aliases + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", ) .map_err(StoreError::from)?; for chunk in chunks { @@ -136,6 +136,7 @@ impl kebab_core::DocumentStore for SqliteStore { block_ids, now, chunk.tokenized_korean_text.as_deref(), + chunk.aliases.as_deref(), ]) .map_err(StoreError::from)?; } diff --git a/crates/kebab-store-sqlite/tests/chunk_aliases.rs b/crates/kebab-store-sqlite/tests/chunk_aliases.rs new file mode 100644 index 0000000..96c7cd4 --- /dev/null +++ b/crates/kebab-store-sqlite/tests/chunk_aliases.rs @@ -0,0 +1,163 @@ +//! V010 doc-side expansion: `put_chunks` 가 `chunk.aliases` 를 chunks.aliases +//! 컬럼에 영속화하고, chunk_aliases_ai trigger 가 별도 `chunk_aliases_fts` +//! 가상 테이블로 mirror 하는지 검증. +//! +//! `put_chunks` 는 store-owned conn(FK ON)에서 도므로 chunks 의 +//! `doc_id REFERENCES documents(doc_id)` FK 를 만족시키려면 asset + +//! document 그래프가 먼저 있어야 한다. 헬퍼는 `idempotency.rs` 패턴 복제. +//! 인덱싱 검증은 side-channel `env.with_conn` 으로 chunk_aliases_fts 를 직접 +//! MATCH 한다(같은 established 패턴). + +use std::path::PathBuf; + +use kebab_core::{ + AssetId, AssetStorage, Block, CanonicalDocument, Checksum, Chunk, ChunkerVersion, CommonBlock, + DocumentId, DocumentStore, HeadingBlock, Lang, MediaType, Metadata, ParserVersion, Provenance, + SourceSpan, SourceType, SourceUri, TextBlock, TrustLevel, WorkspacePath, +}; +use kebab_store_sqlite::SqliteStore; +use time::OffsetDateTime; + +mod common; + +fn make_asset() -> kebab_core::RawAsset { + let bytes = b"dummy"; + kebab_core::RawAsset { + asset_id: AssetId("a".repeat(32)), + source_uri: SourceUri::File(PathBuf::from("/tmp/foo.md")), + workspace_path: WorkspacePath::new("notes/foo.md".into()).unwrap(), + media_type: MediaType::Markdown, + byte_len: bytes.len() as u64, + checksum: Checksum(blake3::hash(bytes).to_hex().to_string()), + discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + stored: AssetStorage::Reference { + path: PathBuf::from("/tmp/foo.md"), + sha: Checksum(blake3::hash(bytes).to_hex().to_string()), + }, + } +} + +fn make_metadata() -> Metadata { + Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Markdown, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: None, + git_branch: None, + git_commit: None, + code_lang: None, + } +} + +fn make_doc() -> CanonicalDocument { + let doc_id = DocumentId("d".repeat(32)); + let span = SourceSpan::Line { start: 1, end: 1 }; + let block = Block::Heading(HeadingBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("b".repeat(32)), + heading_path: vec![], + source_span: span.clone(), + }, + level: 1, + text: "Title".into(), + }); + let para = Block::Paragraph(TextBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("c".repeat(32)), + heading_path: vec!["Title".into()], + source_span: span, + }, + text: "body".into(), + inlines: vec![], + }); + CanonicalDocument { + doc_id, + source_asset_id: AssetId("a".repeat(32)), + workspace_path: WorkspacePath::new("notes/foo.md".into()).unwrap(), + title: "Title".into(), + lang: Lang("en".into()), + blocks: vec![block, para], + metadata: make_metadata(), + provenance: Provenance { events: vec![] }, + parser_version: ParserVersion("test-parser".into()), + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +/// 단일 청크 생성. `aliases` 만 호출측이 지정. +fn base_chunk(chunk_id: &str, doc_id: &DocumentId, aliases: Option) -> Chunk { + Chunk { + chunk_id: kebab_core::ChunkId(chunk_id.into()), + doc_id: doc_id.clone(), + block_ids: vec![kebab_core::BlockId("b".repeat(32))], + text: "Rust ownership and borrowing".into(), + heading_path: vec!["Title".into()], + source_spans: vec![SourceSpan::Line { start: 1, end: 1 }], + token_estimate: 5, + chunker_version: ChunkerVersion("md-heading-v1".into()), + policy_hash: "h".into(), + tokenized_korean_text: None, + aliases, + } +} + +/// asset + document 그래프를 깔고 마이그레이션된 store 를 돌려준다. +fn open_store_with_document(env: &common::TestEnv) -> SqliteStore { + let store = SqliteStore::open(&env.config()).unwrap(); + store.run_migrations().unwrap(); + store.put_asset(&make_asset()).expect("put_asset"); + store.put_document(&make_doc()).expect("put_document"); + store +} + +#[test] +fn aliases_indexed_into_chunk_aliases_fts() { + let env = common::TestEnv::new(); + let store = open_store_with_document(&env); + let doc = DocumentId("d".repeat(32)); + let chunk = base_chunk( + &"e".repeat(32), + &doc, + Some("메모리 안전성\nwho owns the value".into()), + ); + store.put_chunks(&doc, &[chunk]).unwrap(); + + // 별칭에만 있는 한국어 term 으로 chunk_aliases_fts 검색 → 청크 회수. + let n: i64 = env.with_conn(|c| { + c.query_row( + "SELECT count(*) FROM chunk_aliases_fts \ + WHERE chunk_aliases_fts MATCH 'aliases : (\"메모리\")'", + [], + |r| r.get(0), + ) + }); + assert_eq!( + n, 1, + "aliases 의 한국어 term 이 chunk_aliases_fts 에 색인돼야 한다" + ); +} + +#[test] +fn none_aliases_not_indexed() { + let env = common::TestEnv::new(); + let store = open_store_with_document(&env); + let doc = DocumentId("d".repeat(32)); + let chunk = base_chunk(&"e".repeat(32), &doc, None); + store.put_chunks(&doc, &[chunk]).unwrap(); + + let n: i64 = env.with_conn(|c| { + c.query_row("SELECT count(*) FROM chunk_aliases_fts", [], |r| r.get(0)) + }); + assert_eq!( + n, 0, + "aliases=None 이면 chunk_aliases_fts 에 행이 없어야 한다" + ); +} diff --git a/crates/kebab-store-sqlite/tests/corpus_revision.rs b/crates/kebab-store-sqlite/tests/corpus_revision.rs index 2ba6026..488d81a 100644 --- a/crates/kebab-store-sqlite/tests/corpus_revision.rs +++ b/crates/kebab-store-sqlite/tests/corpus_revision.rs @@ -20,26 +20,26 @@ fn open_store(tmp: &TempDir) -> SqliteStore { store } -/// Fresh store baseline: V004 seeds `corpus_revision = 0`, then V009 -/// migration bumps it by one to invalidate any pre-V009 LRU cache — -/// so a fresh store after `run_migrations()` reads back as `1`. +/// Fresh store baseline: V004 seeds `corpus_revision = 0`, then both V009 +/// and V010 migrations bump it by one each to invalidate any stale LRU +/// cache — so a fresh store after `run_migrations()` reads back as `2`. #[test] fn fresh_store_starts_at_post_migration_baseline() { let tmp = TempDir::new().unwrap(); let store = open_store(&tmp); - assert_eq!(store.corpus_revision(), 1); + assert_eq!(store.corpus_revision(), 2); } /// Each `bump_corpus_revision` returns the new value monotonically -/// from the post-migration baseline. +/// from the post-migration baseline (V009 + V010 → 2). #[test] fn bump_increments_monotonically() { let tmp = TempDir::new().unwrap(); let store = open_store(&tmp); - assert_eq!(store.bump_corpus_revision().unwrap(), 2); assert_eq!(store.bump_corpus_revision().unwrap(), 3); assert_eq!(store.bump_corpus_revision().unwrap(), 4); - assert_eq!(store.corpus_revision(), 4); + assert_eq!(store.bump_corpus_revision().unwrap(), 5); + assert_eq!(store.corpus_revision(), 5); } /// `corpus_revision` survives a store re-open (persisted in SQLite). @@ -52,6 +52,6 @@ fn revision_persists_across_reopen() { store.bump_corpus_revision().unwrap(); } // store dropped — file closed let store = open_store(&tmp); - assert_eq!(store.corpus_revision(), 3); - assert_eq!(store.bump_corpus_revision().unwrap(), 4); + assert_eq!(store.corpus_revision(), 4); + assert_eq!(store.bump_corpus_revision().unwrap(), 5); } diff --git a/migrations/V010__chunk_aliases.sql b/migrations/V010__chunk_aliases.sql new file mode 100644 index 0000000..130dbad --- /dev/null +++ b/migrations/V010__chunk_aliases.sql @@ -0,0 +1,33 @@ +-- V010__chunk_aliases.sql — doc-side expansion (Phase 2) 검색용 별칭 채널. +-- +-- 설계 spec docs/superpowers/specs/2026-05-30-doc-side-expansion-design.md §4. +-- chunks 에 nullable `aliases` 컬럼 + 별도 FTS5 테이블 chunk_aliases_fts + +-- 별도 sync trigger. 기존 chunks_fts / chunks_ai/ad/au (design §5.5 verbatim, +-- CI test fts_v009_matches_design_section_5_5_verbatim) 는 무수정. +-- aliases 는 additive: 미생성/flag off 이면 NULL → chunk_aliases_fts 빈 채로 +-- 시작, 검색 UNION 둘째 절 0행 → 기존 동작과 동일. 자동 backfill 없음. + +ALTER TABLE chunks ADD COLUMN aliases TEXT; + +CREATE VIRTUAL TABLE chunk_aliases_fts USING fts5( + chunk_id UNINDEXED, + doc_id UNINDEXED, + aliases, + tokenize = 'unicode61' +); + +CREATE TRIGGER chunk_aliases_ai AFTER INSERT ON chunks WHEN new.aliases IS NOT NULL BEGIN + INSERT INTO chunk_aliases_fts(chunk_id, doc_id, aliases) + VALUES (new.chunk_id, new.doc_id, new.aliases); +END; +CREATE TRIGGER chunk_aliases_ad AFTER DELETE ON chunks BEGIN + DELETE FROM chunk_aliases_fts WHERE chunk_id = old.chunk_id; +END; +CREATE TRIGGER chunk_aliases_au AFTER UPDATE ON chunks BEGIN + DELETE FROM chunk_aliases_fts WHERE chunk_id = old.chunk_id; + INSERT INTO chunk_aliases_fts(chunk_id, doc_id, aliases) + SELECT new.chunk_id, new.doc_id, new.aliases WHERE new.aliases IS NOT NULL; +END; + +-- in-process LRU search cache 무효화 (V009 와 동일 패턴). +UPDATE kv SET value = CAST(CAST(value AS INTEGER) + 1 AS TEXT) WHERE key = 'corpus_revision';