diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index 766a128..d565a81 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -165,7 +165,8 @@ impl kebab_core::DocumentStore for SqliteStore { doc_id, asset_id, workspace_path, title, lang, source_type, trust_level, parser_version, doc_version, schema_version, metadata_json, - provenance_json, created_at, updated_at + provenance_json, created_at, updated_at, + last_chunker_version, last_embedding_version FROM documents WHERE doc_id = ?", params![id.0], document_row_from_sql, @@ -221,8 +222,8 @@ impl kebab_core::DocumentStore for SqliteStore { // under that invariant. schema_version: row.schema_version as u32, doc_version: row.doc_version as u32, - last_chunker_version: None, - last_embedding_version: None, + last_chunker_version: row.last_chunker_version.map(kebab_core::ChunkerVersion), + last_embedding_version: row.last_embedding_version.map(kebab_core::EmbeddingVersion), })) } @@ -367,6 +368,8 @@ struct DocumentRow { provenance_json: String, // source_type / trust_level are loaded back via metadata_json round-trip, // so we do not need separate fields here for `get_document`. + last_chunker_version: Option, + last_embedding_version: Option, } fn document_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result { @@ -385,6 +388,10 @@ fn document_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result RawAsset { + let bytes = b"incremental-ingest-test"; + RawAsset { + asset_id: AssetId("f".repeat(32)), + source_uri: SourceUri::File(PathBuf::from("/tmp/inc.md")), + workspace_path: WorkspacePath::new("notes/inc.md".into()).unwrap(), + media_type: MediaType::Markdown, + byte_len: bytes.len() as u64, + checksum: Checksum(blake3::hash(bytes).to_hex().to_string()), + discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + stored: AssetStorage::Reference { + path: PathBuf::from("/tmp/inc.md"), + sha: Checksum(blake3::hash(bytes).to_hex().to_string()), + }, + } +} + +fn make_doc() -> CanonicalDocument { + let doc_id = DocumentId("d".repeat(32)); + let block = Block::Heading(HeadingBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("b".repeat(32)), + heading_path: vec![], + source_span: SourceSpan::Line { start: 1, end: 1 }, + }, + level: 1, + text: "Incremental Title".into(), + }); + let metadata = Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Markdown, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + }; + CanonicalDocument { + doc_id, + source_asset_id: AssetId("f".repeat(32)), + workspace_path: WorkspacePath::new("notes/inc.md".into()).unwrap(), + title: "Incremental Title".into(), + lang: Lang("en".into()), + blocks: vec![block], + metadata, + provenance: Provenance { events: vec![] }, + parser_version: ParserVersion("test-parser".into()), + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +#[test] +fn put_then_get_document_roundtrips_version_stamps() { + let env = common::TestEnv::new(); + let store = SqliteStore::open(&env.config()).unwrap(); + store.run_migrations().unwrap(); + + let asset = make_asset(); + store.put_asset(&asset).unwrap(); + + let mut doc = make_doc(); + doc.last_chunker_version = Some(ChunkerVersion("md-heading-v1".into())); + doc.last_embedding_version = Some(EmbeddingVersion("multilingual-e5-small@v1".into())); + + store.put_document(&doc).unwrap(); + let loaded = store + .get_document(&doc.doc_id) + .unwrap() + .expect("doc round-trips"); + + assert_eq!(loaded.last_chunker_version, doc.last_chunker_version); + assert_eq!(loaded.last_embedding_version, doc.last_embedding_version); +} + +#[test] +fn put_then_get_document_roundtrips_none_stamps() { + let env = common::TestEnv::new(); + let store = SqliteStore::open(&env.config()).unwrap(); + store.run_migrations().unwrap(); + + let asset = make_asset(); + store.put_asset(&asset).unwrap(); + + let doc = make_doc(); // both version stamps are None by default + store.put_document(&doc).unwrap(); + let loaded = store + .get_document(&doc.doc_id) + .unwrap() + .expect("doc round-trips"); + + assert!( + loaded.last_chunker_version.is_none(), + "last_chunker_version must be None when not set" + ); + assert!( + loaded.last_embedding_version.is_none(), + "last_embedding_version must be None when not set" + ); +} diff --git a/migrations/V006__incremental_ingest.sql b/migrations/V006__incremental_ingest.sql new file mode 100644 index 0000000..1a2a30e --- /dev/null +++ b/migrations/V006__incremental_ingest.sql @@ -0,0 +1,6 @@ +-- p9-fb-23: incremental ingest needs to know which chunker / embedding +-- versions were used to populate this document so a re-ingest can +-- decide whether to skip (versions match) or re-process (any mismatch). +-- parser_version is already on documents from V001. +ALTER TABLE documents ADD COLUMN last_chunker_version TEXT; +ALTER TABLE documents ADD COLUMN last_embedding_version TEXT;