feat(kebab-store-sqlite): p9-fb-23 task 3 — V006 migration + put/get_document round-trip version stamps
Add V006__incremental_ingest.sql to persist last_chunker_version and last_embedding_version on the documents table. Wire both columns into upsert_document (INSERT + ON CONFLICT UPDATE) and get_document (SELECT + row mapper), replacing the previous hardcoded None. Add two round-trip tests in tests/incremental_ingest.rs covering the set and None cases. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -165,7 +165,8 @@ impl kebab_core::DocumentStore for SqliteStore {
|
|||||||
doc_id, asset_id, workspace_path, title, lang,
|
doc_id, asset_id, workspace_path, title, lang,
|
||||||
source_type, trust_level, parser_version,
|
source_type, trust_level, parser_version,
|
||||||
doc_version, schema_version, metadata_json,
|
doc_version, schema_version, metadata_json,
|
||||||
provenance_json, created_at, updated_at
|
provenance_json, created_at, updated_at,
|
||||||
|
last_chunker_version, last_embedding_version
|
||||||
FROM documents WHERE doc_id = ?",
|
FROM documents WHERE doc_id = ?",
|
||||||
params![id.0],
|
params![id.0],
|
||||||
document_row_from_sql,
|
document_row_from_sql,
|
||||||
@@ -221,8 +222,8 @@ impl kebab_core::DocumentStore for SqliteStore {
|
|||||||
// under that invariant.
|
// under that invariant.
|
||||||
schema_version: row.schema_version as u32,
|
schema_version: row.schema_version as u32,
|
||||||
doc_version: row.doc_version as u32,
|
doc_version: row.doc_version as u32,
|
||||||
last_chunker_version: None,
|
last_chunker_version: row.last_chunker_version.map(kebab_core::ChunkerVersion),
|
||||||
last_embedding_version: None,
|
last_embedding_version: row.last_embedding_version.map(kebab_core::EmbeddingVersion),
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -367,6 +368,8 @@ struct DocumentRow {
|
|||||||
provenance_json: String,
|
provenance_json: String,
|
||||||
// source_type / trust_level are loaded back via metadata_json round-trip,
|
// source_type / trust_level are loaded back via metadata_json round-trip,
|
||||||
// so we do not need separate fields here for `get_document`.
|
// so we do not need separate fields here for `get_document`.
|
||||||
|
last_chunker_version: Option<String>,
|
||||||
|
last_embedding_version: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn document_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result<DocumentRow> {
|
fn document_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result<DocumentRow> {
|
||||||
@@ -385,6 +388,10 @@ fn document_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result<DocumentRo
|
|||||||
schema_version: row.get(9)?,
|
schema_version: row.get(9)?,
|
||||||
metadata_json: row.get(10)?,
|
metadata_json: row.get(10)?,
|
||||||
provenance_json: row.get(11)?,
|
provenance_json: row.get(11)?,
|
||||||
|
// 12: created_at, 13: updated_at — not stored in DocumentRow
|
||||||
|
// (only needed for list_documents). Columns 14-15 follow.
|
||||||
|
last_chunker_version: row.get(14)?,
|
||||||
|
last_embedding_version: row.get(15)?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -505,24 +512,27 @@ fn upsert_document(
|
|||||||
doc_id, asset_id, workspace_path, title, lang,
|
doc_id, asset_id, workspace_path, title, lang,
|
||||||
source_type, trust_level, parser_version,
|
source_type, trust_level, parser_version,
|
||||||
doc_version, schema_version, metadata_json,
|
doc_version, schema_version, metadata_json,
|
||||||
provenance_json, created_at, updated_at
|
provenance_json, created_at, updated_at,
|
||||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
last_chunker_version, last_embedding_version
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
ON CONFLICT(doc_id) DO UPDATE SET
|
ON CONFLICT(doc_id) DO UPDATE SET
|
||||||
asset_id = excluded.asset_id,
|
asset_id = excluded.asset_id,
|
||||||
workspace_path = excluded.workspace_path,
|
workspace_path = excluded.workspace_path,
|
||||||
title = excluded.title,
|
title = excluded.title,
|
||||||
lang = excluded.lang,
|
lang = excluded.lang,
|
||||||
source_type = excluded.source_type,
|
source_type = excluded.source_type,
|
||||||
trust_level = excluded.trust_level,
|
trust_level = excluded.trust_level,
|
||||||
parser_version = excluded.parser_version,
|
parser_version = excluded.parser_version,
|
||||||
-- doc_version: bump on update. excluded.doc_version is the
|
-- doc_version: bump on update. excluded.doc_version is the
|
||||||
-- caller's submitted value; we ignore it and add 1 to the
|
-- caller's submitted value; we ignore it and add 1 to the
|
||||||
-- existing column so each re-ingest cleanly increments.
|
-- existing column so each re-ingest cleanly increments.
|
||||||
doc_version = documents.doc_version + 1,
|
doc_version = documents.doc_version + 1,
|
||||||
schema_version = excluded.schema_version,
|
schema_version = excluded.schema_version,
|
||||||
metadata_json = excluded.metadata_json,
|
metadata_json = excluded.metadata_json,
|
||||||
provenance_json = excluded.provenance_json,
|
provenance_json = excluded.provenance_json,
|
||||||
updated_at = excluded.updated_at",
|
updated_at = excluded.updated_at,
|
||||||
|
last_chunker_version = excluded.last_chunker_version,
|
||||||
|
last_embedding_version = excluded.last_embedding_version",
|
||||||
params![
|
params![
|
||||||
doc.doc_id.0,
|
doc.doc_id.0,
|
||||||
doc.source_asset_id.0,
|
doc.source_asset_id.0,
|
||||||
@@ -538,6 +548,8 @@ fn upsert_document(
|
|||||||
provenance_json,
|
provenance_json,
|
||||||
created_at,
|
created_at,
|
||||||
now,
|
now,
|
||||||
|
doc.last_chunker_version.as_ref().map(|v| v.0.as_str()),
|
||||||
|
doc.last_embedding_version.as_ref().map(|v| v.0.as_str()),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
.map_err(StoreError::from)?;
|
.map_err(StoreError::from)?;
|
||||||
|
|||||||
119
crates/kebab-store-sqlite/tests/incremental_ingest.rs
Normal file
119
crates/kebab-store-sqlite/tests/incremental_ingest.rs
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
//! Round-trip tests for `last_chunker_version` / `last_embedding_version`
|
||||||
|
//! columns added by the V006 migration (p9-fb-23 task 3).
|
||||||
|
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use kebab_core::{
|
||||||
|
AssetId, AssetStorage, Block, CanonicalDocument, Checksum, ChunkerVersion, CommonBlock,
|
||||||
|
DocumentId, DocumentStore, EmbeddingVersion, HeadingBlock, Lang, MediaType, Metadata,
|
||||||
|
ParserVersion, Provenance, RawAsset, SourceSpan, SourceType, SourceUri, TrustLevel,
|
||||||
|
WorkspacePath,
|
||||||
|
};
|
||||||
|
use kebab_store_sqlite::SqliteStore;
|
||||||
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
fn make_asset() -> RawAsset {
|
||||||
|
let bytes = b"incremental-ingest-test";
|
||||||
|
RawAsset {
|
||||||
|
asset_id: AssetId("f".repeat(32)),
|
||||||
|
source_uri: SourceUri::File(PathBuf::from("/tmp/inc.md")),
|
||||||
|
workspace_path: WorkspacePath::new("notes/inc.md".into()).unwrap(),
|
||||||
|
media_type: MediaType::Markdown,
|
||||||
|
byte_len: bytes.len() as u64,
|
||||||
|
checksum: Checksum(blake3::hash(bytes).to_hex().to_string()),
|
||||||
|
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||||
|
stored: AssetStorage::Reference {
|
||||||
|
path: PathBuf::from("/tmp/inc.md"),
|
||||||
|
sha: Checksum(blake3::hash(bytes).to_hex().to_string()),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_doc() -> CanonicalDocument {
|
||||||
|
let doc_id = DocumentId("d".repeat(32));
|
||||||
|
let block = Block::Heading(HeadingBlock {
|
||||||
|
common: CommonBlock {
|
||||||
|
block_id: kebab_core::BlockId("b".repeat(32)),
|
||||||
|
heading_path: vec![],
|
||||||
|
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||||
|
},
|
||||||
|
level: 1,
|
||||||
|
text: "Incremental Title".into(),
|
||||||
|
});
|
||||||
|
let metadata = Metadata {
|
||||||
|
aliases: vec![],
|
||||||
|
tags: vec![],
|
||||||
|
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||||
|
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||||
|
source_type: SourceType::Markdown,
|
||||||
|
trust_level: TrustLevel::Primary,
|
||||||
|
user_id_alias: None,
|
||||||
|
user: Default::default(),
|
||||||
|
};
|
||||||
|
CanonicalDocument {
|
||||||
|
doc_id,
|
||||||
|
source_asset_id: AssetId("f".repeat(32)),
|
||||||
|
workspace_path: WorkspacePath::new("notes/inc.md".into()).unwrap(),
|
||||||
|
title: "Incremental Title".into(),
|
||||||
|
lang: Lang("en".into()),
|
||||||
|
blocks: vec![block],
|
||||||
|
metadata,
|
||||||
|
provenance: Provenance { events: vec![] },
|
||||||
|
parser_version: ParserVersion("test-parser".into()),
|
||||||
|
schema_version: 1,
|
||||||
|
doc_version: 1,
|
||||||
|
last_chunker_version: None,
|
||||||
|
last_embedding_version: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn put_then_get_document_roundtrips_version_stamps() {
|
||||||
|
let env = common::TestEnv::new();
|
||||||
|
let store = SqliteStore::open(&env.config()).unwrap();
|
||||||
|
store.run_migrations().unwrap();
|
||||||
|
|
||||||
|
let asset = make_asset();
|
||||||
|
store.put_asset(&asset).unwrap();
|
||||||
|
|
||||||
|
let mut doc = make_doc();
|
||||||
|
doc.last_chunker_version = Some(ChunkerVersion("md-heading-v1".into()));
|
||||||
|
doc.last_embedding_version = Some(EmbeddingVersion("multilingual-e5-small@v1".into()));
|
||||||
|
|
||||||
|
store.put_document(&doc).unwrap();
|
||||||
|
let loaded = store
|
||||||
|
.get_document(&doc.doc_id)
|
||||||
|
.unwrap()
|
||||||
|
.expect("doc round-trips");
|
||||||
|
|
||||||
|
assert_eq!(loaded.last_chunker_version, doc.last_chunker_version);
|
||||||
|
assert_eq!(loaded.last_embedding_version, doc.last_embedding_version);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn put_then_get_document_roundtrips_none_stamps() {
|
||||||
|
let env = common::TestEnv::new();
|
||||||
|
let store = SqliteStore::open(&env.config()).unwrap();
|
||||||
|
store.run_migrations().unwrap();
|
||||||
|
|
||||||
|
let asset = make_asset();
|
||||||
|
store.put_asset(&asset).unwrap();
|
||||||
|
|
||||||
|
let doc = make_doc(); // both version stamps are None by default
|
||||||
|
store.put_document(&doc).unwrap();
|
||||||
|
let loaded = store
|
||||||
|
.get_document(&doc.doc_id)
|
||||||
|
.unwrap()
|
||||||
|
.expect("doc round-trips");
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
loaded.last_chunker_version.is_none(),
|
||||||
|
"last_chunker_version must be None when not set"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
loaded.last_embedding_version.is_none(),
|
||||||
|
"last_embedding_version must be None when not set"
|
||||||
|
);
|
||||||
|
}
|
||||||
6
migrations/V006__incremental_ingest.sql
Normal file
6
migrations/V006__incremental_ingest.sql
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
-- p9-fb-23: incremental ingest needs to know which chunker / embedding
|
||||||
|
-- versions were used to populate this document so a re-ingest can
|
||||||
|
-- decide whether to skip (versions match) or re-process (any mismatch).
|
||||||
|
-- parser_version is already on documents from V001.
|
||||||
|
ALTER TABLE documents ADD COLUMN last_chunker_version TEXT;
|
||||||
|
ALTER TABLE documents ADD COLUMN last_embedding_version TEXT;
|
||||||
Reference in New Issue
Block a user