feat(kebab-app): p9-fb-23 task 5 — stamp chunker + embedding versions on CanonicalDocument before put_document
All three ingest flows (markdown, image, pdf) now set last_chunker_version and last_embedding_version on the CanonicalDocument before calling put_document, giving Task 7's skip detection the data it needs on the second run. No skip path is added yet. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -781,7 +781,7 @@ fn ingest_one_asset(
|
||||
.map(|w| format!("{:?}: {}", w.kind, w.note))
|
||||
.collect();
|
||||
|
||||
let canonical = build_canonical_document(
|
||||
let mut canonical = build_canonical_document(
|
||||
asset,
|
||||
metadata,
|
||||
parsed_blocks,
|
||||
@@ -794,6 +794,13 @@ fn ingest_one_asset(
|
||||
.chunk(&canonical, chunk_policy)
|
||||
.context("kb-chunk::MdHeadingV1Chunker::chunk")?;
|
||||
|
||||
// Stamp chunker + embedding versions so Task 7's skip detection has
|
||||
// data on the second run.
|
||||
canonical.last_chunker_version = Some(MdHeadingV1Chunker.chunker_version());
|
||||
if let Some(emb) = embedder {
|
||||
canonical.last_embedding_version = Some(emb.model_version());
|
||||
}
|
||||
|
||||
// Persist. Each `put_*` call wraps its own short transaction
|
||||
// (per-document tx semantics per design §5.8); composing them is
|
||||
// the kb-app job. A failure mid-way leaves the DB in a state the
|
||||
@@ -1030,6 +1037,12 @@ fn ingest_one_image_asset(
|
||||
.context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?;
|
||||
|
||||
// 5. Persist + embed — identical sequence to markdown.
|
||||
// Stamp chunker + embedding versions (image uses MdHeadingV1Chunker
|
||||
// for its single-block doc, so we record that version).
|
||||
canonical.last_chunker_version = Some(MdHeadingV1Chunker.chunker_version());
|
||||
if let Some(emb) = embedder {
|
||||
canonical.last_embedding_version = Some(emb.model_version());
|
||||
}
|
||||
purge_vector_orphans_for_workspace_path(app, asset, vector_store)?;
|
||||
app.sqlite
|
||||
.put_asset_with_bytes(asset, &bytes)
|
||||
@@ -1244,7 +1257,7 @@ fn ingest_one_pdf_asset(
|
||||
workspace_root: &workspace_root,
|
||||
config: &extract_config,
|
||||
};
|
||||
let canonical = PdfTextExtractor::new()
|
||||
let mut canonical = PdfTextExtractor::new()
|
||||
.extract(&ctx, &bytes)
|
||||
.context("kb-parse-pdf::PdfTextExtractor::extract")?;
|
||||
|
||||
@@ -1257,6 +1270,13 @@ fn ingest_one_pdf_asset(
|
||||
.chunk(&canonical, chunk_policy)
|
||||
.context("kb-chunk::PdfPageV1Chunker::chunk")?;
|
||||
|
||||
// Stamp chunker + embedding versions so Task 7's skip detection has
|
||||
// data on the second run.
|
||||
canonical.last_chunker_version = Some(chunker.chunker_version());
|
||||
if let Some(emb) = embedder {
|
||||
canonical.last_embedding_version = Some(emb.model_version());
|
||||
}
|
||||
|
||||
purge_vector_orphans_for_workspace_path(app, asset, vector_store)?;
|
||||
app.sqlite
|
||||
.put_asset_with_bytes(asset, &bytes)
|
||||
|
||||
@@ -218,3 +218,41 @@ fn inspect_chunk_not_found_returns_actionable_error() {
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("not found"), "got: {msg}");
|
||||
}
|
||||
|
||||
/// p9-fb-23 task 5: every freshly-ingested markdown doc must carry
|
||||
/// `last_chunker_version`. With `provider="none"` (lexical-only),
|
||||
/// `last_embedding_version` stays `None`.
|
||||
#[test]
|
||||
fn ingest_stamps_chunker_version_on_document() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert!(report.new >= 1, "expected at least one new doc: {report:?}");
|
||||
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
|
||||
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
assert!(!docs.is_empty(), "no docs after ingest");
|
||||
|
||||
for doc_entry in &docs {
|
||||
let canonical =
|
||||
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id)
|
||||
.unwrap();
|
||||
assert!(
|
||||
canonical.last_chunker_version.is_some(),
|
||||
"last_chunker_version must be stamped for doc {}: got {:?}",
|
||||
doc_entry.doc_id.0,
|
||||
canonical.last_chunker_version,
|
||||
);
|
||||
// provider="none" → embedder is None → last_embedding_version stays None.
|
||||
assert!(
|
||||
canonical.last_embedding_version.is_none(),
|
||||
"last_embedding_version must be None when provider=none for doc {}: got {:?}",
|
||||
doc_entry.doc_id.0,
|
||||
canonical.last_embedding_version,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user