feat(kebab-app): p9-fb-23 task 5 — stamp chunker + embedding versions on CanonicalDocument before put_document

All three ingest flows (markdown, image, pdf) now set
last_chunker_version and last_embedding_version on the CanonicalDocument
before calling put_document, giving Task 7's skip detection the data it
needs on the second run. No skip path is added yet.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 18:01:48 +00:00
parent 366e89e5e2
commit a16e9c9215
2 changed files with 60 additions and 2 deletions

View File

@@ -781,7 +781,7 @@ fn ingest_one_asset(
.map(|w| format!("{:?}: {}", w.kind, w.note)) .map(|w| format!("{:?}: {}", w.kind, w.note))
.collect(); .collect();
let canonical = build_canonical_document( let mut canonical = build_canonical_document(
asset, asset,
metadata, metadata,
parsed_blocks, parsed_blocks,
@@ -794,6 +794,13 @@ fn ingest_one_asset(
.chunk(&canonical, chunk_policy) .chunk(&canonical, chunk_policy)
.context("kb-chunk::MdHeadingV1Chunker::chunk")?; .context("kb-chunk::MdHeadingV1Chunker::chunk")?;
// Stamp chunker + embedding versions so Task 7's skip detection has
// data on the second run.
canonical.last_chunker_version = Some(MdHeadingV1Chunker.chunker_version());
if let Some(emb) = embedder {
canonical.last_embedding_version = Some(emb.model_version());
}
// Persist. Each `put_*` call wraps its own short transaction // Persist. Each `put_*` call wraps its own short transaction
// (per-document tx semantics per design §5.8); composing them is // (per-document tx semantics per design §5.8); composing them is
// the kb-app job. A failure mid-way leaves the DB in a state the // the kb-app job. A failure mid-way leaves the DB in a state the
@@ -1030,6 +1037,12 @@ fn ingest_one_image_asset(
.context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?; .context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?;
// 5. Persist + embed — identical sequence to markdown. // 5. Persist + embed — identical sequence to markdown.
// Stamp chunker + embedding versions (image uses MdHeadingV1Chunker
// for its single-block doc, so we record that version).
canonical.last_chunker_version = Some(MdHeadingV1Chunker.chunker_version());
if let Some(emb) = embedder {
canonical.last_embedding_version = Some(emb.model_version());
}
purge_vector_orphans_for_workspace_path(app, asset, vector_store)?; purge_vector_orphans_for_workspace_path(app, asset, vector_store)?;
app.sqlite app.sqlite
.put_asset_with_bytes(asset, &bytes) .put_asset_with_bytes(asset, &bytes)
@@ -1244,7 +1257,7 @@ fn ingest_one_pdf_asset(
workspace_root: &workspace_root, workspace_root: &workspace_root,
config: &extract_config, config: &extract_config,
}; };
let canonical = PdfTextExtractor::new() let mut canonical = PdfTextExtractor::new()
.extract(&ctx, &bytes) .extract(&ctx, &bytes)
.context("kb-parse-pdf::PdfTextExtractor::extract")?; .context("kb-parse-pdf::PdfTextExtractor::extract")?;
@@ -1257,6 +1270,13 @@ fn ingest_one_pdf_asset(
.chunk(&canonical, chunk_policy) .chunk(&canonical, chunk_policy)
.context("kb-chunk::PdfPageV1Chunker::chunk")?; .context("kb-chunk::PdfPageV1Chunker::chunk")?;
// Stamp chunker + embedding versions so Task 7's skip detection has
// data on the second run.
canonical.last_chunker_version = Some(chunker.chunker_version());
if let Some(emb) = embedder {
canonical.last_embedding_version = Some(emb.model_version());
}
purge_vector_orphans_for_workspace_path(app, asset, vector_store)?; purge_vector_orphans_for_workspace_path(app, asset, vector_store)?;
app.sqlite app.sqlite
.put_asset_with_bytes(asset, &bytes) .put_asset_with_bytes(asset, &bytes)

View File

@@ -218,3 +218,41 @@ fn inspect_chunk_not_found_returns_actionable_error() {
let msg = format!("{err:#}"); let msg = format!("{err:#}");
assert!(msg.contains("not found"), "got: {msg}"); assert!(msg.contains("not found"), "got: {msg}");
} }
/// p9-fb-23 task 5: every freshly-ingested markdown doc must carry
/// `last_chunker_version`. With `provider="none"` (lexical-only),
/// `last_embedding_version` stays `None`.
#[test]
fn ingest_stamps_chunker_version_on_document() {
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert!(report.new >= 1, "expected at least one new doc: {report:?}");
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
let docs = kebab_app::list_docs_with_config(
env.config.clone(),
kebab_core::DocFilter::default(),
)
.unwrap();
assert!(!docs.is_empty(), "no docs after ingest");
for doc_entry in &docs {
let canonical =
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id)
.unwrap();
assert!(
canonical.last_chunker_version.is_some(),
"last_chunker_version must be stamped for doc {}: got {:?}",
doc_entry.doc_id.0,
canonical.last_chunker_version,
);
// provider="none" → embedder is None → last_embedding_version stays None.
assert!(
canonical.last_embedding_version.is_none(),
"last_embedding_version must be None when provider=none for doc {}: got {:?}",
doc_entry.doc_id.0,
canonical.last_embedding_version,
);
}
}