From a16e9c9215a387f3b996503a294dac3e7edf519c Mon Sep 17 00:00:00 2001 From: altair823 Date: Mon, 4 May 2026 18:01:48 +0000 Subject: [PATCH] =?UTF-8?q?feat(kebab-app):=20p9-fb-23=20task=205=20?= =?UTF-8?q?=E2=80=94=20stamp=20chunker=20+=20embedding=20versions=20on=20C?= =?UTF-8?q?anonicalDocument=20before=20put=5Fdocument?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All three ingest flows (markdown, image, pdf) now set last_chunker_version and last_embedding_version on the CanonicalDocument before calling put_document, giving Task 7's skip detection the data it needs on the second run. No skip path is added yet. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 24 +++++++++++++-- crates/kebab-app/tests/ingest_lexical.rs | 38 ++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 4220af9..7993b79 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -781,7 +781,7 @@ fn ingest_one_asset( .map(|w| format!("{:?}: {}", w.kind, w.note)) .collect(); - let canonical = build_canonical_document( + let mut canonical = build_canonical_document( asset, metadata, parsed_blocks, @@ -794,6 +794,13 @@ fn ingest_one_asset( .chunk(&canonical, chunk_policy) .context("kb-chunk::MdHeadingV1Chunker::chunk")?; + // Stamp chunker + embedding versions so Task 7's skip detection has + // data on the second run. + canonical.last_chunker_version = Some(MdHeadingV1Chunker.chunker_version()); + if let Some(emb) = embedder { + canonical.last_embedding_version = Some(emb.model_version()); + } + // Persist. Each `put_*` call wraps its own short transaction // (per-document tx semantics per design ยง5.8); composing them is // the kb-app job. A failure mid-way leaves the DB in a state the @@ -1030,6 +1037,12 @@ fn ingest_one_image_asset( .context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?; // 5. Persist + embed โ€” identical sequence to markdown. + // Stamp chunker + embedding versions (image uses MdHeadingV1Chunker + // for its single-block doc, so we record that version). + canonical.last_chunker_version = Some(MdHeadingV1Chunker.chunker_version()); + if let Some(emb) = embedder { + canonical.last_embedding_version = Some(emb.model_version()); + } purge_vector_orphans_for_workspace_path(app, asset, vector_store)?; app.sqlite .put_asset_with_bytes(asset, &bytes) @@ -1244,7 +1257,7 @@ fn ingest_one_pdf_asset( workspace_root: &workspace_root, config: &extract_config, }; - let canonical = PdfTextExtractor::new() + let mut canonical = PdfTextExtractor::new() .extract(&ctx, &bytes) .context("kb-parse-pdf::PdfTextExtractor::extract")?; @@ -1257,6 +1270,13 @@ fn ingest_one_pdf_asset( .chunk(&canonical, chunk_policy) .context("kb-chunk::PdfPageV1Chunker::chunk")?; + // Stamp chunker + embedding versions so Task 7's skip detection has + // data on the second run. + canonical.last_chunker_version = Some(chunker.chunker_version()); + if let Some(emb) = embedder { + canonical.last_embedding_version = Some(emb.model_version()); + } + purge_vector_orphans_for_workspace_path(app, asset, vector_store)?; app.sqlite .put_asset_with_bytes(asset, &bytes) diff --git a/crates/kebab-app/tests/ingest_lexical.rs b/crates/kebab-app/tests/ingest_lexical.rs index 9999552..9eb9973 100644 --- a/crates/kebab-app/tests/ingest_lexical.rs +++ b/crates/kebab-app/tests/ingest_lexical.rs @@ -218,3 +218,41 @@ fn inspect_chunk_not_found_returns_actionable_error() { let msg = format!("{err:#}"); assert!(msg.contains("not found"), "got: {msg}"); } + +/// p9-fb-23 task 5: every freshly-ingested markdown doc must carry +/// `last_chunker_version`. With `provider="none"` (lexical-only), +/// `last_embedding_version` stays `None`. +#[test] +fn ingest_stamps_chunker_version_on_document() { + let env = TestEnv::lexical_only(); + let report = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + assert!(report.new >= 1, "expected at least one new doc: {report:?}"); + assert_eq!(report.errors, 0, "no errors expected: {report:?}"); + + let docs = kebab_app::list_docs_with_config( + env.config.clone(), + kebab_core::DocFilter::default(), + ) + .unwrap(); + assert!(!docs.is_empty(), "no docs after ingest"); + + for doc_entry in &docs { + let canonical = + kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id) + .unwrap(); + assert!( + canonical.last_chunker_version.is_some(), + "last_chunker_version must be stamped for doc {}: got {:?}", + doc_entry.doc_id.0, + canonical.last_chunker_version, + ); + // provider="none" โ†’ embedder is None โ†’ last_embedding_version stays None. + assert!( + canonical.last_embedding_version.is_none(), + "last_embedding_version must be None when provider=none for doc {}: got {:?}", + doc_entry.doc_id.0, + canonical.last_embedding_version, + ); + } +}