Files
kebab/crates/kebab-app/tests/ingest_lexical.rs
altair823 0e6d6073e7 feat(kebab-app): p9-fb-23 task 7 — early-skip Unchanged path in ingest
Adds the per-asset incremental-ingest skip block to all three flows
(markdown / image / pdf). When `IngestOpts::force_reingest = false`
AND the asset's blake3 checksum + parser/chunker/embedding versions
all match the existing DB record, ingest emits
`AssetFinished { result: Unchanged }`, bumps `aggregate.unchanged`,
and skips parse / chunk / embed / vector upsert entirely.

Shared `try_skip_unchanged` helper performs the four checks; per-flow
callers supply the active parser_version + chunker_version + optional
embedding_version. `force_reingest = true` bypasses the skip path so
`incremental_ingest::force_reingest_bypasses_skip` still sees `Updated`.

Tests:
- new `incremental_ingest.rs` covers both paths.
- existing `ingest_idempotent_on_second_run` /
  `re_ingest_image_produces_*` / `re_ingest_identical_pdf_produces_*`
  updated to assert `Unchanged` on identical-bytes re-ingest (the
  pre-task behaviour was `Updated`).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 18:12:47 +00:00

285 lines
10 KiB
Rust

//! Integration tests for `kb-app::ingest` + `list_docs` + `inspect_*`
//! along the lexical-only path (no embeddings → no AVX requirement).
mod common;
use common::TestEnv;
#[test]
fn ingest_then_list_inspects_round_trip() {
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
// The fixture has 3 markdown files; first ingest should label them
// all as New.
assert_eq!(report.scanned, 3, "scanned: {report:?}");
assert_eq!(report.new, 3, "new: {report:?}");
assert_eq!(report.updated, 0, "updated: {report:?}");
assert_eq!(report.errors, 0, "errors: {report:?}");
let items = report.items.as_ref().expect("items present");
assert_eq!(items.len(), 3);
for it in items {
assert!(it.error.is_none(), "per-item error: {it:?}");
assert!(it.doc_id.is_some());
// Each fixture file emits ≥1 chunk.
assert!(it.chunk_count.unwrap_or(0) >= 1, "chunks: {it:?}");
}
// list_docs returns the 3 docs.
let docs = kebab_app::list_docs_with_config(
env.config.clone(),
kebab_core::DocFilter::default(),
)
.unwrap();
assert_eq!(docs.len(), 3, "docs: {docs:?}");
// inspect_doc round-trips one of them.
let any_doc_id = docs[0].doc_id.clone();
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id)
.unwrap();
assert_eq!(canonical.doc_id, any_doc_id);
assert!(!canonical.blocks.is_empty(), "blocks empty");
}
#[test]
fn ingest_idempotent_on_second_run() {
let env = TestEnv::lexical_only();
let r1 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert_eq!(r1.new, 3);
let r2 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
// Same files re-ingested — p9-fb-23 task 7 introduced the early-skip
// path: when checksum + parser/chunker/embedding versions all match,
// the second run reports `Unchanged` rather than `Updated`. Pre-p9-fb-23
// returned `Updated` here. The `force_reingest=true` path still returns
// `Updated` and is exercised by `incremental_ingest.rs`.
assert_eq!(r2.scanned, 3, "second scan: {r2:?}");
assert_eq!(r2.new, 0, "second run new should be 0: {r2:?}");
assert_eq!(r2.updated, 0, "second run updated: {r2:?}");
assert_eq!(r2.unchanged, 3, "second run unchanged: {r2:?}");
// list_docs still has 3 docs (no duplicates).
let docs = kebab_app::list_docs_with_config(
env.config.clone(),
kebab_core::DocFilter::default(),
)
.unwrap();
assert_eq!(docs.len(), 3);
}
#[test]
fn ingest_summary_only_drops_items() {
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert_eq!(report.scanned, 3);
assert!(report.items.is_none(), "summary-only should null items");
}
#[test]
fn ingest_records_ingest_runs_row_with_aggregate_counts() {
// The ingest_runs table is the §5.7 sibling of `jobs`: dedicated
// count columns (`scanned`, `new_count`, …) populated at the end
// of every run. `summary_only=true` writes `items_json=NULL`; the
// counts MUST still be present.
let env = TestEnv::lexical_only();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
.unwrap();
assert_eq!(report.scanned, 3);
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir)
.join("kebab.sqlite");
let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite");
let (scanned, new_c, updated, skipped, errors, items_json): (
i64,
i64,
i64,
i64,
i64,
Option<String>,
) = conn
.query_row(
"SELECT scanned, new_count, updated_count, skipped_count,
error_count, items_json
FROM ingest_runs
ORDER BY started_at DESC
LIMIT 1",
[],
|r| {
Ok((
r.get(0)?,
r.get(1)?,
r.get(2)?,
r.get(3)?,
r.get(4)?,
r.get(5)?,
))
},
)
.expect("ingest_runs row present");
assert_eq!(scanned, 3);
assert_eq!(new_c, 3);
assert_eq!(updated, 0);
assert_eq!(skipped, 0);
assert_eq!(errors, 0);
assert!(
items_json.is_none(),
"summary_only=true must store items_json=NULL: {items_json:?}"
);
}
#[test]
fn ingest_provider_none_skips_lance() {
// `provider="none"` must short-circuit the embedder + vector store
// build entirely, so the LanceDB directory MUST NOT be created on
// disk during ingest. `IngestReport` currently has no
// `embeddings_indexed` field, so we assert via the on-disk lance
// tree shape (no `<data_dir>/lancedb` directory, or no `*.lance`
// tables under it).
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert_eq!(report.errors, 0, "lexical-only run must not error");
assert_eq!(report.new, 3);
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir)
.join("lancedb");
if lance_dir.exists() {
// If the dir was created (e.g., by an earlier consumer touching
// the path), it MUST contain no `.lance` tables.
let mut had_lance_table = false;
for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") {
let entry = entry.unwrap();
if entry
.path()
.extension()
.and_then(|s| s.to_str())
== Some("lance")
{
had_lance_table = true;
break;
}
}
assert!(
!had_lance_table,
"provider=none must not produce any *.lance table under {}",
lance_dir.display()
);
}
}
#[test]
fn list_docs_filters_by_tags_any() {
let env = TestEnv::lexical_only();
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
let filter = kebab_core::DocFilter {
tags_any: vec!["python".to_string()],
..Default::default()
};
let docs = kebab_app::list_docs_with_config(env.config.clone(), filter).unwrap();
assert_eq!(docs.len(), 1, "expected only the python doc: {docs:?}");
assert!(docs[0].tags.contains(&"python".to_string()));
let rust_filter = kebab_core::DocFilter {
tags_any: vec!["rust".to_string()],
..Default::default()
};
let rust_docs =
kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
// intro.md and notes/cargo.md both tag "rust".
assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}");
}
#[test]
fn inspect_doc_not_found_returns_actionable_error() {
let env = TestEnv::lexical_only();
let bogus =
kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string());
let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err();
let msg = format!("{err:#}");
assert!(
msg.contains("not found"),
"error must mention not-found: {msg}"
);
assert!(
msg.contains("kb list docs") || msg.contains("list"),
"error must hint at `kb list docs`: {msg}"
);
}
#[test]
fn inspect_chunk_not_found_returns_actionable_error() {
let env = TestEnv::lexical_only();
let bogus = kebab_core::ChunkId(
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
);
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus)
.unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("not found"), "got: {msg}");
}
/// p9-fb-23 task 6: `ingest_with_config_opts` with `IngestOpts::default()`
/// must behave identically to `ingest_with_config` — first ingest reports
/// all assets as new, no errors, no unchanged.
#[test]
fn ingest_with_config_opts_default_matches_legacy_behaviour() {
let env = TestEnv::lexical_only();
let report = kebab_app::ingest_with_config_opts(
env.config.clone(),
env.scope(),
false,
kebab_app::IngestOpts::default(),
)
.unwrap();
assert!(report.new >= 1, "expected at least one new doc: {report:?}");
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
assert_eq!(
report.unchanged, 0,
"first ingest cannot have unchanged: {report:?}"
);
}
/// p9-fb-23 task 5: every freshly-ingested markdown doc must carry
/// `last_chunker_version`. With `provider="none"` (lexical-only),
/// `last_embedding_version` stays `None`.
#[test]
fn ingest_stamps_chunker_version_on_document() {
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert!(report.new >= 1, "expected at least one new doc: {report:?}");
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
let docs = kebab_app::list_docs_with_config(
env.config.clone(),
kebab_core::DocFilter::default(),
)
.unwrap();
assert!(!docs.is_empty(), "no docs after ingest");
for doc_entry in &docs {
let canonical =
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id)
.unwrap();
assert!(
canonical.last_chunker_version.is_some(),
"last_chunker_version must be stamped for doc {}: got {:?}",
doc_entry.doc_id.0,
canonical.last_chunker_version,
);
// provider="none" → embedder is None → last_embedding_version stays None.
assert!(
canonical.last_embedding_version.is_none(),
"last_embedding_version must be None when provider=none for doc {}: got {:?}",
doc_entry.doc_id.0,
canonical.last_embedding_version,
);
}
}