Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| acf8cf3be2 | |||
| ea5f7b22c8 | |||
| 5497c6e7b5 | |||
| 5a90940f1c | |||
| 4389b887f0 | |||
| 360f825f3a | |||
| 641b92af7d | |||
| 08fb743598 | |||
| 0a2a7ae214 | |||
| 803d02b68b | |||
| 4e8b84c4e0 | |||
| 16dc02cfa2 | |||
| 74f1b0571b | |||
| 918ee6c0be |
47
Cargo.lock
generated
47
Cargo.lock
generated
@@ -4127,7 +4127,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-app"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.22.1",
|
||||
@@ -4172,7 +4172,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-chunk"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4187,7 +4187,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-cli"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
@@ -4208,7 +4208,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-config"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"dirs 5.0.1",
|
||||
@@ -4223,7 +4223,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-core"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4237,7 +4237,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4251,7 +4251,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed-local"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"fastembed",
|
||||
@@ -4264,7 +4264,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-eval"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4283,7 +4283,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4292,7 +4292,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm-local"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-config",
|
||||
@@ -4309,7 +4309,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-mcp"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4327,7 +4327,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-normalize"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4342,7 +4342,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-code"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"gix",
|
||||
@@ -4360,7 +4360,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-image"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"ab_glyph",
|
||||
"anyhow",
|
||||
@@ -4384,7 +4384,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-md"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4401,7 +4401,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-pdf"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4414,7 +4414,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-types"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"kebab-core",
|
||||
"serde",
|
||||
@@ -4422,7 +4422,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-rag"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4443,7 +4443,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-search"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"globset",
|
||||
@@ -4462,10 +4462,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-source-fs"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"globset",
|
||||
"ignore",
|
||||
"kebab-config",
|
||||
"kebab-core",
|
||||
@@ -4480,7 +4481,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-sqlite"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4501,7 +4502,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-vector"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arrow",
|
||||
@@ -4525,7 +4526,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-tui"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"crossterm",
|
||||
|
||||
@@ -31,7 +31,7 @@ edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kebab"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1"
|
||||
|
||||
@@ -748,15 +748,18 @@ struct ImagePipeline<'a> {
|
||||
/// hold (per design §9 cascade rule):
|
||||
///
|
||||
/// 1. `force_reingest == false` — caller hasn't asked to bypass skip.
|
||||
/// 2. The freshly-scanned asset's blake3 checksum equals what the
|
||||
/// existing `assets` row stores at the same `workspace_path`.
|
||||
/// 3. The doc keyed on `(workspace_path, asset_id, current_parser_version)`
|
||||
/// exists. If the parser_version changed, `id_for_doc` produces a
|
||||
/// different `doc_id` so the lookup misses → no skip → re-process.
|
||||
/// 4. The existing doc's stamped `last_chunker_version` AND
|
||||
/// `last_embedding_version` match the values the caller is about
|
||||
/// to use (`Some(v) == Some(v)` and `None == None` — see design
|
||||
/// doc for the `None == None` rule when no embedder is configured).
|
||||
/// 2. A document already exists at this `workspace_path`
|
||||
/// (`get_document_by_workspace_path`). The lookup is document-side, not
|
||||
/// asset-side, so twin files (identical content at different paths) each
|
||||
/// hit their own stable doc row — `documents.workspace_path` is UNIQUE
|
||||
/// while `assets` may dedupe content into a single row with a flip-flop
|
||||
/// `workspace_path` column (dogfood bug #4, see `tasks/HOTFIXES.md`).
|
||||
/// 3. The existing doc's `source_asset_id` equals the freshly-scanned
|
||||
/// asset's blake3 checksum (content unchanged).
|
||||
/// 4. The existing doc's `parser_version` matches the current extractor's
|
||||
/// `parser_version` (extractor not upgraded). Combined with `chunker_version`
|
||||
/// and `last_embedding_version` checks immediately below — full cascade
|
||||
/// per design §9.
|
||||
///
|
||||
/// Returns `Ok(None)` (proceed with full re-process) when any check
|
||||
/// fails or any DB read errors out — the skip path is opportunistic;
|
||||
@@ -773,31 +776,19 @@ fn try_skip_unchanged(
|
||||
if force_reingest {
|
||||
return Ok(None);
|
||||
}
|
||||
let existing_asset = match app
|
||||
// Document-centric skip: look up the existing document row by
|
||||
// workspace_path directly. This avoids the twin-file flip-flop
|
||||
// that the old asset-side lookup suffers from — multiple files
|
||||
// with identical content share one `assets` row whose
|
||||
// `workspace_path` is overwritten on every UPSERT, so
|
||||
// `get_asset_by_workspace_path(path1)` could return the OTHER
|
||||
// twin's path (or None) after any ingest of the twin. The
|
||||
// `documents` table has a UNIQUE index on `workspace_path` (V001),
|
||||
// so each twin has its own stable row regardless of asset de-dup.
|
||||
let existing_doc = match app
|
||||
.sqlite
|
||||
.get_asset_by_workspace_path(&asset.workspace_path)
|
||||
.get_document_by_workspace_path(&asset.workspace_path)
|
||||
{
|
||||
Ok(Some(a)) => a,
|
||||
Ok(None) => return Ok(None),
|
||||
Err(e) => {
|
||||
tracing::debug!(
|
||||
target: "kebab-app",
|
||||
path = %asset.workspace_path.0,
|
||||
error = %e,
|
||||
"skip-check: get_asset_by_workspace_path failed; falling through to re-process"
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
if existing_asset.checksum != asset.checksum {
|
||||
return Ok(None);
|
||||
}
|
||||
let candidate_doc_id = kebab_core::id_for_doc(
|
||||
&asset.workspace_path,
|
||||
&asset.asset_id,
|
||||
current_parser_version,
|
||||
);
|
||||
let existing_doc = match app.sqlite.get_document(&candidate_doc_id) {
|
||||
Ok(Some(d)) => d,
|
||||
Ok(None) => return Ok(None),
|
||||
Err(e) => {
|
||||
@@ -805,21 +796,37 @@ fn try_skip_unchanged(
|
||||
target: "kebab-app",
|
||||
path = %asset.workspace_path.0,
|
||||
error = %e,
|
||||
"skip-check: get_document failed; falling through to re-process"
|
||||
"skip-check: get_document_by_workspace_path failed; falling through to re-process"
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
// 1. Content unchanged: the freshly-computed asset_id (blake3
|
||||
// content hash) must match what this document was ingested from.
|
||||
if existing_doc.source_asset_id != asset.asset_id {
|
||||
return Ok(None);
|
||||
}
|
||||
// 2. Parser unchanged: parser_version is baked into id_for_doc so
|
||||
// a version bump yields a different doc_id and the row above
|
||||
// would have been missing. Checking here explicitly keeps the
|
||||
// logic self-documenting and guards against future id_for_doc
|
||||
// changes.
|
||||
if existing_doc.parser_version != *current_parser_version {
|
||||
return Ok(None);
|
||||
}
|
||||
// 3. Chunker unchanged.
|
||||
let chunker_match = existing_doc.last_chunker_version.as_ref()
|
||||
== Some(current_chunker_version);
|
||||
if !chunker_match {
|
||||
return Ok(None);
|
||||
}
|
||||
// 4. Embedder unchanged.
|
||||
let embedder_match = existing_doc.last_embedding_version.as_ref()
|
||||
== current_embedding_version;
|
||||
if !embedder_match {
|
||||
return Ok(None);
|
||||
}
|
||||
let candidate_doc_id = existing_doc.doc_id.clone();
|
||||
tracing::debug!(
|
||||
target: "kebab-app::ingest",
|
||||
path = %asset.workspace_path.0,
|
||||
|
||||
@@ -168,7 +168,9 @@ fn collect_stats(
|
||||
stale_doc_count: counts.stale_doc_count,
|
||||
// p10-1A-2: populated by the store query added in this task.
|
||||
code_lang_breakdown: store.code_lang_breakdown()?,
|
||||
repo_breakdown: std::collections::BTreeMap::new(),
|
||||
// p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a
|
||||
// placeholder — mirror of code_lang_breakdown for the repo field.
|
||||
repo_breakdown: store.repo_breakdown()?,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
90
crates/kebab-app/tests/twin_files_idempotent.rs
Normal file
90
crates/kebab-app/tests/twin_files_idempotent.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
//! Regression test for the twin-file idempotency bug.
|
||||
//!
|
||||
//! Identical-content files at different workspace paths share one
|
||||
//! `assets` row (`asset_id` = blake3 content hash, PRIMARY KEY). The
|
||||
//! old UPSERT `ON CONFLICT(asset_id) DO UPDATE SET workspace_path =
|
||||
//! excluded.workspace_path` made each twin overwrite the other's path
|
||||
//! on every ingest, so `get_asset_by_workspace_path(path1)` returned
|
||||
//! None (or the wrong twin) → re-process every time.
|
||||
//!
|
||||
//! Fix: `try_skip_unchanged` now uses `get_document_by_workspace_path`
|
||||
//! instead. `documents.workspace_path` is UNIQUE (V001) so each twin
|
||||
//! has its own stable document row.
|
||||
//!
|
||||
//! Assertion contract:
|
||||
//! 1st ingest → 2 New (one per twin)
|
||||
//! 2nd ingest → 0 New, 0 Updated, 2 Unchanged
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::ingest_with_config;
|
||||
use kebab_core::IngestItemKind;
|
||||
|
||||
#[test]
|
||||
fn twin_files_second_ingest_is_unchanged() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two files with identical content at different paths.
|
||||
let pkg_a = env.workspace_root.join("pkg_a");
|
||||
let pkg_b = env.workspace_root.join("pkg_b");
|
||||
std::fs::create_dir_all(&pkg_a).unwrap();
|
||||
std::fs::create_dir_all(&pkg_b).unwrap();
|
||||
|
||||
let content = b"# shared\nThis content is identical in both files.\n";
|
||||
std::fs::write(pkg_a.join("__init__.py"), content).unwrap();
|
||||
std::fs::write(pkg_b.join("__init__.py"), content).unwrap();
|
||||
|
||||
// First ingest — both files must be New.
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("first ingest must succeed");
|
||||
assert_eq!(first.errors, 0, "first ingest: no errors; report={first:?}");
|
||||
|
||||
let items = first.items.as_ref().expect("items must be present");
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| {
|
||||
i.doc_path.0.ends_with("__init__.py")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items.len(),
|
||||
2,
|
||||
"first ingest: expected exactly 2 __init__.py items; items={items:?}"
|
||||
);
|
||||
for item in &twin_items {
|
||||
assert_eq!(
|
||||
item.kind,
|
||||
IngestItemKind::New,
|
||||
"first ingest: each twin must be New; item={item:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// Second ingest — same files, same content → both must be Unchanged.
|
||||
let second = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(second.errors, 0, "second ingest: no errors; report={second:?}");
|
||||
assert_eq!(second.new, 0, "second ingest: no new docs; report={second:?}");
|
||||
assert_eq!(
|
||||
second.updated, 0,
|
||||
"second ingest: no updated docs (twin-file bug would set this to 2); report={second:?}"
|
||||
);
|
||||
|
||||
let second_items = second.items.as_ref().expect("items must be present");
|
||||
let twin_items2: Vec<_> = second_items
|
||||
.iter()
|
||||
.filter(|i| i.doc_path.0.ends_with("__init__.py"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items2.len(),
|
||||
2,
|
||||
"second ingest: expected exactly 2 __init__.py items; items={second_items:?}"
|
||||
);
|
||||
for item in &twin_items2 {
|
||||
assert_eq!(
|
||||
item.kind,
|
||||
IngestItemKind::Unchanged,
|
||||
"second ingest: each twin must be Unchanged; item={item:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -169,6 +169,20 @@ pub trait DocumentStore {
|
||||
&self,
|
||||
path: &WorkspacePath,
|
||||
) -> anyhow::Result<Option<RawAsset>>;
|
||||
|
||||
/// Look up a document row by its workspace path. Used by the
|
||||
/// document-centric skip path in `try_skip_unchanged` to avoid the
|
||||
/// twin-file flip-flop that the asset-side lookup suffers from
|
||||
/// (multiple files with identical content share one `assets` row
|
||||
/// whose `workspace_path` is overwritten on every UPSERT, so
|
||||
/// `get_asset_by_workspace_path` returns the wrong twin's path).
|
||||
///
|
||||
/// `documents.workspace_path` is UNIQUE (V001), so each twin has
|
||||
/// its own stable document row regardless of the asset de-dup.
|
||||
fn get_document_by_workspace_path(
|
||||
&self,
|
||||
path: &WorkspacePath,
|
||||
) -> anyhow::Result<Option<CanonicalDocument>>;
|
||||
}
|
||||
|
||||
pub trait VectorStore {
|
||||
|
||||
@@ -24,7 +24,7 @@ pub fn code_lang_for_path(path: &Path) -> Option<&'static str> {
|
||||
match ext.as_str() {
|
||||
"rs" => Some("rust"),
|
||||
"py" | "pyi" => Some("python"),
|
||||
"ts" | "tsx" => Some("typescript"),
|
||||
"ts" | "tsx" | "mts" | "cts" => Some("typescript"),
|
||||
"js" | "mjs" | "cjs" | "jsx" => Some("javascript"),
|
||||
"go" => Some("go"),
|
||||
"java" => Some("java"),
|
||||
@@ -82,7 +82,7 @@ pub fn module_path_for_python(workspace_path: &str) -> String {
|
||||
/// (no slash replacement, no source-root strip). See plan §Task C.
|
||||
pub fn module_path_for_tsjs(workspace_path: &str) -> String {
|
||||
let p = workspace_path;
|
||||
for ext in [".tsx", ".ts", ".jsx", ".mjs", ".cjs", ".js"] {
|
||||
for ext in [".tsx", ".mts", ".cts", ".ts", ".jsx", ".mjs", ".cjs", ".js"] {
|
||||
if let Some(stripped) = p.strip_suffix(ext) {
|
||||
return stripped.to_string();
|
||||
}
|
||||
@@ -110,7 +110,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn module_path_for_tsjs_keeps_slashes_and_strips_ext() {
|
||||
for ext in ["ts", "tsx", "js", "jsx", "mjs", "cjs"] {
|
||||
for ext in ["ts", "tsx", "mts", "cts", "js", "jsx", "mjs", "cjs"] {
|
||||
let p = format!("src/search/retriever/Retriever.{ext}");
|
||||
assert_eq!(module_path_for_tsjs(&p), "src/search/retriever/Retriever");
|
||||
}
|
||||
|
||||
@@ -173,8 +173,9 @@ impl Extractor for TypescriptAstExtractor {
|
||||
}
|
||||
|
||||
/// Select the tree-sitter grammar based on the workspace path's
|
||||
/// extension. `.tsx` → TSX grammar; everything else (`.ts`, `.d.ts`,
|
||||
/// missing extension) → TypeScript grammar.
|
||||
/// extension. `.tsx` → TSX grammar; everything else (`.ts`, `.mts`,
|
||||
/// `.cts`, `.d.ts`, missing extension) → TypeScript grammar (the JSX-
|
||||
/// agnostic variants all share one grammar in tree-sitter-typescript 0.23).
|
||||
fn select_grammar(workspace_path: &str) -> tree_sitter::Language {
|
||||
if workspace_path.ends_with(".tsx") {
|
||||
tree_sitter_typescript::LANGUAGE_TSX.into()
|
||||
|
||||
@@ -9,6 +9,8 @@ fn known_extensions_map_to_canonical_identifiers() {
|
||||
("foo.pyi", Some("python")),
|
||||
("foo.ts", Some("typescript")),
|
||||
("foo.tsx", Some("typescript")),
|
||||
("foo.mts", Some("typescript")), // ESM TS — same grammar
|
||||
("foo.cts", Some("typescript")), // CommonJS TS — same grammar
|
||||
("foo.js", Some("javascript")),
|
||||
("foo.mjs", Some("javascript")),
|
||||
("foo.cjs", Some("javascript")),
|
||||
|
||||
@@ -346,6 +346,34 @@ fn run_query(
|
||||
}
|
||||
}
|
||||
|
||||
// p10-1A-1 fix (dogfood-discovered 2026-05-20): code_lang filter
|
||||
// (IN-list on metadata_json.$.code_lang). Empty Vec = no filter.
|
||||
if !filters.code_lang.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.code_lang.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(
|
||||
" AND json_extract(d.metadata_json, '$.code_lang') IN ({placeholders})"
|
||||
));
|
||||
for lang in &filters.code_lang {
|
||||
params.push(Box::new(lang.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p10-1A-1 fix (dogfood-discovered 2026-05-20): repo filter
|
||||
// (IN-list on metadata_json.$.repo). Empty Vec = no filter.
|
||||
if !filters.repo.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.repo.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(
|
||||
" AND json_extract(d.metadata_json, '$.repo') IN ({placeholders})"
|
||||
));
|
||||
for repo in &filters.repo {
|
||||
params.push(Box::new(repo.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p9-fb-36: ingested_after filter.
|
||||
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
|
||||
// fb-32 ingest path), so lexicographic >= compare is correct — but only
|
||||
|
||||
@@ -785,6 +785,19 @@ impl TestEnv {
|
||||
body: &str,
|
||||
media: MediaType,
|
||||
updated_at: OffsetDateTime,
|
||||
) -> DocumentId {
|
||||
self.insert_doc_full_with_metadata(path, body, media, updated_at, "{}")
|
||||
}
|
||||
|
||||
/// Like `insert_doc_full` but accepts an explicit `metadata_json` string
|
||||
/// so p10-1A-1 filter tests can set `metadata.code_lang` / `metadata.repo`.
|
||||
fn insert_doc_full_with_metadata(
|
||||
&self,
|
||||
path: &str,
|
||||
body: &str,
|
||||
media: MediaType,
|
||||
updated_at: OffsetDateTime,
|
||||
metadata_json: &str,
|
||||
) -> DocumentId {
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
let doc_id = self.next_id("doc");
|
||||
@@ -810,10 +823,10 @@ impl TestEnv {
|
||||
source_type, trust_level, parser_version,
|
||||
doc_version, schema_version, metadata_json,
|
||||
provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1,
|
||||
'{}', '{\"events\":[]}',
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'code', 'primary', 'pv1', 1, 1,
|
||||
?, '{\"events\":[]}',
|
||||
'2024-01-01T00:00:00Z', ?)",
|
||||
rusqlite::params![doc_id, asset_id, path, updated_at_str],
|
||||
rusqlite::params![doc_id, asset_id, path, metadata_json, updated_at_str],
|
||||
)
|
||||
.expect("insert document");
|
||||
|
||||
@@ -834,6 +847,21 @@ impl TestEnv {
|
||||
DocumentId(doc_id)
|
||||
}
|
||||
|
||||
/// Insert a code doc with explicit `code_lang` and optional `repo` in metadata.
|
||||
fn insert_code_doc(&self, path: &str, body: &str, code_lang: &str, repo: Option<&str>) -> DocumentId {
|
||||
let metadata_json = match repo {
|
||||
Some(r) => format!(r#"{{"code_lang":"{code_lang}","repo":"{r}"}}"#),
|
||||
None => format!(r#"{{"code_lang":"{code_lang}"}}"#),
|
||||
};
|
||||
self.insert_doc_full_with_metadata(
|
||||
path,
|
||||
body,
|
||||
MediaType::Markdown,
|
||||
OffsetDateTime::now_utc(),
|
||||
&metadata_json,
|
||||
)
|
||||
}
|
||||
|
||||
fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec<SearchHit> {
|
||||
let r = self.inner.retriever();
|
||||
let q = SearchQuery {
|
||||
@@ -934,6 +962,52 @@ fn lexical_empty_filters_match_default_behavior() {
|
||||
assert!(!with_default.is_empty());
|
||||
}
|
||||
|
||||
// ── p10-1A-1 filter tests ────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_by_code_lang() {
|
||||
// Three docs: python code, rust code, markdown (no code_lang).
|
||||
// Filter code_lang=["python"] → only the python doc should match.
|
||||
let env = TestEnv::new();
|
||||
env.insert_code_doc("src/main.py", "AsyncClient session", "python", None);
|
||||
env.insert_code_doc("src/lib.rs", "AsyncClient session", "rust", None);
|
||||
env.insert_doc("docs/guide.md", "AsyncClient session");
|
||||
|
||||
let filters = SearchFilters {
|
||||
code_lang: vec!["python".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_search("AsyncClient", &filters);
|
||||
assert_eq!(hits.len(), 1, "only python doc should match code_lang filter");
|
||||
assert!(
|
||||
hits[0].doc_path.0.ends_with(".py"),
|
||||
"expected python path, got: {}",
|
||||
hits[0].doc_path.0
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_by_repo() {
|
||||
// Three docs: one in repo "httpx", one in repo "requests", one with no repo.
|
||||
// Filter repo=["httpx"] → only the httpx doc should match.
|
||||
let env = TestEnv::new();
|
||||
env.insert_code_doc("httpx/client.py", "session send request", "python", Some("httpx"));
|
||||
env.insert_code_doc("requests/api.py", "session send request", "python", Some("requests"));
|
||||
env.insert_code_doc("standalone.py", "session send request", "python", None);
|
||||
|
||||
let filters = SearchFilters {
|
||||
repo: vec!["httpx".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_search("session", &filters);
|
||||
assert_eq!(hits.len(), 1, "only httpx doc should match repo filter");
|
||||
assert!(
|
||||
hits[0].doc_path.0.starts_with("httpx/"),
|
||||
"expected httpx path, got: {}",
|
||||
hits[0].doc_path.0
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_snapshot_run_1() {
|
||||
// Pinned snapshot. A small, deterministic corpus; the JSON shape of
|
||||
|
||||
@@ -18,6 +18,7 @@ blake3 = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
walkdir = "2"
|
||||
ignore = "0.4"
|
||||
globset = "0.4"
|
||||
|
||||
[dev-dependencies]
|
||||
serde_json = { workspace = true }
|
||||
|
||||
@@ -86,7 +86,7 @@ impl FsSourceConnector {
|
||||
excludes.extend(scope.exclude.iter().cloned());
|
||||
let kbignore = read_kbignore(&root)?;
|
||||
|
||||
let overrides = build_overrides(&root, &excludes, &kbignore)?;
|
||||
let overrides = build_overrides(&root, &excludes, &kbignore, &scope.include)?;
|
||||
Ok((root, overrides))
|
||||
}
|
||||
|
||||
@@ -103,8 +103,6 @@ impl FsSourceConnector {
|
||||
) -> Result<(Vec<RawAsset>, FsScanSkips)> {
|
||||
let (root, overrides) = self.resolve_scan_params(scope)?;
|
||||
|
||||
log_scope_include_warning(scope);
|
||||
|
||||
let (files, skipped_entries) = walk_files_with_skips(&root, &overrides)?;
|
||||
|
||||
// Accumulate per-category skip counts and sample paths.
|
||||
@@ -284,14 +282,6 @@ fn build_assets(
|
||||
Ok(assets)
|
||||
}
|
||||
|
||||
fn log_scope_include_warning(scope: &SourceScope) {
|
||||
if !scope.include.is_empty() {
|
||||
tracing::debug!(
|
||||
count = scope.include.len(),
|
||||
"FsSourceConnector ignores scope.include — handled by extractor router"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl SourceConnector for FsSourceConnector {
|
||||
fn scan(&self, scope: &SourceScope) -> Result<Vec<RawAsset>> {
|
||||
|
||||
@@ -19,7 +19,9 @@ pub(crate) fn media_type_for(path: &Path) -> MediaType {
|
||||
.unwrap_or_default();
|
||||
|
||||
match ext.as_str() {
|
||||
"md" => MediaType::Markdown,
|
||||
// Markdown + MDX (markdown + JSX, treated as plain markdown — the
|
||||
// JSX islands are folded into raw passthrough by the md parser).
|
||||
"md" | "mdx" => MediaType::Markdown,
|
||||
"pdf" => MediaType::Pdf,
|
||||
|
||||
"png" => MediaType::Image(ImageType::Png),
|
||||
@@ -40,7 +42,8 @@ pub(crate) fn media_type_for(path: &Path) -> MediaType {
|
||||
|
||||
// p10-1B: Python / TS / JS AST chunkers active.
|
||||
"py" | "pyi" => MediaType::Code("python".into()),
|
||||
"ts" | "tsx" => MediaType::Code("typescript".into()),
|
||||
// .mts / .cts are TypeScript ESM / CommonJS variants — same grammar.
|
||||
"ts" | "tsx" | "mts" | "cts" => MediaType::Code("typescript".into()),
|
||||
"js" | "mjs" | "cjs" | "jsx" => MediaType::Code("javascript".into()),
|
||||
|
||||
// Empty string (no extension) and any other extension: bucket as
|
||||
@@ -102,6 +105,20 @@ mod tests {
|
||||
assert_eq!(media_type_for(Path::new("a/b.rs")), MediaType::Code("rust".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ts_variants_mts_cts() {
|
||||
// .mts / .cts are TypeScript ESM / CommonJS — same grammar as .ts.
|
||||
assert_eq!(media_type_for(Path::new("a/b.mts")), MediaType::Code("typescript".into()));
|
||||
assert_eq!(media_type_for(Path::new("a/b.cts")), MediaType::Code("typescript".into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mdx_routes_to_markdown() {
|
||||
// MDX is markdown with JSX islands; the md parser folds the JSX
|
||||
// through as raw passthrough.
|
||||
assert_eq!(media_type_for(Path::new("docs/page.mdx")), MediaType::Markdown);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_and_missing_extension() {
|
||||
assert_eq!(
|
||||
|
||||
@@ -44,6 +44,7 @@ use std::collections::HashSet;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use globset::{GlobBuilder, GlobSet, GlobSetBuilder};
|
||||
use ignore::overrides::{Override, OverrideBuilder};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
@@ -69,6 +70,11 @@ const DEFAULT_EXCLUDES: &[&str] = &[
|
||||
///
|
||||
/// `default_and_config` covers DEFAULT_EXCLUDES + `config.workspace.exclude`
|
||||
/// — these do NOT map to any of the three named `IngestReport` counters.
|
||||
///
|
||||
/// `include` is the compiled `scope.include` allow-list. When the set is
|
||||
/// empty (no patterns) every file passes; when non-empty a file must match
|
||||
/// at least one pattern to be accepted (directories always pass, so the
|
||||
/// walker can still descend into them).
|
||||
pub(crate) struct WalkOverrides {
|
||||
/// Merged matcher — same as today's `Override`; used for the walk decision.
|
||||
pub combined: Override,
|
||||
@@ -78,6 +84,8 @@ pub(crate) struct WalkOverrides {
|
||||
pub kebabignore: Override,
|
||||
/// Matcher built from `kebab_parse_code::BUILTIN_BLACKLIST` only.
|
||||
pub builtin: Override,
|
||||
/// Compiled allow-list from `scope.include`. Empty set = pass all.
|
||||
pub include: GlobSet,
|
||||
}
|
||||
|
||||
/// Skip attribution category. Used by the connector when counting per-source
|
||||
@@ -161,10 +169,15 @@ fn build_single_matcher_owned(root: &Path, patterns: &[String]) -> Result<Overri
|
||||
/// The three per-source matchers (`gitignore`, `kebabignore`, `builtin`) are
|
||||
/// built in addition to the combined one so the connector can attribute skips
|
||||
/// to the correct `IngestReport` counter without a second walker pass.
|
||||
///
|
||||
/// `include_patterns` (from `scope.include`) are compiled into an allow-list
|
||||
/// `GlobSet`. Empty slice → pass-all (backward-compat); non-empty → file
|
||||
/// must match at least one pattern to be accepted.
|
||||
pub(crate) fn build_overrides(
|
||||
root: &Path,
|
||||
config_exclude: &[String],
|
||||
kbignore_patterns: &[String],
|
||||
include_patterns: &[String],
|
||||
) -> Result<WalkOverrides> {
|
||||
let gitignore_patterns = read_gitignore(root)?;
|
||||
|
||||
@@ -209,14 +222,41 @@ pub(crate) fn build_overrides(
|
||||
.build()
|
||||
.context("failed to compile combined override set")?;
|
||||
|
||||
// Allow-list GlobSet: empty Vec → matches nothing (= pass all); non-empty
|
||||
// → file must match at least one glob to be accepted. We compile with
|
||||
// `case_insensitive=false` to keep the semantics consistent with the
|
||||
// OverrideBuilder exclude patterns above.
|
||||
let include = build_include_globset(include_patterns)?;
|
||||
|
||||
Ok(WalkOverrides {
|
||||
combined,
|
||||
gitignore,
|
||||
kebabignore,
|
||||
builtin,
|
||||
include,
|
||||
})
|
||||
}
|
||||
|
||||
/// Compile `scope.include` patterns into a `GlobSet` allow-list.
|
||||
///
|
||||
/// Each pattern uses `GlobBuilder` with `literal_separator = true` so that
|
||||
/// `**` can cross directory boundaries while `*` stops at `/`, matching the
|
||||
/// gitignore convention used throughout the rest of the walker.
|
||||
///
|
||||
/// An empty slice produces an empty `GlobSet` — callers interpret that as
|
||||
/// "pass all files" (no allow-list constraint).
|
||||
fn build_include_globset(patterns: &[String]) -> Result<GlobSet> {
|
||||
let mut builder = GlobSetBuilder::new();
|
||||
for pat in patterns {
|
||||
let glob = GlobBuilder::new(pat)
|
||||
.literal_separator(true)
|
||||
.build()
|
||||
.with_context(|| format!("invalid include pattern: {pat}"))?;
|
||||
builder.add(glob);
|
||||
}
|
||||
builder.build().context("failed to compile include globset")
|
||||
}
|
||||
|
||||
/// Classify why a path was excluded, using per-source matchers in spec §5.2
|
||||
/// priority order: built-in > gitignore > kebabignore > other.
|
||||
///
|
||||
@@ -391,6 +431,13 @@ pub(crate) fn walk_files_with_skips(
|
||||
}
|
||||
|
||||
if entry.file_type().is_file() {
|
||||
// Apply include allow-list: if non-empty, the file's path
|
||||
// relative to root must match at least one pattern.
|
||||
if !overrides.include.is_empty() && !overrides.include.is_match(rel) {
|
||||
// Not in the allow-list — silently drop (no skip counter;
|
||||
// the include filter is not a "skip" source in IngestReport).
|
||||
continue;
|
||||
}
|
||||
accepted.push(path.to_path_buf());
|
||||
}
|
||||
}
|
||||
@@ -406,7 +453,7 @@ mod tests {
|
||||
#[test]
|
||||
fn empty_inputs_compile_into_an_override() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
|
||||
let ov = build_overrides(dir.path(), &[], &[], &[]).unwrap();
|
||||
// Default-excludes only; non-special files should not match.
|
||||
let m = ov.combined.matched(Path::new("notes/alpha.md"), false);
|
||||
assert!(!m.is_ignore());
|
||||
@@ -415,7 +462,7 @@ mod tests {
|
||||
#[test]
|
||||
fn default_excludes_ds_store_and_resource_forks() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
|
||||
let ov = build_overrides(dir.path(), &[], &[], &[]).unwrap();
|
||||
assert!(ov.combined.matched(Path::new(".DS_Store"), false).is_ignore());
|
||||
assert!(
|
||||
ov.combined.matched(Path::new("notes/.DS_Store"), false).is_ignore()
|
||||
@@ -433,6 +480,7 @@ mod tests {
|
||||
dir.path(),
|
||||
&["*.tmp".to_string(), "node_modules/**".to_string()],
|
||||
&[],
|
||||
&[],
|
||||
)
|
||||
.unwrap();
|
||||
assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
|
||||
@@ -452,6 +500,7 @@ mod tests {
|
||||
dir.path(),
|
||||
&["*.tmp".to_string()],
|
||||
&["secret/**".to_string()],
|
||||
&[],
|
||||
)
|
||||
.unwrap();
|
||||
assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
|
||||
@@ -491,7 +540,7 @@ mod tests {
|
||||
fs::write(root.join("src/main.rs"), "x").unwrap();
|
||||
fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap();
|
||||
|
||||
let overrides = build_overrides(root, &[], &[]).unwrap();
|
||||
let overrides = build_overrides(root, &[], &[], &[]).unwrap();
|
||||
// Override::matched expects paths relative to the builder's root.
|
||||
let m_in = overrides.combined.matched(Path::new("src/main.rs"), false);
|
||||
let m_out = overrides.combined.matched(Path::new("node_modules/foo/bar.js"), false);
|
||||
@@ -514,7 +563,7 @@ mod tests {
|
||||
fs::create_dir_all(root.join("ok")).unwrap();
|
||||
fs::write(root.join("ok/z.txt"), "z").unwrap();
|
||||
|
||||
let overrides = build_overrides(root, &[], &[]).unwrap();
|
||||
let overrides = build_overrides(root, &[], &[], &[]).unwrap();
|
||||
// Override::matched expects paths relative to the builder's root.
|
||||
for blacklisted in [
|
||||
"target/x/y.txt",
|
||||
@@ -544,7 +593,7 @@ mod tests {
|
||||
fs::create_dir_all(root.join("dist")).unwrap();
|
||||
fs::write(root.join("dist/bundle.js"), "x").unwrap();
|
||||
|
||||
let overrides = build_overrides(root, &[], &[]).unwrap();
|
||||
let overrides = build_overrides(root, &[], &[], &[]).unwrap();
|
||||
assert!(overrides.combined.matched(Path::new("a.log"), false).is_ignore());
|
||||
assert!(overrides.combined.matched(Path::new("dist/bundle.js"), false).is_ignore());
|
||||
assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
|
||||
@@ -562,7 +611,7 @@ mod tests {
|
||||
fs::write(root.join("src/main.rs"), "x").unwrap();
|
||||
|
||||
// No .gitignore present — patterns from .gitignore should not affect overrides.
|
||||
let overrides = build_overrides(root, &[], &[]).unwrap();
|
||||
let overrides = build_overrides(root, &[], &[], &[]).unwrap();
|
||||
assert!(!overrides.combined.matched(Path::new("a.log"), false).is_ignore());
|
||||
assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
|
||||
}
|
||||
@@ -577,7 +626,7 @@ mod tests {
|
||||
// semantics, but at minimum it must not produce double-`!` corruption.
|
||||
fs::write(root.join(".gitignore"), "!keep/\n").unwrap();
|
||||
// Just verify build_overrides doesn't error.
|
||||
let result = build_overrides(root, &[], &[]);
|
||||
let result = build_overrides(root, &[], &[], &[]);
|
||||
assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err());
|
||||
}
|
||||
|
||||
@@ -594,7 +643,7 @@ mod tests {
|
||||
// .gitignore entry. Builtin must win (priority order §5.2).
|
||||
fs::write(root.join(".gitignore"), "node_modules/\n").unwrap();
|
||||
|
||||
let ov = build_overrides(root, &[], &[]).unwrap();
|
||||
let ov = build_overrides(root, &[], &[], &[]).unwrap();
|
||||
// node_modules/ dir itself
|
||||
let cat = classify_skip(Path::new("node_modules"), true, &ov);
|
||||
assert_eq!(cat, SkipCategory::BuiltinBlacklist, "builtin must have priority");
|
||||
@@ -609,7 +658,7 @@ mod tests {
|
||||
let root = tmp.path();
|
||||
fs::write(root.join(".gitignore"), "*.log\n").unwrap();
|
||||
|
||||
let ov = build_overrides(root, &[], &[]).unwrap();
|
||||
let ov = build_overrides(root, &[], &[], &[]).unwrap();
|
||||
let cat = classify_skip(Path::new("app.log"), false, &ov);
|
||||
assert_eq!(cat, SkipCategory::Gitignore);
|
||||
}
|
||||
@@ -621,7 +670,7 @@ mod tests {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let root = tmp.path();
|
||||
|
||||
let ov = build_overrides(root, &[], &["*.secret".to_string()]).unwrap();
|
||||
let ov = build_overrides(root, &[], &["*.secret".to_string()], &[]).unwrap();
|
||||
let cat = classify_skip(Path::new("creds.secret"), false, &ov);
|
||||
assert_eq!(cat, SkipCategory::Kebabignore);
|
||||
}
|
||||
@@ -637,7 +686,7 @@ mod tests {
|
||||
fs::write(root.join("ok.md"), "# ok").unwrap();
|
||||
fs::write(root.join("skipme.log"), "x").unwrap();
|
||||
|
||||
let ov = build_overrides(root, &[], &[]).unwrap();
|
||||
let ov = build_overrides(root, &[], &[], &[]).unwrap();
|
||||
let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
|
||||
|
||||
let accepted_names: Vec<_> = accepted
|
||||
@@ -677,7 +726,7 @@ mod tests {
|
||||
fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap();
|
||||
fs::write(root.join("ok.md"), "# ok").unwrap();
|
||||
|
||||
let ov = build_overrides(root, &[], &[]).unwrap();
|
||||
let ov = build_overrides(root, &[], &[], &[]).unwrap();
|
||||
let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
|
||||
|
||||
let accepted_names: Vec<_> = accepted
|
||||
|
||||
111
crates/kebab-source-fs/tests/include_allowlist.rs
Normal file
111
crates/kebab-source-fs/tests/include_allowlist.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
//! Integration test: `scope.include` enforces an allow-list.
|
||||
//!
|
||||
//! Semantics (gitignore convention):
|
||||
//! - `include` is empty Vec → all files pass through (backward-compat).
|
||||
//! - `include` is non-empty → only files matching at least one pattern
|
||||
//! are accepted. `exclude` rules still apply after include.
|
||||
//!
|
||||
//! Layout (built per-test in a TempDir):
|
||||
//! root/
|
||||
//! ├── a.md
|
||||
//! ├── b.py
|
||||
//! ├── c.png
|
||||
//! └── d.pdf
|
||||
|
||||
use std::fs;
|
||||
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{SourceConnector, SourceScope};
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
fn cfg_with_root(root: &str) -> Config {
|
||||
let mut c = Config::defaults();
|
||||
c.workspace.root = root.to_string();
|
||||
c.workspace.exclude.clear();
|
||||
// Disable size / generated caps so small test files always pass.
|
||||
c.ingest.code.max_file_bytes = u64::MAX;
|
||||
c.ingest.code.max_file_lines = u32::MAX;
|
||||
c.ingest.code.skip_generated_header = false;
|
||||
c
|
||||
}
|
||||
|
||||
fn setup_mixed_dir() -> tempfile::TempDir {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
fs::write(root.join("a.md"), b"md").unwrap();
|
||||
fs::write(root.join("b.py"), b"py").unwrap();
|
||||
fs::write(root.join("c.png"), b"\x89PNG").unwrap();
|
||||
fs::write(root.join("d.pdf"), b"%PDF").unwrap();
|
||||
dir
|
||||
}
|
||||
|
||||
/// Empty include → all 4 files pass (backward-compat).
|
||||
#[test]
|
||||
fn include_empty_accepts_all_files() {
|
||||
let dir = setup_mixed_dir();
|
||||
let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap();
|
||||
let scope = SourceScope {
|
||||
include: vec![],
|
||||
..SourceScope::default()
|
||||
};
|
||||
let assets = conn.scan(&scope).unwrap();
|
||||
let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
assert!(names.contains(&"a.md".to_string()), "a.md missing; got: {names:?}");
|
||||
assert!(names.contains(&"b.py".to_string()), "b.py missing; got: {names:?}");
|
||||
assert!(names.contains(&"c.png".to_string()), "c.png missing; got: {names:?}");
|
||||
assert!(names.contains(&"d.pdf".to_string()), "d.pdf missing; got: {names:?}");
|
||||
assert_eq!(names.len(), 4, "expected exactly 4 files; got: {names:?}");
|
||||
}
|
||||
|
||||
/// Non-empty include → only md + py come back; png + pdf are excluded.
|
||||
#[test]
|
||||
fn include_nonempty_is_allowlist() {
|
||||
let dir = setup_mixed_dir();
|
||||
let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap();
|
||||
let scope = SourceScope {
|
||||
include: vec!["**/*.md".to_string(), "**/*.py".to_string()],
|
||||
..SourceScope::default()
|
||||
};
|
||||
let assets = conn.scan(&scope).unwrap();
|
||||
let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
assert!(names.contains(&"a.md".to_string()), "a.md should be accepted; got: {names:?}");
|
||||
assert!(names.contains(&"b.py".to_string()), "b.py should be accepted; got: {names:?}");
|
||||
assert!(
|
||||
!names.contains(&"c.png".to_string()),
|
||||
"c.png must be rejected by include allowlist; got: {names:?}"
|
||||
);
|
||||
assert!(
|
||||
!names.contains(&"d.pdf".to_string()),
|
||||
"d.pdf must be rejected by include allowlist; got: {names:?}"
|
||||
);
|
||||
assert_eq!(names.len(), 2, "expected exactly 2 files; got: {names:?}");
|
||||
}
|
||||
|
||||
/// include + exclude are ANDed: a file matching include but also matching
|
||||
/// exclude must be rejected.
|
||||
#[test]
|
||||
fn include_and_exclude_are_anded() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
fs::write(root.join("keep.md"), b"keep").unwrap();
|
||||
fs::write(root.join("drop.md"), b"drop").unwrap();
|
||||
fs::write(root.join("other.py"), b"py").unwrap();
|
||||
|
||||
let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap();
|
||||
let scope = SourceScope {
|
||||
include: vec!["**/*.md".to_string()],
|
||||
exclude: vec!["drop.md".to_string()],
|
||||
..SourceScope::default()
|
||||
};
|
||||
let assets = conn.scan(&scope).unwrap();
|
||||
let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
assert!(names.contains(&"keep.md".to_string()), "keep.md should be accepted; got: {names:?}");
|
||||
assert!(
|
||||
!names.contains(&"drop.md".to_string()),
|
||||
"drop.md should be excluded (matched exclude); got: {names:?}"
|
||||
);
|
||||
assert!(
|
||||
!names.contains(&"other.py".to_string()),
|
||||
"other.py should be excluded (not in include); got: {names:?}"
|
||||
);
|
||||
}
|
||||
@@ -286,6 +286,72 @@ impl kebab_core::DocumentStore for SqliteStore {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_document_by_workspace_path(
|
||||
&self,
|
||||
path: &kebab_core::WorkspacePath,
|
||||
) -> Result<Option<kebab_core::CanonicalDocument>> {
|
||||
let conn = self.lock_conn();
|
||||
let row: Option<DocumentRow> = conn
|
||||
.query_row(
|
||||
"SELECT
|
||||
doc_id, asset_id, workspace_path, title, lang,
|
||||
source_type, trust_level, parser_version,
|
||||
doc_version, schema_version, metadata_json,
|
||||
provenance_json, created_at, updated_at,
|
||||
last_chunker_version, last_embedding_version
|
||||
FROM documents WHERE workspace_path = ?",
|
||||
params![path.0],
|
||||
document_row_from_sql,
|
||||
)
|
||||
.map(Some)
|
||||
.or_else(rows_optional)
|
||||
.map_err(StoreError::from)?;
|
||||
let Some(row) = row else { return Ok(None) };
|
||||
|
||||
let doc_id = kebab_core::DocumentId(row.doc_id.clone());
|
||||
let mut blocks_stmt = conn
|
||||
.prepare(
|
||||
"SELECT payload_json FROM blocks
|
||||
WHERE doc_id = ? ORDER BY ordinal ASC",
|
||||
)
|
||||
.map_err(StoreError::from)?;
|
||||
let block_rows = blocks_stmt
|
||||
.query_map(params![row.doc_id], |r| {
|
||||
let payload_json: String = r.get(0)?;
|
||||
Ok(payload_json)
|
||||
})
|
||||
.map_err(StoreError::from)?;
|
||||
let mut blocks: Vec<kebab_core::Block> = Vec::new();
|
||||
for block_row in block_rows {
|
||||
let payload_json = block_row.map_err(StoreError::from)?;
|
||||
let block: kebab_core::Block = serde_json::from_str(&payload_json)
|
||||
.context("deserialize block payload_json")?;
|
||||
blocks.push(block);
|
||||
}
|
||||
|
||||
let metadata: kebab_core::Metadata = serde_json::from_str(&row.metadata_json)
|
||||
.context("deserialize metadata_json")?;
|
||||
let provenance: kebab_core::Provenance =
|
||||
serde_json::from_str(&row.provenance_json)
|
||||
.context("deserialize provenance_json")?;
|
||||
|
||||
Ok(Some(kebab_core::CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: kebab_core::AssetId(row.asset_id),
|
||||
workspace_path: kebab_core::WorkspacePath(row.workspace_path),
|
||||
title: row.title.unwrap_or_default(),
|
||||
lang: kebab_core::Lang(row.lang.unwrap_or_default()),
|
||||
blocks,
|
||||
metadata,
|
||||
provenance,
|
||||
parser_version: kebab_core::ParserVersion(row.parser_version),
|
||||
schema_version: row.schema_version as u32,
|
||||
doc_version: row.doc_version as u32,
|
||||
last_chunker_version: row.last_chunker_version.map(kebab_core::ChunkerVersion),
|
||||
last_embedding_version: row.last_embedding_version.map(kebab_core::EmbeddingVersion),
|
||||
}))
|
||||
}
|
||||
|
||||
fn list_documents(
|
||||
&self,
|
||||
filter: &kebab_core::DocFilter,
|
||||
|
||||
@@ -153,6 +153,34 @@ impl SqliteStore {
|
||||
}
|
||||
}
|
||||
|
||||
// p10-1A-1 fix (dogfood-discovered 2026-05-20): code_lang filter
|
||||
// (IN-list on metadata_json.$.code_lang). Empty Vec = no filter.
|
||||
if !filters.code_lang.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.code_lang.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(
|
||||
" AND json_extract(d.metadata_json, '$.code_lang') IN ({placeholders})"
|
||||
));
|
||||
for lang in &filters.code_lang {
|
||||
bind.push(Box::new(lang.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p10-1A-1 fix (dogfood-discovered 2026-05-20): repo filter
|
||||
// (IN-list on metadata_json.$.repo). Empty Vec = no filter.
|
||||
if !filters.repo.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.repo.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(
|
||||
" AND json_extract(d.metadata_json, '$.repo') IN ({placeholders})"
|
||||
));
|
||||
for repo in &filters.repo {
|
||||
bind.push(Box::new(repo.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p9-fb-36: ingested_after filter.
|
||||
// `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
|
||||
// lexicographic >= compare is correct — but only when the filter
|
||||
@@ -408,6 +436,78 @@ mod tests {
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Variant of `seed_committed_full` that additionally accepts a
|
||||
/// `metadata_json` string so p10-1A-1 filter tests can set
|
||||
/// `metadata.code_lang` / `metadata.repo` without going through the
|
||||
/// full ingest pipeline.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn seed_committed_with_metadata(
|
||||
store: &SqliteStore,
|
||||
chunk_id: &str,
|
||||
doc_id: &str,
|
||||
workspace_path: &str,
|
||||
media_type_json: &str,
|
||||
metadata_json: &str,
|
||||
) {
|
||||
let asset_id = format!("a{}", &doc_id[..31]);
|
||||
{
|
||||
let conn = store.lock_conn();
|
||||
conn.execute(
|
||||
"INSERT INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, ?, ?, ?, 0, 'deadbeefdeadbeefdeadbeefdeadbeef',
|
||||
'reference', ?, '1970-01-01T00:00:00Z')",
|
||||
params![
|
||||
asset_id,
|
||||
format!("file://{workspace_path}"),
|
||||
workspace_path,
|
||||
media_type_json,
|
||||
workspace_path,
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang, source_type,
|
||||
trust_level, parser_version, doc_version, schema_version,
|
||||
metadata_json, provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'code', 'primary', 'v1', 1, 1,
|
||||
?, '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
|
||||
params![doc_id, asset_id, workspace_path, metadata_json],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, 'code snippet', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
|
||||
'1970-01-01T00:00:00Z')",
|
||||
params![chunk_id, doc_id],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let embed_row = EmbeddingRecordRow {
|
||||
embedding_id: format!("e{}", &chunk_id[..31]),
|
||||
chunk_id: chunk_id.to_string(),
|
||||
model_id: "m".to_string(),
|
||||
model_version: "v1".to_string(),
|
||||
dimensions: 4,
|
||||
lance_table: "t".to_string(),
|
||||
created_at: OffsetDateTime::UNIX_EPOCH,
|
||||
};
|
||||
store
|
||||
.put_embedding_records_pending(std::slice::from_ref(&embed_row))
|
||||
.unwrap();
|
||||
store
|
||||
.mark_embedding_records_committed(std::slice::from_ref(
|
||||
&embed_row.embedding_id,
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn cid(s: &str) -> ChunkId {
|
||||
ChunkId(s.to_string())
|
||||
}
|
||||
@@ -671,6 +771,78 @@ mod tests {
|
||||
assert_eq!(out, vec![cid(c1)], "doc_id filter must scope to the target doc only");
|
||||
}
|
||||
|
||||
// ── p10-1A-1 new filter arms ─────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn filter_chunks_code_lang_keeps_matching_lang() {
|
||||
// c1 = python, c2 = rust, c3 = markdown (no code_lang).
|
||||
// Filter code_lang=["python"] → only c1 survives.
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let store = open_store(&tmp);
|
||||
let c1 = "11111111111111111111111111111111";
|
||||
let c2 = "22222222222222222222222222222222";
|
||||
let c3 = "33333333333333333333333333333333";
|
||||
seed_committed_with_metadata(
|
||||
&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
|
||||
"src/main.py", r#""code""#,
|
||||
r#"{"code_lang":"python"}"#,
|
||||
);
|
||||
seed_committed_with_metadata(
|
||||
&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2",
|
||||
"src/lib.rs", r#""code""#,
|
||||
r#"{"code_lang":"rust"}"#,
|
||||
);
|
||||
seed_committed_with_metadata(
|
||||
&store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
|
||||
"README.md", r#""markdown""#,
|
||||
r#"{}"#,
|
||||
);
|
||||
|
||||
let f = SearchFilters {
|
||||
code_lang: vec!["python".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1)], "only python chunk should survive code_lang filter");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_chunks_repo_keeps_matching_repo() {
|
||||
// c1 = repo "httpx", c2 = repo "requests", c3 = no repo.
|
||||
// Filter repo=["httpx"] → only c1 survives.
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let store = open_store(&tmp);
|
||||
let c1 = "11111111111111111111111111111111";
|
||||
let c2 = "22222222222222222222222222222222";
|
||||
let c3 = "33333333333333333333333333333333";
|
||||
seed_committed_with_metadata(
|
||||
&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
|
||||
"httpx/client.py", r#""code""#,
|
||||
r#"{"repo":"httpx","code_lang":"python"}"#,
|
||||
);
|
||||
seed_committed_with_metadata(
|
||||
&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2",
|
||||
"requests/api.py", r#""code""#,
|
||||
r#"{"repo":"requests","code_lang":"python"}"#,
|
||||
);
|
||||
seed_committed_with_metadata(
|
||||
&store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
|
||||
"standalone.py", r#""code""#,
|
||||
r#"{"code_lang":"python"}"#,
|
||||
);
|
||||
|
||||
let f = SearchFilters {
|
||||
repo: vec!["httpx".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1)], "only httpx chunk should survive repo filter");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
|
||||
// Regression test for the non-UTC offset lex-compare bug.
|
||||
|
||||
@@ -701,6 +701,39 @@ impl SqliteStore {
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// p10-1A-2 follow-up (dogfooding 2026-05-20): per-repo doc count for
|
||||
/// `schema.v1`.
|
||||
///
|
||||
/// Reads `metadata_json->'$.repo'`, groups by the value, and skips rows
|
||||
/// where `repo` is NULL (documents without an explicit repo tag).
|
||||
/// Returns `BTreeMap<String, u32>` — key is the repo name as stored in
|
||||
/// frontmatter, value is the doc count.
|
||||
pub fn repo_breakdown(
|
||||
&self,
|
||||
) -> anyhow::Result<std::collections::BTreeMap<String, u32>> {
|
||||
use anyhow::Context;
|
||||
let conn = self.read_conn();
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT json_extract(metadata_json, '$.repo') AS rp, COUNT(*) \
|
||||
FROM documents \
|
||||
WHERE rp IS NOT NULL \
|
||||
GROUP BY rp",
|
||||
)
|
||||
.context("prepare repo_breakdown")?;
|
||||
let rows = stmt
|
||||
.query_map([], |r| {
|
||||
Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)? as u32))
|
||||
})
|
||||
.context("query repo_breakdown")?;
|
||||
let mut out = std::collections::BTreeMap::new();
|
||||
for row in rows {
|
||||
let (k, v) = row.context("read repo_breakdown row")?;
|
||||
out.insert(k, v);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the design §5 / task-spec pragmas. Called once per connection.
|
||||
@@ -817,5 +850,79 @@ mod tests {
|
||||
// only one key total
|
||||
assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}");
|
||||
}
|
||||
|
||||
/// p10-1A-2 follow-up: `repo_breakdown` counts docs by
|
||||
/// `metadata_json.repo`.
|
||||
///
|
||||
/// Inserts:
|
||||
/// - one doc with `repo = "my-repo"` → must appear with count 1
|
||||
/// - one doc with `repo = null` → must NOT appear (NULL skipped)
|
||||
///
|
||||
/// Uses a side rusqlite connection that bypasses the `assets` FK via
|
||||
/// `PRAGMA foreign_keys = OFF` so the test is self-contained.
|
||||
#[test]
|
||||
fn repo_breakdown_counts_by_repo() {
|
||||
let (dir, store) = open_fresh_store();
|
||||
|
||||
let db_path = dir.path().join("kebab.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).unwrap();
|
||||
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
|
||||
|
||||
// Doc 1: doc with repo = "my-repo"
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path,
|
||||
source_type, trust_level, parser_version,
|
||||
doc_version, schema_version,
|
||||
metadata_json, provenance_json,
|
||||
created_at, updated_at
|
||||
) VALUES (
|
||||
'doc-repo-1', 'asset-r1', 'my-repo/README.md',
|
||||
'markdown', 'primary', 'test-v1',
|
||||
1, 1,
|
||||
'{\"repo\":\"my-repo\"}', '{}',
|
||||
'2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z'
|
||||
)",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Doc 2: doc with repo absent (null in JSON)
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path,
|
||||
source_type, trust_level, parser_version,
|
||||
doc_version, schema_version,
|
||||
metadata_json, provenance_json,
|
||||
created_at, updated_at
|
||||
) VALUES (
|
||||
'doc-norepo-1', 'asset-r2', 'standalone/notes.md',
|
||||
'markdown', 'primary', 'test-v1',
|
||||
1, 1,
|
||||
'{\"repo\":null}', '{}',
|
||||
'2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z'
|
||||
)",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
drop(conn); // release side connection before querying via store
|
||||
|
||||
let bd = store.repo_breakdown().unwrap();
|
||||
|
||||
// "my-repo" must appear with count 1
|
||||
assert_eq!(
|
||||
bd.get("my-repo"),
|
||||
Some(&1u32),
|
||||
"expected my-repo=1 in repo_breakdown, got: {bd:?}"
|
||||
);
|
||||
// null repo must NOT appear as any key
|
||||
assert!(
|
||||
!bd.contains_key("null"),
|
||||
"null repo must not appear in breakdown, got: {bd:?}"
|
||||
);
|
||||
// only one key total
|
||||
assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user