Compare commits

...

7 Commits

Author SHA1 Message Date
08fb743598 chore: bump version 0.8.1 → 0.8.2
dogfood-discovered fixes (PR #145) land in production:
- schema.v1.repo_breakdown 가 실제로 채워짐 (이전: 항상 빈 BTreeMap)
- workspace.include glob 가 walker 에서 enforce 됨 (이전: 완전 무시)

patch bump 사유: 둘 다 advertised surface 의 정상 동작 복원.
새 wire / config / surface 변경 없음.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 05:20:48 +00:00
0a2a7ae214 Merge pull request 'fix(dogfood): schema.repo_breakdown + workspace.include walker enforcement (dogfood-discovered)' (#145) from fix/dogfood-bugs-schema-walker-incremental into main 2026-05-20 05:18:59 +00:00
803d02b68b fix(dogfood): enforce workspace.include in walker (allow-list semantics)
config.workspace.include was completely ignored by the walker — connector.rs
log_scope_include_warning literally said "handled by extractor router" but
no extractor router exists. Dogfooding (PR #142 1B + multi-root corpus
kebab-docs + httpx + zod + lodash) showed user-set include of code+md still
ingested 84 .png + 8 .pdf files.

Fix: walker treats scope.include as an allow-list — empty Vec preserves
backward-compat (all files pass), non-empty requires file path to match at
least one pattern (AND with the existing exclude rules). Removed the
misleading debug log.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 05:15:04 +00:00
4e8b84c4e0 fix(dogfood): populate schema.v1.repo_breakdown (Task 9 follow-up)
Dogfooding (PR #142 1B + multi-root corpus: kebab-docs + httpx + zod + lodash)
revealed schema.v1.repo_breakdown is always {} despite the 1A-2 Task 9
having added the code_lang_breakdown sibling. The schema.rs:171 placeholder
`BTreeMap::new()` was left in place. Mirror Task 9's code_lang_breakdown
query for the repo field — same metadata_json JSON-path pattern.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 05:09:19 +00:00
16dc02cfa2 chore: bump version 0.8.0 → 0.8.1
dogfood-discovered code_lang/repo filter bug (PR #144) fix lands in
production. patch bump because:
- 1A-1 advertised CLI flags --code-lang / --repo were live but inert
  (SearchFilters fields propagated but never applied to retriever SQL)
- fix restores intended behavior; no new wire surface
- user has dogfooded against httpx + zod + lodash and re-validating
  needs the fixed binary

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 03:35:36 +00:00
74f1b0571b Merge pull request 'fix(p10-1a-1): apply code_lang + repo filters in lexical SQL and filter_chunks (dogfood)' (#144) from fix/p10-1a-1-code-lang-repo-filter-sql into main 2026-05-20 03:34:53 +00:00
918ee6c0be fix(p10-1a-1): apply code_lang + repo filters in lexical SQL and filter_chunks (dogfood-discovered)
p10-1A-1 (PR #139) added SearchFilters.code_lang + .repo fields and the CLI
--code-lang / --repo flags propagate them correctly into SearchFilters, but
neither the lexical retriever's FTS SQL nor the shared filter_chunks helper
(used by the vector retriever) ever applied them — so a code-lang-filtered
search returned all-doc hits (markdown / pdf / code mixed).

Discovered while dogfooding p10-1B with httpx + zod + lodash clones:
`kebab search 'AsyncClient' --code-lang python --json` returned markdown
hits from httpx/docs/ first.

Fix: add IN-list filters on json_extract(d.metadata_json, '$.code_lang')
and '$.repo' to both lexical.rs and filters.rs, mirroring the existing
media filter pattern. Two regression tests added in each crate covering
the new filter behavior.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 03:27:01 +00:00
11 changed files with 586 additions and 51 deletions

47
Cargo.lock generated
View File

@@ -4127,7 +4127,7 @@ dependencies = [
[[package]]
name = "kebab-app"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"base64 0.22.1",
@@ -4172,7 +4172,7 @@ dependencies = [
[[package]]
name = "kebab-chunk"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"blake3",
@@ -4187,7 +4187,7 @@ dependencies = [
[[package]]
name = "kebab-cli"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"clap",
@@ -4208,7 +4208,7 @@ dependencies = [
[[package]]
name = "kebab-config"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"dirs 5.0.1",
@@ -4223,7 +4223,7 @@ dependencies = [
[[package]]
name = "kebab-core"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"blake3",
@@ -4237,7 +4237,7 @@ dependencies = [
[[package]]
name = "kebab-embed"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"blake3",
@@ -4251,7 +4251,7 @@ dependencies = [
[[package]]
name = "kebab-embed-local"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"fastembed",
@@ -4264,7 +4264,7 @@ dependencies = [
[[package]]
name = "kebab-eval"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"kebab-app",
@@ -4283,7 +4283,7 @@ dependencies = [
[[package]]
name = "kebab-llm"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"kebab-core",
@@ -4292,7 +4292,7 @@ dependencies = [
[[package]]
name = "kebab-llm-local"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"kebab-config",
@@ -4309,7 +4309,7 @@ dependencies = [
[[package]]
name = "kebab-mcp"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"kebab-app",
@@ -4327,7 +4327,7 @@ dependencies = [
[[package]]
name = "kebab-normalize"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"kebab-core",
@@ -4342,7 +4342,7 @@ dependencies = [
[[package]]
name = "kebab-parse-code"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"gix",
@@ -4360,7 +4360,7 @@ dependencies = [
[[package]]
name = "kebab-parse-image"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"ab_glyph",
"anyhow",
@@ -4384,7 +4384,7 @@ dependencies = [
[[package]]
name = "kebab-parse-md"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"kebab-core",
@@ -4401,7 +4401,7 @@ dependencies = [
[[package]]
name = "kebab-parse-pdf"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"blake3",
@@ -4414,7 +4414,7 @@ dependencies = [
[[package]]
name = "kebab-parse-types"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"kebab-core",
"serde",
@@ -4422,7 +4422,7 @@ dependencies = [
[[package]]
name = "kebab-rag"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"blake3",
@@ -4443,7 +4443,7 @@ dependencies = [
[[package]]
name = "kebab-search"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"globset",
@@ -4462,10 +4462,11 @@ dependencies = [
[[package]]
name = "kebab-source-fs"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"blake3",
"globset",
"ignore",
"kebab-config",
"kebab-core",
@@ -4480,7 +4481,7 @@ dependencies = [
[[package]]
name = "kebab-store-sqlite"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"blake3",
@@ -4501,7 +4502,7 @@ dependencies = [
[[package]]
name = "kebab-store-vector"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"arrow",
@@ -4525,7 +4526,7 @@ dependencies = [
[[package]]
name = "kebab-tui"
version = "0.8.0"
version = "0.8.2"
dependencies = [
"anyhow",
"crossterm",

View File

@@ -31,7 +31,7 @@ edition = "2024"
rust-version = "1.85"
license = "MIT OR Apache-2.0"
repository = "https://github.com/altair823/kebab"
version = "0.8.0"
version = "0.8.2"
[workspace.dependencies]
anyhow = "1"

View File

@@ -168,7 +168,9 @@ fn collect_stats(
stale_doc_count: counts.stale_doc_count,
// p10-1A-2: populated by the store query added in this task.
code_lang_breakdown: store.code_lang_breakdown()?,
repo_breakdown: std::collections::BTreeMap::new(),
// p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a
// placeholder — mirror of code_lang_breakdown for the repo field.
repo_breakdown: store.repo_breakdown()?,
})
}

View File

@@ -346,6 +346,34 @@ fn run_query(
}
}
// p10-1A-1 fix (dogfood-discovered 2026-05-20): code_lang filter
// (IN-list on metadata_json.$.code_lang). Empty Vec = no filter.
if !filters.code_lang.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.code_lang.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(
" AND json_extract(d.metadata_json, '$.code_lang') IN ({placeholders})"
));
for lang in &filters.code_lang {
params.push(Box::new(lang.clone()));
}
}
// p10-1A-1 fix (dogfood-discovered 2026-05-20): repo filter
// (IN-list on metadata_json.$.repo). Empty Vec = no filter.
if !filters.repo.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.repo.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(
" AND json_extract(d.metadata_json, '$.repo') IN ({placeholders})"
));
for repo in &filters.repo {
params.push(Box::new(repo.clone()));
}
}
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
// fb-32 ingest path), so lexicographic >= compare is correct — but only

View File

@@ -785,6 +785,19 @@ impl TestEnv {
body: &str,
media: MediaType,
updated_at: OffsetDateTime,
) -> DocumentId {
self.insert_doc_full_with_metadata(path, body, media, updated_at, "{}")
}
/// Like `insert_doc_full` but accepts an explicit `metadata_json` string
/// so p10-1A-1 filter tests can set `metadata.code_lang` / `metadata.repo`.
fn insert_doc_full_with_metadata(
&self,
path: &str,
body: &str,
media: MediaType,
updated_at: OffsetDateTime,
metadata_json: &str,
) -> DocumentId {
use time::format_description::well_known::Rfc3339;
let doc_id = self.next_id("doc");
@@ -810,10 +823,10 @@ impl TestEnv {
source_type, trust_level, parser_version,
doc_version, schema_version, metadata_json,
provenance_json, created_at, updated_at
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1,
'{}', '{\"events\":[]}',
) VALUES (?, ?, ?, NULL, 'en', 'code', 'primary', 'pv1', 1, 1,
?, '{\"events\":[]}',
'2024-01-01T00:00:00Z', ?)",
rusqlite::params![doc_id, asset_id, path, updated_at_str],
rusqlite::params![doc_id, asset_id, path, metadata_json, updated_at_str],
)
.expect("insert document");
@@ -834,6 +847,21 @@ impl TestEnv {
DocumentId(doc_id)
}
/// Insert a code doc with explicit `code_lang` and optional `repo` in metadata.
fn insert_code_doc(&self, path: &str, body: &str, code_lang: &str, repo: Option<&str>) -> DocumentId {
let metadata_json = match repo {
Some(r) => format!(r#"{{"code_lang":"{code_lang}","repo":"{r}"}}"#),
None => format!(r#"{{"code_lang":"{code_lang}"}}"#),
};
self.insert_doc_full_with_metadata(
path,
body,
MediaType::Markdown,
OffsetDateTime::now_utc(),
&metadata_json,
)
}
fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec<SearchHit> {
let r = self.inner.retriever();
let q = SearchQuery {
@@ -934,6 +962,52 @@ fn lexical_empty_filters_match_default_behavior() {
assert!(!with_default.is_empty());
}
// ── p10-1A-1 filter tests ────────────────────────────────────────────────
#[test]
fn lexical_filter_by_code_lang() {
// Three docs: python code, rust code, markdown (no code_lang).
// Filter code_lang=["python"] → only the python doc should match.
let env = TestEnv::new();
env.insert_code_doc("src/main.py", "AsyncClient session", "python", None);
env.insert_code_doc("src/lib.rs", "AsyncClient session", "rust", None);
env.insert_doc("docs/guide.md", "AsyncClient session");
let filters = SearchFilters {
code_lang: vec!["python".to_string()],
..Default::default()
};
let hits = env.run_search("AsyncClient", &filters);
assert_eq!(hits.len(), 1, "only python doc should match code_lang filter");
assert!(
hits[0].doc_path.0.ends_with(".py"),
"expected python path, got: {}",
hits[0].doc_path.0
);
}
#[test]
fn lexical_filter_by_repo() {
// Three docs: one in repo "httpx", one in repo "requests", one with no repo.
// Filter repo=["httpx"] → only the httpx doc should match.
let env = TestEnv::new();
env.insert_code_doc("httpx/client.py", "session send request", "python", Some("httpx"));
env.insert_code_doc("requests/api.py", "session send request", "python", Some("requests"));
env.insert_code_doc("standalone.py", "session send request", "python", None);
let filters = SearchFilters {
repo: vec!["httpx".to_string()],
..Default::default()
};
let hits = env.run_search("session", &filters);
assert_eq!(hits.len(), 1, "only httpx doc should match repo filter");
assert!(
hits[0].doc_path.0.starts_with("httpx/"),
"expected httpx path, got: {}",
hits[0].doc_path.0
);
}
#[test]
fn lexical_snapshot_run_1() {
// Pinned snapshot. A small, deterministic corpus; the JSON shape of

View File

@@ -18,6 +18,7 @@ blake3 = { workspace = true }
tracing = { workspace = true }
walkdir = "2"
ignore = "0.4"
globset = "0.4"
[dev-dependencies]
serde_json = { workspace = true }

View File

@@ -86,7 +86,7 @@ impl FsSourceConnector {
excludes.extend(scope.exclude.iter().cloned());
let kbignore = read_kbignore(&root)?;
let overrides = build_overrides(&root, &excludes, &kbignore)?;
let overrides = build_overrides(&root, &excludes, &kbignore, &scope.include)?;
Ok((root, overrides))
}
@@ -103,8 +103,6 @@ impl FsSourceConnector {
) -> Result<(Vec<RawAsset>, FsScanSkips)> {
let (root, overrides) = self.resolve_scan_params(scope)?;
log_scope_include_warning(scope);
let (files, skipped_entries) = walk_files_with_skips(&root, &overrides)?;
// Accumulate per-category skip counts and sample paths.
@@ -284,14 +282,6 @@ fn build_assets(
Ok(assets)
}
fn log_scope_include_warning(scope: &SourceScope) {
if !scope.include.is_empty() {
tracing::debug!(
count = scope.include.len(),
"FsSourceConnector ignores scope.include — handled by extractor router"
);
}
}
impl SourceConnector for FsSourceConnector {
fn scan(&self, scope: &SourceScope) -> Result<Vec<RawAsset>> {

View File

@@ -44,6 +44,7 @@ use std::collections::HashSet;
use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use globset::{GlobBuilder, GlobSet, GlobSetBuilder};
use ignore::overrides::{Override, OverrideBuilder};
use walkdir::WalkDir;
@@ -69,6 +70,11 @@ const DEFAULT_EXCLUDES: &[&str] = &[
///
/// `default_and_config` covers DEFAULT_EXCLUDES + `config.workspace.exclude`
/// — these do NOT map to any of the three named `IngestReport` counters.
///
/// `include` is the compiled `scope.include` allow-list. When the set is
/// empty (no patterns) every file passes; when non-empty a file must match
/// at least one pattern to be accepted (directories always pass, so the
/// walker can still descend into them).
pub(crate) struct WalkOverrides {
/// Merged matcher — same as today's `Override`; used for the walk decision.
pub combined: Override,
@@ -78,6 +84,8 @@ pub(crate) struct WalkOverrides {
pub kebabignore: Override,
/// Matcher built from `kebab_parse_code::BUILTIN_BLACKLIST` only.
pub builtin: Override,
/// Compiled allow-list from `scope.include`. Empty set = pass all.
pub include: GlobSet,
}
/// Skip attribution category. Used by the connector when counting per-source
@@ -161,10 +169,15 @@ fn build_single_matcher_owned(root: &Path, patterns: &[String]) -> Result<Overri
/// The three per-source matchers (`gitignore`, `kebabignore`, `builtin`) are
/// built in addition to the combined one so the connector can attribute skips
/// to the correct `IngestReport` counter without a second walker pass.
///
/// `include_patterns` (from `scope.include`) are compiled into an allow-list
/// `GlobSet`. Empty slice → pass-all (backward-compat); non-empty → file
/// must match at least one pattern to be accepted.
pub(crate) fn build_overrides(
root: &Path,
config_exclude: &[String],
kbignore_patterns: &[String],
include_patterns: &[String],
) -> Result<WalkOverrides> {
let gitignore_patterns = read_gitignore(root)?;
@@ -209,14 +222,41 @@ pub(crate) fn build_overrides(
.build()
.context("failed to compile combined override set")?;
// Allow-list GlobSet: empty Vec → matches nothing (= pass all); non-empty
// → file must match at least one glob to be accepted. We compile with
// `case_insensitive=false` to keep the semantics consistent with the
// OverrideBuilder exclude patterns above.
let include = build_include_globset(include_patterns)?;
Ok(WalkOverrides {
combined,
gitignore,
kebabignore,
builtin,
include,
})
}
/// Compile `scope.include` patterns into a `GlobSet` allow-list.
///
/// Each pattern uses `GlobBuilder` with `literal_separator = true` so that
/// `**` can cross directory boundaries while `*` stops at `/`, matching the
/// gitignore convention used throughout the rest of the walker.
///
/// An empty slice produces an empty `GlobSet` — callers interpret that as
/// "pass all files" (no allow-list constraint).
fn build_include_globset(patterns: &[String]) -> Result<GlobSet> {
let mut builder = GlobSetBuilder::new();
for pat in patterns {
let glob = GlobBuilder::new(pat)
.literal_separator(true)
.build()
.with_context(|| format!("invalid include pattern: {pat}"))?;
builder.add(glob);
}
builder.build().context("failed to compile include globset")
}
/// Classify why a path was excluded, using per-source matchers in spec §5.2
/// priority order: built-in > gitignore > kebabignore > other.
///
@@ -391,6 +431,13 @@ pub(crate) fn walk_files_with_skips(
}
if entry.file_type().is_file() {
// Apply include allow-list: if non-empty, the file's path
// relative to root must match at least one pattern.
if !overrides.include.is_empty() && !overrides.include.is_match(rel) {
// Not in the allow-list — silently drop (no skip counter;
// the include filter is not a "skip" source in IngestReport).
continue;
}
accepted.push(path.to_path_buf());
}
}
@@ -406,7 +453,7 @@ mod tests {
#[test]
fn empty_inputs_compile_into_an_override() {
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
let ov = build_overrides(dir.path(), &[], &[], &[]).unwrap();
// Default-excludes only; non-special files should not match.
let m = ov.combined.matched(Path::new("notes/alpha.md"), false);
assert!(!m.is_ignore());
@@ -415,7 +462,7 @@ mod tests {
#[test]
fn default_excludes_ds_store_and_resource_forks() {
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
let ov = build_overrides(dir.path(), &[], &[], &[]).unwrap();
assert!(ov.combined.matched(Path::new(".DS_Store"), false).is_ignore());
assert!(
ov.combined.matched(Path::new("notes/.DS_Store"), false).is_ignore()
@@ -433,6 +480,7 @@ mod tests {
dir.path(),
&["*.tmp".to_string(), "node_modules/**".to_string()],
&[],
&[],
)
.unwrap();
assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
@@ -452,6 +500,7 @@ mod tests {
dir.path(),
&["*.tmp".to_string()],
&["secret/**".to_string()],
&[],
)
.unwrap();
assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
@@ -491,7 +540,7 @@ mod tests {
fs::write(root.join("src/main.rs"), "x").unwrap();
fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap();
let overrides = build_overrides(root, &[], &[]).unwrap();
let overrides = build_overrides(root, &[], &[], &[]).unwrap();
// Override::matched expects paths relative to the builder's root.
let m_in = overrides.combined.matched(Path::new("src/main.rs"), false);
let m_out = overrides.combined.matched(Path::new("node_modules/foo/bar.js"), false);
@@ -514,7 +563,7 @@ mod tests {
fs::create_dir_all(root.join("ok")).unwrap();
fs::write(root.join("ok/z.txt"), "z").unwrap();
let overrides = build_overrides(root, &[], &[]).unwrap();
let overrides = build_overrides(root, &[], &[], &[]).unwrap();
// Override::matched expects paths relative to the builder's root.
for blacklisted in [
"target/x/y.txt",
@@ -544,7 +593,7 @@ mod tests {
fs::create_dir_all(root.join("dist")).unwrap();
fs::write(root.join("dist/bundle.js"), "x").unwrap();
let overrides = build_overrides(root, &[], &[]).unwrap();
let overrides = build_overrides(root, &[], &[], &[]).unwrap();
assert!(overrides.combined.matched(Path::new("a.log"), false).is_ignore());
assert!(overrides.combined.matched(Path::new("dist/bundle.js"), false).is_ignore());
assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
@@ -562,7 +611,7 @@ mod tests {
fs::write(root.join("src/main.rs"), "x").unwrap();
// No .gitignore present — patterns from .gitignore should not affect overrides.
let overrides = build_overrides(root, &[], &[]).unwrap();
let overrides = build_overrides(root, &[], &[], &[]).unwrap();
assert!(!overrides.combined.matched(Path::new("a.log"), false).is_ignore());
assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
}
@@ -577,7 +626,7 @@ mod tests {
// semantics, but at minimum it must not produce double-`!` corruption.
fs::write(root.join(".gitignore"), "!keep/\n").unwrap();
// Just verify build_overrides doesn't error.
let result = build_overrides(root, &[], &[]);
let result = build_overrides(root, &[], &[], &[]);
assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err());
}
@@ -594,7 +643,7 @@ mod tests {
// .gitignore entry. Builtin must win (priority order §5.2).
fs::write(root.join(".gitignore"), "node_modules/\n").unwrap();
let ov = build_overrides(root, &[], &[]).unwrap();
let ov = build_overrides(root, &[], &[], &[]).unwrap();
// node_modules/ dir itself
let cat = classify_skip(Path::new("node_modules"), true, &ov);
assert_eq!(cat, SkipCategory::BuiltinBlacklist, "builtin must have priority");
@@ -609,7 +658,7 @@ mod tests {
let root = tmp.path();
fs::write(root.join(".gitignore"), "*.log\n").unwrap();
let ov = build_overrides(root, &[], &[]).unwrap();
let ov = build_overrides(root, &[], &[], &[]).unwrap();
let cat = classify_skip(Path::new("app.log"), false, &ov);
assert_eq!(cat, SkipCategory::Gitignore);
}
@@ -621,7 +670,7 @@ mod tests {
let tmp = TempDir::new().unwrap();
let root = tmp.path();
let ov = build_overrides(root, &[], &["*.secret".to_string()]).unwrap();
let ov = build_overrides(root, &[], &["*.secret".to_string()], &[]).unwrap();
let cat = classify_skip(Path::new("creds.secret"), false, &ov);
assert_eq!(cat, SkipCategory::Kebabignore);
}
@@ -637,7 +686,7 @@ mod tests {
fs::write(root.join("ok.md"), "# ok").unwrap();
fs::write(root.join("skipme.log"), "x").unwrap();
let ov = build_overrides(root, &[], &[]).unwrap();
let ov = build_overrides(root, &[], &[], &[]).unwrap();
let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
let accepted_names: Vec<_> = accepted
@@ -677,7 +726,7 @@ mod tests {
fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap();
fs::write(root.join("ok.md"), "# ok").unwrap();
let ov = build_overrides(root, &[], &[]).unwrap();
let ov = build_overrides(root, &[], &[], &[]).unwrap();
let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
let accepted_names: Vec<_> = accepted

View File

@@ -0,0 +1,111 @@
//! Integration test: `scope.include` enforces an allow-list.
//!
//! Semantics (gitignore convention):
//! - `include` is empty Vec → all files pass through (backward-compat).
//! - `include` is non-empty → only files matching at least one pattern
//! are accepted. `exclude` rules still apply after include.
//!
//! Layout (built per-test in a TempDir):
//! root/
//! ├── a.md
//! ├── b.py
//! ├── c.png
//! └── d.pdf
use std::fs;
use kebab_config::Config;
use kebab_core::{SourceConnector, SourceScope};
use kebab_source_fs::FsSourceConnector;
fn cfg_with_root(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = root.to_string();
c.workspace.exclude.clear();
// Disable size / generated caps so small test files always pass.
c.ingest.code.max_file_bytes = u64::MAX;
c.ingest.code.max_file_lines = u32::MAX;
c.ingest.code.skip_generated_header = false;
c
}
fn setup_mixed_dir() -> tempfile::TempDir {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
fs::write(root.join("a.md"), b"md").unwrap();
fs::write(root.join("b.py"), b"py").unwrap();
fs::write(root.join("c.png"), b"\x89PNG").unwrap();
fs::write(root.join("d.pdf"), b"%PDF").unwrap();
dir
}
/// Empty include → all 4 files pass (backward-compat).
#[test]
fn include_empty_accepts_all_files() {
let dir = setup_mixed_dir();
let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap();
let scope = SourceScope {
include: vec![],
..SourceScope::default()
};
let assets = conn.scan(&scope).unwrap();
let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect();
assert!(names.contains(&"a.md".to_string()), "a.md missing; got: {names:?}");
assert!(names.contains(&"b.py".to_string()), "b.py missing; got: {names:?}");
assert!(names.contains(&"c.png".to_string()), "c.png missing; got: {names:?}");
assert!(names.contains(&"d.pdf".to_string()), "d.pdf missing; got: {names:?}");
assert_eq!(names.len(), 4, "expected exactly 4 files; got: {names:?}");
}
/// Non-empty include → only md + py come back; png + pdf are excluded.
#[test]
fn include_nonempty_is_allowlist() {
let dir = setup_mixed_dir();
let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap();
let scope = SourceScope {
include: vec!["**/*.md".to_string(), "**/*.py".to_string()],
..SourceScope::default()
};
let assets = conn.scan(&scope).unwrap();
let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect();
assert!(names.contains(&"a.md".to_string()), "a.md should be accepted; got: {names:?}");
assert!(names.contains(&"b.py".to_string()), "b.py should be accepted; got: {names:?}");
assert!(
!names.contains(&"c.png".to_string()),
"c.png must be rejected by include allowlist; got: {names:?}"
);
assert!(
!names.contains(&"d.pdf".to_string()),
"d.pdf must be rejected by include allowlist; got: {names:?}"
);
assert_eq!(names.len(), 2, "expected exactly 2 files; got: {names:?}");
}
/// include + exclude are ANDed: a file matching include but also matching
/// exclude must be rejected.
#[test]
fn include_and_exclude_are_anded() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
fs::write(root.join("keep.md"), b"keep").unwrap();
fs::write(root.join("drop.md"), b"drop").unwrap();
fs::write(root.join("other.py"), b"py").unwrap();
let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap();
let scope = SourceScope {
include: vec!["**/*.md".to_string()],
exclude: vec!["drop.md".to_string()],
..SourceScope::default()
};
let assets = conn.scan(&scope).unwrap();
let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect();
assert!(names.contains(&"keep.md".to_string()), "keep.md should be accepted; got: {names:?}");
assert!(
!names.contains(&"drop.md".to_string()),
"drop.md should be excluded (matched exclude); got: {names:?}"
);
assert!(
!names.contains(&"other.py".to_string()),
"other.py should be excluded (not in include); got: {names:?}"
);
}

View File

@@ -153,6 +153,34 @@ impl SqliteStore {
}
}
// p10-1A-1 fix (dogfood-discovered 2026-05-20): code_lang filter
// (IN-list on metadata_json.$.code_lang). Empty Vec = no filter.
if !filters.code_lang.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.code_lang.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(
" AND json_extract(d.metadata_json, '$.code_lang') IN ({placeholders})"
));
for lang in &filters.code_lang {
bind.push(Box::new(lang.clone()));
}
}
// p10-1A-1 fix (dogfood-discovered 2026-05-20): repo filter
// (IN-list on metadata_json.$.repo). Empty Vec = no filter.
if !filters.repo.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.repo.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(
" AND json_extract(d.metadata_json, '$.repo') IN ({placeholders})"
));
for repo in &filters.repo {
bind.push(Box::new(repo.clone()));
}
}
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
// lexicographic >= compare is correct — but only when the filter
@@ -408,6 +436,78 @@ mod tests {
.unwrap();
}
/// Variant of `seed_committed_full` that additionally accepts a
/// `metadata_json` string so p10-1A-1 filter tests can set
/// `metadata.code_lang` / `metadata.repo` without going through the
/// full ingest pipeline.
#[allow(clippy::too_many_arguments)]
fn seed_committed_with_metadata(
store: &SqliteStore,
chunk_id: &str,
doc_id: &str,
workspace_path: &str,
media_type_json: &str,
metadata_json: &str,
) {
let asset_id = format!("a{}", &doc_id[..31]);
{
let conn = store.lock_conn();
conn.execute(
"INSERT INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, ?, ?, ?, 0, 'deadbeefdeadbeefdeadbeefdeadbeef',
'reference', ?, '1970-01-01T00:00:00Z')",
params![
asset_id,
format!("file://{workspace_path}"),
workspace_path,
media_type_json,
workspace_path,
],
)
.unwrap();
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang, source_type,
trust_level, parser_version, doc_version, schema_version,
metadata_json, provenance_json, created_at, updated_at
) VALUES (?, ?, ?, NULL, 'en', 'code', 'primary', 'v1', 1, 1,
?, '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
params![doc_id, asset_id, workspace_path, metadata_json],
)
.unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, 'code snippet', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
'1970-01-01T00:00:00Z')",
params![chunk_id, doc_id],
)
.unwrap();
}
let embed_row = EmbeddingRecordRow {
embedding_id: format!("e{}", &chunk_id[..31]),
chunk_id: chunk_id.to_string(),
model_id: "m".to_string(),
model_version: "v1".to_string(),
dimensions: 4,
lance_table: "t".to_string(),
created_at: OffsetDateTime::UNIX_EPOCH,
};
store
.put_embedding_records_pending(std::slice::from_ref(&embed_row))
.unwrap();
store
.mark_embedding_records_committed(std::slice::from_ref(
&embed_row.embedding_id,
))
.unwrap();
}
fn cid(s: &str) -> ChunkId {
ChunkId(s.to_string())
}
@@ -671,6 +771,78 @@ mod tests {
assert_eq!(out, vec![cid(c1)], "doc_id filter must scope to the target doc only");
}
// ── p10-1A-1 new filter arms ─────────────────────────────────────────
#[test]
fn filter_chunks_code_lang_keeps_matching_lang() {
// c1 = python, c2 = rust, c3 = markdown (no code_lang).
// Filter code_lang=["python"] → only c1 survives.
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let c1 = "11111111111111111111111111111111";
let c2 = "22222222222222222222222222222222";
let c3 = "33333333333333333333333333333333";
seed_committed_with_metadata(
&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
"src/main.py", r#""code""#,
r#"{"code_lang":"python"}"#,
);
seed_committed_with_metadata(
&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2",
"src/lib.rs", r#""code""#,
r#"{"code_lang":"rust"}"#,
);
seed_committed_with_metadata(
&store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
"README.md", r#""markdown""#,
r#"{}"#,
);
let f = SearchFilters {
code_lang: vec!["python".to_string()],
..Default::default()
};
let out = store
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
.unwrap();
assert_eq!(out, vec![cid(c1)], "only python chunk should survive code_lang filter");
}
#[test]
fn filter_chunks_repo_keeps_matching_repo() {
// c1 = repo "httpx", c2 = repo "requests", c3 = no repo.
// Filter repo=["httpx"] → only c1 survives.
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let c1 = "11111111111111111111111111111111";
let c2 = "22222222222222222222222222222222";
let c3 = "33333333333333333333333333333333";
seed_committed_with_metadata(
&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
"httpx/client.py", r#""code""#,
r#"{"repo":"httpx","code_lang":"python"}"#,
);
seed_committed_with_metadata(
&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2",
"requests/api.py", r#""code""#,
r#"{"repo":"requests","code_lang":"python"}"#,
);
seed_committed_with_metadata(
&store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
"standalone.py", r#""code""#,
r#"{"code_lang":"python"}"#,
);
let f = SearchFilters {
repo: vec!["httpx".to_string()],
..Default::default()
};
let out = store
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
.unwrap();
assert_eq!(out, vec![cid(c1)], "only httpx chunk should survive repo filter");
}
#[test]
fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
// Regression test for the non-UTC offset lex-compare bug.

View File

@@ -701,6 +701,39 @@ impl SqliteStore {
}
Ok(out)
}
/// p10-1A-2 follow-up (dogfooding 2026-05-20): per-repo doc count for
/// `schema.v1`.
///
/// Reads `metadata_json->'$.repo'`, groups by the value, and skips rows
/// where `repo` is NULL (documents without an explicit repo tag).
/// Returns `BTreeMap<String, u32>` — key is the repo name as stored in
/// frontmatter, value is the doc count.
pub fn repo_breakdown(
&self,
) -> anyhow::Result<std::collections::BTreeMap<String, u32>> {
use anyhow::Context;
let conn = self.read_conn();
let mut stmt = conn
.prepare(
"SELECT json_extract(metadata_json, '$.repo') AS rp, COUNT(*) \
FROM documents \
WHERE rp IS NOT NULL \
GROUP BY rp",
)
.context("prepare repo_breakdown")?;
let rows = stmt
.query_map([], |r| {
Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)? as u32))
})
.context("query repo_breakdown")?;
let mut out = std::collections::BTreeMap::new();
for row in rows {
let (k, v) = row.context("read repo_breakdown row")?;
out.insert(k, v);
}
Ok(out)
}
}
/// Apply the design §5 / task-spec pragmas. Called once per connection.
@@ -817,5 +850,79 @@ mod tests {
// only one key total
assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}");
}
/// p10-1A-2 follow-up: `repo_breakdown` counts docs by
/// `metadata_json.repo`.
///
/// Inserts:
/// - one doc with `repo = "my-repo"` → must appear with count 1
/// - one doc with `repo = null` → must NOT appear (NULL skipped)
///
/// Uses a side rusqlite connection that bypasses the `assets` FK via
/// `PRAGMA foreign_keys = OFF` so the test is self-contained.
#[test]
fn repo_breakdown_counts_by_repo() {
let (dir, store) = open_fresh_store();
let db_path = dir.path().join("kebab.sqlite");
let conn = rusqlite::Connection::open(&db_path).unwrap();
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
// Doc 1: doc with repo = "my-repo"
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path,
source_type, trust_level, parser_version,
doc_version, schema_version,
metadata_json, provenance_json,
created_at, updated_at
) VALUES (
'doc-repo-1', 'asset-r1', 'my-repo/README.md',
'markdown', 'primary', 'test-v1',
1, 1,
'{\"repo\":\"my-repo\"}', '{}',
'2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z'
)",
[],
)
.unwrap();
// Doc 2: doc with repo absent (null in JSON)
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path,
source_type, trust_level, parser_version,
doc_version, schema_version,
metadata_json, provenance_json,
created_at, updated_at
) VALUES (
'doc-norepo-1', 'asset-r2', 'standalone/notes.md',
'markdown', 'primary', 'test-v1',
1, 1,
'{\"repo\":null}', '{}',
'2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z'
)",
[],
)
.unwrap();
drop(conn); // release side connection before querying via store
let bd = store.repo_breakdown().unwrap();
// "my-repo" must appear with count 1
assert_eq!(
bd.get("my-repo"),
Some(&1u32),
"expected my-repo=1 in repo_breakdown, got: {bd:?}"
);
// null repo must NOT appear as any key
assert!(
!bd.contains_key("null"),
"null repo must not appear in breakdown, got: {bd:?}"
);
// only one key total
assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}");
}
}