Files
kebab/crates/kebab-source-fs/tests/include_allowlist.rs
altair823 58ac62d53a feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type
혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance
레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR
0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버.

- config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type);
  단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음.
- config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등)
- V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0)
- Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary)
- ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp
- 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR)
- CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep)

도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810,
--source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값).

version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor).
follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨.

자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
2026-06-21 08:35:19 +00:00

133 lines
4.4 KiB
Rust

//! Integration test: `scope.include` enforces an allow-list.
//!
//! Semantics (gitignore convention):
//! - `include` is empty Vec → all files pass through (backward-compat).
//! - `include` is non-empty → only files matching at least one pattern
//! are accepted. `exclude` rules still apply after include.
//!
//! Layout (built per-test in a TempDir):
//! root/
//! ├── a.md
//! ├── b.py
//! ├── c.png
//! └── d.pdf
use std::fs;
use kebab_config::Config;
use kebab_core::{SourceConnector, SourceScope};
use kebab_source_fs::FsSourceConnector;
fn cfg_with_root(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = Some(root.to_string());
c.workspace.exclude.clear();
// Disable size / generated caps so small test files always pass.
c.ingest.code.max_file_bytes = u64::MAX;
c.ingest.code.max_file_lines = u32::MAX;
c.ingest.code.skip_generated_header = false;
c
}
fn setup_mixed_dir() -> tempfile::TempDir {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
fs::write(root.join("a.md"), b"md").unwrap();
fs::write(root.join("b.py"), b"py").unwrap();
fs::write(root.join("c.png"), b"\x89PNG").unwrap();
fs::write(root.join("d.pdf"), b"%PDF").unwrap();
dir
}
/// Empty include → all 4 files pass (backward-compat).
#[test]
fn include_empty_accepts_all_files() {
let dir = setup_mixed_dir();
let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap();
let scope = SourceScope {
include: vec![],
..SourceScope::default()
};
let assets = conn.scan(&scope).unwrap();
let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect();
assert!(
names.contains(&"a.md".to_string()),
"a.md missing; got: {names:?}"
);
assert!(
names.contains(&"b.py".to_string()),
"b.py missing; got: {names:?}"
);
assert!(
names.contains(&"c.png".to_string()),
"c.png missing; got: {names:?}"
);
assert!(
names.contains(&"d.pdf".to_string()),
"d.pdf missing; got: {names:?}"
);
assert_eq!(names.len(), 4, "expected exactly 4 files; got: {names:?}");
}
/// Non-empty include → only md + py come back; png + pdf are excluded.
#[test]
fn include_nonempty_is_allowlist() {
let dir = setup_mixed_dir();
let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap();
let scope = SourceScope {
include: vec!["**/*.md".to_string(), "**/*.py".to_string()],
..SourceScope::default()
};
let assets = conn.scan(&scope).unwrap();
let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect();
assert!(
names.contains(&"a.md".to_string()),
"a.md should be accepted; got: {names:?}"
);
assert!(
names.contains(&"b.py".to_string()),
"b.py should be accepted; got: {names:?}"
);
assert!(
!names.contains(&"c.png".to_string()),
"c.png must be rejected by include allowlist; got: {names:?}"
);
assert!(
!names.contains(&"d.pdf".to_string()),
"d.pdf must be rejected by include allowlist; got: {names:?}"
);
assert_eq!(names.len(), 2, "expected exactly 2 files; got: {names:?}");
}
/// include + exclude are ANDed: a file matching include but also matching
/// exclude must be rejected.
#[test]
fn include_and_exclude_are_anded() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
fs::write(root.join("keep.md"), b"keep").unwrap();
fs::write(root.join("drop.md"), b"drop").unwrap();
fs::write(root.join("other.py"), b"py").unwrap();
let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap();
let scope = SourceScope {
include: vec!["**/*.md".to_string()],
exclude: vec!["drop.md".to_string()],
..SourceScope::default()
};
let assets = conn.scan(&scope).unwrap();
let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect();
assert!(
names.contains(&"keep.md".to_string()),
"keep.md should be accepted; got: {names:?}"
);
assert!(
!names.contains(&"drop.md".to_string()),
"drop.md should be excluded (matched exclude); got: {names:?}"
);
assert!(
!names.contains(&"other.py".to_string()),
"other.py should be excluded (not in include); got: {names:?}"
);
}