Files
kebab/crates/kebab-source-fs/tests/snapshot_tree1.rs
altair823 58ac62d53a feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type
혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance
레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR
0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버.

- config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type);
  단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음.
- config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등)
- V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0)
- Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary)
- ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp
- 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR)
- CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep)

도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810,
--source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값).

version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor).
follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨.

자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
2026-06-21 08:35:19 +00:00

139 lines
5.0 KiB
Rust

//! Snapshot + determinism tests against `fixtures/source-fs/tree-1`.
//!
//! Layout (committed under `<repo>/fixtures/source-fs/tree-1/`):
//!
//! ```
//! tree-1/
//! ├── README.md
//! ├── notes/
//! │ ├── alpha.md
//! │ └── beta.md
//! ├── ignored/
//! │ └── skip.tmp # excluded by .kebabignore
//! ├── .kebabignore # contains: *.tmp
//! └── .DS_Store # implicitly excluded
//! ```
//!
//! Two assertions:
//! 1. Snapshot stability — `scan` output (with `discovered_at` stripped)
//! matches the committed baseline JSON byte-for-byte.
//! 2. Determinism — running `scan` twice produces byte-identical JSON
//! after stripping `discovered_at`.
//!
//! `discovered_at` is wall-clock and intentionally NOT part of the
//! contract: the task spec says strip it before comparison.
use std::path::PathBuf;
use kebab_config::Config;
use kebab_core::{SourceConnector, SourceScope};
use kebab_source_fs::FsSourceConnector;
use serde_json::Value;
/// Repo root, derived from `CARGO_MANIFEST_DIR` (= `crates/kb-source-fs`).
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.to_path_buf()
}
fn fixture_root() -> PathBuf {
repo_root().join("fixtures/source-fs/tree-1")
}
fn baseline_path() -> PathBuf {
repo_root().join("fixtures/source-fs/tree-1.snapshot.json")
}
fn cfg_for_fixture(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = Some(root.to_string());
// Clear default excludes (`.git/**`, `node_modules/**`, `.obsidian/**`)
// so the snapshot is purely a function of the fixture + .kebabignore +
// baked-in default-excludes.
c.workspace.exclude.clear();
c
}
/// Run `scan` against the fixture and return the JSON value with every
/// `discovered_at` field replaced by the literal string "<stripped>".
/// Also strip `source_uri.value` and `stored.path` because they contain
/// absolute paths that vary by checkout location — the snapshot must be
/// portable across machines and CI checkout dirs.
fn scan_and_strip() -> Value {
let root = fixture_root();
let cfg = cfg_for_fixture(root.to_str().unwrap());
let conn = FsSourceConnector::new(&cfg).expect("connector init");
let assets = conn
.scan(&SourceScope::default())
.expect("scan must succeed against committed fixture");
let mut v = serde_json::to_value(&assets).expect("serialize");
if let Value::Array(items) = &mut v {
for item in items {
if let Value::Object(map) = item {
map.insert(
"discovered_at".to_string(),
Value::String("<stripped>".to_string()),
);
// source_uri = { kind: "file", value: "<abs>" } — strip value.
if let Some(Value::Object(s)) = map.get_mut("source_uri") {
if s.contains_key("value") {
s.insert("value".to_string(), Value::String("<stripped>".to_string()));
}
}
// stored = { kind: "copied"|"reference", path: "<abs>", ... } — strip path.
if let Some(Value::Object(s)) = map.get_mut("stored") {
if s.contains_key("path") {
s.insert("path".to_string(), Value::String("<stripped>".to_string()));
}
}
}
}
}
v
}
#[test]
fn tree_1_snapshot_matches_baseline() {
let actual = scan_and_strip();
// If KEBAB_REGEN_SNAPSHOT is set, (re)write the baseline and exit
// *before* attempting to read it. This is the only path that may
// create the file from scratch.
if std::env::var_os("KEBAB_REGEN_SNAPSHOT").is_some() {
let pretty = serde_json::to_string_pretty(&actual).unwrap() + "\n";
std::fs::write(baseline_path(), pretty).expect("write baseline");
panic!("regenerated baseline; rerun without KEBAB_REGEN_SNAPSHOT to verify");
}
let baseline_text = std::fs::read_to_string(baseline_path()).unwrap_or_else(|_| {
panic!(
"missing baseline at {} — regenerate via `KEBAB_REGEN_SNAPSHOT=1 cargo test \
-p kb-source-fs --test snapshot_tree1 -- tree_1_snapshot_matches_baseline`",
baseline_path().display()
)
});
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline JSON must parse");
if actual != expected {
let actual_pretty = serde_json::to_string_pretty(&actual).unwrap();
let expected_pretty = serde_json::to_string_pretty(&expected).unwrap();
panic!(
"snapshot drift.\n--- expected ---\n{expected_pretty}\n--- actual ---\n{actual_pretty}\n"
);
}
}
#[test]
fn tree_1_scan_is_deterministic() {
let v1 = scan_and_strip();
let v2 = scan_and_strip();
let s1 = serde_json::to_string(&v1).unwrap();
let s2 = serde_json::to_string(&v2).unwrap();
assert_eq!(s1, s2, "two consecutive scans diverged");
}