Files
kebab/crates/kebab-source-fs/tests/symlink_cycle.rs
altair823 58ac62d53a feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type
혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance
레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR
0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버.

- config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type);
  단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음.
- config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등)
- V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0)
- Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary)
- ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp
- 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR)
- CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep)

도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810,
--source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값).

version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor).
follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨.

자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
2026-06-21 08:35:19 +00:00

169 lines
6.3 KiB
Rust

//! Integration test: a `notes/` symlink whose target points back at the
//! workspace root MUST NOT cause `scan` to loop forever or panic.
//!
//! Layout (built per-test in a tempdir):
//! root/
//! ├── alpha.md
//! ├── notes/ (symlink → root) ← cycle: root → notes → root → …
//!
//! Expected: `scan` returns in O(seconds), every emitted path is unique,
//! and `alpha.md` appears at least once.
//!
//! The cycle guard lives in `walker::walk_files_with_skips`; this test exists to
//! prove it catches the realistic shape (cycle through one or more
//! symlinks) end-to-end via the public API.
#![cfg(unix)]
use std::os::unix::fs::symlink;
use kebab_config::Config;
use kebab_core::{SourceConnector, SourceScope};
use kebab_source_fs::FsSourceConnector;
fn cfg_with_root(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = Some(root.to_string());
c.workspace.exclude.clear();
c
}
#[test]
fn symlink_cycle_does_not_loop_or_crash() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join("alpha.md"), b"alpha").unwrap();
// Symlink: root/notes → root (a → a cycle through the link `notes`).
symlink(root, root.join("notes")).unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).expect("connector init");
let v = conn
.scan(&SourceScope::default())
.expect("scan must return, not loop");
// Determinism check: no duplicate workspace paths.
let mut seen = std::collections::HashSet::new();
for asset in &v {
assert!(
seen.insert(asset.workspace_path.0.clone()),
"duplicate workspace_path: {}",
asset.workspace_path.0
);
}
// The original alpha.md must appear.
assert!(
v.iter().any(|a| a.workspace_path.0 == "alpha.md"),
"expected alpha.md in scan output, got: {:?}",
v.iter().map(|a| &a.workspace_path.0).collect::<Vec<_>>()
);
}
#[test]
fn dangling_symlink_pseudo_cycle_does_not_crash() {
// root/
// ├── alpha.md
// ├── a → b (b does not exist as a real file/dir)
// └── b → a (a does not exist as a real file/dir)
//
// Both symlinks are dangling — neither resolves to anything. This is
// NOT a real two-step directory cycle (see
// `two_step_directory_cycle_visited_set_breaks_loop` for that case);
// it merely verifies the scan tolerates broken-link pseudo-cycles
// without crashing or looping.
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join("alpha.md"), b"alpha").unwrap();
symlink(root.join("b"), root.join("a")).unwrap();
symlink(root.join("a"), root.join("b")).unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).expect("connector init");
// Even though a→b→a never resolves to a real directory (broken
// pseudo-cycle of dangling symlinks), the scan must complete and
// surface alpha.md.
let v = conn
.scan(&SourceScope::default())
.expect("scan must return");
assert!(v.iter().any(|a| a.workspace_path.0 == "alpha.md"));
}
#[test]
fn two_step_directory_cycle_visited_set_breaks_loop() {
// Real two-step directory cycle through symlinks:
// root/
// ├── a/
// │ ├── inside_a.md
// │ └── loop → ../b (symlink, target IS a real directory)
// └── b/
// ├── inside_b.md
// └── loop → ../a (symlink, target IS a real directory)
//
// Without the visited-set, walkdir would descend
// a → a/loop (=b) → a/loop/loop (=a) → … forever.
// The canonical-path visited-set in `walker::walk_files_with_skips` must break
// the loop and yield a finite, deterministic result.
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::create_dir(root.join("a")).unwrap();
std::fs::create_dir(root.join("b")).unwrap();
std::fs::write(root.join("a/inside_a.md"), b"a-content").unwrap();
std::fs::write(root.join("b/inside_b.md"), b"b-content").unwrap();
// Use relative targets so the symlink truly points at the sibling
// directory regardless of where the tempdir lives.
symlink("../b", root.join("a/loop")).unwrap();
symlink("../a", root.join("b/loop")).unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).expect("connector init");
// Run scan twice — both must terminate AND produce identical
// workspace_path lists (visited-set is deterministic per scan).
let v1 = conn
.scan(&SourceScope::default())
.expect("scan must return");
let v2 = conn
.scan(&SourceScope::default())
.expect("scan must return");
let names1: Vec<String> = v1.iter().map(|a| a.workspace_path.0.clone()).collect();
let names2: Vec<String> = v2.iter().map(|a| a.workspace_path.0.clone()).collect();
assert_eq!(names1, names2, "scan must be deterministic across runs");
// No duplicate workspace paths (visited-set should suppress
// re-emission of the same canonical file via the cycle).
let mut seen = std::collections::HashSet::new();
for asset in &v1 {
assert!(
seen.insert(asset.workspace_path.0.clone()),
"duplicate workspace_path: {}",
asset.workspace_path.0
);
}
// Both real files must appear at least once. Their exact relative
// paths depend on which side of the cycle the walker descended into
// first; assert by basename to keep the check robust.
assert!(
v1.iter()
.any(|a| a.workspace_path.0.ends_with("inside_a.md")),
"expected inside_a.md in scan output, got: {names1:?}"
);
assert!(
v1.iter()
.any(|a| a.workspace_path.0.ends_with("inside_b.md")),
"expected inside_b.md in scan output, got: {names1:?}"
);
// Sanity bound: with two real files and a working cycle guard the
// output should be tiny. If we ever produce >50 entries the visited
// set has regressed.
assert!(
v1.len() < 50,
"scan emitted {} assets — cycle guard likely regressed: {:?}",
v1.len(),
names1
);
}