feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type

혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance
레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR
0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버.

- config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type);
  단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음.
- config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등)
- V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0)
- Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary)
- ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp
- 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR)
- CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep)

도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810,
--source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값).

version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor).
follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨.

자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
This commit is contained in:
2026-06-21 08:35:19 +00:00
parent 403e162ac0
commit 58ac62d53a
101 changed files with 1201 additions and 111 deletions

View File

@@ -12,6 +12,12 @@ mod paths;
pub mod migrate;
pub use paths::{expand_path, expand_path_with_base};
/// Implicit source id used when a single-root `[workspace]` config (no
/// `[[workspace.sources]]`) is normalized into the multi-source model, and
/// the `DEFAULT` value of the `documents.source_id` column. Kept in sync
/// with the migration default in `migrations/V0XX__documents_source_id.sql`.
pub const DEFAULT_SOURCE_ID: &str = "default";
/// f32 의 shortest round-trip(Display)을 f64 로 재파싱해 직렬화한다.
/// `0.3_f32` 가 `0.30000001192092896` 으로 새지 않고 `0.3` 으로 출력되게 한다.
/// 마이그레이션 시 toml_edit relocation 의 무손실 비교를 깨지 않도록, 그리고
@@ -88,8 +94,67 @@ pub struct Config {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct WorkspaceCfg {
pub root: String,
/// Single-root workspace (legacy / common case). `Option` so that a
/// config that declares only `[[workspace.sources]]` (no bare `root`)
/// parses — and, symmetrically, a legacy single-`root` config (no
/// `sources`) still parses unchanged. The load-time normalizer
/// ([`Config::normalize_sources`]) reconciles the two into a single
/// non-empty `sources` list (`id = "default"` synthesized from `root`).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub root: Option<String>,
pub exclude: Vec<String>,
/// `[[workspace.sources]]`: named multi-source declaration. When empty
/// and `root` is set, the load path normalizes to a single implicit
/// `default` source. Each entry stamps its `id` onto every document it
/// ingests and supplies per-source `trust_level` / `source_type`
/// defaults (frontmatter still wins per the §0 Q9 derive table).
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub sources: Vec<SourceCfg>,
}
/// One named source under `[[workspace.sources]]`.
///
/// `trust_level` / `source_type` are the **source-level defaults**: they
/// apply when a document's frontmatter does not specify the field. The
/// precedence is `frontmatter > source default > hardcoded`
/// (`TrustLevel::Primary` / `SourceType::Markdown`) — implemented in the
/// markdown derive via `BodyHints::fallback_trust_level`.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct SourceCfg {
/// Stable identifier stamped onto `documents.source_id` for every
/// document ingested from this source. Must be unique and non-empty
/// across the workspace (enforced in [`Config::validate`]).
pub id: String,
/// Root directory to walk for this source. Accepts the same
/// absolute / `~` / `${VAR}` / relative(=config-dir-based) forms as
/// the legacy `workspace.root`.
pub root: String,
/// Per-source denylist globs, merged on top of `workspace.exclude`.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub exclude: Vec<String>,
/// Per-source default `trust_level` (frontmatter overrides it).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub trust_level: Option<kebab_core::TrustLevel>,
/// Per-source default `source_type` (frontmatter overrides it).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub source_type: Option<kebab_core::SourceType>,
}
/// A source with its `root` resolved to an absolute path and its `exclude`
/// merged with `workspace.exclude`. Produced by [`Config::resolved_sources`]
/// — the single entry point the ingest pipeline iterates over.
#[derive(Clone, Debug, PartialEq)]
pub struct ResolvedSource {
/// Stamped onto `documents.source_id`.
pub id: String,
/// Absolute walk root (tilde / `${VAR}` / relative-to-config resolved).
pub root: PathBuf,
/// `workspace.exclude` per-source `exclude`.
pub exclude: Vec<String>,
/// Per-source default trust level (None → fall back to `Primary`).
pub trust_level: Option<kebab_core::TrustLevel>,
/// Per-source default source type (None → fall back to `Markdown`).
pub source_type: Option<kebab_core::SourceType>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -782,12 +847,13 @@ impl Config {
Self {
schema_version: crate::migrate::CURRENT_SCHEMA_VERSION,
workspace: WorkspaceCfg {
root: "~/KnowledgeBase".to_string(),
root: Some("~/KnowledgeBase".to_string()),
exclude: vec![
".git/**".to_string(),
"node_modules/**".to_string(),
".obsidian/**".to_string(),
],
sources: vec![],
},
storage: StorageCfg {
data_dir: "${XDG_DATA_HOME:-~/.local/share}/kebab".to_string(),
@@ -906,7 +972,78 @@ impl Config {
PathBuf::from(".")
})
});
paths::expand_path_with_base(&self.workspace.root, "", &base)
paths::expand_path_with_base(&self.primary_root_raw(), "", &base)
}
/// The raw (unexpanded) string for the *primary* workspace root, used by
/// [`resolve_workspace_root`](Self::resolve_workspace_root) and any
/// single-root code path. Order: first `[[workspace.sources]]` entry's
/// `root` → bare `workspace.root` → `~/KnowledgeBase` default. This keeps
/// every pre-existing single-root call site working when only `sources`
/// is declared.
fn primary_root_raw(&self) -> String {
if let Some(s) = self.workspace.sources.first() {
return s.root.clone();
}
self.workspace
.root
.clone()
.unwrap_or_else(|| "~/KnowledgeBase".to_string())
}
/// The base directory for resolving relative source roots: the config
/// file's directory when loaded from disk, else the current dir (mirrors
/// [`resolve_workspace_root`](Self::resolve_workspace_root)).
fn root_resolution_base(&self) -> PathBuf {
self.source_dir.clone().unwrap_or_else(|| {
std::env::current_dir().unwrap_or_else(|e| {
tracing::warn!(
target: "kebab-config",
error = %e,
"current_dir() failed; falling back to '.' for source root resolution"
);
PathBuf::from(".")
})
})
}
/// Normalized, resolved list of sources to ingest. Always non-empty:
///
/// - If `[[workspace.sources]]` is declared, each entry is returned with
/// its `root` expanded and `exclude` merged with `workspace.exclude`.
/// - Otherwise a single implicit source `id = "default"` is synthesized
/// from `workspace.root` (the legacy single-root path).
///
/// This is the single entry point the ingest pipeline iterates over, so
/// single-root and multi-source configs share one code path.
pub fn resolved_sources(&self) -> Vec<ResolvedSource> {
let base = self.root_resolution_base();
if self.workspace.sources.is_empty() {
let root = paths::expand_path_with_base(&self.primary_root_raw(), "", &base);
return vec![ResolvedSource {
id: DEFAULT_SOURCE_ID.to_string(),
root,
exclude: self.workspace.exclude.clone(),
trust_level: None,
source_type: None,
}];
}
self.workspace
.sources
.iter()
.map(|s| {
let root = paths::expand_path_with_base(&s.root, "", &base);
let mut exclude = self.workspace.exclude.clone();
exclude.extend(s.exclude.iter().cloned());
ResolvedSource {
id: s.id.clone(),
root,
exclude,
trust_level: s.trust_level,
source_type: s.source_type,
}
})
.collect()
}
/// Read config from disk and merge env overrides on top of it. If the
@@ -1019,10 +1156,41 @@ impl Config {
cause: format!("parse_failed: {e}"),
})
})?;
cfg.validate_sources().map_err(|cause| {
anyhow::Error::new(ConfigInvalid {
path: path.to_path_buf(),
cause,
})
})?;
cfg.source_dir = path.parent().map(Path::to_path_buf);
Ok(cfg)
}
/// Validate `[[workspace.sources]]`: every `id` must be non-empty and
/// unique across the workspace. Empty `sources` (legacy single-root) is
/// always valid. Returns the failure cause string for `ConfigInvalid`.
fn validate_sources(&self) -> Result<(), String> {
let mut seen = std::collections::HashSet::new();
for s in &self.workspace.sources {
if s.id.trim().is_empty() {
return Err("workspace.sources: an entry has an empty `id`".to_string());
}
if s.root.trim().is_empty() {
return Err(format!(
"workspace.sources: source `{}` has an empty `root`",
s.id
));
}
if !seen.insert(s.id.as_str()) {
return Err(format!(
"workspace.sources: duplicate source id `{}` (ids must be unique)",
s.id
));
}
}
Ok(())
}
/// Apply `KEBAB_<SECTION>_<KEY>` env overrides. Unknown keys are ignored.
///
/// The mapping is an explicit grep-friendly whitelist — one match arm
@@ -1037,7 +1205,7 @@ impl Config {
}
match k.as_str() {
// workspace
"KEBAB_WORKSPACE_ROOT" => self.workspace.root = v.clone(),
"KEBAB_WORKSPACE_ROOT" => self.workspace.root = Some(v.clone()),
// storage
"KEBAB_STORAGE_DATA_DIR" => self.storage.data_dir = v.clone(),
@@ -2034,7 +2202,7 @@ max_context_tokens = 8000
#[test]
fn legacy_include_field_is_ignored_silently() {
let mut cfg = Config::defaults();
cfg.workspace.root = "/tmp/kebab-legacy".to_string();
cfg.workspace.root = Some("/tmp/kebab-legacy".to_string());
let mut toml_text = toml::to_string(&cfg).expect("default round-trips");
// Inject a legacy `include = [...]` line into the [workspace] block.
toml_text = toml_text.replace(
@@ -2048,20 +2216,105 @@ max_context_tokens = 8000
parsed.err()
);
let cfg = parsed.unwrap();
assert_eq!(cfg.workspace.root, "/tmp/kebab-legacy");
assert_eq!(cfg.workspace.root.as_deref(), Some("/tmp/kebab-legacy"));
}
/// p9-fb-25: `WorkspaceCfg` must NOT have an `include` field.
/// Compile-time proof: exhaustive destructure.
#[test]
fn workspace_cfg_has_only_root_and_exclude_fields() {
fn workspace_cfg_has_only_root_exclude_sources_fields() {
let ws = Config::defaults().workspace;
let WorkspaceCfg {
root: _,
exclude: _,
sources: _,
} = &ws;
}
#[test]
fn legacy_single_root_normalizes_to_default_source() {
// A single-root config (no [[workspace.sources]]) must resolve to
// exactly one source `id = "default"` rooted at workspace.root.
let mut cfg = Config::defaults();
cfg.workspace.root = Some("/tmp/kb-notes".to_string());
let resolved = cfg.resolved_sources();
assert_eq!(resolved.len(), 1);
assert_eq!(resolved[0].id, DEFAULT_SOURCE_ID);
assert_eq!(resolved[0].root, std::path::PathBuf::from("/tmp/kb-notes"));
assert_eq!(resolved[0].trust_level, None);
}
#[test]
fn multi_source_config_resolves_each_with_merged_exclude() {
let mut cfg = Config::defaults();
cfg.workspace.root = None;
cfg.workspace.exclude = vec![".git/**".to_string()];
cfg.workspace.sources = vec![
SourceCfg {
id: "notes".to_string(),
root: "/tmp/notes".to_string(),
exclude: vec![],
trust_level: Some(kebab_core::TrustLevel::Primary),
source_type: None,
},
SourceCfg {
id: "refs".to_string(),
root: "/tmp/refs".to_string(),
exclude: vec!["draft/**".to_string()],
trust_level: Some(kebab_core::TrustLevel::Secondary),
source_type: Some(kebab_core::SourceType::Reference),
},
];
// A multi-source config (no bare root) must round-trip through TOML.
let toml_text = toml::to_string(&cfg).expect("multi-source serializes");
let cfg: Config = toml::from_str(&toml_text).expect("multi-source parses");
cfg.validate_sources().expect("valid sources");
let resolved = cfg.resolved_sources();
assert_eq!(resolved.len(), 2);
assert_eq!(resolved[0].id, "notes");
assert_eq!(resolved[0].root, std::path::PathBuf::from("/tmp/notes"));
assert_eq!(resolved[0].exclude, vec![".git/**".to_string()]);
assert_eq!(resolved[0].trust_level, Some(kebab_core::TrustLevel::Primary));
assert_eq!(resolved[1].id, "refs");
// workspace.exclude per-source exclude.
assert_eq!(
resolved[1].exclude,
vec![".git/**".to_string(), "draft/**".to_string()]
);
assert_eq!(
resolved[1].source_type,
Some(kebab_core::SourceType::Reference)
);
assert_eq!(
resolved[1].trust_level,
Some(kebab_core::TrustLevel::Secondary)
);
}
fn source_cfg(id: &str, root: &str) -> SourceCfg {
SourceCfg {
id: id.to_string(),
root: root.to_string(),
exclude: vec![],
trust_level: None,
source_type: None,
}
}
#[test]
fn duplicate_source_ids_rejected() {
let mut cfg = Config::defaults();
cfg.workspace.sources = vec![source_cfg("dup", "/a"), source_cfg("dup", "/b")];
assert!(cfg.validate_sources().is_err(), "duplicate ids must fail");
}
#[test]
fn empty_source_id_rejected() {
let mut cfg = Config::defaults();
cfg.workspace.sources = vec![source_cfg("", "/a")];
assert!(cfg.validate_sources().is_err(), "empty id must fail");
}
#[test]
fn default_stale_threshold_is_30() {
let c = Config::defaults();

View File

@@ -9,7 +9,7 @@ use toml_edit::{DocumentMut, Item};
/// 현재 바이너리가 이해하는 config 스키마 버전. 마이그레이션 완료 시
/// 사용자 파일의 `schema_version` 을 이 값으로 stamp 한다.
pub const CURRENT_SCHEMA_VERSION: u32 = 3;
pub const CURRENT_SCHEMA_VERSION: u32 = 4;
/// 한 번의 마이그레이션에서 발생한 개별 변경.
#[derive(Clone, Debug, PartialEq, serde::Serialize)]
@@ -68,6 +68,7 @@ const HEADER: &str = "\
fn section_comment(path: &str) -> Option<&'static str> {
Some(match path {
"workspace" => "# 색인 대상 워크스페이스.",
"workspace.sources" => "# named multi-source (각 source 의 id 가 documents.source_id 로 stamp).",
"storage" => "# XDG 저장 경로(데이터/sqlite/벡터/에셋/모델).",
"indexing" => "# 병렬도 + 파일시스템 watch.",
"chunking" => "# 청크 크기·오버랩·heading 존중.",
@@ -376,6 +377,39 @@ pub fn step_2_to_3(doc: &mut DocumentMut, changes: &mut Vec<MigrationChange>) {
copy_image_paddle_to_pdf(doc);
}
/// v3 → v4: 단일 `workspace.root` 를 `[[workspace.sources]]` 의 implicit
/// `default` source 로 미러링한다(`id = "default"`, `root = <기존 root>`).
/// 기존 `workspace.root` 키는 그대로 둔다 — `resolved_sources()` 가 sources
/// 가 있으면 그쪽을 우선하므로 무해하고, defaults reconcile 이 root 를 다시
/// 추가하려 하지 않게 한다. 멱등: `[[workspace.sources]]` 가 이미 있으면 no-op.
pub fn step_3_to_4(doc: &mut DocumentMut, changes: &mut Vec<MigrationChange>) {
let Some(ws) = doc.get_mut("workspace").and_then(Item::as_table_mut) else {
return;
};
// 이미 sources 가 선언돼 있으면(array-of-tables 든 inline 이든) 손대지 않음.
if ws.contains_key("sources") {
return;
}
// root 가 없으면 만들 게 없음(defaults 에는 항상 있지만 방어).
let Some(root_val) = ws.get("root").and_then(Item::as_str).map(str::to_string) else {
return;
};
let mut entry = toml_edit::Table::new();
entry.insert("id", toml_edit::value("default"));
entry.insert("root", toml_edit::value(root_val));
let mut aot = toml_edit::ArrayOfTables::new();
aot.push(entry);
ws.insert("sources", Item::ArrayOfTables(aot));
changes.push(MigrationChange {
kind: ChangeKind::AddedSection,
path: "workspace.sources".to_string(),
detail: "workspace.root → [[workspace.sources]] id=default".to_string(),
});
}
/// 파일의 schema_version(없으면 1) 부터 CURRENT 까지 step 적용.
fn run_steps(doc: &mut DocumentMut, from: u32, changes: &mut Vec<MigrationChange>) {
if from < 2 {
@@ -384,6 +418,9 @@ fn run_steps(doc: &mut DocumentMut, from: u32, changes: &mut Vec<MigrationChange
if from < 3 {
step_2_to_3(doc, changes);
}
if from < 4 {
step_3_to_4(doc, changes);
}
}
/// 사용자 config.toml 텍스트를 받아 step 체인 + reconciliation + version
@@ -648,6 +685,76 @@ engine = \"paddle-onnx\"
assert!(again.is_empty(), "not idempotent: {again:?}");
}
#[test]
fn step_3_to_4_mirrors_root_into_default_source() {
let v3 = "\
schema_version = 3
[workspace]
root = \"/my/notes\"
exclude = [\".git/**\"]
";
let mut doc: DocumentMut = v3.parse().unwrap();
let mut changes = Vec::new();
step_3_to_4(&mut doc, &mut changes);
let out = doc.to_string();
// 새 array-of-tables 가 id=default 로 추가.
assert!(out.contains("[[workspace.sources]]"), "{out}");
assert!(out.contains("id = \"default\""), "{out}");
// 기존 root 는 보존(reconcile 이 다시 추가하지 않게).
assert!(out.contains("root = \"/my/notes\""), "{out}");
// 재파싱 후 sources.default 가 root 를 미러.
let reparsed: DocumentMut = out.parse().unwrap();
let src0 = reparsed["workspace"]["sources"][0].as_table().unwrap();
assert_eq!(src0["id"].as_str(), Some("default"));
assert_eq!(src0["root"].as_str(), Some("/my/notes"));
// 멱등.
let mut changes2 = Vec::new();
step_3_to_4(&mut doc, &mut changes2);
assert!(changes2.is_empty(), "step_3_to_4 not idempotent");
}
#[test]
fn step_3_to_4_noop_when_sources_already_present() {
let v4 = "\
schema_version = 4
[workspace]
root = \"/my/notes\"
exclude = []
[[workspace.sources]]
id = \"notes\"
root = \"/my/notes\"
";
let mut doc: DocumentMut = v4.parse().unwrap();
let mut changes = Vec::new();
step_3_to_4(&mut doc, &mut changes);
assert!(changes.is_empty(), "must not touch existing sources");
// 기존 source 만 존재(default 가 추가되지 않음).
assert!(!doc.to_string().contains("id = \"default\""));
}
#[test]
fn migrate_document_v3_to_v4_adds_sources_and_is_idempotent() {
let v3 = "\
schema_version = 3
[workspace]
root = \"/n\"
exclude = []
";
let outcome = migrate_document(v3);
assert_eq!(outcome.from_schema_version, 3);
assert_eq!(outcome.to_schema_version, 4);
assert!(outcome.changed());
assert!(outcome.new_text.contains("[[workspace.sources]]"));
assert_eq!(read_schema_version(&outcome.new_text), 4);
let again = migrate_document(&outcome.new_text);
assert!(!again.changed(), "not idempotent: {:?}", again.changes);
assert_eq!(again.new_text, outcome.new_text);
}
#[test]
fn migrate_document_missing_schema_version_treated_as_v1() {
let old = "[workspace]\nroot = \"/n\"\n";

View File

@@ -11,11 +11,16 @@ const USER_V2: &str = include_str!("fixtures/user_v2_config.toml");
fn user_v2_migrates_losslessly() {
let out = migrate_document(USER_V2);
assert_eq!(out.from_schema_version, 2);
assert_eq!(out.to_schema_version, 3);
// v2 → CURRENT(=4): v3 의 [ingest.*] relocation 에 더해 v4 의
// [[workspace.sources]] default source 미러링까지 적용된다.
assert_eq!(out.to_schema_version, 4);
let t = &out.new_text;
// 사용자 값 보존.
assert!(t.contains("root = \"/Users/user/Obsidian/Default\""), "{t}");
// v4: workspace.root → [[workspace.sources]] id=default 미러링.
assert!(t.contains("[[workspace.sources]]"), "v4 sources 누락:\n{t}");
assert!(t.contains("id = \"default\""), "default source 누락:\n{t}");
assert!(t.contains("model = \"snowflake-arctic-embed2\""));
assert!(t.contains("endpoint = \"http://192.168.0.2:11943\""));
// 사용자 주석/대안 줄 보존.