feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type
혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance 레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR 0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버. - config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type); 단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음. - config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등) - V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0) - Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary) - ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp - 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR) - CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep) 도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810, --source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값). version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor). follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨. 자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
This commit is contained in:
@@ -42,6 +42,16 @@ pub struct BodyHints {
|
||||
/// Optional language fallback used when neither frontmatter nor lingua
|
||||
/// detection produce a value. If `None` the final fallback is `"und"`.
|
||||
pub fallback_lang: Option<String>,
|
||||
/// `[[workspace.sources]]`: id of the source this document is being
|
||||
/// ingested from. Copied verbatim into `Metadata.source_id` (frontmatter
|
||||
/// does not override the source id — it is an ingest-time provenance
|
||||
/// stamp, not a user-authored field). `None` when single-root /
|
||||
/// unspecified, in which case `Metadata.source_id` stays `None`.
|
||||
pub source_id: Option<String>,
|
||||
/// `[[workspace.sources]]`: per-source default `trust_level`. Consulted
|
||||
/// only when the frontmatter does not specify `trust_level`. Precedence:
|
||||
/// frontmatter > this source default > hardcoded `Primary`.
|
||||
pub fallback_trust_level: Option<TrustLevel>,
|
||||
}
|
||||
|
||||
/// Byte range of the frontmatter region inside the input slice.
|
||||
@@ -444,8 +454,12 @@ fn derive_metadata(
|
||||
};
|
||||
|
||||
// ---- trust_level ----
|
||||
// Precedence: frontmatter > per-source default (hints.fallback_trust_level)
|
||||
// > hardcoded Primary. An *unknown* frontmatter value warns and also falls
|
||||
// through to the source default (then Primary), so a typo doesn't silently
|
||||
// promote past the source's intended trust.
|
||||
let trust_level = match raw.trust_level.as_deref() {
|
||||
None => TrustLevel::Primary,
|
||||
None => hints.fallback_trust_level.unwrap_or(TrustLevel::Primary),
|
||||
Some(s) => {
|
||||
if let Some(tl) = parse_trust_level(s) {
|
||||
tl
|
||||
@@ -454,7 +468,7 @@ fn derive_metadata(
|
||||
kind: WarningKind::MalformedFrontmatter,
|
||||
note: format!("unknown trust_level={s}, defaulted to primary"),
|
||||
});
|
||||
TrustLevel::Primary
|
||||
hints.fallback_trust_level.unwrap_or(TrustLevel::Primary)
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -477,6 +491,10 @@ fn derive_metadata(
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
// `[[workspace.sources]]`: ingest-time provenance stamp. Frontmatter
|
||||
// does not override the source id — it is supplied by the caller
|
||||
// (kebab-app) from the matching source's config `id`.
|
||||
source_id: hints.source_id.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -604,6 +622,8 @@ mod tests {
|
||||
fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
|
||||
fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
|
||||
fallback_lang: None,
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -695,6 +715,47 @@ source_type: alien\n\
|
||||
assert!(warns.iter().any(|w| w.note.contains("source_type=alien")));
|
||||
}
|
||||
|
||||
fn hints_with_source(id: &str, trust: Option<TrustLevel>) -> BodyHints {
|
||||
BodyHints {
|
||||
source_id: Some(id.to_string()),
|
||||
fallback_trust_level: trust,
|
||||
..hints()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn source_default_trust_applied_when_frontmatter_absent() {
|
||||
// No `trust_level:` in frontmatter → the per-source default wins
|
||||
// over the hardcoded Primary.
|
||||
let md = b"---\ntitle: Doc\n---\nbody\n";
|
||||
let (meta, _span, warns) =
|
||||
parse_frontmatter(md, &hints_with_source("notes", Some(TrustLevel::Secondary)))
|
||||
.unwrap();
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
assert_eq!(meta.trust_level, TrustLevel::Secondary);
|
||||
assert_eq!(meta.source_id.as_deref(), Some("notes"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn frontmatter_trust_overrides_source_default() {
|
||||
// Explicit frontmatter trust beats the per-source default.
|
||||
let md = b"---\ntrust_level: generated\n---\nbody\n";
|
||||
let (meta, _span, _warns) =
|
||||
parse_frontmatter(md, &hints_with_source("notes", Some(TrustLevel::Secondary)))
|
||||
.unwrap();
|
||||
assert_eq!(meta.trust_level, TrustLevel::Generated);
|
||||
assert_eq!(meta.source_id.as_deref(), Some("notes"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_source_id_leaves_metadata_source_id_none() {
|
||||
let md = b"---\ntitle: Doc\n---\nbody\n";
|
||||
let (meta, _span, _warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert_eq!(meta.source_id, None);
|
||||
// Without a source default, hardcoded Primary still applies.
|
||||
assert_eq!(meta.trust_level, TrustLevel::Primary);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn malformed_yaml_emits_warning_and_defaults() {
|
||||
// Unclosed quote → YAML parse fails.
|
||||
|
||||
@@ -469,6 +469,7 @@ mod tests {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,6 +37,8 @@ fn pinned_hints() -> BodyHints {
|
||||
fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
|
||||
fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
|
||||
fallback_lang: None,
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -86,6 +86,8 @@ fn code_and_table_canonical_snapshot() {
|
||||
fs_ctime: asset.discovered_at,
|
||||
fs_mtime: asset.discovered_at,
|
||||
fallback_lang: Some("en".into()),
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
};
|
||||
let (metadata, fm_span, _fm_warns) =
|
||||
parse_frontmatter(&bytes, &hints).expect("frontmatter parses");
|
||||
|
||||
Reference in New Issue
Block a user