feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type

혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance
레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR
0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버.

- config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type);
  단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음.
- config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등)
- V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0)
- Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary)
- ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp
- 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR)
- CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep)

도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810,
--source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값).

version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor).
follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨.

자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
This commit is contained in:
2026-06-21 08:35:19 +00:00
parent 403e162ac0
commit 58ac62d53a
101 changed files with 1201 additions and 111 deletions

View File

@@ -727,8 +727,7 @@ impl App {
// Load (or create) the session header.
let now_unix = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
.map_or(0, |d| d.as_secs() as i64);
let existing = self.sqlite.get_session(session_id)?;
let prior_turns = match &existing {
Some(_) => self.sqlite.list_turns(session_id)?,
@@ -1111,7 +1110,7 @@ fn trim_to_chars(s: &str, n: usize) -> String {
/// terminates early) rather than panic in the budget loop.
fn estimate_chars(hits: &[SearchHit]) -> usize {
hits.iter()
.map(|h| serde_json::to_string(h).map(|s| s.len()).unwrap_or(0))
.map(|h| serde_json::to_string(h).map_or(0, |s| s.len()))
.sum()
}

View File

@@ -206,6 +206,8 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
doc_id,
repo: vec![],
code_lang: vec![],
source_type: vec![],
source_id: vec![],
};
let opts = SearchOpts {

View File

@@ -49,7 +49,8 @@ use kebab_core::{
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, Chunker, ChunkerVersion,
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, EmbeddingKind,
ExtractContext, IngestReport, Lang, LanguageModel, MediaType, ParserVersion, RawAsset,
SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore,
SearchHit, SearchQuery, SourceScope, SourceType, SourceUri, TrustLevel, VectorRecord,
VectorStore,
};
use kebab_llm_local::OllamaLanguageModel;
use kebab_parse_image::{
@@ -304,7 +305,12 @@ pub fn ingest_with_config_opts(
0
});
// Walk the workspace.
// Walk the workspace. `[[workspace.sources]]`: when the caller did not
// pin an explicit `scope.root` (the normal `kebab ingest` path), iterate
// over every configured source — each scanned with its own root + exclude
// and tagged with its `id` + default trust. When `scope.root` IS pinned
// (single-file ingest, `--root` override), scan that one root as the
// implicit `default` source — preserving pre-multi-source behavior.
crate::ingest_progress::emit(
progress,
crate::ingest_progress::IngestEvent::ScanStarted {
@@ -313,9 +319,50 @@ pub fn ingest_with_config_opts(
);
let connector =
FsSourceConnector::new(&app.config).context("kb-app::ingest: build FsSourceConnector")?;
let (assets, fs_skips) = connector
.scan_with_skips(&scope)
.context("kb-app::ingest: scan workspace")?;
// Per-source scan plan: (source_id, source_trust, scan_scope).
let scan_plan: Vec<(String, Option<TrustLevel>, SourceScope)> =
if scope.root.as_os_str().is_empty() && scope.include.is_empty() {
app.config
.resolved_sources()
.into_iter()
.map(|s| {
let scan_scope = SourceScope {
root: s.root,
include: scope.include.clone(),
exclude: s.exclude,
};
(s.id, s.trust_level, scan_scope)
})
.collect()
} else {
// Explicit-root / single-file / include-restricted ingest: one
// ad-hoc `default` source rooted at the pinned scope.
vec![(
kebab_config::DEFAULT_SOURCE_ID.to_string(),
None,
scope.clone(),
)]
};
// Accumulate assets across sources + a per-path lookup of which source
// (id + trust) each asset came from. workspace_path is unique per asset
// within a scan; on the rare overlap across sources, last-write-wins
// (sources should not share roots — a config smell, not enforced).
let mut assets: Vec<RawAsset> = Vec::new();
let mut source_by_path: std::collections::HashMap<String, (String, Option<TrustLevel>)> =
std::collections::HashMap::new();
let mut fs_skips = kebab_source_fs::FsScanSkips::default();
for (sid, strust, scan_scope) in &scan_plan {
let (src_assets, src_skips) = connector
.scan_with_skips(scan_scope)
.with_context(|| format!("kb-app::ingest: scan source `{sid}`"))?;
for a in &src_assets {
source_by_path.insert(a.workspace_path.0.clone(), (sid.clone(), *strust));
}
assets.extend(src_assets);
fs_skips.merge(src_skips);
}
crate::ingest_progress::emit(
progress,
crate::ingest_progress::IngestEvent::ScanCompleted {
@@ -468,6 +515,14 @@ pub fn ingest_with_config_opts(
media: crate::ingest_progress::media_label(&asset.media_type).to_string(),
},
);
// `[[workspace.sources]]`: resolve which source this asset came from.
// Missing only if an asset slipped in outside the scan plan (defensive
// — fall back to the implicit `default` source).
let (source_id, source_trust) = source_by_path
.get(&asset.workspace_path.0)
.map_or((kebab_config::DEFAULT_SOURCE_ID, None), |(id, trust)| {
(id.as_str(), *trust)
});
let item = ingest_one_asset(
&app,
&asset,
@@ -478,6 +533,8 @@ pub fn ingest_with_config_opts(
embedder.as_ref(),
vector_store.as_ref(),
&existing_doc_ids,
source_id,
source_trust,
&image_pipeline,
force_reingest,
pdf_ocr_engine.as_deref(),
@@ -738,8 +795,8 @@ pub fn ingest_with_config_opts(
if let Ok(mut w) = lw.lock() {
let run_id = w.run_id().to_string();
let ms_samples = ocr_ms_samples.lock().map(|v| v.clone()).unwrap_or_default();
let pages = ocr_pages_cnt.lock().map(|v| *v).unwrap_or(0);
let failures = ocr_failures_cnt.lock().map(|v| *v).unwrap_or(0);
let pages = ocr_pages_cnt.lock().map_or(0, |v| *v);
let failures = ocr_failures_cnt.lock().map_or(0, |v| *v);
let summary = crate::ingest_log::IngestSummary::new(
crate::ingest_log::now_ts(),
run_id,
@@ -1173,6 +1230,11 @@ fn ingest_one_asset(
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
// `[[workspace.sources]]`: id of the source this asset belongs to (stamped
// onto `documents.source_id`) + that source's default trust level
// (markdown frontmatter overrides it).
source_id: &str,
source_trust: Option<TrustLevel>,
image_pipeline: &ImagePipeline<'_>,
force_reingest: bool,
pdf_ocr_engine: Option<&dyn OcrEngine>,
@@ -1206,6 +1268,7 @@ fn ingest_one_asset(
embedder,
vector_store,
existing_doc_ids,
source_id,
image_pipeline,
force_reingest,
progress,
@@ -1221,6 +1284,7 @@ fn ingest_one_asset(
embedder,
vector_store,
existing_doc_ids,
source_id,
force_reingest,
pdf_ocr_engine,
progress,
@@ -1263,6 +1327,7 @@ fn ingest_one_asset(
existing_doc_ids,
force_reingest,
lang.as_str(),
source_id,
);
}
// p10-1A-2: non-Rust Code, Audio, and Other are not yet wired;
@@ -1338,7 +1403,7 @@ fn ingest_one_asset(
let bytes = std::fs::read(&path)
.with_context(|| format!("read asset bytes from {}", path.display()))?;
let body_hints = build_body_hints(asset);
let body_hints = build_body_hints(asset, Some(source_id), source_trust);
// Frontmatter — `parse_frontmatter` returns Ok even on malformed
// frontmatter (warnings are surfaced through the `Vec<Warning>`).
@@ -1572,6 +1637,7 @@ fn ingest_one_image_asset(
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
source_id: &str,
image_pipeline: &ImagePipeline<'_>,
force_reingest: bool,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
@@ -1646,6 +1712,9 @@ fn ingest_one_image_asset(
// `image-meta-v1`, which already fixed doc_id). Skip compare + stored
// field must agree for next-run detection.
canonical.parser_version = eff_parser_version.clone();
// `[[workspace.sources]]`: stamp the owning source id (image extractor
// leaves it None).
canonical.metadata.source_id = Some(source_id.to_string());
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
// 2 + 3. Apply OCR / caption when their adapters exist. Both are
@@ -2157,6 +2226,7 @@ fn ingest_one_pdf_asset(
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
source_id: &str,
force_reingest: bool,
pdf_ocr_engine: Option<&dyn OcrEngine>,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
@@ -2224,6 +2294,9 @@ fn ingest_one_pdf_asset(
// v0.26.2: store the composite parser_version (base `pdf-text-v1` already
// fixed doc_id) so the next run's skip compare matches.
canonical.parser_version = eff_parser_version.clone();
// `[[workspace.sources]]`: stamp the owning source id (pdf extractor
// leaves it None).
canonical.metadata.source_id = Some(source_id.to_string());
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
// v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry
@@ -2523,6 +2596,7 @@ fn ingest_one_code_asset(
existing_doc_ids: &std::collections::HashSet<String>,
force_reingest: bool,
code_lang: &str, // <-- NEW (p10-1b Task D)
source_id: &str,
) -> anyhow::Result<kebab_core::IngestItem> {
let path = match &asset.source_uri {
SourceUri::File(p) => p.clone(),
@@ -2679,6 +2753,11 @@ fn ingest_one_code_asset(
}
};
// `[[workspace.sources]]`: stamp the owning source id on the synthesized /
// extracted code doc (covers both Tier 1 extract_for and Tier 2/3
// synthesize paths — neither knows the source id).
canonical.metadata.source_id = Some(source_id.to_string());
// p10-1b Task D/G/J/L: chunker per-lang.
// p10-3: track whether the extract stage already fell back to Tier 3.
// Tier 2 langs already have "none-v1" parser_version normally, so exclude them
@@ -2898,7 +2977,7 @@ fn synthesize_tier2_document(
use anyhow::Context as _;
use kebab_core::{
BlockId, CodeBlock, CommonBlock, Lang, Metadata, Provenance, ProvenanceEvent,
ProvenanceKind, SourceSpan, SourceType, TrustLevel, id_for_block, id_for_doc,
ProvenanceKind, SourceSpan, id_for_block, id_for_doc,
};
let text = std::str::from_utf8(bytes)
@@ -2986,6 +3065,10 @@ fn synthesize_tier2_document(
git_branch,
git_commit,
code_lang: Some(code_lang.to_string()),
// `[[workspace.sources]]`: stamped by the caller
// (`ingest_one_code_asset`) post-build so Tier 1 (extract_for) and
// Tier 2/3 (this synthesizer) share one code path.
source_id: None,
};
tracing::debug!(
@@ -3044,12 +3127,20 @@ fn count_lines_in(bytes: &[u8]) -> u32 {
/// overhead for large workspaces and the source-of-truth timestamps
/// are written into the document's frontmatter when the user wants
/// authoritative values.
fn build_body_hints(asset: &RawAsset) -> BodyHints {
fn build_body_hints(
asset: &RawAsset,
source_id: Option<&str>,
source_trust: Option<TrustLevel>,
) -> BodyHints {
BodyHints {
first_h1: None,
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: None,
// `[[workspace.sources]]`: stamp the owning source id + inject the
// per-source default trust level (frontmatter still overrides it).
source_id: source_id.map(str::to_string),
fallback_trust_level: source_trust,
}
}

View File

@@ -114,7 +114,7 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
if ft.is_dir() {
total += walk(&e.path());
} else if ft.is_file() {
total += e.metadata().map(|m| m.len()).unwrap_or(0);
total += e.metadata().map_or(0, |m| m.len());
}
}
total

View File

@@ -51,7 +51,7 @@ impl TestEnv {
std::fs::create_dir_all(&model_dir).unwrap();
let mut config = Config::defaults();
config.workspace.root = workspace_root.to_string_lossy().into_owned();
config.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
// Drop the ".obsidian" / "node_modules" excludes — they bring
// in nothing useful for fixtures and just hide debugging.
config.workspace.exclude.clear();

View File

@@ -14,7 +14,7 @@ fn ingest_file_copies_external_md_and_reports_new() {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
@@ -53,7 +53,7 @@ fn ingest_file_idempotent_on_second_call() {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
@@ -78,7 +78,7 @@ fn ingest_file_errors_on_missing_path() {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
@@ -97,7 +97,7 @@ fn ingest_file_errors_on_unsupported_extension() {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -17,7 +17,7 @@ fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Con
std::fs::create_dir_all(&model_dir).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
@@ -130,7 +130,7 @@ fn ingest_log_disabled_emits_no_file() {
std::fs::create_dir_all(&model_dir).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();

View File

@@ -192,7 +192,7 @@ fn pdf_ocr_progress_emits_started_finished_events() {
std::fs::create_dir_all(&data_dir).expect("create data dir");
let mut config = kebab_config::Config::defaults();
config.workspace.root = workspace.to_string_lossy().into_owned();
config.workspace.root = Some(workspace.to_string_lossy().into_owned());
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
config.models.embedding.provider = "none".to_string();
config.models.embedding.dimensions = 0;

View File

@@ -12,7 +12,7 @@ fn fresh_cfg(dir: &std::path::Path) -> Config {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
@@ -34,7 +34,7 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() {
assert_eq!(report.new, 1, "{report:?}");
// _external/ contains exactly one .md file with frontmatter.
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
let ext_dir = cfg.resolve_workspace_root().join("_external");
let entries: Vec<_> = fs::read_dir(&ext_dir)
.unwrap()
.filter_map(std::result::Result::ok)
@@ -56,7 +56,7 @@ fn ingest_stdin_without_source_uri() {
kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap();
assert_eq!(report.new, 1);
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
let ext_dir = cfg.resolve_workspace_root().join("_external");
let entries: Vec<_> = fs::read_dir(&ext_dir)
.unwrap()
.filter_map(std::result::Result::ok)

View File

@@ -6,7 +6,7 @@ use kebab_core::SourceScope;
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
let mut cfg = Config::defaults();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();

View File

@@ -8,7 +8,7 @@ use kebab_core::SourceScope;
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
let mut config = Config::defaults();
config.workspace.root = workspace_root.to_string_lossy().into_owned();
config.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
config.workspace.exclude.clear();
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
config.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();

View File

@@ -9,7 +9,7 @@ use common::TestEnv;
#[test]
fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
let env = TestEnv::lexical_only();
let workspace_root = std::path::PathBuf::from(&env.config.workspace.root);
let workspace_root = env.config.resolve_workspace_root();
std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();