feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type
혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance 레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR 0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버. - config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type); 단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음. - config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등) - V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0) - Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary) - ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp - 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR) - CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep) 도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810, --source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값). version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor). follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨. 자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
This commit is contained in:
@@ -727,8 +727,7 @@ impl App {
|
||||
// Load (or create) the session header.
|
||||
let now_unix = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.map(|d| d.as_secs() as i64)
|
||||
.unwrap_or(0);
|
||||
.map_or(0, |d| d.as_secs() as i64);
|
||||
let existing = self.sqlite.get_session(session_id)?;
|
||||
let prior_turns = match &existing {
|
||||
Some(_) => self.sqlite.list_turns(session_id)?,
|
||||
@@ -1111,7 +1110,7 @@ fn trim_to_chars(s: &str, n: usize) -> String {
|
||||
/// terminates early) rather than panic in the budget loop.
|
||||
fn estimate_chars(hits: &[SearchHit]) -> usize {
|
||||
hits.iter()
|
||||
.map(|h| serde_json::to_string(h).map(|s| s.len()).unwrap_or(0))
|
||||
.map(|h| serde_json::to_string(h).map_or(0, |s| s.len()))
|
||||
.sum()
|
||||
}
|
||||
|
||||
|
||||
@@ -206,6 +206,8 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
|
||||
doc_id,
|
||||
repo: vec![],
|
||||
code_lang: vec![],
|
||||
source_type: vec![],
|
||||
source_id: vec![],
|
||||
};
|
||||
|
||||
let opts = SearchOpts {
|
||||
|
||||
@@ -49,7 +49,8 @@ use kebab_core::{
|
||||
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, Chunker, ChunkerVersion,
|
||||
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, EmbeddingKind,
|
||||
ExtractContext, IngestReport, Lang, LanguageModel, MediaType, ParserVersion, RawAsset,
|
||||
SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore,
|
||||
SearchHit, SearchQuery, SourceScope, SourceType, SourceUri, TrustLevel, VectorRecord,
|
||||
VectorStore,
|
||||
};
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_parse_image::{
|
||||
@@ -304,7 +305,12 @@ pub fn ingest_with_config_opts(
|
||||
0
|
||||
});
|
||||
|
||||
// Walk the workspace.
|
||||
// Walk the workspace. `[[workspace.sources]]`: when the caller did not
|
||||
// pin an explicit `scope.root` (the normal `kebab ingest` path), iterate
|
||||
// over every configured source — each scanned with its own root + exclude
|
||||
// and tagged with its `id` + default trust. When `scope.root` IS pinned
|
||||
// (single-file ingest, `--root` override), scan that one root as the
|
||||
// implicit `default` source — preserving pre-multi-source behavior.
|
||||
crate::ingest_progress::emit(
|
||||
progress,
|
||||
crate::ingest_progress::IngestEvent::ScanStarted {
|
||||
@@ -313,9 +319,50 @@ pub fn ingest_with_config_opts(
|
||||
);
|
||||
let connector =
|
||||
FsSourceConnector::new(&app.config).context("kb-app::ingest: build FsSourceConnector")?;
|
||||
let (assets, fs_skips) = connector
|
||||
.scan_with_skips(&scope)
|
||||
.context("kb-app::ingest: scan workspace")?;
|
||||
|
||||
// Per-source scan plan: (source_id, source_trust, scan_scope).
|
||||
let scan_plan: Vec<(String, Option<TrustLevel>, SourceScope)> =
|
||||
if scope.root.as_os_str().is_empty() && scope.include.is_empty() {
|
||||
app.config
|
||||
.resolved_sources()
|
||||
.into_iter()
|
||||
.map(|s| {
|
||||
let scan_scope = SourceScope {
|
||||
root: s.root,
|
||||
include: scope.include.clone(),
|
||||
exclude: s.exclude,
|
||||
};
|
||||
(s.id, s.trust_level, scan_scope)
|
||||
})
|
||||
.collect()
|
||||
} else {
|
||||
// Explicit-root / single-file / include-restricted ingest: one
|
||||
// ad-hoc `default` source rooted at the pinned scope.
|
||||
vec![(
|
||||
kebab_config::DEFAULT_SOURCE_ID.to_string(),
|
||||
None,
|
||||
scope.clone(),
|
||||
)]
|
||||
};
|
||||
|
||||
// Accumulate assets across sources + a per-path lookup of which source
|
||||
// (id + trust) each asset came from. workspace_path is unique per asset
|
||||
// within a scan; on the rare overlap across sources, last-write-wins
|
||||
// (sources should not share roots — a config smell, not enforced).
|
||||
let mut assets: Vec<RawAsset> = Vec::new();
|
||||
let mut source_by_path: std::collections::HashMap<String, (String, Option<TrustLevel>)> =
|
||||
std::collections::HashMap::new();
|
||||
let mut fs_skips = kebab_source_fs::FsScanSkips::default();
|
||||
for (sid, strust, scan_scope) in &scan_plan {
|
||||
let (src_assets, src_skips) = connector
|
||||
.scan_with_skips(scan_scope)
|
||||
.with_context(|| format!("kb-app::ingest: scan source `{sid}`"))?;
|
||||
for a in &src_assets {
|
||||
source_by_path.insert(a.workspace_path.0.clone(), (sid.clone(), *strust));
|
||||
}
|
||||
assets.extend(src_assets);
|
||||
fs_skips.merge(src_skips);
|
||||
}
|
||||
crate::ingest_progress::emit(
|
||||
progress,
|
||||
crate::ingest_progress::IngestEvent::ScanCompleted {
|
||||
@@ -468,6 +515,14 @@ pub fn ingest_with_config_opts(
|
||||
media: crate::ingest_progress::media_label(&asset.media_type).to_string(),
|
||||
},
|
||||
);
|
||||
// `[[workspace.sources]]`: resolve which source this asset came from.
|
||||
// Missing only if an asset slipped in outside the scan plan (defensive
|
||||
// — fall back to the implicit `default` source).
|
||||
let (source_id, source_trust) = source_by_path
|
||||
.get(&asset.workspace_path.0)
|
||||
.map_or((kebab_config::DEFAULT_SOURCE_ID, None), |(id, trust)| {
|
||||
(id.as_str(), *trust)
|
||||
});
|
||||
let item = ingest_one_asset(
|
||||
&app,
|
||||
&asset,
|
||||
@@ -478,6 +533,8 @@ pub fn ingest_with_config_opts(
|
||||
embedder.as_ref(),
|
||||
vector_store.as_ref(),
|
||||
&existing_doc_ids,
|
||||
source_id,
|
||||
source_trust,
|
||||
&image_pipeline,
|
||||
force_reingest,
|
||||
pdf_ocr_engine.as_deref(),
|
||||
@@ -738,8 +795,8 @@ pub fn ingest_with_config_opts(
|
||||
if let Ok(mut w) = lw.lock() {
|
||||
let run_id = w.run_id().to_string();
|
||||
let ms_samples = ocr_ms_samples.lock().map(|v| v.clone()).unwrap_or_default();
|
||||
let pages = ocr_pages_cnt.lock().map(|v| *v).unwrap_or(0);
|
||||
let failures = ocr_failures_cnt.lock().map(|v| *v).unwrap_or(0);
|
||||
let pages = ocr_pages_cnt.lock().map_or(0, |v| *v);
|
||||
let failures = ocr_failures_cnt.lock().map_or(0, |v| *v);
|
||||
let summary = crate::ingest_log::IngestSummary::new(
|
||||
crate::ingest_log::now_ts(),
|
||||
run_id,
|
||||
@@ -1173,6 +1230,11 @@ fn ingest_one_asset(
|
||||
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
|
||||
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
// `[[workspace.sources]]`: id of the source this asset belongs to (stamped
|
||||
// onto `documents.source_id`) + that source's default trust level
|
||||
// (markdown frontmatter overrides it).
|
||||
source_id: &str,
|
||||
source_trust: Option<TrustLevel>,
|
||||
image_pipeline: &ImagePipeline<'_>,
|
||||
force_reingest: bool,
|
||||
pdf_ocr_engine: Option<&dyn OcrEngine>,
|
||||
@@ -1206,6 +1268,7 @@ fn ingest_one_asset(
|
||||
embedder,
|
||||
vector_store,
|
||||
existing_doc_ids,
|
||||
source_id,
|
||||
image_pipeline,
|
||||
force_reingest,
|
||||
progress,
|
||||
@@ -1221,6 +1284,7 @@ fn ingest_one_asset(
|
||||
embedder,
|
||||
vector_store,
|
||||
existing_doc_ids,
|
||||
source_id,
|
||||
force_reingest,
|
||||
pdf_ocr_engine,
|
||||
progress,
|
||||
@@ -1263,6 +1327,7 @@ fn ingest_one_asset(
|
||||
existing_doc_ids,
|
||||
force_reingest,
|
||||
lang.as_str(),
|
||||
source_id,
|
||||
);
|
||||
}
|
||||
// p10-1A-2: non-Rust Code, Audio, and Other are not yet wired;
|
||||
@@ -1338,7 +1403,7 @@ fn ingest_one_asset(
|
||||
let bytes = std::fs::read(&path)
|
||||
.with_context(|| format!("read asset bytes from {}", path.display()))?;
|
||||
|
||||
let body_hints = build_body_hints(asset);
|
||||
let body_hints = build_body_hints(asset, Some(source_id), source_trust);
|
||||
|
||||
// Frontmatter — `parse_frontmatter` returns Ok even on malformed
|
||||
// frontmatter (warnings are surfaced through the `Vec<Warning>`).
|
||||
@@ -1572,6 +1637,7 @@ fn ingest_one_image_asset(
|
||||
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
|
||||
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
source_id: &str,
|
||||
image_pipeline: &ImagePipeline<'_>,
|
||||
force_reingest: bool,
|
||||
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
|
||||
@@ -1646,6 +1712,9 @@ fn ingest_one_image_asset(
|
||||
// `image-meta-v1`, which already fixed doc_id). Skip compare + stored
|
||||
// field must agree for next-run detection.
|
||||
canonical.parser_version = eff_parser_version.clone();
|
||||
// `[[workspace.sources]]`: stamp the owning source id (image extractor
|
||||
// leaves it None).
|
||||
canonical.metadata.source_id = Some(source_id.to_string());
|
||||
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
|
||||
|
||||
// 2 + 3. Apply OCR / caption when their adapters exist. Both are
|
||||
@@ -2157,6 +2226,7 @@ fn ingest_one_pdf_asset(
|
||||
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
|
||||
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
source_id: &str,
|
||||
force_reingest: bool,
|
||||
pdf_ocr_engine: Option<&dyn OcrEngine>,
|
||||
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
|
||||
@@ -2224,6 +2294,9 @@ fn ingest_one_pdf_asset(
|
||||
// v0.26.2: store the composite parser_version (base `pdf-text-v1` already
|
||||
// fixed doc_id) so the next run's skip compare matches.
|
||||
canonical.parser_version = eff_parser_version.clone();
|
||||
// `[[workspace.sources]]`: stamp the owning source id (pdf extractor
|
||||
// leaves it None).
|
||||
canonical.metadata.source_id = Some(source_id.to_string());
|
||||
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
|
||||
|
||||
// v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry
|
||||
@@ -2523,6 +2596,7 @@ fn ingest_one_code_asset(
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
force_reingest: bool,
|
||||
code_lang: &str, // <-- NEW (p10-1b Task D)
|
||||
source_id: &str,
|
||||
) -> anyhow::Result<kebab_core::IngestItem> {
|
||||
let path = match &asset.source_uri {
|
||||
SourceUri::File(p) => p.clone(),
|
||||
@@ -2679,6 +2753,11 @@ fn ingest_one_code_asset(
|
||||
}
|
||||
};
|
||||
|
||||
// `[[workspace.sources]]`: stamp the owning source id on the synthesized /
|
||||
// extracted code doc (covers both Tier 1 extract_for and Tier 2/3
|
||||
// synthesize paths — neither knows the source id).
|
||||
canonical.metadata.source_id = Some(source_id.to_string());
|
||||
|
||||
// p10-1b Task D/G/J/L: chunker per-lang.
|
||||
// p10-3: track whether the extract stage already fell back to Tier 3.
|
||||
// Tier 2 langs already have "none-v1" parser_version normally, so exclude them
|
||||
@@ -2898,7 +2977,7 @@ fn synthesize_tier2_document(
|
||||
use anyhow::Context as _;
|
||||
use kebab_core::{
|
||||
BlockId, CodeBlock, CommonBlock, Lang, Metadata, Provenance, ProvenanceEvent,
|
||||
ProvenanceKind, SourceSpan, SourceType, TrustLevel, id_for_block, id_for_doc,
|
||||
ProvenanceKind, SourceSpan, id_for_block, id_for_doc,
|
||||
};
|
||||
|
||||
let text = std::str::from_utf8(bytes)
|
||||
@@ -2986,6 +3065,10 @@ fn synthesize_tier2_document(
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some(code_lang.to_string()),
|
||||
// `[[workspace.sources]]`: stamped by the caller
|
||||
// (`ingest_one_code_asset`) post-build so Tier 1 (extract_for) and
|
||||
// Tier 2/3 (this synthesizer) share one code path.
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
@@ -3044,12 +3127,20 @@ fn count_lines_in(bytes: &[u8]) -> u32 {
|
||||
/// overhead for large workspaces and the source-of-truth timestamps
|
||||
/// are written into the document's frontmatter when the user wants
|
||||
/// authoritative values.
|
||||
fn build_body_hints(asset: &RawAsset) -> BodyHints {
|
||||
fn build_body_hints(
|
||||
asset: &RawAsset,
|
||||
source_id: Option<&str>,
|
||||
source_trust: Option<TrustLevel>,
|
||||
) -> BodyHints {
|
||||
BodyHints {
|
||||
first_h1: None,
|
||||
fs_ctime: asset.discovered_at,
|
||||
fs_mtime: asset.discovered_at,
|
||||
fallback_lang: None,
|
||||
// `[[workspace.sources]]`: stamp the owning source id + inject the
|
||||
// per-source default trust level (frontmatter still overrides it).
|
||||
source_id: source_id.map(str::to_string),
|
||||
fallback_trust_level: source_trust,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -114,7 +114,7 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
|
||||
if ft.is_dir() {
|
||||
total += walk(&e.path());
|
||||
} else if ft.is_file() {
|
||||
total += e.metadata().map(|m| m.len()).unwrap_or(0);
|
||||
total += e.metadata().map_or(0, |m| m.len());
|
||||
}
|
||||
}
|
||||
total
|
||||
|
||||
@@ -51,7 +51,7 @@ impl TestEnv {
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut config = Config::defaults();
|
||||
config.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
config.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
// Drop the ".obsidian" / "node_modules" excludes — they bring
|
||||
// in nothing useful for fixtures and just hide debugging.
|
||||
config.workspace.exclude.clear();
|
||||
|
||||
@@ -14,7 +14,7 @@ fn ingest_file_copies_external_md_and_reports_new() {
|
||||
fs::create_dir_all(&data).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.storage.data_dir = data.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
@@ -53,7 +53,7 @@ fn ingest_file_idempotent_on_second_call() {
|
||||
fs::create_dir_all(&data).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.storage.data_dir = data.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
@@ -78,7 +78,7 @@ fn ingest_file_errors_on_missing_path() {
|
||||
fs::create_dir_all(&data).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.storage.data_dir = data.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
@@ -97,7 +97,7 @@ fn ingest_file_errors_on_unsupported_extension() {
|
||||
fs::create_dir_all(&data).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.storage.data_dir = data.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -17,7 +17,7 @@ fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Con
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
@@ -130,7 +130,7 @@ fn ingest_log_disabled_emits_no_file() {
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
|
||||
@@ -192,7 +192,7 @@ fn pdf_ocr_progress_emits_started_finished_events() {
|
||||
std::fs::create_dir_all(&data_dir).expect("create data dir");
|
||||
|
||||
let mut config = kebab_config::Config::defaults();
|
||||
config.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
config.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
config.models.embedding.provider = "none".to_string();
|
||||
config.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -12,7 +12,7 @@ fn fresh_cfg(dir: &std::path::Path) -> Config {
|
||||
fs::create_dir_all(&data).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.storage.data_dir = data.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
@@ -34,7 +34,7 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() {
|
||||
assert_eq!(report.new, 1, "{report:?}");
|
||||
|
||||
// _external/ contains exactly one .md file with frontmatter.
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let ext_dir = cfg.resolve_workspace_root().join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
@@ -56,7 +56,7 @@ fn ingest_stdin_without_source_uri() {
|
||||
kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap();
|
||||
assert_eq!(report.new, 1);
|
||||
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let ext_dir = cfg.resolve_workspace_root().join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
|
||||
@@ -6,7 +6,7 @@ use kebab_core::SourceScope;
|
||||
|
||||
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
|
||||
@@ -8,7 +8,7 @@ use kebab_core::SourceScope;
|
||||
|
||||
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
|
||||
let mut config = Config::defaults();
|
||||
config.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
config.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
config.workspace.exclude.clear();
|
||||
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
config.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
|
||||
@@ -9,7 +9,7 @@ use common::TestEnv;
|
||||
#[test]
|
||||
fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let workspace_root = std::path::PathBuf::from(&env.config.workspace.root);
|
||||
let workspace_root = env.config.resolve_workspace_root();
|
||||
std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
|
||||
std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();
|
||||
|
||||
|
||||
@@ -242,6 +242,7 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("c".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -244,6 +244,7 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -244,6 +244,7 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("go".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -244,6 +244,7 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("java".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -244,6 +244,7 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("javascript".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -244,6 +244,7 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("kotlin".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -244,6 +244,7 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("python".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -244,6 +244,7 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("rust".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -244,6 +244,7 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("typescript".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -450,6 +450,7 @@ mod tests {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: kebab_core::ParserVersion("test-parser-0".into()),
|
||||
|
||||
@@ -355,6 +355,7 @@ mod tests {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version,
|
||||
@@ -533,6 +534,7 @@ mod tests {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version,
|
||||
|
||||
@@ -111,6 +111,7 @@ fn fixed_doc() -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("c".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -118,6 +118,7 @@ fn fixed_doc() -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("go".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("java".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("javascript".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("kotlin".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("python".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("rust".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -68,6 +68,7 @@ fn text_doc(lang: &str, text: &str) -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some(lang.into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("typescript".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -67,6 +67,7 @@ fn dockerfile_doc(dockerfile_text: &str) -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("dockerfile".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -67,6 +67,7 @@ fn yaml_doc(yaml_text: &str) -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("yaml".into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -58,6 +58,8 @@ fn long_section_chunks_snapshot() {
|
||||
fs_ctime: asset.discovered_at,
|
||||
fs_mtime: asset.discovered_at,
|
||||
fallback_lang: Some("en".into()),
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
};
|
||||
let (metadata, fm_span, _fm_warns) =
|
||||
parse_frontmatter(&bytes, &hints).expect("frontmatter parses");
|
||||
@@ -133,6 +135,8 @@ fn long_section_chunks_are_deterministic() {
|
||||
fs_ctime: asset.discovered_at,
|
||||
fs_mtime: asset.discovered_at,
|
||||
fallback_lang: Some("en".into()),
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
};
|
||||
|
||||
let policy = ChunkPolicy {
|
||||
|
||||
@@ -67,6 +67,7 @@ fn manifest_doc(lang: &str, manifest_text: &str) -> CanonicalDocument {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some(lang.into()),
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
|
||||
@@ -193,6 +193,31 @@ enum Cmd {
|
||||
)]
|
||||
code_lang: Vec<String>,
|
||||
|
||||
/// Phase-2: filter by document source_type
|
||||
/// (`markdown`, `note`, `paper`, `reference`, `inbox`).
|
||||
/// Repeatable or comma-separated. Empty = no filter.
|
||||
/// The clean source/provenance lever for mixed-source KBs.
|
||||
#[arg(
|
||||
long = "source-type",
|
||||
value_name = "TYPE",
|
||||
num_args = 1,
|
||||
value_delimiter = ','
|
||||
)]
|
||||
source_type: Vec<String>,
|
||||
|
||||
/// [[workspace.sources]]: filter by source id — the `id` of the
|
||||
/// `[[workspace.sources]]` entry a document was ingested from
|
||||
/// (e.g. `default`, `notes`, `code`). Repeatable or
|
||||
/// comma-separated. Empty = no filter. The named-source
|
||||
/// provenance lever for multi-source KBs.
|
||||
#[arg(
|
||||
long = "source",
|
||||
value_name = "ID",
|
||||
num_args = 1,
|
||||
value_delimiter = ','
|
||||
)]
|
||||
source: Vec<String>,
|
||||
|
||||
/// p9-fb-37: emit pre-fusion lexical / vector / RRF candidate
|
||||
/// lists + per-stage timing in the response. Bypasses cache
|
||||
/// (debug intent — fresh run guaranteed). Requires embeddings
|
||||
@@ -615,12 +640,18 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
force_reingest,
|
||||
} => {
|
||||
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
|
||||
let scope = kebab_core::SourceScope {
|
||||
root: root
|
||||
.clone()
|
||||
.unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)),
|
||||
exclude: cfg.workspace.exclude.clone(),
|
||||
..Default::default()
|
||||
// [[workspace.sources]]: when the user passes `--root <dir>` we pin
|
||||
// that single root (one ad-hoc `default` source). Otherwise we
|
||||
// leave `scope.root` EMPTY so the app iterates every configured
|
||||
// source (`config.resolved_sources()`); a bare empty scope.exclude
|
||||
// is fine because each source carries its own merged exclude.
|
||||
let scope = match root.clone() {
|
||||
Some(r) => kebab_core::SourceScope {
|
||||
root: r,
|
||||
exclude: cfg.workspace.exclude.clone(),
|
||||
..Default::default()
|
||||
},
|
||||
None => kebab_core::SourceScope::default(),
|
||||
};
|
||||
|
||||
// p9-fb-02: spawn the progress display on a background
|
||||
@@ -629,8 +660,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
// call returns, the `Sender` drops and the display thread
|
||||
// sees `recv()` return Err — exits cleanly.
|
||||
let plain_env = std::env::var("KEBAB_PROGRESS")
|
||||
.map(|v| v.eq_ignore_ascii_case("plain"))
|
||||
.unwrap_or(false);
|
||||
.is_ok_and(|v| v.eq_ignore_ascii_case("plain"));
|
||||
let mode = progress::ProgressMode::from_flags(cli.json, cli.quiet, plain_env);
|
||||
|
||||
// Surface the active embedding backend/device on the terminal so the
|
||||
@@ -828,6 +858,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
doc_id,
|
||||
repo,
|
||||
code_lang,
|
||||
source_type,
|
||||
source,
|
||||
trace,
|
||||
bulk,
|
||||
} => {
|
||||
@@ -967,6 +999,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
doc_id: doc_id.as_ref().map(|s| kebab_core::DocumentId(s.clone())),
|
||||
repo: repo.clone(),
|
||||
code_lang: code_lang.clone(),
|
||||
source_type: source_type.clone(),
|
||||
source_id: source.clone(),
|
||||
};
|
||||
|
||||
let q = kebab_core::SearchQuery {
|
||||
|
||||
@@ -12,6 +12,12 @@ mod paths;
|
||||
pub mod migrate;
|
||||
pub use paths::{expand_path, expand_path_with_base};
|
||||
|
||||
/// Implicit source id used when a single-root `[workspace]` config (no
|
||||
/// `[[workspace.sources]]`) is normalized into the multi-source model, and
|
||||
/// the `DEFAULT` value of the `documents.source_id` column. Kept in sync
|
||||
/// with the migration default in `migrations/V0XX__documents_source_id.sql`.
|
||||
pub const DEFAULT_SOURCE_ID: &str = "default";
|
||||
|
||||
/// f32 의 shortest round-trip(Display)을 f64 로 재파싱해 직렬화한다.
|
||||
/// `0.3_f32` 가 `0.30000001192092896` 으로 새지 않고 `0.3` 으로 출력되게 한다.
|
||||
/// 마이그레이션 시 toml_edit relocation 의 무손실 비교를 깨지 않도록, 그리고
|
||||
@@ -88,8 +94,67 @@ pub struct Config {
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct WorkspaceCfg {
|
||||
pub root: String,
|
||||
/// Single-root workspace (legacy / common case). `Option` so that a
|
||||
/// config that declares only `[[workspace.sources]]` (no bare `root`)
|
||||
/// parses — and, symmetrically, a legacy single-`root` config (no
|
||||
/// `sources`) still parses unchanged. The load-time normalizer
|
||||
/// ([`Config::normalize_sources`]) reconciles the two into a single
|
||||
/// non-empty `sources` list (`id = "default"` synthesized from `root`).
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub root: Option<String>,
|
||||
pub exclude: Vec<String>,
|
||||
/// `[[workspace.sources]]`: named multi-source declaration. When empty
|
||||
/// and `root` is set, the load path normalizes to a single implicit
|
||||
/// `default` source. Each entry stamps its `id` onto every document it
|
||||
/// ingests and supplies per-source `trust_level` / `source_type`
|
||||
/// defaults (frontmatter still wins per the §0 Q9 derive table).
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub sources: Vec<SourceCfg>,
|
||||
}
|
||||
|
||||
/// One named source under `[[workspace.sources]]`.
|
||||
///
|
||||
/// `trust_level` / `source_type` are the **source-level defaults**: they
|
||||
/// apply when a document's frontmatter does not specify the field. The
|
||||
/// precedence is `frontmatter > source default > hardcoded`
|
||||
/// (`TrustLevel::Primary` / `SourceType::Markdown`) — implemented in the
|
||||
/// markdown derive via `BodyHints::fallback_trust_level`.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SourceCfg {
|
||||
/// Stable identifier stamped onto `documents.source_id` for every
|
||||
/// document ingested from this source. Must be unique and non-empty
|
||||
/// across the workspace (enforced in [`Config::validate`]).
|
||||
pub id: String,
|
||||
/// Root directory to walk for this source. Accepts the same
|
||||
/// absolute / `~` / `${VAR}` / relative(=config-dir-based) forms as
|
||||
/// the legacy `workspace.root`.
|
||||
pub root: String,
|
||||
/// Per-source denylist globs, merged on top of `workspace.exclude`.
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub exclude: Vec<String>,
|
||||
/// Per-source default `trust_level` (frontmatter overrides it).
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub trust_level: Option<kebab_core::TrustLevel>,
|
||||
/// Per-source default `source_type` (frontmatter overrides it).
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub source_type: Option<kebab_core::SourceType>,
|
||||
}
|
||||
|
||||
/// A source with its `root` resolved to an absolute path and its `exclude`
|
||||
/// merged with `workspace.exclude`. Produced by [`Config::resolved_sources`]
|
||||
/// — the single entry point the ingest pipeline iterates over.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct ResolvedSource {
|
||||
/// Stamped onto `documents.source_id`.
|
||||
pub id: String,
|
||||
/// Absolute walk root (tilde / `${VAR}` / relative-to-config resolved).
|
||||
pub root: PathBuf,
|
||||
/// `workspace.exclude` ∪ per-source `exclude`.
|
||||
pub exclude: Vec<String>,
|
||||
/// Per-source default trust level (None → fall back to `Primary`).
|
||||
pub trust_level: Option<kebab_core::TrustLevel>,
|
||||
/// Per-source default source type (None → fall back to `Markdown`).
|
||||
pub source_type: Option<kebab_core::SourceType>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -782,12 +847,13 @@ impl Config {
|
||||
Self {
|
||||
schema_version: crate::migrate::CURRENT_SCHEMA_VERSION,
|
||||
workspace: WorkspaceCfg {
|
||||
root: "~/KnowledgeBase".to_string(),
|
||||
root: Some("~/KnowledgeBase".to_string()),
|
||||
exclude: vec![
|
||||
".git/**".to_string(),
|
||||
"node_modules/**".to_string(),
|
||||
".obsidian/**".to_string(),
|
||||
],
|
||||
sources: vec![],
|
||||
},
|
||||
storage: StorageCfg {
|
||||
data_dir: "${XDG_DATA_HOME:-~/.local/share}/kebab".to_string(),
|
||||
@@ -906,7 +972,78 @@ impl Config {
|
||||
PathBuf::from(".")
|
||||
})
|
||||
});
|
||||
paths::expand_path_with_base(&self.workspace.root, "", &base)
|
||||
paths::expand_path_with_base(&self.primary_root_raw(), "", &base)
|
||||
}
|
||||
|
||||
/// The raw (unexpanded) string for the *primary* workspace root, used by
|
||||
/// [`resolve_workspace_root`](Self::resolve_workspace_root) and any
|
||||
/// single-root code path. Order: first `[[workspace.sources]]` entry's
|
||||
/// `root` → bare `workspace.root` → `~/KnowledgeBase` default. This keeps
|
||||
/// every pre-existing single-root call site working when only `sources`
|
||||
/// is declared.
|
||||
fn primary_root_raw(&self) -> String {
|
||||
if let Some(s) = self.workspace.sources.first() {
|
||||
return s.root.clone();
|
||||
}
|
||||
self.workspace
|
||||
.root
|
||||
.clone()
|
||||
.unwrap_or_else(|| "~/KnowledgeBase".to_string())
|
||||
}
|
||||
|
||||
/// The base directory for resolving relative source roots: the config
|
||||
/// file's directory when loaded from disk, else the current dir (mirrors
|
||||
/// [`resolve_workspace_root`](Self::resolve_workspace_root)).
|
||||
fn root_resolution_base(&self) -> PathBuf {
|
||||
self.source_dir.clone().unwrap_or_else(|| {
|
||||
std::env::current_dir().unwrap_or_else(|e| {
|
||||
tracing::warn!(
|
||||
target: "kebab-config",
|
||||
error = %e,
|
||||
"current_dir() failed; falling back to '.' for source root resolution"
|
||||
);
|
||||
PathBuf::from(".")
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/// Normalized, resolved list of sources to ingest. Always non-empty:
|
||||
///
|
||||
/// - If `[[workspace.sources]]` is declared, each entry is returned with
|
||||
/// its `root` expanded and `exclude` merged with `workspace.exclude`.
|
||||
/// - Otherwise a single implicit source `id = "default"` is synthesized
|
||||
/// from `workspace.root` (the legacy single-root path).
|
||||
///
|
||||
/// This is the single entry point the ingest pipeline iterates over, so
|
||||
/// single-root and multi-source configs share one code path.
|
||||
pub fn resolved_sources(&self) -> Vec<ResolvedSource> {
|
||||
let base = self.root_resolution_base();
|
||||
if self.workspace.sources.is_empty() {
|
||||
let root = paths::expand_path_with_base(&self.primary_root_raw(), "", &base);
|
||||
return vec![ResolvedSource {
|
||||
id: DEFAULT_SOURCE_ID.to_string(),
|
||||
root,
|
||||
exclude: self.workspace.exclude.clone(),
|
||||
trust_level: None,
|
||||
source_type: None,
|
||||
}];
|
||||
}
|
||||
self.workspace
|
||||
.sources
|
||||
.iter()
|
||||
.map(|s| {
|
||||
let root = paths::expand_path_with_base(&s.root, "", &base);
|
||||
let mut exclude = self.workspace.exclude.clone();
|
||||
exclude.extend(s.exclude.iter().cloned());
|
||||
ResolvedSource {
|
||||
id: s.id.clone(),
|
||||
root,
|
||||
exclude,
|
||||
trust_level: s.trust_level,
|
||||
source_type: s.source_type,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Read config from disk and merge env overrides on top of it. If the
|
||||
@@ -1019,10 +1156,41 @@ impl Config {
|
||||
cause: format!("parse_failed: {e}"),
|
||||
})
|
||||
})?;
|
||||
cfg.validate_sources().map_err(|cause| {
|
||||
anyhow::Error::new(ConfigInvalid {
|
||||
path: path.to_path_buf(),
|
||||
cause,
|
||||
})
|
||||
})?;
|
||||
cfg.source_dir = path.parent().map(Path::to_path_buf);
|
||||
Ok(cfg)
|
||||
}
|
||||
|
||||
/// Validate `[[workspace.sources]]`: every `id` must be non-empty and
|
||||
/// unique across the workspace. Empty `sources` (legacy single-root) is
|
||||
/// always valid. Returns the failure cause string for `ConfigInvalid`.
|
||||
fn validate_sources(&self) -> Result<(), String> {
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
for s in &self.workspace.sources {
|
||||
if s.id.trim().is_empty() {
|
||||
return Err("workspace.sources: an entry has an empty `id`".to_string());
|
||||
}
|
||||
if s.root.trim().is_empty() {
|
||||
return Err(format!(
|
||||
"workspace.sources: source `{}` has an empty `root`",
|
||||
s.id
|
||||
));
|
||||
}
|
||||
if !seen.insert(s.id.as_str()) {
|
||||
return Err(format!(
|
||||
"workspace.sources: duplicate source id `{}` (ids must be unique)",
|
||||
s.id
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Apply `KEBAB_<SECTION>_<KEY>` env overrides. Unknown keys are ignored.
|
||||
///
|
||||
/// The mapping is an explicit grep-friendly whitelist — one match arm
|
||||
@@ -1037,7 +1205,7 @@ impl Config {
|
||||
}
|
||||
match k.as_str() {
|
||||
// workspace
|
||||
"KEBAB_WORKSPACE_ROOT" => self.workspace.root = v.clone(),
|
||||
"KEBAB_WORKSPACE_ROOT" => self.workspace.root = Some(v.clone()),
|
||||
|
||||
// storage
|
||||
"KEBAB_STORAGE_DATA_DIR" => self.storage.data_dir = v.clone(),
|
||||
@@ -2034,7 +2202,7 @@ max_context_tokens = 8000
|
||||
#[test]
|
||||
fn legacy_include_field_is_ignored_silently() {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = "/tmp/kebab-legacy".to_string();
|
||||
cfg.workspace.root = Some("/tmp/kebab-legacy".to_string());
|
||||
let mut toml_text = toml::to_string(&cfg).expect("default round-trips");
|
||||
// Inject a legacy `include = [...]` line into the [workspace] block.
|
||||
toml_text = toml_text.replace(
|
||||
@@ -2048,20 +2216,105 @@ max_context_tokens = 8000
|
||||
parsed.err()
|
||||
);
|
||||
let cfg = parsed.unwrap();
|
||||
assert_eq!(cfg.workspace.root, "/tmp/kebab-legacy");
|
||||
assert_eq!(cfg.workspace.root.as_deref(), Some("/tmp/kebab-legacy"));
|
||||
}
|
||||
|
||||
/// p9-fb-25: `WorkspaceCfg` must NOT have an `include` field.
|
||||
/// Compile-time proof: exhaustive destructure.
|
||||
#[test]
|
||||
fn workspace_cfg_has_only_root_and_exclude_fields() {
|
||||
fn workspace_cfg_has_only_root_exclude_sources_fields() {
|
||||
let ws = Config::defaults().workspace;
|
||||
let WorkspaceCfg {
|
||||
root: _,
|
||||
exclude: _,
|
||||
sources: _,
|
||||
} = &ws;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn legacy_single_root_normalizes_to_default_source() {
|
||||
// A single-root config (no [[workspace.sources]]) must resolve to
|
||||
// exactly one source `id = "default"` rooted at workspace.root.
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = Some("/tmp/kb-notes".to_string());
|
||||
let resolved = cfg.resolved_sources();
|
||||
assert_eq!(resolved.len(), 1);
|
||||
assert_eq!(resolved[0].id, DEFAULT_SOURCE_ID);
|
||||
assert_eq!(resolved[0].root, std::path::PathBuf::from("/tmp/kb-notes"));
|
||||
assert_eq!(resolved[0].trust_level, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_source_config_resolves_each_with_merged_exclude() {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = None;
|
||||
cfg.workspace.exclude = vec![".git/**".to_string()];
|
||||
cfg.workspace.sources = vec![
|
||||
SourceCfg {
|
||||
id: "notes".to_string(),
|
||||
root: "/tmp/notes".to_string(),
|
||||
exclude: vec![],
|
||||
trust_level: Some(kebab_core::TrustLevel::Primary),
|
||||
source_type: None,
|
||||
},
|
||||
SourceCfg {
|
||||
id: "refs".to_string(),
|
||||
root: "/tmp/refs".to_string(),
|
||||
exclude: vec!["draft/**".to_string()],
|
||||
trust_level: Some(kebab_core::TrustLevel::Secondary),
|
||||
source_type: Some(kebab_core::SourceType::Reference),
|
||||
},
|
||||
];
|
||||
// A multi-source config (no bare root) must round-trip through TOML.
|
||||
let toml_text = toml::to_string(&cfg).expect("multi-source serializes");
|
||||
let cfg: Config = toml::from_str(&toml_text).expect("multi-source parses");
|
||||
cfg.validate_sources().expect("valid sources");
|
||||
let resolved = cfg.resolved_sources();
|
||||
assert_eq!(resolved.len(), 2);
|
||||
assert_eq!(resolved[0].id, "notes");
|
||||
assert_eq!(resolved[0].root, std::path::PathBuf::from("/tmp/notes"));
|
||||
assert_eq!(resolved[0].exclude, vec![".git/**".to_string()]);
|
||||
assert_eq!(resolved[0].trust_level, Some(kebab_core::TrustLevel::Primary));
|
||||
assert_eq!(resolved[1].id, "refs");
|
||||
// workspace.exclude ∪ per-source exclude.
|
||||
assert_eq!(
|
||||
resolved[1].exclude,
|
||||
vec![".git/**".to_string(), "draft/**".to_string()]
|
||||
);
|
||||
assert_eq!(
|
||||
resolved[1].source_type,
|
||||
Some(kebab_core::SourceType::Reference)
|
||||
);
|
||||
assert_eq!(
|
||||
resolved[1].trust_level,
|
||||
Some(kebab_core::TrustLevel::Secondary)
|
||||
);
|
||||
}
|
||||
|
||||
fn source_cfg(id: &str, root: &str) -> SourceCfg {
|
||||
SourceCfg {
|
||||
id: id.to_string(),
|
||||
root: root.to_string(),
|
||||
exclude: vec![],
|
||||
trust_level: None,
|
||||
source_type: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_source_ids_rejected() {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.sources = vec![source_cfg("dup", "/a"), source_cfg("dup", "/b")];
|
||||
assert!(cfg.validate_sources().is_err(), "duplicate ids must fail");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_source_id_rejected() {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.sources = vec![source_cfg("", "/a")];
|
||||
assert!(cfg.validate_sources().is_err(), "empty id must fail");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_stale_threshold_is_30() {
|
||||
let c = Config::defaults();
|
||||
|
||||
@@ -9,7 +9,7 @@ use toml_edit::{DocumentMut, Item};
|
||||
|
||||
/// 현재 바이너리가 이해하는 config 스키마 버전. 마이그레이션 완료 시
|
||||
/// 사용자 파일의 `schema_version` 을 이 값으로 stamp 한다.
|
||||
pub const CURRENT_SCHEMA_VERSION: u32 = 3;
|
||||
pub const CURRENT_SCHEMA_VERSION: u32 = 4;
|
||||
|
||||
/// 한 번의 마이그레이션에서 발생한 개별 변경.
|
||||
#[derive(Clone, Debug, PartialEq, serde::Serialize)]
|
||||
@@ -68,6 +68,7 @@ const HEADER: &str = "\
|
||||
fn section_comment(path: &str) -> Option<&'static str> {
|
||||
Some(match path {
|
||||
"workspace" => "# 색인 대상 워크스페이스.",
|
||||
"workspace.sources" => "# named multi-source (각 source 의 id 가 documents.source_id 로 stamp).",
|
||||
"storage" => "# XDG 저장 경로(데이터/sqlite/벡터/에셋/모델).",
|
||||
"indexing" => "# 병렬도 + 파일시스템 watch.",
|
||||
"chunking" => "# 청크 크기·오버랩·heading 존중.",
|
||||
@@ -376,6 +377,39 @@ pub fn step_2_to_3(doc: &mut DocumentMut, changes: &mut Vec<MigrationChange>) {
|
||||
copy_image_paddle_to_pdf(doc);
|
||||
}
|
||||
|
||||
/// v3 → v4: 단일 `workspace.root` 를 `[[workspace.sources]]` 의 implicit
|
||||
/// `default` source 로 미러링한다(`id = "default"`, `root = <기존 root>`).
|
||||
/// 기존 `workspace.root` 키는 그대로 둔다 — `resolved_sources()` 가 sources
|
||||
/// 가 있으면 그쪽을 우선하므로 무해하고, defaults reconcile 이 root 를 다시
|
||||
/// 추가하려 하지 않게 한다. 멱등: `[[workspace.sources]]` 가 이미 있으면 no-op.
|
||||
pub fn step_3_to_4(doc: &mut DocumentMut, changes: &mut Vec<MigrationChange>) {
|
||||
let Some(ws) = doc.get_mut("workspace").and_then(Item::as_table_mut) else {
|
||||
return;
|
||||
};
|
||||
// 이미 sources 가 선언돼 있으면(array-of-tables 든 inline 이든) 손대지 않음.
|
||||
if ws.contains_key("sources") {
|
||||
return;
|
||||
}
|
||||
// root 가 없으면 만들 게 없음(defaults 에는 항상 있지만 방어).
|
||||
let Some(root_val) = ws.get("root").and_then(Item::as_str).map(str::to_string) else {
|
||||
return;
|
||||
};
|
||||
|
||||
let mut entry = toml_edit::Table::new();
|
||||
entry.insert("id", toml_edit::value("default"));
|
||||
entry.insert("root", toml_edit::value(root_val));
|
||||
|
||||
let mut aot = toml_edit::ArrayOfTables::new();
|
||||
aot.push(entry);
|
||||
ws.insert("sources", Item::ArrayOfTables(aot));
|
||||
|
||||
changes.push(MigrationChange {
|
||||
kind: ChangeKind::AddedSection,
|
||||
path: "workspace.sources".to_string(),
|
||||
detail: "workspace.root → [[workspace.sources]] id=default".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
/// 파일의 schema_version(없으면 1) 부터 CURRENT 까지 step 적용.
|
||||
fn run_steps(doc: &mut DocumentMut, from: u32, changes: &mut Vec<MigrationChange>) {
|
||||
if from < 2 {
|
||||
@@ -384,6 +418,9 @@ fn run_steps(doc: &mut DocumentMut, from: u32, changes: &mut Vec<MigrationChange
|
||||
if from < 3 {
|
||||
step_2_to_3(doc, changes);
|
||||
}
|
||||
if from < 4 {
|
||||
step_3_to_4(doc, changes);
|
||||
}
|
||||
}
|
||||
|
||||
/// 사용자 config.toml 텍스트를 받아 step 체인 + reconciliation + version
|
||||
@@ -648,6 +685,76 @@ engine = \"paddle-onnx\"
|
||||
assert!(again.is_empty(), "not idempotent: {again:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn step_3_to_4_mirrors_root_into_default_source() {
|
||||
let v3 = "\
|
||||
schema_version = 3
|
||||
|
||||
[workspace]
|
||||
root = \"/my/notes\"
|
||||
exclude = [\".git/**\"]
|
||||
";
|
||||
let mut doc: DocumentMut = v3.parse().unwrap();
|
||||
let mut changes = Vec::new();
|
||||
step_3_to_4(&mut doc, &mut changes);
|
||||
let out = doc.to_string();
|
||||
// 새 array-of-tables 가 id=default 로 추가.
|
||||
assert!(out.contains("[[workspace.sources]]"), "{out}");
|
||||
assert!(out.contains("id = \"default\""), "{out}");
|
||||
// 기존 root 는 보존(reconcile 이 다시 추가하지 않게).
|
||||
assert!(out.contains("root = \"/my/notes\""), "{out}");
|
||||
// 재파싱 후 sources.default 가 root 를 미러.
|
||||
let reparsed: DocumentMut = out.parse().unwrap();
|
||||
let src0 = reparsed["workspace"]["sources"][0].as_table().unwrap();
|
||||
assert_eq!(src0["id"].as_str(), Some("default"));
|
||||
assert_eq!(src0["root"].as_str(), Some("/my/notes"));
|
||||
// 멱등.
|
||||
let mut changes2 = Vec::new();
|
||||
step_3_to_4(&mut doc, &mut changes2);
|
||||
assert!(changes2.is_empty(), "step_3_to_4 not idempotent");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn step_3_to_4_noop_when_sources_already_present() {
|
||||
let v4 = "\
|
||||
schema_version = 4
|
||||
|
||||
[workspace]
|
||||
root = \"/my/notes\"
|
||||
exclude = []
|
||||
|
||||
[[workspace.sources]]
|
||||
id = \"notes\"
|
||||
root = \"/my/notes\"
|
||||
";
|
||||
let mut doc: DocumentMut = v4.parse().unwrap();
|
||||
let mut changes = Vec::new();
|
||||
step_3_to_4(&mut doc, &mut changes);
|
||||
assert!(changes.is_empty(), "must not touch existing sources");
|
||||
// 기존 source 만 존재(default 가 추가되지 않음).
|
||||
assert!(!doc.to_string().contains("id = \"default\""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn migrate_document_v3_to_v4_adds_sources_and_is_idempotent() {
|
||||
let v3 = "\
|
||||
schema_version = 3
|
||||
|
||||
[workspace]
|
||||
root = \"/n\"
|
||||
exclude = []
|
||||
";
|
||||
let outcome = migrate_document(v3);
|
||||
assert_eq!(outcome.from_schema_version, 3);
|
||||
assert_eq!(outcome.to_schema_version, 4);
|
||||
assert!(outcome.changed());
|
||||
assert!(outcome.new_text.contains("[[workspace.sources]]"));
|
||||
assert_eq!(read_schema_version(&outcome.new_text), 4);
|
||||
let again = migrate_document(&outcome.new_text);
|
||||
assert!(!again.changed(), "not idempotent: {:?}", again.changes);
|
||||
assert_eq!(again.new_text, outcome.new_text);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn migrate_document_missing_schema_version_treated_as_v1() {
|
||||
let old = "[workspace]\nroot = \"/n\"\n";
|
||||
|
||||
@@ -11,11 +11,16 @@ const USER_V2: &str = include_str!("fixtures/user_v2_config.toml");
|
||||
fn user_v2_migrates_losslessly() {
|
||||
let out = migrate_document(USER_V2);
|
||||
assert_eq!(out.from_schema_version, 2);
|
||||
assert_eq!(out.to_schema_version, 3);
|
||||
// v2 → CURRENT(=4): v3 의 [ingest.*] relocation 에 더해 v4 의
|
||||
// [[workspace.sources]] default source 미러링까지 적용된다.
|
||||
assert_eq!(out.to_schema_version, 4);
|
||||
let t = &out.new_text;
|
||||
|
||||
// 사용자 값 보존.
|
||||
assert!(t.contains("root = \"/Users/user/Obsidian/Default\""), "{t}");
|
||||
// v4: workspace.root → [[workspace.sources]] id=default 미러링.
|
||||
assert!(t.contains("[[workspace.sources]]"), "v4 sources 누락:\n{t}");
|
||||
assert!(t.contains("id = \"default\""), "default source 누락:\n{t}");
|
||||
assert!(t.contains("model = \"snowflake-arctic-embed2\""));
|
||||
assert!(t.contains("endpoint = \"http://192.168.0.2:11943\""));
|
||||
// 사용자 주석/대안 줄 보존.
|
||||
|
||||
@@ -36,6 +36,14 @@ pub struct Metadata {
|
||||
/// for markdown / pdf / image. Set by the local-filesystem source connector during ingest.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub code_lang: Option<String>,
|
||||
|
||||
/// `[[workspace.sources]]`: id of the named source this document was
|
||||
/// ingested from (the `id` of the matching `[[workspace.sources]]`
|
||||
/// entry; `"default"` for single-root workspaces normalized to the
|
||||
/// implicit `default` source). null on documents ingested before the
|
||||
/// multi-source feature; the store column defaults to `"default"`.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub source_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
@@ -105,12 +113,14 @@ mod tests {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
};
|
||||
let v = serde_json::to_value(&m).unwrap();
|
||||
assert!(v.get("repo").is_none());
|
||||
assert!(v.get("git_branch").is_none());
|
||||
assert!(v.get("git_commit").is_none());
|
||||
assert!(v.get("code_lang").is_none());
|
||||
assert!(v.get("source_id").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -128,8 +138,10 @@ mod tests {
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("a".repeat(40)),
|
||||
code_lang: Some("rust".into()),
|
||||
source_id: Some("notes".into()),
|
||||
};
|
||||
let v = serde_json::to_value(&m).unwrap();
|
||||
assert_eq!(v["source_id"], "notes");
|
||||
assert_eq!(v["repo"], "kebab");
|
||||
assert_eq!(v["git_branch"], "main");
|
||||
assert_eq!(v["git_commit"].as_str().unwrap().len(), 40);
|
||||
|
||||
@@ -69,6 +69,20 @@ pub struct SearchFilters {
|
||||
/// Unknown values produce empty hits (consistent with `media` policy).
|
||||
#[serde(default)]
|
||||
pub code_lang: Vec<String>,
|
||||
/// Phase-2 (jira-contamination experiment): filter by `documents.source_type`
|
||||
/// (`markdown` | `note` | `paper` | `reference` | `inbox`). Empty = no filter;
|
||||
/// multi-value = OR. Direct indexed column — the clean provenance/source lever:
|
||||
/// filtering recovers concept-query precision without the see-saw of global
|
||||
/// trust-weighting (see tasks/HOTFIXES.md A/B evidence).
|
||||
#[serde(default)]
|
||||
pub source_type: Vec<String>,
|
||||
/// `[[workspace.sources]]`: filter by `documents.source_id` (the `id` of
|
||||
/// the `[[workspace.sources]]` entry a document was ingested from; e.g.
|
||||
/// `default`, `notes`, `code`). Empty = no filter; multi-value = OR.
|
||||
/// Direct indexed column (idx_docs_source_id) — the named-source
|
||||
/// provenance lever for multi-source KBs.
|
||||
#[serde(default)]
|
||||
pub source_id: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
|
||||
@@ -107,6 +107,8 @@ pub fn handle(state: &KebabAppState, input: SearchInput) -> CallToolResult {
|
||||
doc_id: input.doc_id.clone().map(kebab_core::DocumentId),
|
||||
repo: vec![],
|
||||
code_lang: vec![],
|
||||
source_type: vec![],
|
||||
source_id: vec![],
|
||||
};
|
||||
|
||||
let query = kebab_core::SearchQuery {
|
||||
|
||||
@@ -10,7 +10,7 @@ async fn schema_tool_emits_error_v1_when_db_missing() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
|
||||
cfg.workspace.root = dir.path().join("notes").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(dir.path().join("notes").to_string_lossy().into_owned());
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
// Note: NO ingest call — kebab.sqlite is absent → schema_with_config
|
||||
|
||||
@@ -10,7 +10,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -27,7 +27,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -12,7 +12,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -9,10 +9,10 @@ async fn doctor_tool_returns_doctor_v1_json() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = dir.path().join("data").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = dir.path().join("notes").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(dir.path().join("notes").to_string_lossy().into_owned());
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
std::fs::create_dir_all(&cfg.workspace.root).unwrap();
|
||||
std::fs::create_dir_all(cfg.resolve_workspace_root()).unwrap();
|
||||
|
||||
// Pass None for config_path — doctor falls back to XDG default probe
|
||||
// (path won't exist in the tempdir, which is fine; doctor reports it
|
||||
|
||||
@@ -16,7 +16,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -15,7 +15,7 @@ async fn ingest_file_tool_returns_ingest_report_v1() {
|
||||
fs::create_dir_all(&data).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.storage.data_dir = data.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
@@ -61,7 +61,7 @@ async fn ingest_file_tool_idempotent_on_second_call() {
|
||||
std::fs::create_dir_all(&data).unwrap();
|
||||
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.storage.data_dir = data.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -14,7 +14,7 @@ fn fresh_state(dir: &std::path::Path) -> KebabAppState {
|
||||
fs::create_dir_all(&data).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
|
||||
cfg.storage.data_dir = data.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -11,7 +11,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -11,7 +11,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -11,7 +11,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
|
||||
@@ -131,6 +131,7 @@ impl Extractor for CAstExtractor {
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some("c".to_string()),
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -155,6 +155,7 @@ impl Extractor for CppAstExtractor {
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some("cpp".to_string()),
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -133,6 +133,7 @@ impl Extractor for GoAstExtractor {
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some("go".to_string()),
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -144,6 +144,7 @@ impl Extractor for JavaAstExtractor {
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some("java".to_string()),
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -151,6 +151,7 @@ impl Extractor for JavascriptAstExtractor {
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some("javascript".to_string()),
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -149,6 +149,7 @@ impl Extractor for KotlinAstExtractor {
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some("kotlin".to_string()),
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -133,6 +133,7 @@ impl Extractor for PythonAstExtractor {
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some("python".to_string()),
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -136,6 +136,7 @@ impl Extractor for RustAstExtractor {
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some("rust".to_string()),
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -144,6 +144,7 @@ impl Extractor for TypescriptAstExtractor {
|
||||
git_branch,
|
||||
git_commit,
|
||||
code_lang: Some("typescript".to_string()),
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -203,6 +203,7 @@ impl Extractor for ImageExtractor {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -42,6 +42,16 @@ pub struct BodyHints {
|
||||
/// Optional language fallback used when neither frontmatter nor lingua
|
||||
/// detection produce a value. If `None` the final fallback is `"und"`.
|
||||
pub fallback_lang: Option<String>,
|
||||
/// `[[workspace.sources]]`: id of the source this document is being
|
||||
/// ingested from. Copied verbatim into `Metadata.source_id` (frontmatter
|
||||
/// does not override the source id — it is an ingest-time provenance
|
||||
/// stamp, not a user-authored field). `None` when single-root /
|
||||
/// unspecified, in which case `Metadata.source_id` stays `None`.
|
||||
pub source_id: Option<String>,
|
||||
/// `[[workspace.sources]]`: per-source default `trust_level`. Consulted
|
||||
/// only when the frontmatter does not specify `trust_level`. Precedence:
|
||||
/// frontmatter > this source default > hardcoded `Primary`.
|
||||
pub fallback_trust_level: Option<TrustLevel>,
|
||||
}
|
||||
|
||||
/// Byte range of the frontmatter region inside the input slice.
|
||||
@@ -444,8 +454,12 @@ fn derive_metadata(
|
||||
};
|
||||
|
||||
// ---- trust_level ----
|
||||
// Precedence: frontmatter > per-source default (hints.fallback_trust_level)
|
||||
// > hardcoded Primary. An *unknown* frontmatter value warns and also falls
|
||||
// through to the source default (then Primary), so a typo doesn't silently
|
||||
// promote past the source's intended trust.
|
||||
let trust_level = match raw.trust_level.as_deref() {
|
||||
None => TrustLevel::Primary,
|
||||
None => hints.fallback_trust_level.unwrap_or(TrustLevel::Primary),
|
||||
Some(s) => {
|
||||
if let Some(tl) = parse_trust_level(s) {
|
||||
tl
|
||||
@@ -454,7 +468,7 @@ fn derive_metadata(
|
||||
kind: WarningKind::MalformedFrontmatter,
|
||||
note: format!("unknown trust_level={s}, defaulted to primary"),
|
||||
});
|
||||
TrustLevel::Primary
|
||||
hints.fallback_trust_level.unwrap_or(TrustLevel::Primary)
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -477,6 +491,10 @@ fn derive_metadata(
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
// `[[workspace.sources]]`: ingest-time provenance stamp. Frontmatter
|
||||
// does not override the source id — it is supplied by the caller
|
||||
// (kebab-app) from the matching source's config `id`.
|
||||
source_id: hints.source_id.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -604,6 +622,8 @@ mod tests {
|
||||
fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
|
||||
fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
|
||||
fallback_lang: None,
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -695,6 +715,47 @@ source_type: alien\n\
|
||||
assert!(warns.iter().any(|w| w.note.contains("source_type=alien")));
|
||||
}
|
||||
|
||||
fn hints_with_source(id: &str, trust: Option<TrustLevel>) -> BodyHints {
|
||||
BodyHints {
|
||||
source_id: Some(id.to_string()),
|
||||
fallback_trust_level: trust,
|
||||
..hints()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn source_default_trust_applied_when_frontmatter_absent() {
|
||||
// No `trust_level:` in frontmatter → the per-source default wins
|
||||
// over the hardcoded Primary.
|
||||
let md = b"---\ntitle: Doc\n---\nbody\n";
|
||||
let (meta, _span, warns) =
|
||||
parse_frontmatter(md, &hints_with_source("notes", Some(TrustLevel::Secondary)))
|
||||
.unwrap();
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
assert_eq!(meta.trust_level, TrustLevel::Secondary);
|
||||
assert_eq!(meta.source_id.as_deref(), Some("notes"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn frontmatter_trust_overrides_source_default() {
|
||||
// Explicit frontmatter trust beats the per-source default.
|
||||
let md = b"---\ntrust_level: generated\n---\nbody\n";
|
||||
let (meta, _span, _warns) =
|
||||
parse_frontmatter(md, &hints_with_source("notes", Some(TrustLevel::Secondary)))
|
||||
.unwrap();
|
||||
assert_eq!(meta.trust_level, TrustLevel::Generated);
|
||||
assert_eq!(meta.source_id.as_deref(), Some("notes"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_source_id_leaves_metadata_source_id_none() {
|
||||
let md = b"---\ntitle: Doc\n---\nbody\n";
|
||||
let (meta, _span, _warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert_eq!(meta.source_id, None);
|
||||
// Without a source default, hardcoded Primary still applies.
|
||||
assert_eq!(meta.trust_level, TrustLevel::Primary);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn malformed_yaml_emits_warning_and_defaults() {
|
||||
// Unclosed quote → YAML parse fails.
|
||||
|
||||
@@ -469,6 +469,7 @@ mod tests {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,6 +37,8 @@ fn pinned_hints() -> BodyHints {
|
||||
fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
|
||||
fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
|
||||
fallback_lang: None,
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -86,6 +86,8 @@ fn code_and_table_canonical_snapshot() {
|
||||
fs_ctime: asset.discovered_at,
|
||||
fs_mtime: asset.discovered_at,
|
||||
fallback_lang: Some("en".into()),
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
};
|
||||
let (metadata, fm_span, _fm_warns) =
|
||||
parse_frontmatter(&bytes, &hints).expect("frontmatter parses");
|
||||
|
||||
@@ -203,6 +203,7 @@ impl Extractor for PdfTextExtractor {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
|
||||
@@ -419,6 +419,31 @@ fn run_query(
|
||||
}
|
||||
}
|
||||
|
||||
// Phase-2: source_type filter (IN-list on the direct `documents.source_type`
|
||||
// column). Empty Vec = no filter; multi-value = OR. Mirrors filters.rs.
|
||||
if !filters.source_type.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.source_type.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(" AND d.source_type IN ({placeholders})"));
|
||||
for st in &filters.source_type {
|
||||
params.push(Box::new(st.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// [[workspace.sources]]: source_id filter (IN-list on the direct
|
||||
// `documents.source_id` column). Empty Vec = no filter; multi-value = OR.
|
||||
// Mirrors filters.rs.
|
||||
if !filters.source_id.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.source_id.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(" AND d.source_id IN ({placeholders})"));
|
||||
for sid in &filters.source_id {
|
||||
params.push(Box::new(sid.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p9-fb-36: ingested_after filter.
|
||||
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
|
||||
// fb-32 ingest path), so lexicographic >= compare is correct — but only
|
||||
|
||||
@@ -231,6 +231,47 @@ pub struct FsScanSkips {
|
||||
pub events: Vec<FsSkipEvent>,
|
||||
}
|
||||
|
||||
impl FsScanSkips {
|
||||
/// `[[workspace.sources]]`: fold another source's scan skips into `self`,
|
||||
/// so a multi-source ingest reports aggregate counts. Counters add;
|
||||
/// per-category sample vecs concatenate and re-cap at 5 (spec §5.5);
|
||||
/// events concatenate.
|
||||
pub fn merge(&mut self, other: FsScanSkips) {
|
||||
self.skipped_gitignore = self.skipped_gitignore.saturating_add(other.skipped_gitignore);
|
||||
self.skipped_kebabignore = self
|
||||
.skipped_kebabignore
|
||||
.saturating_add(other.skipped_kebabignore);
|
||||
self.skipped_builtin_blacklist = self
|
||||
.skipped_builtin_blacklist
|
||||
.saturating_add(other.skipped_builtin_blacklist);
|
||||
self.skipped_generated = self.skipped_generated.saturating_add(other.skipped_generated);
|
||||
self.skipped_size_exceeded = self
|
||||
.skipped_size_exceeded
|
||||
.saturating_add(other.skipped_size_exceeded);
|
||||
|
||||
fn merge_samples(dst: &mut Vec<String>, src: Vec<String>) {
|
||||
for s in src {
|
||||
if dst.len() >= 5 {
|
||||
break;
|
||||
}
|
||||
dst.push(s);
|
||||
}
|
||||
}
|
||||
merge_samples(&mut self.skip_examples.generated, other.skip_examples.generated);
|
||||
merge_samples(
|
||||
&mut self.skip_examples.size_exceeded,
|
||||
other.skip_examples.size_exceeded,
|
||||
);
|
||||
merge_samples(
|
||||
&mut self.skip_examples.builtin_blacklist,
|
||||
other.skip_examples.builtin_blacklist,
|
||||
);
|
||||
merge_samples(&mut self.skip_examples.gitignore, other.skip_examples.gitignore);
|
||||
|
||||
self.events.extend(other.events);
|
||||
}
|
||||
}
|
||||
|
||||
/// A single per-file skip event for structured ingest log (v0.20.x).
|
||||
#[derive(Debug)]
|
||||
pub struct FsSkipEvent {
|
||||
@@ -326,7 +367,7 @@ mod tests {
|
||||
|
||||
fn cfg_with_root(root: &str) -> Config {
|
||||
let mut c = Config::defaults();
|
||||
c.workspace.root = root.to_string();
|
||||
c.workspace.root = Some(root.to_string());
|
||||
c.workspace.exclude.clear();
|
||||
c
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
fn cfg_with_root(root: &str) -> Config {
|
||||
let mut c = Config::defaults();
|
||||
c.workspace.root = root.to_string();
|
||||
c.workspace.root = Some(root.to_string());
|
||||
c.workspace.exclude.clear();
|
||||
// Disable size / generated caps so small test files always pass.
|
||||
c.ingest.code.max_file_bytes = u64::MAX;
|
||||
|
||||
@@ -50,7 +50,7 @@ fn baseline_path() -> PathBuf {
|
||||
|
||||
fn cfg_for_fixture(root: &str) -> Config {
|
||||
let mut c = Config::defaults();
|
||||
c.workspace.root = root.to_string();
|
||||
c.workspace.root = Some(root.to_string());
|
||||
// Clear default excludes (`.git/**`, `node_modules/**`, `.obsidian/**`)
|
||||
// so the snapshot is purely a function of the fixture + .kebabignore +
|
||||
// baked-in default-excludes.
|
||||
|
||||
@@ -23,7 +23,7 @@ use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
fn cfg_with_root(root: &str) -> Config {
|
||||
let mut c = Config::defaults();
|
||||
c.workspace.root = root.to_string();
|
||||
c.workspace.root = Some(root.to_string());
|
||||
c.workspace.exclude.clear();
|
||||
c
|
||||
}
|
||||
|
||||
@@ -745,6 +745,14 @@ fn upsert_document(
|
||||
// `markdown` for the column).
|
||||
let source_type = source_type_label(&doc.metadata.source_type);
|
||||
let trust_level = trust_level_label(&doc.metadata.trust_level);
|
||||
// `[[workspace.sources]]`: id of the source this doc came from. Falls back
|
||||
// to the column default `"default"` for docs without an explicit source
|
||||
// (single-root workspaces / pre-multi-source ingests).
|
||||
let source_id = doc
|
||||
.metadata
|
||||
.source_id
|
||||
.as_deref()
|
||||
.unwrap_or(kebab_config::DEFAULT_SOURCE_ID);
|
||||
let created_at = doc
|
||||
.metadata
|
||||
.created_at
|
||||
@@ -757,11 +765,11 @@ fn upsert_document(
|
||||
tx.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang,
|
||||
source_type, trust_level, parser_version,
|
||||
source_type, trust_level, source_id, parser_version,
|
||||
doc_version, schema_version, metadata_json,
|
||||
provenance_json, created_at, updated_at,
|
||||
last_chunker_version, last_embedding_version
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(doc_id) DO UPDATE SET
|
||||
asset_id = excluded.asset_id,
|
||||
workspace_path = excluded.workspace_path,
|
||||
@@ -769,6 +777,7 @@ fn upsert_document(
|
||||
lang = excluded.lang,
|
||||
source_type = excluded.source_type,
|
||||
trust_level = excluded.trust_level,
|
||||
source_id = excluded.source_id,
|
||||
parser_version = excluded.parser_version,
|
||||
-- doc_version: bump on update. excluded.doc_version is the
|
||||
-- caller's submitted value; we ignore it and add 1 to the
|
||||
@@ -788,6 +797,7 @@ fn upsert_document(
|
||||
doc.lang.0,
|
||||
source_type,
|
||||
trust_level,
|
||||
source_id,
|
||||
doc.parser_version.0,
|
||||
i64::from(doc.doc_version),
|
||||
i64::from(doc.schema_version),
|
||||
|
||||
@@ -191,6 +191,31 @@ impl SqliteStore {
|
||||
}
|
||||
}
|
||||
|
||||
// Phase-2: source_type filter (IN-list on the direct `documents.source_type`
|
||||
// column, idx_docs_source_type). Empty Vec = no filter; multi-value = OR.
|
||||
if !filters.source_type.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.source_type.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(" AND d.source_type IN ({placeholders})"));
|
||||
for st in &filters.source_type {
|
||||
bind.push(Box::new(st.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// [[workspace.sources]]: source_id filter (IN-list on the direct
|
||||
// `documents.source_id` column, idx_docs_source_id). Empty Vec = no
|
||||
// filter; multi-value = OR. Mirrors the source_type filter above.
|
||||
if !filters.source_id.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.source_id.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(" AND d.source_id IN ({placeholders})"));
|
||||
for sid in &filters.source_id {
|
||||
bind.push(Box::new(sid.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p9-fb-36: ingested_after filter.
|
||||
// `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
|
||||
// lexicographic >= compare is correct — but only when the filter
|
||||
@@ -1000,6 +1025,121 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
/// [[workspace.sources]]: the `source_id` filter keeps only chunks whose
|
||||
/// owning document's `documents.source_id` column is in the IN-list.
|
||||
#[test]
|
||||
fn filter_chunks_source_id_keeps_matching_source() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let store = open_store(&tmp);
|
||||
let c1 = "11111111111111111111111111111111";
|
||||
let c2 = "22222222222222222222222222222222";
|
||||
let c3 = "33333333333333333333333333333333";
|
||||
// Three docs, each with a distinct source_id column value.
|
||||
seed_with_source_id(&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "notes/a.md", "notes");
|
||||
seed_with_source_id(&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "code/b.rs", "code");
|
||||
seed_with_source_id(
|
||||
&store,
|
||||
c3,
|
||||
"d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
|
||||
"x.md",
|
||||
"default",
|
||||
);
|
||||
|
||||
// Single value.
|
||||
let f = SearchFilters {
|
||||
source_id: vec!["notes".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1)], "only the `notes` source chunk survives");
|
||||
|
||||
// Multi-value OR.
|
||||
let f = SearchFilters {
|
||||
source_id: vec!["notes".to_string(), "code".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1), cid(c2)], "notes OR code survive");
|
||||
|
||||
// Empty filter = no filtering.
|
||||
let f = SearchFilters::default();
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1), cid(c2), cid(c3)]);
|
||||
}
|
||||
|
||||
/// Seed one committed doc + chunk + embedding with an explicit
|
||||
/// `documents.source_id` column value (the DEFAULT is `'default'`).
|
||||
fn seed_with_source_id(
|
||||
store: &SqliteStore,
|
||||
chunk_id: &str,
|
||||
doc_id: &str,
|
||||
workspace_path: &str,
|
||||
source_id: &str,
|
||||
) {
|
||||
let asset_id = format!("a{}", &doc_id[..31]);
|
||||
{
|
||||
let conn = store.lock_conn();
|
||||
conn.execute(
|
||||
"INSERT INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, ?, ?, '\"markdown\"', 1, ?, 'reference', ?,
|
||||
'1970-01-01T00:00:00Z')",
|
||||
params![
|
||||
asset_id,
|
||||
format!("file://{workspace_path}"),
|
||||
workspace_path,
|
||||
workspace_path,
|
||||
workspace_path,
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang, source_type,
|
||||
trust_level, source_id, parser_version, doc_version,
|
||||
schema_version, metadata_json, provenance_json,
|
||||
created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', ?, 'v1',
|
||||
1, 1, '{}', '{}', '1970-01-01T00:00:00Z',
|
||||
'1970-01-01T00:00:00Z')",
|
||||
params![doc_id, asset_id, workspace_path, source_id],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
|
||||
'1970-01-01T00:00:00Z')",
|
||||
params![chunk_id, doc_id],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
let embed_row = EmbeddingRecordRow {
|
||||
embedding_id: format!("e{}", &chunk_id[..31]),
|
||||
chunk_id: chunk_id.to_string(),
|
||||
model_id: "m".to_string(),
|
||||
model_version: "v1".to_string(),
|
||||
dimensions: 4,
|
||||
lance_table: "t".to_string(),
|
||||
created_at: OffsetDateTime::UNIX_EPOCH,
|
||||
};
|
||||
store
|
||||
.put_embedding_records_pending(std::slice::from_ref(&embed_row))
|
||||
.unwrap();
|
||||
store
|
||||
.mark_embedding_records_committed(std::slice::from_ref(&embed_row.embedding_id))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
|
||||
// Regression test for the non-UTC offset lex-compare bug.
|
||||
|
||||
@@ -80,7 +80,7 @@ pub fn breakdowns(conn: &Connection, threshold_days: u64) -> rusqlite::Result<Br
|
||||
/// the LanceDB directory tree. Missing files / dir = 0.
|
||||
pub fn index_bytes(data_dir: &Path) -> std::io::Result<IndexBytes> {
|
||||
fn file_size_or_zero(p: &Path) -> u64 {
|
||||
std::fs::metadata(p).map(|m| m.len()).unwrap_or(0)
|
||||
std::fs::metadata(p).map_or(0, |m| m.len())
|
||||
}
|
||||
fn dir_walk_sum(p: &Path) -> std::io::Result<u64> {
|
||||
if !p.exists() {
|
||||
|
||||
@@ -57,6 +57,8 @@ fn document_and_chunks_round_trip_through_sqlite() {
|
||||
fs_ctime: asset.discovered_at,
|
||||
fs_mtime: asset.discovered_at,
|
||||
fallback_lang: Some("en".into()),
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
};
|
||||
let (mut metadata, _fm_span, _fm_warns) = parse_frontmatter(&bytes, &hints).unwrap();
|
||||
let (parsed_blocks, parse_warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
|
||||
@@ -45,6 +45,7 @@ fn make_metadata() -> Metadata {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -55,6 +55,7 @@ fn make_doc() -> CanonicalDocument {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
};
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
|
||||
@@ -58,6 +58,7 @@ fn make_doc(
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
};
|
||||
let doc = CanonicalDocument {
|
||||
doc_id,
|
||||
|
||||
@@ -598,8 +598,7 @@ fn spawn_ask_worker(state: &mut App) {
|
||||
fn make_conversation_id() -> String {
|
||||
let nanos = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map(|d| d.as_nanos())
|
||||
.unwrap_or(0);
|
||||
.map_or(0, |d| d.as_nanos());
|
||||
format!("conv_{nanos:032x}")
|
||||
}
|
||||
|
||||
|
||||
@@ -34,11 +34,10 @@ pub fn start_ingest(app: &mut App) -> anyhow::Result<()> {
|
||||
anyhow::bail!("ingest already running");
|
||||
}
|
||||
let cfg = app.config.clone();
|
||||
let scope = SourceScope {
|
||||
root: std::path::PathBuf::from(&cfg.workspace.root),
|
||||
exclude: cfg.workspace.exclude.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
// [[workspace.sources]]: leave `scope.root` empty so the app iterates
|
||||
// every configured source (`config.resolved_sources()`), mirroring the
|
||||
// CLI `kebab ingest` path. Each source carries its own merged exclude.
|
||||
let scope = SourceScope::default();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let cancel = Arc::new(AtomicBool::new(false));
|
||||
let cancel_for_worker = cancel.clone();
|
||||
|
||||
@@ -304,10 +304,11 @@ pub fn handle_key_search(state: &mut App, key: KeyEvent) -> KeyOutcome {
|
||||
// `terminal.clear()` couldn't happen — leaving the
|
||||
// previous frame leaking through the new draw.
|
||||
let editor = std::env::var("EDITOR").unwrap_or_else(|_| "vi".into());
|
||||
// `~/...` / `${XDG_…}` expansion via `kebab-config::expand_path`
|
||||
// — same helper used by the markdown / image / PDF ingest
|
||||
// paths (HOTFIXES 2026-05-02 P9-4 follow-up).
|
||||
let workspace_root = kebab_config::expand_path(&state.config.workspace.root, "");
|
||||
// [[workspace.sources]]: resolve the primary workspace root
|
||||
// (first source / legacy `root`). `resolve_workspace_root` applies
|
||||
// the same `~` / `${XDG_…}` / relative-to-config expansion as the
|
||||
// markdown / image / PDF ingest paths (HOTFIXES 2026-05-02 P9-4).
|
||||
let workspace_root = state.config.resolve_workspace_root();
|
||||
state.pending_editor = Some(crate::app::EditorRequest {
|
||||
citation: citation.unwrap(),
|
||||
editor_env: editor,
|
||||
|
||||
@@ -19,7 +19,7 @@ use time::OffsetDateTime;
|
||||
fn fresh_app() -> App {
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = "/tmp/kebab-tui-ask-tests-noop".to_string();
|
||||
config.workspace.root = "/tmp/kebab-tui-ask-tests-noop/workspace".to_string();
|
||||
config.workspace.root = Some("/tmp/kebab-tui-ask-tests-noop/workspace".to_string());
|
||||
let mut app = App::new(config).expect("App::new");
|
||||
app.focus = Pane::Ask;
|
||||
// p9-fb-12 follow-up: mirror the run loop's auto-flip on pane
|
||||
|
||||
@@ -12,7 +12,7 @@ use ratatui::layout::Rect;
|
||||
fn fresh_app(focus: Pane) -> App {
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = "/tmp/kebab-tui-cheatsheet-tests-noop".to_string();
|
||||
config.workspace.root = "/tmp/kebab-tui-cheatsheet-tests-noop/workspace".to_string();
|
||||
config.workspace.root = Some("/tmp/kebab-tui-cheatsheet-tests-noop/workspace".to_string());
|
||||
let mut app = App::new(config).expect("App::new");
|
||||
app.focus = focus;
|
||||
app
|
||||
|
||||
@@ -23,7 +23,7 @@ use time::OffsetDateTime;
|
||||
fn fresh_app() -> App {
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = "/tmp/kebab-tui-inspect-tests-noop".to_string();
|
||||
config.workspace.root = "/tmp/kebab-tui-inspect-tests-noop/workspace".to_string();
|
||||
config.workspace.root = Some("/tmp/kebab-tui-inspect-tests-noop/workspace".to_string());
|
||||
let mut app = App::new(config).expect("App::new");
|
||||
app.focus = Pane::Inspect;
|
||||
app.inspect = Some(InspectState::default());
|
||||
@@ -85,6 +85,7 @@ fn make_doc() -> CanonicalDocument {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
},
|
||||
provenance: Provenance {
|
||||
events: vec![ProvenanceEvent {
|
||||
|
||||
@@ -9,7 +9,7 @@ use kebab_tui::{App, Mode, Pane, mode_intercept};
|
||||
fn fresh_app(focus: Pane) -> App {
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = "/tmp/kebab-tui-mode-tests-noop".to_string();
|
||||
config.workspace.root = "/tmp/kebab-tui-mode-tests-noop/workspace".to_string();
|
||||
config.workspace.root = Some("/tmp/kebab-tui-mode-tests-noop/workspace".to_string());
|
||||
let mut app = App::new(config).expect("App::new");
|
||||
app.focus = focus;
|
||||
app.mode = Mode::auto_for(focus);
|
||||
|
||||
@@ -18,7 +18,7 @@ use std::path::Path;
|
||||
fn fresh_app() -> App {
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = "/tmp/kebab-tui-search-tests-noop".to_string();
|
||||
config.workspace.root = "/tmp/kebab-tui-search-tests-noop/workspace".to_string();
|
||||
config.workspace.root = Some("/tmp/kebab-tui-search-tests-noop/workspace".to_string());
|
||||
let mut app = App::new(config).expect("App::new");
|
||||
app.focus = Pane::Search;
|
||||
// p9-fb-12 follow-up: mirror the run loop's auto-flip — Search
|
||||
|
||||
@@ -9,7 +9,7 @@ use ratatui::layout::Rect;
|
||||
fn fresh_app(focus: Pane) -> App {
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = "/tmp/kebab-tui-status-bar-tests-noop".to_string();
|
||||
config.workspace.root = "/tmp/kebab-tui-status-bar-tests-noop/workspace".to_string();
|
||||
config.workspace.root = Some("/tmp/kebab-tui-status-bar-tests-noop/workspace".to_string());
|
||||
let mut app = App::new(config).expect("App::new");
|
||||
app.focus = focus;
|
||||
app
|
||||
|
||||
Reference in New Issue
Block a user