feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type #208

Merged
altair823 merged 1 commits from feat/source-type-filter into main 2026-06-23 15:52:35 +00:00
101 changed files with 1201 additions and 111 deletions

48
Cargo.lock generated
View File

@@ -4751,7 +4751,7 @@ dependencies = [
[[package]]
name = "kebab-app"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"base64 0.22.1",
@@ -4799,7 +4799,7 @@ dependencies = [
[[package]]
name = "kebab-chunk"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"blake3",
@@ -4817,7 +4817,7 @@ dependencies = [
[[package]]
name = "kebab-cli"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"clap",
@@ -4838,7 +4838,7 @@ dependencies = [
[[package]]
name = "kebab-config"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"dirs 5.0.1",
@@ -4854,7 +4854,7 @@ dependencies = [
[[package]]
name = "kebab-core"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"blake3",
@@ -4868,7 +4868,7 @@ dependencies = [
[[package]]
name = "kebab-embed"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"blake3",
@@ -4882,7 +4882,7 @@ dependencies = [
[[package]]
name = "kebab-embed-candle"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"candle-core",
@@ -4902,7 +4902,7 @@ dependencies = [
[[package]]
name = "kebab-embed-local"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"fastembed",
@@ -4915,7 +4915,7 @@ dependencies = [
[[package]]
name = "kebab-embed-ollama"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"kebab-config",
@@ -4930,7 +4930,7 @@ dependencies = [
[[package]]
name = "kebab-eval"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"kebab-app",
@@ -4949,7 +4949,7 @@ dependencies = [
[[package]]
name = "kebab-llm"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"kebab-core",
@@ -4958,7 +4958,7 @@ dependencies = [
[[package]]
name = "kebab-llm-local"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"kebab-config",
@@ -4975,7 +4975,7 @@ dependencies = [
[[package]]
name = "kebab-mcp"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"kebab-app",
@@ -4993,7 +4993,7 @@ dependencies = [
[[package]]
name = "kebab-nli"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"hf-hub",
@@ -5008,7 +5008,7 @@ dependencies = [
[[package]]
name = "kebab-parse-code"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"gix",
@@ -5031,7 +5031,7 @@ dependencies = [
[[package]]
name = "kebab-parse-image"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"ab_glyph",
"anyhow",
@@ -5059,7 +5059,7 @@ dependencies = [
[[package]]
name = "kebab-parse-md"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"kebab-core",
@@ -5076,7 +5076,7 @@ dependencies = [
[[package]]
name = "kebab-parse-pdf"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"blake3",
@@ -5091,7 +5091,7 @@ dependencies = [
[[package]]
name = "kebab-rag"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"blake3",
@@ -5113,7 +5113,7 @@ dependencies = [
[[package]]
name = "kebab-search"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"globset",
@@ -5132,7 +5132,7 @@ dependencies = [
[[package]]
name = "kebab-source-fs"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"blake3",
@@ -5150,7 +5150,7 @@ dependencies = [
[[package]]
name = "kebab-store-sqlite"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"blake3",
@@ -5170,7 +5170,7 @@ dependencies = [
[[package]]
name = "kebab-store-vector"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"arrow",
@@ -5194,7 +5194,7 @@ dependencies = [
[[package]]
name = "kebab-tui"
version = "0.28.0"
version = "0.29.0"
dependencies = [
"anyhow",
"crossterm",

View File

@@ -32,7 +32,7 @@ edition = "2024"
rust-version = "1.85"
license = "MIT OR Apache-2.0"
repository = "https://github.com/altair823/kebab"
version = "0.28.0" # v0.28.0 — config 스키마 v2→v3 재편: 미디어 형식 설정을 `[ingest.*]` 우산으로 통합(`[indexing]`→`[ingest]` 스칼라, `[chunking]`/`[image.ocr]`/`[image.caption]`/`[pdf.ocr]`→`[ingest.*]`). 기존 v2 파일은 load 시 메모리 자동 변환(디스크 미변경), 파일 갱신은 `kebab config migrate`(값·주석 보존). env 이름(LHS) 100% 보존 + RHS 만 새 경로, 신규 `KEBAB_PDF_OCR_{DET_MODEL,REC_MODEL,DICT,SCORE_THRESH,UNCLIP_RATIO,MAX_BOXES}`. `ingest_config_signature` 바이트 불변(재색인 0). PdfOcrCfg paddle 대칭 키. 신규 인터페이스(config 레이아웃 rename + env 추가) → minor. — CLAUDE.md §Release
version = "0.29.0" # v0.29.0 — provenance 출처 필터: `[[workspace.sources]]` 멀티소스 + 검색 `--source <id>` / `--source-type <type>`(lexical+vector 두 site, OR). `documents.source_id` 컬럼(V014, additive·DEFAULT 'default'·재색인 0) + config v3→v4 migration(`step_3_to_4`, 단일 root→implicit `default` source 미러, 멱등). per-source `trust_level`/`source_type` 기본값(우선순위 frontmatter > source 기본값 > Primary). 단일 root 사용자 무영향. 설계 근거: 전역 trust 곱셈가중(weighted-RRF)은 A/B 반증(incident MRR 절벽), 출처 필터가 see-saw 없는 레버. 신규 CLI flag + config 키 + migration → minor. — CLAUDE.md §Release
# pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
# intentional allow-list. The allowed lints are either cosmetic (doc style),

View File

@@ -35,6 +35,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:
- **2026-06-21 provenance 출처 필터: `[[workspace.sources]]` 멀티소스 + `--source`/`--source-type`** — v0.29.0. 혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 레버. config `[[workspace.sources]]`(각 id/root/trust_level/source_type) + `documents.source_id` 컬럼(V014, additive, 재색인 0) + config v3→v4 migration(`step_3_to_4`, 단일 root→implicit `default` source, 멱등) + 검색 `--source <id>` / `--source-type <type>`(lexical+vector 두 site, OR). trust precedence = frontmatter > per-source 기본값 > Primary. **설계 근거**: 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR 0.918→0.340 절벽) — 필터가 see-saw 없는 올바른 레버. 도그푸딩(620 doc, jira400+wiki220): `--source wiki` concept 0.780→0.810, `--source jira` incident 0.918→0.975. **follow-up**: MCP search 필터 미노출 · `kebab list` source_id 미표시 · RAG provenance 라벨 미구현. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-21).
- **2026-06-04 PP-OCRv5 ONNX Rust 네이티브 OCR** — v0.27.0. `[image.ocr] engine = "paddle-onnx"` 로 PP-OCRv5(검출+인식) ONNX 를 in-process(`ort` =2.0.0-rc.9) 실행 — Python 런타임/원격 호출 없이 큰 페이지 CPU <4초(Ollama vision ~50초 대비). default 는 여전히 `"ollama-vision"`. 후처리(min-area rect/unclip)는 pure-Rust. **함정**: unclip 은 corner 를 centroid 에서 방사 확장하면 안 되고 edge 별 polygon offset 이어야 함(방사 확장 시 wide/short 텍스트 박스 높이가 안 커져 글자 윗부분 잘림 → ㄷ→ㄴ, e2e CER 0.26). 수정 후 CER 0.005. 모델 ONNX 는 `crates/kebab-parse-image/assets/paddleocr-onnx/`(LFS). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-04 PP-OCRv5 ONNX), spec/plan `docs/superpowers/{specs,plans}/2026-06-04-rust-native-ocr-*.md`.
- **2026-06-03 ingest 설정 변경 자동 재색인** — v0.26.2. ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/`[ingest.code]`)을 변경하면 `--force-reingest` 없이 영향 자산만 자동 재색인. 그 설정들의 결정적 서명(`ingest_config_signature`)을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 다음 ingest 비교가 mismatch. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout)은 제외(과도 무효화 회피), doc_id 는 base 로 안정 유지. **업그레이드 후 첫 ingest 는 전 자산 1회 재색인**(저장된 상수 parser_version ≠ 새 composite; embedding 은 V012 캐시 히트). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 설정 변경 자동 재색인), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-*invalidation*.md`.
- **2026-06-03 ingest 진행 로그 개선** — v0.26.1. 이미지/PDF + OCR/caption on 볼트 ingest 가 "멈춘 듯" 보이던 문제 해소: TTY 진행바에 현재 파일명 + 느린 phase(ocr/caption/embed)+모델명 + 경과초 `(Ns)` heartbeat, 종료 시 최장 소요 파일 top-5 요약. 신규 wire `asset_phase{idx,total,phase,model}` + `asset_timings.ocr_ms`/`caption_ms`(additive, `ingest_progress.v1` 유지, serde default 0). 이미지·PDF 경로도 `asset_timings` emit(이전 markdown 만). 기본 동작 불변. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 진행 로그), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-ingest-log-improve-*.md`.

View File

@@ -86,7 +86,7 @@ Markdown · PDF · 이미지(OCR + caption) · 소스코드(Rust/Python/TS/JS/Go
| `kebab ingest [<path>]` | 워크스페이스 스캔 후 새/변경 문서 색인 (idempotent · incremental, `--force-reingest` 로 강제 재처리). 미지원 확장자는 자동 skip. 진행바는 현재 **파일명** · 느린 **phase(ocr/caption/embed)+모델명** · **경과초**`(Ns)` · 문서별 청크 수 · phase별 소요시간(parse/chunk/ocr/caption/embed/store)을 표시하고, 종료 시 **최장 소요 파일 top-5** 를 요약한다 (`--json``asset_phase`/`asset_chunked`/`asset_timings` 이벤트로, 사람용 요약은 미출력) |
| `kebab ingest-file <path>` | 단일 파일 ingest (workspace 외부 가능 — `_external/` 로 deterministic copy) |
| `kebab ingest-stdin --title <T>` | stdin 의 markdown 본문 ingest |
| `kebab search --mode {lexical,vector,hybrid} "<query>" [flags]` | 검색 (default hybrid = RRF fusion, citation 포함). 필터/budget flag 는 `--help` |
| `kebab search --mode {lexical,vector,hybrid} "<query>" [flags]` | 검색 (default hybrid = RRF fusion, citation 포함). 출처 필터 `--source <id>` (`[[workspace.sources]]` id) · `--source-type {markdown,note,paper,reference,inbox}` (둘 다 repeatable/comma-sep, OR). 그 외 필터/budget flag 는 `--help` |
| `kebab ask "<query>" [flags]` | RAG 답변 + 근거 인용 (Ollama 필요). `--session` (multi-turn) · `--stream` · `--multi-hop` |
| `kebab list docs` | 색인된 문서 목록 |
| `kebab inspect doc <id>` / `inspect chunk <id>` | raw record 보기 |
@@ -108,6 +108,19 @@ Markdown · PDF · 이미지(OCR + caption) · 소스코드(Rust/Python/TS/JS/Go
[workspace]
root = "~/KnowledgeBase" # 색인할 폴더. 절대 / tilde / env / 상대 경로 가능.
# 상대 경로의 base 는 config.toml 위치 (cwd 무관).
# 단일 root 는 implicit `default` source 로 정규화된다.
# 멀티소스 (선택) — 출처별로 검색을 좁히려면 root 대신 명명 source 를 선언한다.
# 각 source 의 id 가 모든 문서에 stamp 되고, `kebab search --source <id>` 로 필터.
# trust_level / source_type 은 frontmatter 가 없을 때의 source 기본값
# (우선순위: frontmatter > source 기본값 > 하드코딩 Primary/Markdown).
# [[workspace.sources]]
# id = "notes"
# root = "~/KnowledgeBase/notes"
# [[workspace.sources]]
# id = "jira"
# root = "~/exports/jira"
# trust_level = "secondary" # 낮은 신뢰 출처 — `--trust-min primary` 로 배제 가능.
[models.embedding]
provider = "fastembed" # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust)

View File

@@ -727,8 +727,7 @@ impl App {
// Load (or create) the session header.
let now_unix = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
.map_or(0, |d| d.as_secs() as i64);
let existing = self.sqlite.get_session(session_id)?;
let prior_turns = match &existing {
Some(_) => self.sqlite.list_turns(session_id)?,
@@ -1111,7 +1110,7 @@ fn trim_to_chars(s: &str, n: usize) -> String {
/// terminates early) rather than panic in the budget loop.
fn estimate_chars(hits: &[SearchHit]) -> usize {
hits.iter()
.map(|h| serde_json::to_string(h).map(|s| s.len()).unwrap_or(0))
.map(|h| serde_json::to_string(h).map_or(0, |s| s.len()))
.sum()
}

View File

@@ -206,6 +206,8 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
doc_id,
repo: vec![],
code_lang: vec![],
source_type: vec![],
source_id: vec![],
};
let opts = SearchOpts {

View File

@@ -49,7 +49,8 @@ use kebab_core::{
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, Chunker, ChunkerVersion,
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, EmbeddingKind,
ExtractContext, IngestReport, Lang, LanguageModel, MediaType, ParserVersion, RawAsset,
SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore,
SearchHit, SearchQuery, SourceScope, SourceType, SourceUri, TrustLevel, VectorRecord,
VectorStore,
};
use kebab_llm_local::OllamaLanguageModel;
use kebab_parse_image::{
@@ -304,7 +305,12 @@ pub fn ingest_with_config_opts(
0
});
// Walk the workspace.
// Walk the workspace. `[[workspace.sources]]`: when the caller did not
// pin an explicit `scope.root` (the normal `kebab ingest` path), iterate
// over every configured source — each scanned with its own root + exclude
// and tagged with its `id` + default trust. When `scope.root` IS pinned
// (single-file ingest, `--root` override), scan that one root as the
// implicit `default` source — preserving pre-multi-source behavior.
crate::ingest_progress::emit(
progress,
crate::ingest_progress::IngestEvent::ScanStarted {
@@ -313,9 +319,50 @@ pub fn ingest_with_config_opts(
);
let connector =
FsSourceConnector::new(&app.config).context("kb-app::ingest: build FsSourceConnector")?;
let (assets, fs_skips) = connector
.scan_with_skips(&scope)
.context("kb-app::ingest: scan workspace")?;
// Per-source scan plan: (source_id, source_trust, scan_scope).
let scan_plan: Vec<(String, Option<TrustLevel>, SourceScope)> =
if scope.root.as_os_str().is_empty() && scope.include.is_empty() {
app.config
.resolved_sources()
.into_iter()
.map(|s| {
let scan_scope = SourceScope {
root: s.root,
include: scope.include.clone(),
exclude: s.exclude,
};
(s.id, s.trust_level, scan_scope)
})
.collect()
} else {
// Explicit-root / single-file / include-restricted ingest: one
// ad-hoc `default` source rooted at the pinned scope.
vec![(
kebab_config::DEFAULT_SOURCE_ID.to_string(),
None,
scope.clone(),
)]
};
// Accumulate assets across sources + a per-path lookup of which source
// (id + trust) each asset came from. workspace_path is unique per asset
// within a scan; on the rare overlap across sources, last-write-wins
// (sources should not share roots — a config smell, not enforced).
let mut assets: Vec<RawAsset> = Vec::new();
let mut source_by_path: std::collections::HashMap<String, (String, Option<TrustLevel>)> =
std::collections::HashMap::new();
let mut fs_skips = kebab_source_fs::FsScanSkips::default();
for (sid, strust, scan_scope) in &scan_plan {
let (src_assets, src_skips) = connector
.scan_with_skips(scan_scope)
.with_context(|| format!("kb-app::ingest: scan source `{sid}`"))?;
for a in &src_assets {
source_by_path.insert(a.workspace_path.0.clone(), (sid.clone(), *strust));
}
assets.extend(src_assets);
fs_skips.merge(src_skips);
}
crate::ingest_progress::emit(
progress,
crate::ingest_progress::IngestEvent::ScanCompleted {
@@ -468,6 +515,14 @@ pub fn ingest_with_config_opts(
media: crate::ingest_progress::media_label(&asset.media_type).to_string(),
},
);
// `[[workspace.sources]]`: resolve which source this asset came from.
// Missing only if an asset slipped in outside the scan plan (defensive
// — fall back to the implicit `default` source).
let (source_id, source_trust) = source_by_path
.get(&asset.workspace_path.0)
.map_or((kebab_config::DEFAULT_SOURCE_ID, None), |(id, trust)| {
(id.as_str(), *trust)
});
let item = ingest_one_asset(
&app,
&asset,
@@ -478,6 +533,8 @@ pub fn ingest_with_config_opts(
embedder.as_ref(),
vector_store.as_ref(),
&existing_doc_ids,
source_id,
source_trust,
&image_pipeline,
force_reingest,
pdf_ocr_engine.as_deref(),
@@ -738,8 +795,8 @@ pub fn ingest_with_config_opts(
if let Ok(mut w) = lw.lock() {
let run_id = w.run_id().to_string();
let ms_samples = ocr_ms_samples.lock().map(|v| v.clone()).unwrap_or_default();
let pages = ocr_pages_cnt.lock().map(|v| *v).unwrap_or(0);
let failures = ocr_failures_cnt.lock().map(|v| *v).unwrap_or(0);
let pages = ocr_pages_cnt.lock().map_or(0, |v| *v);
let failures = ocr_failures_cnt.lock().map_or(0, |v| *v);
let summary = crate::ingest_log::IngestSummary::new(
crate::ingest_log::now_ts(),
run_id,
@@ -1173,6 +1230,11 @@ fn ingest_one_asset(
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
// `[[workspace.sources]]`: id of the source this asset belongs to (stamped
// onto `documents.source_id`) + that source's default trust level
// (markdown frontmatter overrides it).
source_id: &str,
source_trust: Option<TrustLevel>,
image_pipeline: &ImagePipeline<'_>,
force_reingest: bool,
pdf_ocr_engine: Option<&dyn OcrEngine>,
@@ -1206,6 +1268,7 @@ fn ingest_one_asset(
embedder,
vector_store,
existing_doc_ids,
source_id,
image_pipeline,
force_reingest,
progress,
@@ -1221,6 +1284,7 @@ fn ingest_one_asset(
embedder,
vector_store,
existing_doc_ids,
source_id,
force_reingest,
pdf_ocr_engine,
progress,
@@ -1263,6 +1327,7 @@ fn ingest_one_asset(
existing_doc_ids,
force_reingest,
lang.as_str(),
source_id,
);
}
// p10-1A-2: non-Rust Code, Audio, and Other are not yet wired;
@@ -1338,7 +1403,7 @@ fn ingest_one_asset(
let bytes = std::fs::read(&path)
.with_context(|| format!("read asset bytes from {}", path.display()))?;
let body_hints = build_body_hints(asset);
let body_hints = build_body_hints(asset, Some(source_id), source_trust);
// Frontmatter — `parse_frontmatter` returns Ok even on malformed
// frontmatter (warnings are surfaced through the `Vec<Warning>`).
@@ -1572,6 +1637,7 @@ fn ingest_one_image_asset(
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
source_id: &str,
image_pipeline: &ImagePipeline<'_>,
force_reingest: bool,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
@@ -1646,6 +1712,9 @@ fn ingest_one_image_asset(
// `image-meta-v1`, which already fixed doc_id). Skip compare + stored
// field must agree for next-run detection.
canonical.parser_version = eff_parser_version.clone();
// `[[workspace.sources]]`: stamp the owning source id (image extractor
// leaves it None).
canonical.metadata.source_id = Some(source_id.to_string());
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
// 2 + 3. Apply OCR / caption when their adapters exist. Both are
@@ -2157,6 +2226,7 @@ fn ingest_one_pdf_asset(
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
source_id: &str,
force_reingest: bool,
pdf_ocr_engine: Option<&dyn OcrEngine>,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
@@ -2224,6 +2294,9 @@ fn ingest_one_pdf_asset(
// v0.26.2: store the composite parser_version (base `pdf-text-v1` already
// fixed doc_id) so the next run's skip compare matches.
canonical.parser_version = eff_parser_version.clone();
// `[[workspace.sources]]`: stamp the owning source id (pdf extractor
// leaves it None).
canonical.metadata.source_id = Some(source_id.to_string());
let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX);
// v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry
@@ -2523,6 +2596,7 @@ fn ingest_one_code_asset(
existing_doc_ids: &std::collections::HashSet<String>,
force_reingest: bool,
code_lang: &str, // <-- NEW (p10-1b Task D)
source_id: &str,
) -> anyhow::Result<kebab_core::IngestItem> {
let path = match &asset.source_uri {
SourceUri::File(p) => p.clone(),
@@ -2679,6 +2753,11 @@ fn ingest_one_code_asset(
}
};
// `[[workspace.sources]]`: stamp the owning source id on the synthesized /
// extracted code doc (covers both Tier 1 extract_for and Tier 2/3
// synthesize paths — neither knows the source id).
canonical.metadata.source_id = Some(source_id.to_string());
// p10-1b Task D/G/J/L: chunker per-lang.
// p10-3: track whether the extract stage already fell back to Tier 3.
// Tier 2 langs already have "none-v1" parser_version normally, so exclude them
@@ -2898,7 +2977,7 @@ fn synthesize_tier2_document(
use anyhow::Context as _;
use kebab_core::{
BlockId, CodeBlock, CommonBlock, Lang, Metadata, Provenance, ProvenanceEvent,
ProvenanceKind, SourceSpan, SourceType, TrustLevel, id_for_block, id_for_doc,
ProvenanceKind, SourceSpan, id_for_block, id_for_doc,
};
let text = std::str::from_utf8(bytes)
@@ -2986,6 +3065,10 @@ fn synthesize_tier2_document(
git_branch,
git_commit,
code_lang: Some(code_lang.to_string()),
// `[[workspace.sources]]`: stamped by the caller
// (`ingest_one_code_asset`) post-build so Tier 1 (extract_for) and
// Tier 2/3 (this synthesizer) share one code path.
source_id: None,
};
tracing::debug!(
@@ -3044,12 +3127,20 @@ fn count_lines_in(bytes: &[u8]) -> u32 {
/// overhead for large workspaces and the source-of-truth timestamps
/// are written into the document's frontmatter when the user wants
/// authoritative values.
fn build_body_hints(asset: &RawAsset) -> BodyHints {
fn build_body_hints(
asset: &RawAsset,
source_id: Option<&str>,
source_trust: Option<TrustLevel>,
) -> BodyHints {
BodyHints {
first_h1: None,
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: None,
// `[[workspace.sources]]`: stamp the owning source id + inject the
// per-source default trust level (frontmatter still overrides it).
source_id: source_id.map(str::to_string),
fallback_trust_level: source_trust,
}
}

View File

@@ -114,7 +114,7 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
if ft.is_dir() {
total += walk(&e.path());
} else if ft.is_file() {
total += e.metadata().map(|m| m.len()).unwrap_or(0);
total += e.metadata().map_or(0, |m| m.len());
}
}
total

View File

@@ -51,7 +51,7 @@ impl TestEnv {
std::fs::create_dir_all(&model_dir).unwrap();
let mut config = Config::defaults();
config.workspace.root = workspace_root.to_string_lossy().into_owned();
config.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
// Drop the ".obsidian" / "node_modules" excludes — they bring
// in nothing useful for fixtures and just hide debugging.
config.workspace.exclude.clear();

View File

@@ -14,7 +14,7 @@ fn ingest_file_copies_external_md_and_reports_new() {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
@@ -53,7 +53,7 @@ fn ingest_file_idempotent_on_second_call() {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
@@ -78,7 +78,7 @@ fn ingest_file_errors_on_missing_path() {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
@@ -97,7 +97,7 @@ fn ingest_file_errors_on_unsupported_extension() {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -17,7 +17,7 @@ fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Con
std::fs::create_dir_all(&model_dir).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
@@ -130,7 +130,7 @@ fn ingest_log_disabled_emits_no_file() {
std::fs::create_dir_all(&model_dir).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();

View File

@@ -192,7 +192,7 @@ fn pdf_ocr_progress_emits_started_finished_events() {
std::fs::create_dir_all(&data_dir).expect("create data dir");
let mut config = kebab_config::Config::defaults();
config.workspace.root = workspace.to_string_lossy().into_owned();
config.workspace.root = Some(workspace.to_string_lossy().into_owned());
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
config.models.embedding.provider = "none".to_string();
config.models.embedding.dimensions = 0;

View File

@@ -12,7 +12,7 @@ fn fresh_cfg(dir: &std::path::Path) -> Config {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
@@ -34,7 +34,7 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() {
assert_eq!(report.new, 1, "{report:?}");
// _external/ contains exactly one .md file with frontmatter.
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
let ext_dir = cfg.resolve_workspace_root().join("_external");
let entries: Vec<_> = fs::read_dir(&ext_dir)
.unwrap()
.filter_map(std::result::Result::ok)
@@ -56,7 +56,7 @@ fn ingest_stdin_without_source_uri() {
kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap();
assert_eq!(report.new, 1);
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
let ext_dir = cfg.resolve_workspace_root().join("_external");
let entries: Vec<_> = fs::read_dir(&ext_dir)
.unwrap()
.filter_map(std::result::Result::ok)

View File

@@ -6,7 +6,7 @@ use kebab_core::SourceScope;
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
let mut cfg = Config::defaults();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();

View File

@@ -8,7 +8,7 @@ use kebab_core::SourceScope;
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
let mut config = Config::defaults();
config.workspace.root = workspace_root.to_string_lossy().into_owned();
config.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
config.workspace.exclude.clear();
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
config.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();

View File

@@ -9,7 +9,7 @@ use common::TestEnv;
#[test]
fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
let env = TestEnv::lexical_only();
let workspace_root = std::path::PathBuf::from(&env.config.workspace.root);
let workspace_root = env.config.resolve_workspace_root();
std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();

View File

@@ -242,6 +242,7 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("c".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -244,6 +244,7 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("cpp".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -244,6 +244,7 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("go".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -244,6 +244,7 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("java".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -244,6 +244,7 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("javascript".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -244,6 +244,7 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("kotlin".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -244,6 +244,7 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("python".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -244,6 +244,7 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("rust".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -244,6 +244,7 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("typescript".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -450,6 +450,7 @@ mod tests {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: kebab_core::ParserVersion("test-parser-0".into()),

View File

@@ -355,6 +355,7 @@ mod tests {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version,
@@ -533,6 +534,7 @@ mod tests {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version,

View File

@@ -111,6 +111,7 @@ fn fixed_doc() -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("c".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -118,6 +118,7 @@ fn fixed_doc() -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("cpp".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("go".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("java".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("javascript".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("kotlin".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("python".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("rust".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -68,6 +68,7 @@ fn text_doc(lang: &str, text: &str) -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some(lang.into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("typescript".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -67,6 +67,7 @@ fn dockerfile_doc(dockerfile_text: &str) -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("dockerfile".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -67,6 +67,7 @@ fn yaml_doc(yaml_text: &str) -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("yaml".into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -58,6 +58,8 @@ fn long_section_chunks_snapshot() {
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: Some("en".into()),
source_id: None,
fallback_trust_level: None,
};
let (metadata, fm_span, _fm_warns) =
parse_frontmatter(&bytes, &hints).expect("frontmatter parses");
@@ -133,6 +135,8 @@ fn long_section_chunks_are_deterministic() {
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: Some("en".into()),
source_id: None,
fallback_trust_level: None,
};
let policy = ChunkPolicy {

View File

@@ -67,6 +67,7 @@ fn manifest_doc(lang: &str, manifest_text: &str) -> CanonicalDocument {
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some(lang.into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,

View File

@@ -193,6 +193,31 @@ enum Cmd {
)]
code_lang: Vec<String>,
/// Phase-2: filter by document source_type
/// (`markdown`, `note`, `paper`, `reference`, `inbox`).
/// Repeatable or comma-separated. Empty = no filter.
/// The clean source/provenance lever for mixed-source KBs.
#[arg(
long = "source-type",
value_name = "TYPE",
num_args = 1,
value_delimiter = ','
)]
source_type: Vec<String>,
/// [[workspace.sources]]: filter by source id — the `id` of the
/// `[[workspace.sources]]` entry a document was ingested from
/// (e.g. `default`, `notes`, `code`). Repeatable or
/// comma-separated. Empty = no filter. The named-source
/// provenance lever for multi-source KBs.
#[arg(
long = "source",
value_name = "ID",
num_args = 1,
value_delimiter = ','
)]
source: Vec<String>,
/// p9-fb-37: emit pre-fusion lexical / vector / RRF candidate
/// lists + per-stage timing in the response. Bypasses cache
/// (debug intent — fresh run guaranteed). Requires embeddings
@@ -615,12 +640,18 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
force_reingest,
} => {
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
let scope = kebab_core::SourceScope {
root: root
.clone()
.unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)),
// [[workspace.sources]]: when the user passes `--root <dir>` we pin
// that single root (one ad-hoc `default` source). Otherwise we
// leave `scope.root` EMPTY so the app iterates every configured
// source (`config.resolved_sources()`); a bare empty scope.exclude
// is fine because each source carries its own merged exclude.
let scope = match root.clone() {
Some(r) => kebab_core::SourceScope {
root: r,
exclude: cfg.workspace.exclude.clone(),
..Default::default()
},
None => kebab_core::SourceScope::default(),
};
// p9-fb-02: spawn the progress display on a background
@@ -629,8 +660,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
// call returns, the `Sender` drops and the display thread
// sees `recv()` return Err — exits cleanly.
let plain_env = std::env::var("KEBAB_PROGRESS")
.map(|v| v.eq_ignore_ascii_case("plain"))
.unwrap_or(false);
.is_ok_and(|v| v.eq_ignore_ascii_case("plain"));
let mode = progress::ProgressMode::from_flags(cli.json, cli.quiet, plain_env);
// Surface the active embedding backend/device on the terminal so the
@@ -828,6 +858,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
doc_id,
repo,
code_lang,
source_type,
source,
trace,
bulk,
} => {
@@ -967,6 +999,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
doc_id: doc_id.as_ref().map(|s| kebab_core::DocumentId(s.clone())),
repo: repo.clone(),
code_lang: code_lang.clone(),
source_type: source_type.clone(),
source_id: source.clone(),
};
let q = kebab_core::SearchQuery {

View File

@@ -12,6 +12,12 @@ mod paths;
pub mod migrate;
pub use paths::{expand_path, expand_path_with_base};
/// Implicit source id used when a single-root `[workspace]` config (no
/// `[[workspace.sources]]`) is normalized into the multi-source model, and
/// the `DEFAULT` value of the `documents.source_id` column. Kept in sync
/// with the migration default in `migrations/V0XX__documents_source_id.sql`.
pub const DEFAULT_SOURCE_ID: &str = "default";
/// f32 의 shortest round-trip(Display)을 f64 로 재파싱해 직렬화한다.
/// `0.3_f32` 가 `0.30000001192092896` 으로 새지 않고 `0.3` 으로 출력되게 한다.
/// 마이그레이션 시 toml_edit relocation 의 무손실 비교를 깨지 않도록, 그리고
@@ -88,8 +94,67 @@ pub struct Config {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct WorkspaceCfg {
pub root: String,
/// Single-root workspace (legacy / common case). `Option` so that a
/// config that declares only `[[workspace.sources]]` (no bare `root`)
/// parses — and, symmetrically, a legacy single-`root` config (no
/// `sources`) still parses unchanged. The load-time normalizer
/// ([`Config::normalize_sources`]) reconciles the two into a single
/// non-empty `sources` list (`id = "default"` synthesized from `root`).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub root: Option<String>,
pub exclude: Vec<String>,
/// `[[workspace.sources]]`: named multi-source declaration. When empty
/// and `root` is set, the load path normalizes to a single implicit
/// `default` source. Each entry stamps its `id` onto every document it
/// ingests and supplies per-source `trust_level` / `source_type`
/// defaults (frontmatter still wins per the §0 Q9 derive table).
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub sources: Vec<SourceCfg>,
}
/// One named source under `[[workspace.sources]]`.
///
/// `trust_level` / `source_type` are the **source-level defaults**: they
/// apply when a document's frontmatter does not specify the field. The
/// precedence is `frontmatter > source default > hardcoded`
/// (`TrustLevel::Primary` / `SourceType::Markdown`) — implemented in the
/// markdown derive via `BodyHints::fallback_trust_level`.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct SourceCfg {
/// Stable identifier stamped onto `documents.source_id` for every
/// document ingested from this source. Must be unique and non-empty
/// across the workspace (enforced in [`Config::validate`]).
pub id: String,
/// Root directory to walk for this source. Accepts the same
/// absolute / `~` / `${VAR}` / relative(=config-dir-based) forms as
/// the legacy `workspace.root`.
pub root: String,
/// Per-source denylist globs, merged on top of `workspace.exclude`.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub exclude: Vec<String>,
/// Per-source default `trust_level` (frontmatter overrides it).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub trust_level: Option<kebab_core::TrustLevel>,
/// Per-source default `source_type` (frontmatter overrides it).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub source_type: Option<kebab_core::SourceType>,
}
/// A source with its `root` resolved to an absolute path and its `exclude`
/// merged with `workspace.exclude`. Produced by [`Config::resolved_sources`]
/// — the single entry point the ingest pipeline iterates over.
#[derive(Clone, Debug, PartialEq)]
pub struct ResolvedSource {
/// Stamped onto `documents.source_id`.
pub id: String,
/// Absolute walk root (tilde / `${VAR}` / relative-to-config resolved).
pub root: PathBuf,
/// `workspace.exclude` per-source `exclude`.
pub exclude: Vec<String>,
/// Per-source default trust level (None → fall back to `Primary`).
pub trust_level: Option<kebab_core::TrustLevel>,
/// Per-source default source type (None → fall back to `Markdown`).
pub source_type: Option<kebab_core::SourceType>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -782,12 +847,13 @@ impl Config {
Self {
schema_version: crate::migrate::CURRENT_SCHEMA_VERSION,
workspace: WorkspaceCfg {
root: "~/KnowledgeBase".to_string(),
root: Some("~/KnowledgeBase".to_string()),
exclude: vec![
".git/**".to_string(),
"node_modules/**".to_string(),
".obsidian/**".to_string(),
],
sources: vec![],
},
storage: StorageCfg {
data_dir: "${XDG_DATA_HOME:-~/.local/share}/kebab".to_string(),
@@ -906,7 +972,78 @@ impl Config {
PathBuf::from(".")
})
});
paths::expand_path_with_base(&self.workspace.root, "", &base)
paths::expand_path_with_base(&self.primary_root_raw(), "", &base)
}
/// The raw (unexpanded) string for the *primary* workspace root, used by
/// [`resolve_workspace_root`](Self::resolve_workspace_root) and any
/// single-root code path. Order: first `[[workspace.sources]]` entry's
/// `root` → bare `workspace.root` → `~/KnowledgeBase` default. This keeps
/// every pre-existing single-root call site working when only `sources`
/// is declared.
fn primary_root_raw(&self) -> String {
if let Some(s) = self.workspace.sources.first() {
return s.root.clone();
}
self.workspace
.root
.clone()
.unwrap_or_else(|| "~/KnowledgeBase".to_string())
}
/// The base directory for resolving relative source roots: the config
/// file's directory when loaded from disk, else the current dir (mirrors
/// [`resolve_workspace_root`](Self::resolve_workspace_root)).
fn root_resolution_base(&self) -> PathBuf {
self.source_dir.clone().unwrap_or_else(|| {
std::env::current_dir().unwrap_or_else(|e| {
tracing::warn!(
target: "kebab-config",
error = %e,
"current_dir() failed; falling back to '.' for source root resolution"
);
PathBuf::from(".")
})
})
}
/// Normalized, resolved list of sources to ingest. Always non-empty:
///
/// - If `[[workspace.sources]]` is declared, each entry is returned with
/// its `root` expanded and `exclude` merged with `workspace.exclude`.
/// - Otherwise a single implicit source `id = "default"` is synthesized
/// from `workspace.root` (the legacy single-root path).
///
/// This is the single entry point the ingest pipeline iterates over, so
/// single-root and multi-source configs share one code path.
pub fn resolved_sources(&self) -> Vec<ResolvedSource> {
let base = self.root_resolution_base();
if self.workspace.sources.is_empty() {
let root = paths::expand_path_with_base(&self.primary_root_raw(), "", &base);
return vec![ResolvedSource {
id: DEFAULT_SOURCE_ID.to_string(),
root,
exclude: self.workspace.exclude.clone(),
trust_level: None,
source_type: None,
}];
}
self.workspace
.sources
.iter()
.map(|s| {
let root = paths::expand_path_with_base(&s.root, "", &base);
let mut exclude = self.workspace.exclude.clone();
exclude.extend(s.exclude.iter().cloned());
ResolvedSource {
id: s.id.clone(),
root,
exclude,
trust_level: s.trust_level,
source_type: s.source_type,
}
})
.collect()
}
/// Read config from disk and merge env overrides on top of it. If the
@@ -1019,10 +1156,41 @@ impl Config {
cause: format!("parse_failed: {e}"),
})
})?;
cfg.validate_sources().map_err(|cause| {
anyhow::Error::new(ConfigInvalid {
path: path.to_path_buf(),
cause,
})
})?;
cfg.source_dir = path.parent().map(Path::to_path_buf);
Ok(cfg)
}
/// Validate `[[workspace.sources]]`: every `id` must be non-empty and
/// unique across the workspace. Empty `sources` (legacy single-root) is
/// always valid. Returns the failure cause string for `ConfigInvalid`.
fn validate_sources(&self) -> Result<(), String> {
let mut seen = std::collections::HashSet::new();
for s in &self.workspace.sources {
if s.id.trim().is_empty() {
return Err("workspace.sources: an entry has an empty `id`".to_string());
}
if s.root.trim().is_empty() {
return Err(format!(
"workspace.sources: source `{}` has an empty `root`",
s.id
));
}
if !seen.insert(s.id.as_str()) {
return Err(format!(
"workspace.sources: duplicate source id `{}` (ids must be unique)",
s.id
));
}
}
Ok(())
}
/// Apply `KEBAB_<SECTION>_<KEY>` env overrides. Unknown keys are ignored.
///
/// The mapping is an explicit grep-friendly whitelist — one match arm
@@ -1037,7 +1205,7 @@ impl Config {
}
match k.as_str() {
// workspace
"KEBAB_WORKSPACE_ROOT" => self.workspace.root = v.clone(),
"KEBAB_WORKSPACE_ROOT" => self.workspace.root = Some(v.clone()),
// storage
"KEBAB_STORAGE_DATA_DIR" => self.storage.data_dir = v.clone(),
@@ -2034,7 +2202,7 @@ max_context_tokens = 8000
#[test]
fn legacy_include_field_is_ignored_silently() {
let mut cfg = Config::defaults();
cfg.workspace.root = "/tmp/kebab-legacy".to_string();
cfg.workspace.root = Some("/tmp/kebab-legacy".to_string());
let mut toml_text = toml::to_string(&cfg).expect("default round-trips");
// Inject a legacy `include = [...]` line into the [workspace] block.
toml_text = toml_text.replace(
@@ -2048,20 +2216,105 @@ max_context_tokens = 8000
parsed.err()
);
let cfg = parsed.unwrap();
assert_eq!(cfg.workspace.root, "/tmp/kebab-legacy");
assert_eq!(cfg.workspace.root.as_deref(), Some("/tmp/kebab-legacy"));
}
/// p9-fb-25: `WorkspaceCfg` must NOT have an `include` field.
/// Compile-time proof: exhaustive destructure.
#[test]
fn workspace_cfg_has_only_root_and_exclude_fields() {
fn workspace_cfg_has_only_root_exclude_sources_fields() {
let ws = Config::defaults().workspace;
let WorkspaceCfg {
root: _,
exclude: _,
sources: _,
} = &ws;
}
#[test]
fn legacy_single_root_normalizes_to_default_source() {
// A single-root config (no [[workspace.sources]]) must resolve to
// exactly one source `id = "default"` rooted at workspace.root.
let mut cfg = Config::defaults();
cfg.workspace.root = Some("/tmp/kb-notes".to_string());
let resolved = cfg.resolved_sources();
assert_eq!(resolved.len(), 1);
assert_eq!(resolved[0].id, DEFAULT_SOURCE_ID);
assert_eq!(resolved[0].root, std::path::PathBuf::from("/tmp/kb-notes"));
assert_eq!(resolved[0].trust_level, None);
}
#[test]
fn multi_source_config_resolves_each_with_merged_exclude() {
let mut cfg = Config::defaults();
cfg.workspace.root = None;
cfg.workspace.exclude = vec![".git/**".to_string()];
cfg.workspace.sources = vec![
SourceCfg {
id: "notes".to_string(),
root: "/tmp/notes".to_string(),
exclude: vec![],
trust_level: Some(kebab_core::TrustLevel::Primary),
source_type: None,
},
SourceCfg {
id: "refs".to_string(),
root: "/tmp/refs".to_string(),
exclude: vec!["draft/**".to_string()],
trust_level: Some(kebab_core::TrustLevel::Secondary),
source_type: Some(kebab_core::SourceType::Reference),
},
];
// A multi-source config (no bare root) must round-trip through TOML.
let toml_text = toml::to_string(&cfg).expect("multi-source serializes");
let cfg: Config = toml::from_str(&toml_text).expect("multi-source parses");
cfg.validate_sources().expect("valid sources");
let resolved = cfg.resolved_sources();
assert_eq!(resolved.len(), 2);
assert_eq!(resolved[0].id, "notes");
assert_eq!(resolved[0].root, std::path::PathBuf::from("/tmp/notes"));
assert_eq!(resolved[0].exclude, vec![".git/**".to_string()]);
assert_eq!(resolved[0].trust_level, Some(kebab_core::TrustLevel::Primary));
assert_eq!(resolved[1].id, "refs");
// workspace.exclude per-source exclude.
assert_eq!(
resolved[1].exclude,
vec![".git/**".to_string(), "draft/**".to_string()]
);
assert_eq!(
resolved[1].source_type,
Some(kebab_core::SourceType::Reference)
);
assert_eq!(
resolved[1].trust_level,
Some(kebab_core::TrustLevel::Secondary)
);
}
fn source_cfg(id: &str, root: &str) -> SourceCfg {
SourceCfg {
id: id.to_string(),
root: root.to_string(),
exclude: vec![],
trust_level: None,
source_type: None,
}
}
#[test]
fn duplicate_source_ids_rejected() {
let mut cfg = Config::defaults();
cfg.workspace.sources = vec![source_cfg("dup", "/a"), source_cfg("dup", "/b")];
assert!(cfg.validate_sources().is_err(), "duplicate ids must fail");
}
#[test]
fn empty_source_id_rejected() {
let mut cfg = Config::defaults();
cfg.workspace.sources = vec![source_cfg("", "/a")];
assert!(cfg.validate_sources().is_err(), "empty id must fail");
}
#[test]
fn default_stale_threshold_is_30() {
let c = Config::defaults();

View File

@@ -9,7 +9,7 @@ use toml_edit::{DocumentMut, Item};
/// 현재 바이너리가 이해하는 config 스키마 버전. 마이그레이션 완료 시
/// 사용자 파일의 `schema_version` 을 이 값으로 stamp 한다.
pub const CURRENT_SCHEMA_VERSION: u32 = 3;
pub const CURRENT_SCHEMA_VERSION: u32 = 4;
/// 한 번의 마이그레이션에서 발생한 개별 변경.
#[derive(Clone, Debug, PartialEq, serde::Serialize)]
@@ -68,6 +68,7 @@ const HEADER: &str = "\
fn section_comment(path: &str) -> Option<&'static str> {
Some(match path {
"workspace" => "# 색인 대상 워크스페이스.",
"workspace.sources" => "# named multi-source (각 source 의 id 가 documents.source_id 로 stamp).",
"storage" => "# XDG 저장 경로(데이터/sqlite/벡터/에셋/모델).",
"indexing" => "# 병렬도 + 파일시스템 watch.",
"chunking" => "# 청크 크기·오버랩·heading 존중.",
@@ -376,6 +377,39 @@ pub fn step_2_to_3(doc: &mut DocumentMut, changes: &mut Vec<MigrationChange>) {
copy_image_paddle_to_pdf(doc);
}
/// v3 → v4: 단일 `workspace.root` 를 `[[workspace.sources]]` 의 implicit
/// `default` source 로 미러링한다(`id = "default"`, `root = <기존 root>`).
/// 기존 `workspace.root` 키는 그대로 둔다 — `resolved_sources()` 가 sources
/// 가 있으면 그쪽을 우선하므로 무해하고, defaults reconcile 이 root 를 다시
/// 추가하려 하지 않게 한다. 멱등: `[[workspace.sources]]` 가 이미 있으면 no-op.
pub fn step_3_to_4(doc: &mut DocumentMut, changes: &mut Vec<MigrationChange>) {
let Some(ws) = doc.get_mut("workspace").and_then(Item::as_table_mut) else {
return;
};
// 이미 sources 가 선언돼 있으면(array-of-tables 든 inline 이든) 손대지 않음.
if ws.contains_key("sources") {
return;
}
// root 가 없으면 만들 게 없음(defaults 에는 항상 있지만 방어).
let Some(root_val) = ws.get("root").and_then(Item::as_str).map(str::to_string) else {
return;
};
let mut entry = toml_edit::Table::new();
entry.insert("id", toml_edit::value("default"));
entry.insert("root", toml_edit::value(root_val));
let mut aot = toml_edit::ArrayOfTables::new();
aot.push(entry);
ws.insert("sources", Item::ArrayOfTables(aot));
changes.push(MigrationChange {
kind: ChangeKind::AddedSection,
path: "workspace.sources".to_string(),
detail: "workspace.root → [[workspace.sources]] id=default".to_string(),
});
}
/// 파일의 schema_version(없으면 1) 부터 CURRENT 까지 step 적용.
fn run_steps(doc: &mut DocumentMut, from: u32, changes: &mut Vec<MigrationChange>) {
if from < 2 {
@@ -384,6 +418,9 @@ fn run_steps(doc: &mut DocumentMut, from: u32, changes: &mut Vec<MigrationChange
if from < 3 {
step_2_to_3(doc, changes);
}
if from < 4 {
step_3_to_4(doc, changes);
}
}
/// 사용자 config.toml 텍스트를 받아 step 체인 + reconciliation + version
@@ -648,6 +685,76 @@ engine = \"paddle-onnx\"
assert!(again.is_empty(), "not idempotent: {again:?}");
}
#[test]
fn step_3_to_4_mirrors_root_into_default_source() {
let v3 = "\
schema_version = 3
[workspace]
root = \"/my/notes\"
exclude = [\".git/**\"]
";
let mut doc: DocumentMut = v3.parse().unwrap();
let mut changes = Vec::new();
step_3_to_4(&mut doc, &mut changes);
let out = doc.to_string();
// 새 array-of-tables 가 id=default 로 추가.
assert!(out.contains("[[workspace.sources]]"), "{out}");
assert!(out.contains("id = \"default\""), "{out}");
// 기존 root 는 보존(reconcile 이 다시 추가하지 않게).
assert!(out.contains("root = \"/my/notes\""), "{out}");
// 재파싱 후 sources.default 가 root 를 미러.
let reparsed: DocumentMut = out.parse().unwrap();
let src0 = reparsed["workspace"]["sources"][0].as_table().unwrap();
assert_eq!(src0["id"].as_str(), Some("default"));
assert_eq!(src0["root"].as_str(), Some("/my/notes"));
// 멱등.
let mut changes2 = Vec::new();
step_3_to_4(&mut doc, &mut changes2);
assert!(changes2.is_empty(), "step_3_to_4 not idempotent");
}
#[test]
fn step_3_to_4_noop_when_sources_already_present() {
let v4 = "\
schema_version = 4
[workspace]
root = \"/my/notes\"
exclude = []
[[workspace.sources]]
id = \"notes\"
root = \"/my/notes\"
";
let mut doc: DocumentMut = v4.parse().unwrap();
let mut changes = Vec::new();
step_3_to_4(&mut doc, &mut changes);
assert!(changes.is_empty(), "must not touch existing sources");
// 기존 source 만 존재(default 가 추가되지 않음).
assert!(!doc.to_string().contains("id = \"default\""));
}
#[test]
fn migrate_document_v3_to_v4_adds_sources_and_is_idempotent() {
let v3 = "\
schema_version = 3
[workspace]
root = \"/n\"
exclude = []
";
let outcome = migrate_document(v3);
assert_eq!(outcome.from_schema_version, 3);
assert_eq!(outcome.to_schema_version, 4);
assert!(outcome.changed());
assert!(outcome.new_text.contains("[[workspace.sources]]"));
assert_eq!(read_schema_version(&outcome.new_text), 4);
let again = migrate_document(&outcome.new_text);
assert!(!again.changed(), "not idempotent: {:?}", again.changes);
assert_eq!(again.new_text, outcome.new_text);
}
#[test]
fn migrate_document_missing_schema_version_treated_as_v1() {
let old = "[workspace]\nroot = \"/n\"\n";

View File

@@ -11,11 +11,16 @@ const USER_V2: &str = include_str!("fixtures/user_v2_config.toml");
fn user_v2_migrates_losslessly() {
let out = migrate_document(USER_V2);
assert_eq!(out.from_schema_version, 2);
assert_eq!(out.to_schema_version, 3);
// v2 → CURRENT(=4): v3 의 [ingest.*] relocation 에 더해 v4 의
// [[workspace.sources]] default source 미러링까지 적용된다.
assert_eq!(out.to_schema_version, 4);
let t = &out.new_text;
// 사용자 값 보존.
assert!(t.contains("root = \"/Users/user/Obsidian/Default\""), "{t}");
// v4: workspace.root → [[workspace.sources]] id=default 미러링.
assert!(t.contains("[[workspace.sources]]"), "v4 sources 누락:\n{t}");
assert!(t.contains("id = \"default\""), "default source 누락:\n{t}");
assert!(t.contains("model = \"snowflake-arctic-embed2\""));
assert!(t.contains("endpoint = \"http://192.168.0.2:11943\""));
// 사용자 주석/대안 줄 보존.

View File

@@ -36,6 +36,14 @@ pub struct Metadata {
/// for markdown / pdf / image. Set by the local-filesystem source connector during ingest.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub code_lang: Option<String>,
/// `[[workspace.sources]]`: id of the named source this document was
/// ingested from (the `id` of the matching `[[workspace.sources]]`
/// entry; `"default"` for single-root workspaces normalized to the
/// implicit `default` source). null on documents ingested before the
/// multi-source feature; the store column defaults to `"default"`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub source_id: Option<String>,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
@@ -105,12 +113,14 @@ mod tests {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
};
let v = serde_json::to_value(&m).unwrap();
assert!(v.get("repo").is_none());
assert!(v.get("git_branch").is_none());
assert!(v.get("git_commit").is_none());
assert!(v.get("code_lang").is_none());
assert!(v.get("source_id").is_none());
}
#[test]
@@ -128,8 +138,10 @@ mod tests {
git_branch: Some("main".into()),
git_commit: Some("a".repeat(40)),
code_lang: Some("rust".into()),
source_id: Some("notes".into()),
};
let v = serde_json::to_value(&m).unwrap();
assert_eq!(v["source_id"], "notes");
assert_eq!(v["repo"], "kebab");
assert_eq!(v["git_branch"], "main");
assert_eq!(v["git_commit"].as_str().unwrap().len(), 40);

View File

@@ -69,6 +69,20 @@ pub struct SearchFilters {
/// Unknown values produce empty hits (consistent with `media` policy).
#[serde(default)]
pub code_lang: Vec<String>,
/// Phase-2 (jira-contamination experiment): filter by `documents.source_type`
/// (`markdown` | `note` | `paper` | `reference` | `inbox`). Empty = no filter;
/// multi-value = OR. Direct indexed column — the clean provenance/source lever:
/// filtering recovers concept-query precision without the see-saw of global
/// trust-weighting (see tasks/HOTFIXES.md A/B evidence).
#[serde(default)]
pub source_type: Vec<String>,
/// `[[workspace.sources]]`: filter by `documents.source_id` (the `id` of
/// the `[[workspace.sources]]` entry a document was ingested from; e.g.
/// `default`, `notes`, `code`). Empty = no filter; multi-value = OR.
/// Direct indexed column (idx_docs_source_id) — the named-source
/// provenance lever for multi-source KBs.
#[serde(default)]
pub source_id: Vec<String>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]

View File

@@ -107,6 +107,8 @@ pub fn handle(state: &KebabAppState, input: SearchInput) -> CallToolResult {
doc_id: input.doc_id.clone().map(kebab_core::DocumentId),
repo: vec![],
code_lang: vec![],
source_type: vec![],
source_id: vec![],
};
let query = kebab_core::SearchQuery {

View File

@@ -10,7 +10,7 @@ async fn schema_tool_emits_error_v1_when_db_missing() {
let dir = tempfile::tempdir().unwrap();
let mut cfg = Config::defaults();
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
cfg.workspace.root = dir.path().join("notes").to_string_lossy().into_owned();
cfg.workspace.root = Some(dir.path().join("notes").to_string_lossy().into_owned());
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
// Note: NO ingest call — kebab.sqlite is absent → schema_with_config

View File

@@ -10,7 +10,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
let mut cfg = Config::defaults();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -27,7 +27,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
let mut cfg = Config::defaults();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -12,7 +12,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
let mut cfg = Config::defaults();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -9,10 +9,10 @@ async fn doctor_tool_returns_doctor_v1_json() {
let dir = tempfile::tempdir().unwrap();
let mut cfg = Config::defaults();
cfg.storage.data_dir = dir.path().join("data").to_string_lossy().into_owned();
cfg.workspace.root = dir.path().join("notes").to_string_lossy().into_owned();
cfg.workspace.root = Some(dir.path().join("notes").to_string_lossy().into_owned());
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
std::fs::create_dir_all(&cfg.workspace.root).unwrap();
std::fs::create_dir_all(cfg.resolve_workspace_root()).unwrap();
// Pass None for config_path — doctor falls back to XDG default probe
// (path won't exist in the tempdir, which is fine; doctor reports it

View File

@@ -16,7 +16,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
let mut cfg = Config::defaults();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -15,7 +15,7 @@ async fn ingest_file_tool_returns_ingest_report_v1() {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
@@ -61,7 +61,7 @@ async fn ingest_file_tool_idempotent_on_second_call() {
std::fs::create_dir_all(&data).unwrap();
let mut cfg = kebab_config::Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -14,7 +14,7 @@ fn fresh_state(dir: &std::path::Path) -> KebabAppState {
fs::create_dir_all(&data).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace.to_string_lossy().into_owned());
cfg.storage.data_dir = data.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -11,7 +11,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
let mut cfg = Config::defaults();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -11,7 +11,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
let mut cfg = Config::defaults();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -11,7 +11,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
let mut cfg = Config::defaults();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned());
cfg.workspace.exclude.clear();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;

View File

@@ -131,6 +131,7 @@ impl Extractor for CAstExtractor {
git_branch,
git_commit,
code_lang: Some("c".to_string()),
source_id: None,
};
tracing::debug!(

View File

@@ -155,6 +155,7 @@ impl Extractor for CppAstExtractor {
git_branch,
git_commit,
code_lang: Some("cpp".to_string()),
source_id: None,
};
tracing::debug!(

View File

@@ -133,6 +133,7 @@ impl Extractor for GoAstExtractor {
git_branch,
git_commit,
code_lang: Some("go".to_string()),
source_id: None,
};
tracing::debug!(

View File

@@ -144,6 +144,7 @@ impl Extractor for JavaAstExtractor {
git_branch,
git_commit,
code_lang: Some("java".to_string()),
source_id: None,
};
tracing::debug!(

View File

@@ -151,6 +151,7 @@ impl Extractor for JavascriptAstExtractor {
git_branch,
git_commit,
code_lang: Some("javascript".to_string()),
source_id: None,
};
tracing::debug!(

View File

@@ -149,6 +149,7 @@ impl Extractor for KotlinAstExtractor {
git_branch,
git_commit,
code_lang: Some("kotlin".to_string()),
source_id: None,
};
tracing::debug!(

View File

@@ -133,6 +133,7 @@ impl Extractor for PythonAstExtractor {
git_branch,
git_commit,
code_lang: Some("python".to_string()),
source_id: None,
};
tracing::debug!(

View File

@@ -136,6 +136,7 @@ impl Extractor for RustAstExtractor {
git_branch,
git_commit,
code_lang: Some("rust".to_string()),
source_id: None,
};
tracing::debug!(

View File

@@ -144,6 +144,7 @@ impl Extractor for TypescriptAstExtractor {
git_branch,
git_commit,
code_lang: Some("typescript".to_string()),
source_id: None,
};
tracing::debug!(

View File

@@ -203,6 +203,7 @@ impl Extractor for ImageExtractor {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
};
tracing::debug!(

View File

@@ -42,6 +42,16 @@ pub struct BodyHints {
/// Optional language fallback used when neither frontmatter nor lingua
/// detection produce a value. If `None` the final fallback is `"und"`.
pub fallback_lang: Option<String>,
/// `[[workspace.sources]]`: id of the source this document is being
/// ingested from. Copied verbatim into `Metadata.source_id` (frontmatter
/// does not override the source id — it is an ingest-time provenance
/// stamp, not a user-authored field). `None` when single-root /
/// unspecified, in which case `Metadata.source_id` stays `None`.
pub source_id: Option<String>,
/// `[[workspace.sources]]`: per-source default `trust_level`. Consulted
/// only when the frontmatter does not specify `trust_level`. Precedence:
/// frontmatter > this source default > hardcoded `Primary`.
pub fallback_trust_level: Option<TrustLevel>,
}
/// Byte range of the frontmatter region inside the input slice.
@@ -444,8 +454,12 @@ fn derive_metadata(
};
// ---- trust_level ----
// Precedence: frontmatter > per-source default (hints.fallback_trust_level)
// > hardcoded Primary. An *unknown* frontmatter value warns and also falls
// through to the source default (then Primary), so a typo doesn't silently
// promote past the source's intended trust.
let trust_level = match raw.trust_level.as_deref() {
None => TrustLevel::Primary,
None => hints.fallback_trust_level.unwrap_or(TrustLevel::Primary),
Some(s) => {
if let Some(tl) = parse_trust_level(s) {
tl
@@ -454,7 +468,7 @@ fn derive_metadata(
kind: WarningKind::MalformedFrontmatter,
note: format!("unknown trust_level={s}, defaulted to primary"),
});
TrustLevel::Primary
hints.fallback_trust_level.unwrap_or(TrustLevel::Primary)
}
}
};
@@ -477,6 +491,10 @@ fn derive_metadata(
git_branch: None,
git_commit: None,
code_lang: None,
// `[[workspace.sources]]`: ingest-time provenance stamp. Frontmatter
// does not override the source id — it is supplied by the caller
// (kebab-app) from the matching source's config `id`.
source_id: hints.source_id.clone(),
}
}
@@ -604,6 +622,8 @@ mod tests {
fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
fallback_lang: None,
source_id: None,
fallback_trust_level: None,
}
}
@@ -695,6 +715,47 @@ source_type: alien\n\
assert!(warns.iter().any(|w| w.note.contains("source_type=alien")));
}
fn hints_with_source(id: &str, trust: Option<TrustLevel>) -> BodyHints {
BodyHints {
source_id: Some(id.to_string()),
fallback_trust_level: trust,
..hints()
}
}
#[test]
fn source_default_trust_applied_when_frontmatter_absent() {
// No `trust_level:` in frontmatter → the per-source default wins
// over the hardcoded Primary.
let md = b"---\ntitle: Doc\n---\nbody\n";
let (meta, _span, warns) =
parse_frontmatter(md, &hints_with_source("notes", Some(TrustLevel::Secondary)))
.unwrap();
assert!(warns.is_empty(), "warnings: {warns:?}");
assert_eq!(meta.trust_level, TrustLevel::Secondary);
assert_eq!(meta.source_id.as_deref(), Some("notes"));
}
#[test]
fn frontmatter_trust_overrides_source_default() {
// Explicit frontmatter trust beats the per-source default.
let md = b"---\ntrust_level: generated\n---\nbody\n";
let (meta, _span, _warns) =
parse_frontmatter(md, &hints_with_source("notes", Some(TrustLevel::Secondary)))
.unwrap();
assert_eq!(meta.trust_level, TrustLevel::Generated);
assert_eq!(meta.source_id.as_deref(), Some("notes"));
}
#[test]
fn no_source_id_leaves_metadata_source_id_none() {
let md = b"---\ntitle: Doc\n---\nbody\n";
let (meta, _span, _warns) = parse_frontmatter(md, &hints()).unwrap();
assert_eq!(meta.source_id, None);
// Without a source default, hardcoded Primary still applies.
assert_eq!(meta.trust_level, TrustLevel::Primary);
}
#[test]
fn malformed_yaml_emits_warning_and_defaults() {
// Unclosed quote → YAML parse fails.

View File

@@ -469,6 +469,7 @@ mod tests {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
}
}

View File

@@ -37,6 +37,8 @@ fn pinned_hints() -> BodyHints {
fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
fallback_lang: None,
source_id: None,
fallback_trust_level: None,
}
}

View File

@@ -86,6 +86,8 @@ fn code_and_table_canonical_snapshot() {
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: Some("en".into()),
source_id: None,
fallback_trust_level: None,
};
let (metadata, fm_span, _fm_warns) =
parse_frontmatter(&bytes, &hints).expect("frontmatter parses");

View File

@@ -203,6 +203,7 @@ impl Extractor for PdfTextExtractor {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
};
tracing::debug!(

View File

@@ -419,6 +419,31 @@ fn run_query(
}
}
// Phase-2: source_type filter (IN-list on the direct `documents.source_type`
// column). Empty Vec = no filter; multi-value = OR. Mirrors filters.rs.
if !filters.source_type.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.source_type.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(" AND d.source_type IN ({placeholders})"));
for st in &filters.source_type {
params.push(Box::new(st.clone()));
}
}
// [[workspace.sources]]: source_id filter (IN-list on the direct
// `documents.source_id` column). Empty Vec = no filter; multi-value = OR.
// Mirrors filters.rs.
if !filters.source_id.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.source_id.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(" AND d.source_id IN ({placeholders})"));
for sid in &filters.source_id {
params.push(Box::new(sid.clone()));
}
}
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
// fb-32 ingest path), so lexicographic >= compare is correct — but only

View File

@@ -231,6 +231,47 @@ pub struct FsScanSkips {
pub events: Vec<FsSkipEvent>,
}
impl FsScanSkips {
/// `[[workspace.sources]]`: fold another source's scan skips into `self`,
/// so a multi-source ingest reports aggregate counts. Counters add;
/// per-category sample vecs concatenate and re-cap at 5 (spec §5.5);
/// events concatenate.
pub fn merge(&mut self, other: FsScanSkips) {
self.skipped_gitignore = self.skipped_gitignore.saturating_add(other.skipped_gitignore);
self.skipped_kebabignore = self
.skipped_kebabignore
.saturating_add(other.skipped_kebabignore);
self.skipped_builtin_blacklist = self
.skipped_builtin_blacklist
.saturating_add(other.skipped_builtin_blacklist);
self.skipped_generated = self.skipped_generated.saturating_add(other.skipped_generated);
self.skipped_size_exceeded = self
.skipped_size_exceeded
.saturating_add(other.skipped_size_exceeded);
fn merge_samples(dst: &mut Vec<String>, src: Vec<String>) {
for s in src {
if dst.len() >= 5 {
break;
}
dst.push(s);
}
}
merge_samples(&mut self.skip_examples.generated, other.skip_examples.generated);
merge_samples(
&mut self.skip_examples.size_exceeded,
other.skip_examples.size_exceeded,
);
merge_samples(
&mut self.skip_examples.builtin_blacklist,
other.skip_examples.builtin_blacklist,
);
merge_samples(&mut self.skip_examples.gitignore, other.skip_examples.gitignore);
self.events.extend(other.events);
}
}
/// A single per-file skip event for structured ingest log (v0.20.x).
#[derive(Debug)]
pub struct FsSkipEvent {
@@ -326,7 +367,7 @@ mod tests {
fn cfg_with_root(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = root.to_string();
c.workspace.root = Some(root.to_string());
c.workspace.exclude.clear();
c
}

View File

@@ -20,7 +20,7 @@ use kebab_source_fs::FsSourceConnector;
fn cfg_with_root(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = root.to_string();
c.workspace.root = Some(root.to_string());
c.workspace.exclude.clear();
// Disable size / generated caps so small test files always pass.
c.ingest.code.max_file_bytes = u64::MAX;

View File

@@ -50,7 +50,7 @@ fn baseline_path() -> PathBuf {
fn cfg_for_fixture(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = root.to_string();
c.workspace.root = Some(root.to_string());
// Clear default excludes (`.git/**`, `node_modules/**`, `.obsidian/**`)
// so the snapshot is purely a function of the fixture + .kebabignore +
// baked-in default-excludes.

View File

@@ -23,7 +23,7 @@ use kebab_source_fs::FsSourceConnector;
fn cfg_with_root(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = root.to_string();
c.workspace.root = Some(root.to_string());
c.workspace.exclude.clear();
c
}

View File

@@ -745,6 +745,14 @@ fn upsert_document(
// `markdown` for the column).
let source_type = source_type_label(&doc.metadata.source_type);
let trust_level = trust_level_label(&doc.metadata.trust_level);
// `[[workspace.sources]]`: id of the source this doc came from. Falls back
// to the column default `"default"` for docs without an explicit source
// (single-root workspaces / pre-multi-source ingests).
let source_id = doc
.metadata
.source_id
.as_deref()
.unwrap_or(kebab_config::DEFAULT_SOURCE_ID);
let created_at = doc
.metadata
.created_at
@@ -757,11 +765,11 @@ fn upsert_document(
tx.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang,
source_type, trust_level, parser_version,
source_type, trust_level, source_id, parser_version,
doc_version, schema_version, metadata_json,
provenance_json, created_at, updated_at,
last_chunker_version, last_embedding_version
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(doc_id) DO UPDATE SET
asset_id = excluded.asset_id,
workspace_path = excluded.workspace_path,
@@ -769,6 +777,7 @@ fn upsert_document(
lang = excluded.lang,
source_type = excluded.source_type,
trust_level = excluded.trust_level,
source_id = excluded.source_id,
parser_version = excluded.parser_version,
-- doc_version: bump on update. excluded.doc_version is the
-- caller's submitted value; we ignore it and add 1 to the
@@ -788,6 +797,7 @@ fn upsert_document(
doc.lang.0,
source_type,
trust_level,
source_id,
doc.parser_version.0,
i64::from(doc.doc_version),
i64::from(doc.schema_version),

View File

@@ -191,6 +191,31 @@ impl SqliteStore {
}
}
// Phase-2: source_type filter (IN-list on the direct `documents.source_type`
// column, idx_docs_source_type). Empty Vec = no filter; multi-value = OR.
if !filters.source_type.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.source_type.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(" AND d.source_type IN ({placeholders})"));
for st in &filters.source_type {
bind.push(Box::new(st.clone()));
}
}
// [[workspace.sources]]: source_id filter (IN-list on the direct
// `documents.source_id` column, idx_docs_source_id). Empty Vec = no
// filter; multi-value = OR. Mirrors the source_type filter above.
if !filters.source_id.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.source_id.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(" AND d.source_id IN ({placeholders})"));
for sid in &filters.source_id {
bind.push(Box::new(sid.clone()));
}
}
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
// lexicographic >= compare is correct — but only when the filter
@@ -1000,6 +1025,121 @@ mod tests {
);
}
/// [[workspace.sources]]: the `source_id` filter keeps only chunks whose
/// owning document's `documents.source_id` column is in the IN-list.
#[test]
fn filter_chunks_source_id_keeps_matching_source() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let c1 = "11111111111111111111111111111111";
let c2 = "22222222222222222222222222222222";
let c3 = "33333333333333333333333333333333";
// Three docs, each with a distinct source_id column value.
seed_with_source_id(&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "notes/a.md", "notes");
seed_with_source_id(&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "code/b.rs", "code");
seed_with_source_id(
&store,
c3,
"d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
"x.md",
"default",
);
// Single value.
let f = SearchFilters {
source_id: vec!["notes".to_string()],
..Default::default()
};
let out = store
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
.unwrap();
assert_eq!(out, vec![cid(c1)], "only the `notes` source chunk survives");
// Multi-value OR.
let f = SearchFilters {
source_id: vec!["notes".to_string(), "code".to_string()],
..Default::default()
};
let out = store
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
.unwrap();
assert_eq!(out, vec![cid(c1), cid(c2)], "notes OR code survive");
// Empty filter = no filtering.
let f = SearchFilters::default();
let out = store
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
.unwrap();
assert_eq!(out, vec![cid(c1), cid(c2), cid(c3)]);
}
/// Seed one committed doc + chunk + embedding with an explicit
/// `documents.source_id` column value (the DEFAULT is `'default'`).
fn seed_with_source_id(
store: &SqliteStore,
chunk_id: &str,
doc_id: &str,
workspace_path: &str,
source_id: &str,
) {
let asset_id = format!("a{}", &doc_id[..31]);
{
let conn = store.lock_conn();
conn.execute(
"INSERT INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, ?, ?, '\"markdown\"', 1, ?, 'reference', ?,
'1970-01-01T00:00:00Z')",
params![
asset_id,
format!("file://{workspace_path}"),
workspace_path,
workspace_path,
workspace_path,
],
)
.unwrap();
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang, source_type,
trust_level, source_id, parser_version, doc_version,
schema_version, metadata_json, provenance_json,
created_at, updated_at
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', ?, 'v1',
1, 1, '{}', '{}', '1970-01-01T00:00:00Z',
'1970-01-01T00:00:00Z')",
params![doc_id, asset_id, workspace_path, source_id],
)
.unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
'1970-01-01T00:00:00Z')",
params![chunk_id, doc_id],
)
.unwrap();
}
let embed_row = EmbeddingRecordRow {
embedding_id: format!("e{}", &chunk_id[..31]),
chunk_id: chunk_id.to_string(),
model_id: "m".to_string(),
model_version: "v1".to_string(),
dimensions: 4,
lance_table: "t".to_string(),
created_at: OffsetDateTime::UNIX_EPOCH,
};
store
.put_embedding_records_pending(std::slice::from_ref(&embed_row))
.unwrap();
store
.mark_embedding_records_committed(std::slice::from_ref(&embed_row.embedding_id))
.unwrap();
}
#[test]
fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
// Regression test for the non-UTC offset lex-compare bug.

View File

@@ -80,7 +80,7 @@ pub fn breakdowns(conn: &Connection, threshold_days: u64) -> rusqlite::Result<Br
/// the LanceDB directory tree. Missing files / dir = 0.
pub fn index_bytes(data_dir: &Path) -> std::io::Result<IndexBytes> {
fn file_size_or_zero(p: &Path) -> u64 {
std::fs::metadata(p).map(|m| m.len()).unwrap_or(0)
std::fs::metadata(p).map_or(0, |m| m.len())
}
fn dir_walk_sum(p: &Path) -> std::io::Result<u64> {
if !p.exists() {

View File

@@ -57,6 +57,8 @@ fn document_and_chunks_round_trip_through_sqlite() {
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: Some("en".into()),
source_id: None,
fallback_trust_level: None,
};
let (mut metadata, _fm_span, _fm_warns) = parse_frontmatter(&bytes, &hints).unwrap();
let (parsed_blocks, parse_warns) = parse_blocks(&bytes, 1).unwrap();

View File

@@ -45,6 +45,7 @@ fn make_metadata() -> Metadata {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
}
}

View File

@@ -55,6 +55,7 @@ fn make_doc() -> CanonicalDocument {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
};
CanonicalDocument {
doc_id,

View File

@@ -58,6 +58,7 @@ fn make_doc(
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
};
let doc = CanonicalDocument {
doc_id,

View File

@@ -598,8 +598,7 @@ fn spawn_ask_worker(state: &mut App) {
fn make_conversation_id() -> String {
let nanos = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0);
.map_or(0, |d| d.as_nanos());
format!("conv_{nanos:032x}")
}

View File

@@ -34,11 +34,10 @@ pub fn start_ingest(app: &mut App) -> anyhow::Result<()> {
anyhow::bail!("ingest already running");
}
let cfg = app.config.clone();
let scope = SourceScope {
root: std::path::PathBuf::from(&cfg.workspace.root),
exclude: cfg.workspace.exclude.clone(),
..Default::default()
};
// [[workspace.sources]]: leave `scope.root` empty so the app iterates
// every configured source (`config.resolved_sources()`), mirroring the
// CLI `kebab ingest` path. Each source carries its own merged exclude.
let scope = SourceScope::default();
let (tx, rx) = mpsc::channel::<IngestEvent>();
let cancel = Arc::new(AtomicBool::new(false));
let cancel_for_worker = cancel.clone();

View File

@@ -304,10 +304,11 @@ pub fn handle_key_search(state: &mut App, key: KeyEvent) -> KeyOutcome {
// `terminal.clear()` couldn't happen — leaving the
// previous frame leaking through the new draw.
let editor = std::env::var("EDITOR").unwrap_or_else(|_| "vi".into());
// `~/...` / `${XDG_…}` expansion via `kebab-config::expand_path`
// — same helper used by the markdown / image / PDF ingest
// paths (HOTFIXES 2026-05-02 P9-4 follow-up).
let workspace_root = kebab_config::expand_path(&state.config.workspace.root, "");
// [[workspace.sources]]: resolve the primary workspace root
// (first source / legacy `root`). `resolve_workspace_root` applies
// the same `~` / `${XDG_…}` / relative-to-config expansion as the
// markdown / image / PDF ingest paths (HOTFIXES 2026-05-02 P9-4).
let workspace_root = state.config.resolve_workspace_root();
state.pending_editor = Some(crate::app::EditorRequest {
citation: citation.unwrap(),
editor_env: editor,

View File

@@ -19,7 +19,7 @@ use time::OffsetDateTime;
fn fresh_app() -> App {
let mut config = Config::defaults();
config.storage.data_dir = "/tmp/kebab-tui-ask-tests-noop".to_string();
config.workspace.root = "/tmp/kebab-tui-ask-tests-noop/workspace".to_string();
config.workspace.root = Some("/tmp/kebab-tui-ask-tests-noop/workspace".to_string());
let mut app = App::new(config).expect("App::new");
app.focus = Pane::Ask;
// p9-fb-12 follow-up: mirror the run loop's auto-flip on pane

View File

@@ -12,7 +12,7 @@ use ratatui::layout::Rect;
fn fresh_app(focus: Pane) -> App {
let mut config = Config::defaults();
config.storage.data_dir = "/tmp/kebab-tui-cheatsheet-tests-noop".to_string();
config.workspace.root = "/tmp/kebab-tui-cheatsheet-tests-noop/workspace".to_string();
config.workspace.root = Some("/tmp/kebab-tui-cheatsheet-tests-noop/workspace".to_string());
let mut app = App::new(config).expect("App::new");
app.focus = focus;
app

View File

@@ -23,7 +23,7 @@ use time::OffsetDateTime;
fn fresh_app() -> App {
let mut config = Config::defaults();
config.storage.data_dir = "/tmp/kebab-tui-inspect-tests-noop".to_string();
config.workspace.root = "/tmp/kebab-tui-inspect-tests-noop/workspace".to_string();
config.workspace.root = Some("/tmp/kebab-tui-inspect-tests-noop/workspace".to_string());
let mut app = App::new(config).expect("App::new");
app.focus = Pane::Inspect;
app.inspect = Some(InspectState::default());
@@ -85,6 +85,7 @@ fn make_doc() -> CanonicalDocument {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
},
provenance: Provenance {
events: vec![ProvenanceEvent {

View File

@@ -9,7 +9,7 @@ use kebab_tui::{App, Mode, Pane, mode_intercept};
fn fresh_app(focus: Pane) -> App {
let mut config = Config::defaults();
config.storage.data_dir = "/tmp/kebab-tui-mode-tests-noop".to_string();
config.workspace.root = "/tmp/kebab-tui-mode-tests-noop/workspace".to_string();
config.workspace.root = Some("/tmp/kebab-tui-mode-tests-noop/workspace".to_string());
let mut app = App::new(config).expect("App::new");
app.focus = focus;
app.mode = Mode::auto_for(focus);

View File

@@ -18,7 +18,7 @@ use std::path::Path;
fn fresh_app() -> App {
let mut config = Config::defaults();
config.storage.data_dir = "/tmp/kebab-tui-search-tests-noop".to_string();
config.workspace.root = "/tmp/kebab-tui-search-tests-noop/workspace".to_string();
config.workspace.root = Some("/tmp/kebab-tui-search-tests-noop/workspace".to_string());
let mut app = App::new(config).expect("App::new");
app.focus = Pane::Search;
// p9-fb-12 follow-up: mirror the run loop's auto-flip — Search

View File

@@ -9,7 +9,7 @@ use ratatui::layout::Rect;
fn fresh_app(focus: Pane) -> App {
let mut config = Config::defaults();
config.storage.data_dir = "/tmp/kebab-tui-status-bar-tests-noop".to_string();
config.workspace.root = "/tmp/kebab-tui-status-bar-tests-noop/workspace".to_string();
config.workspace.root = Some("/tmp/kebab-tui-status-bar-tests-noop/workspace".to_string());
let mut app = App::new(config).expect("App::new");
app.focus = focus;
app

View File

@@ -34,6 +34,7 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab-
| RRF fusion_score | `[0, 1]` 정규화 — `2 / (k_rrf + 1)` 로 나눠 mode 간 비교 가능 (post-merge hotfix) |
| ~~doc-side expansion 별칭 (v0.21.0)~~ | **제거됨 (v0.25.0, HOTFIXES 2026-06-03)** — 색인-시 청크당 LLM 별칭 생성 + 별칭 검색 채널을 완전히 제거. 별칭 ROI 음수(cross-lingual 은 e5-large 단독으로 충분, 기여는 설명형 +2 그룹뿐인데 대가가 청크당 색인-시 LLM). V013 마이그레이션이 `chunk_aliases_fts` + `chunks.aliases` DROP. 기존 KB 의 잔존 별칭 벡터는 검색 시 `strip_alias_suffix` 로 본문 chunk 에 매핑(graceful)되거나 `kebab reset` 으로 정리. spec: `docs/superpowers/specs/2026-06-03-remove-doc-expansion-spec.md`. |
| 파생물 캐시 `derivation_cache` (V012, v0.21.0) | 비싼 ingest 파생물(embedding 벡터)을 청크 **내용 해시** 키로 SQLite 에 캐싱 → 재색인 시 내용 불변 청크는 재계산 skip. `cache_key = blake3(kind ‖ text_blake3 ‖ version_key)[:32]`; version_key 에 model/dimensions 포함 → §9 cascade 와 정합(버전 bump 시 자동 miss). 위치 기반 `chunk_id` 와 달리 내용이 같으면 문서·위치 무관 동일 키. 순수 가산 — `corpus_revision` bump 안 함, 손상/삭제돼도 정확성 영향 0(miss → 재계산). search/ask 는 `kebab.sqlite`+`lancedb` 만으로 동작하므로 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능 (HOTFIXES 2026-05-31). (별칭 LLM 캐싱 kind 는 v0.25.0 에서 제거 — embedding kind 만 남음.) |
| provenance 출처 필터 (v0.29.0) | 혼합 출처 KB 의 레버 = **질의 시 출처 필터링** (전역 trust 가중 아님). config `[[workspace.sources]]`(각 id/root/trust_level/source_type) → `documents.source_id` 컬럼(V014, additive·DEFAULT `'default'`) stamp + 검색 `--source <id>` / `--source-type <type>`(lexical+vector 두 site, OR). 단일 root 는 implicit `default` source 로 정규화(config v3→v4 `step_3_to_4` 미러). per-source trust/type 는 frontmatter 부재 시 기본값(우선순위 frontmatter > source 기본값 > Primary). **전역 trust 곱셈가중(weighted-RRF)은 반증** — A/B 에서 θ=0.85 만으로 incident MRR 0.918→0.340 절벽(점수 압축), 작은 오염 잡으려다 큰 개선 버리는 see-saw 라 빌드 안 함. 필터는 see-saw 없음. (HOTFIXES 2026-06-21) |
| layout | XDG (`~/.local/share/kebab/`, `~/.config/kebab/`, …) |
전체 frozen 설계는 [docs/superpowers/specs/2026-04-27-kebab-final-form-design.md](superpowers/specs/2026-04-27-kebab-final-form-design.md) 12 sections 참조.
@@ -219,7 +220,7 @@ kebab/
│ ├── kebab-tui/ # Ratatui shell + Library 패널 (P9-1)
│ ├── kebab-mcp/ # stdio MCP server — tools: schema, doctor, search, ask (P9-FB-30)
│ └── kebab-cli/ # binary (P0 → 핫픽스로 --config flag wiring 강화)
├── migrations/ # SQLite refinery V001..V012 (V012 = derivation_cache, v0.21.0)
├── migrations/ # SQLite refinery V001..V014 (V012 = derivation_cache v0.21.0, V013 = drop chunk_aliases v0.25.0, V014 = documents.source_id v0.29.0)
└── fixtures/ # 테스트 fixture 트리
```

View File

@@ -397,6 +397,22 @@ $KB search 'tokenizer' --mode lexical --json | jq '.hits | length' # ≥ 1 if co
- `--lang` ISO code.
- `--path-glob` workspace_path glob.
### §2.4bis Source / provenance filters (`--source` / `--source-type`, v0.29.0)
```bash
# 출처 id 필터 ([[workspace.sources]] 의 id; 단일 root 는 "default").
"$RELEASE_BIN" search --config "$DOGFOOD/config.toml" "query" --source jira --json | jq '.hits | length'
# source_type 필터 (markdown/note/paper/reference/inbox).
"$RELEASE_BIN" search --config "$DOGFOOD/config.toml" "query" --source-type reference,markdown --json
```
**verify**:
- `--source` / `--source-type` repeatable + comma-sep, OR within.
- lexical · vector · hybrid 모든 모드에 동일 적용 (직접 인덱스 컬럼 `documents.source_id` / `source_type`).
- 모르는 값 → silently empty (no error).
- 멀티소스 KB 측정: `--source wiki` 가 개념 질의 오염 회복(MRR 0.780→0.810), `--source jira` 가 incident 0.918→0.975 (HOTFIXES 2026-06-21).
- trust precedence: `[[workspace.sources]]` 의 per-source `trust_level` 가 frontmatter 부재 시 적용 → `--trust-min primary` 와 조합 시 secondary source 배제.
### §2.5 Search pagination (p9-fb-34)
```bash

View File

@@ -290,6 +290,21 @@ kebab search "rust" --doc-id "<doc-id>" --tag rust --json
Bad `--ingested-after` → `error.v1.code = config_invalid`, exit 2.
Unknown `--media` value → silently empty (no error).
### Source filters (`--source` / `--source-type`)
````bash
# 단일 root 워크스페이스는 implicit `default` source 로 정규화되므로
# 모든 문서가 source_id="default" — 이 필터는 전체와 동일하다.
kebab search "rust" --source default --json | jq '.hits | length'
# source_type 필터 (frontmatter 의 source_type: 또는 source 기본값).
kebab search "rust" --source-type markdown,reference --json | jq '.hits | length'
````
멀티소스 KB 는 `[[workspace.sources]]` 로 명명 source 를 선언하면
`--source <id>` 로 출처를 좁힌다 (예: `--source jira` → jira 문서만).
빈 값 = 무필터, 콤마/반복 = OR. 모르는 값 → silently empty (no error).
### Trace + stats (fb-37)
Re-run a search with `--trace` to see per-stage candidate lists + timing:

View File

@@ -0,0 +1,104 @@
---
title: kebab v0.29.0 release notes (draft)
created: 2026-06-21
status: draft
release_trigger:
- 신규 CLI flag `kebab search --source` / `--source-type` — pre-1.0 minor bump
- 신규 config `[[workspace.sources]]` + V014 migration (documents.source_id) — 인터페이스 추가
- config schema v3→v4 (단일 root → implicit default source 미러)
---
# kebab v0.29.0 — provenance 출처 필터: 멀티소스 + `--source`
v0.28.0(config 스키마 재편) 후속 minor release. 혼합 출처 KB — 예컨대 위키
문서와 jira 이슈를 한 KB 에 같이 넣은 경우 — 에서 **색인은 전부 하되 질의 시
출처로 좁히는** provenance 레버를 추가한다. **기존 단일-폴더 사용자는 아무것도
손대지 않아도 된다** — 단일 `workspace.root` 는 자동으로 `default` 라는 하나의
source 로 정규화되고, 새 DB 컬럼은 기존 문서에 `default` 를 자동 채우는 additive
마이그레이션이라 재색인이 발생하지 않는다.
---
## 변경 사실
**1) 검색 출처 필터 2종.** `kebab search` 에 두 필터가 생겼다.
| flag | 의미 |
|---|---|
| `--source <id>` | `[[workspace.sources]]` 에 선언한 source 의 id 로 필터 (예: `--source jira`) |
| `--source-type <type>` | `markdown` / `note` / `paper` / `reference` / `inbox` 로 필터 |
둘 다 반복(`--source a --source b`) 또는 콤마(`--source a,b`)로 여러 값을 줄 수
있고 OR 로 묶인다. 빈 값 = 무필터. lexical · vector · hybrid **모든 검색 모드**에
동일하게 적용된다(직접 인덱스 컬럼 — 추가 비용 거의 없음).
**2) `[[workspace.sources]]` 멀티소스 config.** 단일 `[workspace] root` 대신
여러 폴더를 명명 source 로 선언할 수 있다.
```toml
[[workspace.sources]]
id = "notes"
root = "~/KnowledgeBase/notes"
[[workspace.sources]]
id = "jira"
root = "~/exports/jira"
trust_level = "secondary" # frontmatter 가 없을 때의 출처 기본 신뢰도
source_type = "reference" # frontmatter 가 없을 때의 출처 기본 타입
```
각 source 의 `id` 는 그 폴더에서 색인된 모든 문서에 stamp 되고
(`documents.source_id` 컬럼), `--source <id>` 필터의 대상이 된다.
`trust_level` / `source_type` 은 **출처 기본값**으로, 문서 frontmatter 가 해당
필드를 지정하지 않을 때만 적용된다 (우선순위: **frontmatter > source 기본값 >
하드코딩 Primary/Markdown**). `kebab ingest``--root` 를 주지 않으면 선언된
모든 source 를 각자의 root + exclude 로 순회한다.
## Trade-off — 왜 필터인가 (전역 trust 가중은 반증됨)
"출처가 섞이면 신뢰도로 가중하면 되지 않나?" 를 통제 실험으로 검증했고,
**전역 trust 곱셈가중은 반증됐다**. jira 를 docs KB 에 섞으면 개념 질의는 약하게
오염(top-3 정답은 유지, rank1→2 강등)되지만 운영/이슈 질의는 크게 개선된다
(jira_only hit@10 0/10 → 10/10). 그런데 jira 점수에 θ=0.85 만 곱해도 RAG 점수
압축 때문에 incident MRR 이 0.918→0.340 으로 절벽 하락한다 — 작은 오염을 잡으려다
큰 개선을 버리는 see-saw. 그래서 전역 가중은 **빌드하지 않았다**.
올바른 레버는 **질의 시 출처 필터링**이다: 색인은 전부 해 두고(운영 질의는
jira 가 답하게), 개념 질의에서만 `--source wiki` 로 좁힌다. see-saw 없이 양쪽을
다 얻는다.
## Mitigation — 기존 사용자 무영향 (재색인 0)
- **단일 root 그대로 동작**: `[[workspace.sources]]` 를 선언하지 않으면 기존
`workspace.root` 가 implicit `default` source 로 정규화된다. 모든 문서가
`source_id = "default"`.
- **V014 는 additive**: `documents.source_id` 컬럼은 `DEFAULT 'default'` 라 기존
row 가 자동으로 채워진다. 데이터 재작성·재색인·`corpus_revision` bump 없음.
- **config v3→v4 자동 변환**: load 시 메모리에서 자동 변환(디스크 미변경),
`kebab config migrate` 로 파일 갱신 시 값·주석 보존 + 멱등. 단일 `root`
`[[workspace.sources]]` id=default 로 **미러**되며 기존 `root` 키도 그대로 남는다.
도그푸딩(v0.29.0 release 빌드, 실험 corpus): 620 문서 / 0 error 색인,
`source_id = {jira: 400, wiki: 220}`. trust precedence 실측 — jira 는 source
기본값 secondary 라 `--trust-min primary` 시 0/6 노출, wiki 는 primary 유지.
출처 필터 실측 — `--source wiki` 로 개념 질의 MRR 0.780→0.810(오염 회복),
`--source jira` 로 incident 0.918→0.975.
## Upgrade 절차
1. 아무것도 안 해도 된다 — 기존 `config.toml` 과 KB 는 그대로 동작한다(단일 root
= `default` source, V014 자동 backfill).
2. 출처를 나누고 싶으면 `kebab config migrate` 로 config 를 v4 로 갱신한 뒤
`[[workspace.sources]]` 블록을 손으로 추가하고 `kebab ingest --force-reingest`
로 각 문서에 새 `source_id` 를 stamp 한다. (단순 `ingest` 는 내용이 안 바뀐
문서를 skip 하므로, 기존 문서의 source_id 를 `default` 외 값으로 바꾸려면
`--force-reingest` 필요.)
3. 검색에서 `--source <id>` / `--source-type <type>` 로 출처를 좁힌다.
## Known limitations / 다음
- **MCP search 도구**는 아직 `--source` / `--source-type` 를 노출하지 않는다
(CLI 전용). agent 용 MCP 필터 노출은 다음 additive 후보.
- **`kebab list`** 출력(`doc_summary.v1`)에 `source_id` 가 아직 안 실린다.
- **`kebab ask`** citation 에 provenance 라벨이 아직 없다 — 검색 필터는 되지만
답변 근거의 출처 표기는 다음 단계.

View File

@@ -0,0 +1,15 @@
-- V014: [[workspace.sources]] multi-source support.
--
-- Adds `documents.source_id`: the id of the `[[workspace.sources]]` entry a
-- document was ingested from. Single-root workspaces (and every pre-existing
-- row) get the implicit `default` id via the column DEFAULT — so this is a
-- backward-compatible additive migration (no data rewrite, no corpus_revision
-- bump required for existing chunks/embeddings).
--
-- The DEFAULT 'default' literal is kept in sync with
-- `kebab_config::DEFAULT_SOURCE_ID`. The index backs the `--source <id>`
-- search filter (SearchFilters.source_id → `d.source_id IN (...)`).
ALTER TABLE documents ADD COLUMN source_id TEXT NOT NULL DEFAULT 'default';
CREATE INDEX idx_docs_source_id ON documents(source_id);

Some files were not shown because too many files have changed in this diff Show More