diff --git a/Cargo.lock b/Cargo.lock index 231c2c6..60477fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4751,7 +4751,7 @@ dependencies = [ [[package]] name = "kebab-app" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "base64 0.22.1", @@ -4799,7 +4799,7 @@ dependencies = [ [[package]] name = "kebab-chunk" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "blake3", @@ -4817,7 +4817,7 @@ dependencies = [ [[package]] name = "kebab-cli" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "clap", @@ -4838,7 +4838,7 @@ dependencies = [ [[package]] name = "kebab-config" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "dirs 5.0.1", @@ -4854,7 +4854,7 @@ dependencies = [ [[package]] name = "kebab-core" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "blake3", @@ -4868,7 +4868,7 @@ dependencies = [ [[package]] name = "kebab-embed" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "blake3", @@ -4882,7 +4882,7 @@ dependencies = [ [[package]] name = "kebab-embed-candle" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "candle-core", @@ -4902,7 +4902,7 @@ dependencies = [ [[package]] name = "kebab-embed-local" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "fastembed", @@ -4915,7 +4915,7 @@ dependencies = [ [[package]] name = "kebab-embed-ollama" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "kebab-config", @@ -4930,7 +4930,7 @@ dependencies = [ [[package]] name = "kebab-eval" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "kebab-app", @@ -4949,7 +4949,7 @@ dependencies = [ [[package]] name = "kebab-llm" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "kebab-core", @@ -4958,7 +4958,7 @@ dependencies = [ [[package]] name = "kebab-llm-local" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "kebab-config", @@ -4975,7 +4975,7 @@ dependencies = [ [[package]] name = "kebab-mcp" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "kebab-app", @@ -4993,7 +4993,7 @@ dependencies = [ [[package]] name = "kebab-nli" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "hf-hub", @@ -5008,7 +5008,7 @@ dependencies = [ [[package]] name = "kebab-parse-code" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "gix", @@ -5031,7 +5031,7 @@ dependencies = [ [[package]] name = "kebab-parse-image" -version = "0.28.0" +version = "0.29.0" dependencies = [ "ab_glyph", "anyhow", @@ -5059,7 +5059,7 @@ dependencies = [ [[package]] name = "kebab-parse-md" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "kebab-core", @@ -5076,7 +5076,7 @@ dependencies = [ [[package]] name = "kebab-parse-pdf" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "blake3", @@ -5091,7 +5091,7 @@ dependencies = [ [[package]] name = "kebab-rag" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "blake3", @@ -5113,7 +5113,7 @@ dependencies = [ [[package]] name = "kebab-search" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "globset", @@ -5132,7 +5132,7 @@ dependencies = [ [[package]] name = "kebab-source-fs" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "blake3", @@ -5150,7 +5150,7 @@ dependencies = [ [[package]] name = "kebab-store-sqlite" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "blake3", @@ -5170,7 +5170,7 @@ dependencies = [ [[package]] name = "kebab-store-vector" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "arrow", @@ -5194,7 +5194,7 @@ dependencies = [ [[package]] name = "kebab-tui" -version = "0.28.0" +version = "0.29.0" dependencies = [ "anyhow", "crossterm", diff --git a/Cargo.toml b/Cargo.toml index 5e1ca25..bc350a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" -version = "0.28.0" # v0.28.0 — config 스키마 v2→v3 재편: 미디어 형식 설정을 `[ingest.*]` 우산으로 통합(`[indexing]`→`[ingest]` 스칼라, `[chunking]`/`[image.ocr]`/`[image.caption]`/`[pdf.ocr]`→`[ingest.*]`). 기존 v2 파일은 load 시 메모리 자동 변환(디스크 미변경), 파일 갱신은 `kebab config migrate`(값·주석 보존). env 이름(LHS) 100% 보존 + RHS 만 새 경로, 신규 `KEBAB_PDF_OCR_{DET_MODEL,REC_MODEL,DICT,SCORE_THRESH,UNCLIP_RATIO,MAX_BOXES}`. `ingest_config_signature` 바이트 불변(재색인 0). PdfOcrCfg paddle 대칭 키. 신규 인터페이스(config 레이아웃 rename + env 추가) → minor. — CLAUDE.md §Release +version = "0.29.0" # v0.29.0 — provenance 출처 필터: `[[workspace.sources]]` 멀티소스 + 검색 `--source ` / `--source-type `(lexical+vector 두 site, OR). `documents.source_id` 컬럼(V014, additive·DEFAULT 'default'·재색인 0) + config v3→v4 migration(`step_3_to_4`, 단일 root→implicit `default` source 미러, 멱등). per-source `trust_level`/`source_type` 기본값(우선순위 frontmatter > source 기본값 > Primary). 단일 root 사용자 무영향. 설계 근거: 전역 trust 곱셈가중(weighted-RRF)은 A/B 반증(incident MRR 절벽), 출처 필터가 see-saw 없는 레버. 신규 CLI flag + config 키 + migration → minor. — CLAUDE.md §Release # pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with # intentional allow-list. The allowed lints are either cosmetic (doc style), diff --git a/HANDOFF.md b/HANDOFF.md index 92a7638..cc64166 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -35,6 +35,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능. 머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만: +- **2026-06-21 provenance 출처 필터: `[[workspace.sources]]` 멀티소스 + `--source`/`--source-type`** — v0.29.0. 혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 레버. config `[[workspace.sources]]`(각 id/root/trust_level/source_type) + `documents.source_id` 컬럼(V014, additive, 재색인 0) + config v3→v4 migration(`step_3_to_4`, 단일 root→implicit `default` source, 멱등) + 검색 `--source ` / `--source-type `(lexical+vector 두 site, OR). trust precedence = frontmatter > per-source 기본값 > Primary. **설계 근거**: 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR 0.918→0.340 절벽) — 필터가 see-saw 없는 올바른 레버. 도그푸딩(620 doc, jira400+wiki220): `--source wiki` concept 0.780→0.810, `--source jira` incident 0.918→0.975. **follow-up**: MCP search 필터 미노출 · `kebab list` source_id 미표시 · RAG provenance 라벨 미구현. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-21). - **2026-06-04 PP-OCRv5 ONNX Rust 네이티브 OCR** — v0.27.0. `[image.ocr] engine = "paddle-onnx"` 로 PP-OCRv5(검출+인식) ONNX 를 in-process(`ort` =2.0.0-rc.9) 실행 — Python 런타임/원격 호출 없이 큰 페이지 CPU <4초(Ollama vision ~50초 대비). default 는 여전히 `"ollama-vision"`. 후처리(min-area rect/unclip)는 pure-Rust. **함정**: unclip 은 corner 를 centroid 에서 방사 확장하면 안 되고 edge 별 polygon offset 이어야 함(방사 확장 시 wide/short 텍스트 박스 높이가 안 커져 글자 윗부분 잘림 → ㄷ→ㄴ, e2e CER 0.26). 수정 후 CER 0.005. 모델 ONNX 는 `crates/kebab-parse-image/assets/paddleocr-onnx/`(LFS). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-04 PP-OCRv5 ONNX), spec/plan `docs/superpowers/{specs,plans}/2026-06-04-rust-native-ocr-*.md`. - **2026-06-03 ingest 설정 변경 자동 재색인** — v0.26.2. ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/`[ingest.code]`)을 변경하면 `--force-reingest` 없이 영향 자산만 자동 재색인. 그 설정들의 결정적 서명(`ingest_config_signature`)을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 다음 ingest 비교가 mismatch. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout)은 제외(과도 무효화 회피), doc_id 는 base 로 안정 유지. **업그레이드 후 첫 ingest 는 전 자산 1회 재색인**(저장된 상수 parser_version ≠ 새 composite; embedding 은 V012 캐시 히트). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 설정 변경 자동 재색인), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-*invalidation*.md`. - **2026-06-03 ingest 진행 로그 개선** — v0.26.1. 이미지/PDF + OCR/caption on 볼트 ingest 가 "멈춘 듯" 보이던 문제 해소: TTY 진행바에 현재 파일명 + 느린 phase(ocr/caption/embed)+모델명 + 경과초 `(Ns)` heartbeat, 종료 시 최장 소요 파일 top-5 요약. 신규 wire `asset_phase{idx,total,phase,model}` + `asset_timings.ocr_ms`/`caption_ms`(additive, `ingest_progress.v1` 유지, serde default 0). 이미지·PDF 경로도 `asset_timings` emit(이전 markdown 만). 기본 동작 불변. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 진행 로그), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-ingest-log-improve-*.md`. diff --git a/README.md b/README.md index 89b09af..10ddc17 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ Markdown · PDF · 이미지(OCR + caption) · 소스코드(Rust/Python/TS/JS/Go | `kebab ingest []` | 워크스페이스 스캔 후 새/변경 문서 색인 (idempotent · incremental, `--force-reingest` 로 강제 재처리). 미지원 확장자는 자동 skip. 진행바는 현재 **파일명** · 느린 **phase(ocr/caption/embed)+모델명** · **경과초**`(Ns)` · 문서별 청크 수 · phase별 소요시간(parse/chunk/ocr/caption/embed/store)을 표시하고, 종료 시 **최장 소요 파일 top-5** 를 요약한다 (`--json` 은 `asset_phase`/`asset_chunked`/`asset_timings` 이벤트로, 사람용 요약은 미출력) | | `kebab ingest-file ` | 단일 파일 ingest (workspace 외부 가능 — `_external/` 로 deterministic copy) | | `kebab ingest-stdin --title ` | stdin 의 markdown 본문 ingest | -| `kebab search --mode {lexical,vector,hybrid} "" [flags]` | 검색 (default hybrid = RRF fusion, citation 포함). 필터/budget flag 는 `--help` | +| `kebab search --mode {lexical,vector,hybrid} "" [flags]` | 검색 (default hybrid = RRF fusion, citation 포함). 출처 필터 `--source ` (`[[workspace.sources]]` id) · `--source-type {markdown,note,paper,reference,inbox}` (둘 다 repeatable/comma-sep, OR). 그 외 필터/budget flag 는 `--help` | | `kebab ask "" [flags]` | RAG 답변 + 근거 인용 (Ollama 필요). `--session` (multi-turn) · `--stream` · `--multi-hop` | | `kebab list docs` | 색인된 문서 목록 | | `kebab inspect doc ` / `inspect chunk ` | raw record 보기 | @@ -108,6 +108,19 @@ Markdown · PDF · 이미지(OCR + caption) · 소스코드(Rust/Python/TS/JS/Go [workspace] root = "~/KnowledgeBase" # 색인할 폴더. 절대 / tilde / env / 상대 경로 가능. # 상대 경로의 base 는 config.toml 위치 (cwd 무관). + # 단일 root 는 implicit `default` source 로 정규화된다. + +# 멀티소스 (선택) — 출처별로 검색을 좁히려면 root 대신 명명 source 를 선언한다. +# 각 source 의 id 가 모든 문서에 stamp 되고, `kebab search --source ` 로 필터. +# trust_level / source_type 은 frontmatter 가 없을 때의 source 기본값 +# (우선순위: frontmatter > source 기본값 > 하드코딩 Primary/Markdown). +# [[workspace.sources]] +# id = "notes" +# root = "~/KnowledgeBase/notes" +# [[workspace.sources]] +# id = "jira" +# root = "~/exports/jira" +# trust_level = "secondary" # 낮은 신뢰 출처 — `--trust-min primary` 로 배제 가능. [models.embedding] provider = "fastembed" # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust) diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index 4112478..8741c89 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -727,8 +727,7 @@ impl App { // Load (or create) the session header. let now_unix = SystemTime::now() .duration_since(UNIX_EPOCH) - .map(|d| d.as_secs() as i64) - .unwrap_or(0); + .map_or(0, |d| d.as_secs() as i64); let existing = self.sqlite.get_session(session_id)?; let prior_turns = match &existing { Some(_) => self.sqlite.list_turns(session_id)?, @@ -1111,7 +1110,7 @@ fn trim_to_chars(s: &str, n: usize) -> String { /// terminates early) rather than panic in the budget loop. fn estimate_chars(hits: &[SearchHit]) -> usize { hits.iter() - .map(|h| serde_json::to_string(h).map(|s| s.len()).unwrap_or(0)) + .map(|h| serde_json::to_string(h).map_or(0, |s| s.len())) .sum() } diff --git a/crates/kebab-app/src/bulk.rs b/crates/kebab-app/src/bulk.rs index 675d6f0..1491c7d 100644 --- a/crates/kebab-app/src/bulk.rs +++ b/crates/kebab-app/src/bulk.rs @@ -206,6 +206,8 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> { doc_id, repo: vec![], code_lang: vec![], + source_type: vec![], + source_id: vec![], }; let opts = SearchOpts { diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 8dcb638..d896c3e 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -49,7 +49,8 @@ use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, Chunker, ChunkerVersion, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, EmbeddingKind, ExtractContext, IngestReport, Lang, LanguageModel, MediaType, ParserVersion, RawAsset, - SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore, + SearchHit, SearchQuery, SourceScope, SourceType, SourceUri, TrustLevel, VectorRecord, + VectorStore, }; use kebab_llm_local::OllamaLanguageModel; use kebab_parse_image::{ @@ -304,7 +305,12 @@ pub fn ingest_with_config_opts( 0 }); - // Walk the workspace. + // Walk the workspace. `[[workspace.sources]]`: when the caller did not + // pin an explicit `scope.root` (the normal `kebab ingest` path), iterate + // over every configured source — each scanned with its own root + exclude + // and tagged with its `id` + default trust. When `scope.root` IS pinned + // (single-file ingest, `--root` override), scan that one root as the + // implicit `default` source — preserving pre-multi-source behavior. crate::ingest_progress::emit( progress, crate::ingest_progress::IngestEvent::ScanStarted { @@ -313,9 +319,50 @@ pub fn ingest_with_config_opts( ); let connector = FsSourceConnector::new(&app.config).context("kb-app::ingest: build FsSourceConnector")?; - let (assets, fs_skips) = connector - .scan_with_skips(&scope) - .context("kb-app::ingest: scan workspace")?; + + // Per-source scan plan: (source_id, source_trust, scan_scope). + let scan_plan: Vec<(String, Option, SourceScope)> = + if scope.root.as_os_str().is_empty() && scope.include.is_empty() { + app.config + .resolved_sources() + .into_iter() + .map(|s| { + let scan_scope = SourceScope { + root: s.root, + include: scope.include.clone(), + exclude: s.exclude, + }; + (s.id, s.trust_level, scan_scope) + }) + .collect() + } else { + // Explicit-root / single-file / include-restricted ingest: one + // ad-hoc `default` source rooted at the pinned scope. + vec![( + kebab_config::DEFAULT_SOURCE_ID.to_string(), + None, + scope.clone(), + )] + }; + + // Accumulate assets across sources + a per-path lookup of which source + // (id + trust) each asset came from. workspace_path is unique per asset + // within a scan; on the rare overlap across sources, last-write-wins + // (sources should not share roots — a config smell, not enforced). + let mut assets: Vec = Vec::new(); + let mut source_by_path: std::collections::HashMap)> = + std::collections::HashMap::new(); + let mut fs_skips = kebab_source_fs::FsScanSkips::default(); + for (sid, strust, scan_scope) in &scan_plan { + let (src_assets, src_skips) = connector + .scan_with_skips(scan_scope) + .with_context(|| format!("kb-app::ingest: scan source `{sid}`"))?; + for a in &src_assets { + source_by_path.insert(a.workspace_path.0.clone(), (sid.clone(), *strust)); + } + assets.extend(src_assets); + fs_skips.merge(src_skips); + } crate::ingest_progress::emit( progress, crate::ingest_progress::IngestEvent::ScanCompleted { @@ -468,6 +515,14 @@ pub fn ingest_with_config_opts( media: crate::ingest_progress::media_label(&asset.media_type).to_string(), }, ); + // `[[workspace.sources]]`: resolve which source this asset came from. + // Missing only if an asset slipped in outside the scan plan (defensive + // — fall back to the implicit `default` source). + let (source_id, source_trust) = source_by_path + .get(&asset.workspace_path.0) + .map_or((kebab_config::DEFAULT_SOURCE_ID, None), |(id, trust)| { + (id.as_str(), *trust) + }); let item = ingest_one_asset( &app, &asset, @@ -478,6 +533,8 @@ pub fn ingest_with_config_opts( embedder.as_ref(), vector_store.as_ref(), &existing_doc_ids, + source_id, + source_trust, &image_pipeline, force_reingest, pdf_ocr_engine.as_deref(), @@ -738,8 +795,8 @@ pub fn ingest_with_config_opts( if let Ok(mut w) = lw.lock() { let run_id = w.run_id().to_string(); let ms_samples = ocr_ms_samples.lock().map(|v| v.clone()).unwrap_or_default(); - let pages = ocr_pages_cnt.lock().map(|v| *v).unwrap_or(0); - let failures = ocr_failures_cnt.lock().map(|v| *v).unwrap_or(0); + let pages = ocr_pages_cnt.lock().map_or(0, |v| *v); + let failures = ocr_failures_cnt.lock().map_or(0, |v| *v); let summary = crate::ingest_log::IngestSummary::new( crate::ingest_log::now_ts(), run_id, @@ -1173,6 +1230,11 @@ fn ingest_one_asset( embedder: Option<&Arc>, vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, + // `[[workspace.sources]]`: id of the source this asset belongs to (stamped + // onto `documents.source_id`) + that source's default trust level + // (markdown frontmatter overrides it). + source_id: &str, + source_trust: Option, image_pipeline: &ImagePipeline<'_>, force_reingest: bool, pdf_ocr_engine: Option<&dyn OcrEngine>, @@ -1206,6 +1268,7 @@ fn ingest_one_asset( embedder, vector_store, existing_doc_ids, + source_id, image_pipeline, force_reingest, progress, @@ -1221,6 +1284,7 @@ fn ingest_one_asset( embedder, vector_store, existing_doc_ids, + source_id, force_reingest, pdf_ocr_engine, progress, @@ -1263,6 +1327,7 @@ fn ingest_one_asset( existing_doc_ids, force_reingest, lang.as_str(), + source_id, ); } // p10-1A-2: non-Rust Code, Audio, and Other are not yet wired; @@ -1338,7 +1403,7 @@ fn ingest_one_asset( let bytes = std::fs::read(&path) .with_context(|| format!("read asset bytes from {}", path.display()))?; - let body_hints = build_body_hints(asset); + let body_hints = build_body_hints(asset, Some(source_id), source_trust); // Frontmatter — `parse_frontmatter` returns Ok even on malformed // frontmatter (warnings are surfaced through the `Vec`). @@ -1572,6 +1637,7 @@ fn ingest_one_image_asset( embedder: Option<&Arc>, vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, + source_id: &str, image_pipeline: &ImagePipeline<'_>, force_reingest: bool, progress: Option<&std::sync::mpsc::Sender>, @@ -1646,6 +1712,9 @@ fn ingest_one_image_asset( // `image-meta-v1`, which already fixed doc_id). Skip compare + stored // field must agree for next-run detection. canonical.parser_version = eff_parser_version.clone(); + // `[[workspace.sources]]`: stamp the owning source id (image extractor + // leaves it None). + canonical.metadata.source_id = Some(source_id.to_string()); let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX); // 2 + 3. Apply OCR / caption when their adapters exist. Both are @@ -2157,6 +2226,7 @@ fn ingest_one_pdf_asset( embedder: Option<&Arc>, vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, + source_id: &str, force_reingest: bool, pdf_ocr_engine: Option<&dyn OcrEngine>, progress: Option<&std::sync::mpsc::Sender>, @@ -2224,6 +2294,9 @@ fn ingest_one_pdf_asset( // v0.26.2: store the composite parser_version (base `pdf-text-v1` already // fixed doc_id) so the next run's skip compare matches. canonical.parser_version = eff_parser_version.clone(); + // `[[workspace.sources]]`: stamp the owning source id (pdf extractor + // leaves it None). + canonical.metadata.source_id = Some(source_id.to_string()); let parse_ms = u64::try_from(t_parse.elapsed().as_millis()).unwrap_or(u64::MAX); // v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry @@ -2523,6 +2596,7 @@ fn ingest_one_code_asset( existing_doc_ids: &std::collections::HashSet, force_reingest: bool, code_lang: &str, // <-- NEW (p10-1b Task D) + source_id: &str, ) -> anyhow::Result { let path = match &asset.source_uri { SourceUri::File(p) => p.clone(), @@ -2679,6 +2753,11 @@ fn ingest_one_code_asset( } }; + // `[[workspace.sources]]`: stamp the owning source id on the synthesized / + // extracted code doc (covers both Tier 1 extract_for and Tier 2/3 + // synthesize paths — neither knows the source id). + canonical.metadata.source_id = Some(source_id.to_string()); + // p10-1b Task D/G/J/L: chunker per-lang. // p10-3: track whether the extract stage already fell back to Tier 3. // Tier 2 langs already have "none-v1" parser_version normally, so exclude them @@ -2898,7 +2977,7 @@ fn synthesize_tier2_document( use anyhow::Context as _; use kebab_core::{ BlockId, CodeBlock, CommonBlock, Lang, Metadata, Provenance, ProvenanceEvent, - ProvenanceKind, SourceSpan, SourceType, TrustLevel, id_for_block, id_for_doc, + ProvenanceKind, SourceSpan, id_for_block, id_for_doc, }; let text = std::str::from_utf8(bytes) @@ -2986,6 +3065,10 @@ fn synthesize_tier2_document( git_branch, git_commit, code_lang: Some(code_lang.to_string()), + // `[[workspace.sources]]`: stamped by the caller + // (`ingest_one_code_asset`) post-build so Tier 1 (extract_for) and + // Tier 2/3 (this synthesizer) share one code path. + source_id: None, }; tracing::debug!( @@ -3044,12 +3127,20 @@ fn count_lines_in(bytes: &[u8]) -> u32 { /// overhead for large workspaces and the source-of-truth timestamps /// are written into the document's frontmatter when the user wants /// authoritative values. -fn build_body_hints(asset: &RawAsset) -> BodyHints { +fn build_body_hints( + asset: &RawAsset, + source_id: Option<&str>, + source_trust: Option, +) -> BodyHints { BodyHints { first_h1: None, fs_ctime: asset.discovered_at, fs_mtime: asset.discovered_at, fallback_lang: None, + // `[[workspace.sources]]`: stamp the owning source id + inject the + // per-source default trust level (frontmatter still overrides it). + source_id: source_id.map(str::to_string), + fallback_trust_level: source_trust, } } diff --git a/crates/kebab-app/src/reset.rs b/crates/kebab-app/src/reset.rs index 87f8b4a..831979b 100644 --- a/crates/kebab-app/src/reset.rs +++ b/crates/kebab-app/src/reset.rs @@ -114,7 +114,7 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 { if ft.is_dir() { total += walk(&e.path()); } else if ft.is_file() { - total += e.metadata().map(|m| m.len()).unwrap_or(0); + total += e.metadata().map_or(0, |m| m.len()); } } total diff --git a/crates/kebab-app/tests/common/mod.rs b/crates/kebab-app/tests/common/mod.rs index 56f4dc5..ec8b588 100644 --- a/crates/kebab-app/tests/common/mod.rs +++ b/crates/kebab-app/tests/common/mod.rs @@ -51,7 +51,7 @@ impl TestEnv { std::fs::create_dir_all(&model_dir).unwrap(); let mut config = Config::defaults(); - config.workspace.root = workspace_root.to_string_lossy().into_owned(); + config.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); // Drop the ".obsidian" / "node_modules" excludes — they bring // in nothing useful for fixtures and just hide debugging. config.workspace.exclude.clear(); diff --git a/crates/kebab-app/tests/ingest_file.rs b/crates/kebab-app/tests/ingest_file.rs index f87a9f3..60552a9 100644 --- a/crates/kebab-app/tests/ingest_file.rs +++ b/crates/kebab-app/tests/ingest_file.rs @@ -14,7 +14,7 @@ fn ingest_file_copies_external_md_and_reports_new() { fs::create_dir_all(&data).unwrap(); let mut cfg = Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.storage.data_dir = data.to_string_lossy().into_owned(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; @@ -53,7 +53,7 @@ fn ingest_file_idempotent_on_second_call() { fs::create_dir_all(&data).unwrap(); let mut cfg = Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.storage.data_dir = data.to_string_lossy().into_owned(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; @@ -78,7 +78,7 @@ fn ingest_file_errors_on_missing_path() { fs::create_dir_all(&data).unwrap(); let mut cfg = Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.storage.data_dir = data.to_string_lossy().into_owned(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; @@ -97,7 +97,7 @@ fn ingest_file_errors_on_unsupported_extension() { fs::create_dir_all(&data).unwrap(); let mut cfg = Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.storage.data_dir = data.to_string_lossy().into_owned(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-app/tests/ingest_log_smoke.rs b/crates/kebab-app/tests/ingest_log_smoke.rs index d4454b6..a843764 100644 --- a/crates/kebab-app/tests/ingest_log_smoke.rs +++ b/crates/kebab-app/tests/ingest_log_smoke.rs @@ -17,7 +17,7 @@ fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Con std::fs::create_dir_all(&model_dir).unwrap(); let mut cfg = Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = model_dir.to_string_lossy().into_owned(); @@ -130,7 +130,7 @@ fn ingest_log_disabled_emits_no_file() { std::fs::create_dir_all(&model_dir).unwrap(); let mut cfg = Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = model_dir.to_string_lossy().into_owned(); diff --git a/crates/kebab-app/tests/ingest_progress.rs b/crates/kebab-app/tests/ingest_progress.rs index a2f83ea..294324e 100644 --- a/crates/kebab-app/tests/ingest_progress.rs +++ b/crates/kebab-app/tests/ingest_progress.rs @@ -192,7 +192,7 @@ fn pdf_ocr_progress_emits_started_finished_events() { std::fs::create_dir_all(&data_dir).expect("create data dir"); let mut config = kebab_config::Config::defaults(); - config.workspace.root = workspace.to_string_lossy().into_owned(); + config.workspace.root = Some(workspace.to_string_lossy().into_owned()); config.storage.data_dir = data_dir.to_string_lossy().into_owned(); config.models.embedding.provider = "none".to_string(); config.models.embedding.dimensions = 0; diff --git a/crates/kebab-app/tests/ingest_stdin.rs b/crates/kebab-app/tests/ingest_stdin.rs index 3e2e478..3e2eabf 100644 --- a/crates/kebab-app/tests/ingest_stdin.rs +++ b/crates/kebab-app/tests/ingest_stdin.rs @@ -12,7 +12,7 @@ fn fresh_cfg(dir: &std::path::Path) -> Config { fs::create_dir_all(&data).unwrap(); let mut cfg = Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.storage.data_dir = data.to_string_lossy().into_owned(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; @@ -34,7 +34,7 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() { assert_eq!(report.new, 1, "{report:?}"); // _external/ contains exactly one .md file with frontmatter. - let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external"); + let ext_dir = cfg.resolve_workspace_root().join("_external"); let entries: Vec<_> = fs::read_dir(&ext_dir) .unwrap() .filter_map(std::result::Result::ok) @@ -56,7 +56,7 @@ fn ingest_stdin_without_source_uri() { kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap(); assert_eq!(report.new, 1); - let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external"); + let ext_dir = cfg.resolve_workspace_root().join("_external"); let entries: Vec<_> = fs::read_dir(&ext_dir) .unwrap() .filter_map(std::result::Result::ok) diff --git a/crates/kebab-app/tests/schema_active_versions.rs b/crates/kebab-app/tests/schema_active_versions.rs index 6b4f65f..e9f1582 100644 --- a/crates/kebab-app/tests/schema_active_versions.rs +++ b/crates/kebab-app/tests/schema_active_versions.rs @@ -6,7 +6,7 @@ use kebab_core::SourceScope; fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config { let mut cfg = Config::defaults(); - cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); diff --git a/crates/kebab-app/tests/schema_report.rs b/crates/kebab-app/tests/schema_report.rs index 3f99a3a..9320576 100644 --- a/crates/kebab-app/tests/schema_report.rs +++ b/crates/kebab-app/tests/schema_report.rs @@ -8,7 +8,7 @@ use kebab_core::SourceScope; fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config { let mut config = Config::defaults(); - config.workspace.root = workspace_root.to_string_lossy().into_owned(); + config.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); config.workspace.exclude.clear(); config.storage.data_dir = data_dir.to_string_lossy().into_owned(); config.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); diff --git a/crates/kebab-app/tests/skip_reason.rs b/crates/kebab-app/tests/skip_reason.rs index 3db5613..9fba7e5 100644 --- a/crates/kebab-app/tests/skip_reason.rs +++ b/crates/kebab-app/tests/skip_reason.rs @@ -9,7 +9,7 @@ use common::TestEnv; #[test] fn unsupported_extension_skip_carries_warning_and_is_aggregated() { let env = TestEnv::lexical_only(); - let workspace_root = std::path::PathBuf::from(&env.config.workspace.root); + let workspace_root = env.config.resolve_workspace_root(); std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap(); std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap(); diff --git a/crates/kebab-chunk/src/code_c_ast_v1.rs b/crates/kebab-chunk/src/code_c_ast_v1.rs index 642f9d3..725791d 100644 --- a/crates/kebab-chunk/src/code_c_ast_v1.rs +++ b/crates/kebab-chunk/src/code_c_ast_v1.rs @@ -242,6 +242,7 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("c".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/src/code_cpp_ast_v1.rs b/crates/kebab-chunk/src/code_cpp_ast_v1.rs index f9ca1a1..d3f852e 100644 --- a/crates/kebab-chunk/src/code_cpp_ast_v1.rs +++ b/crates/kebab-chunk/src/code_cpp_ast_v1.rs @@ -244,6 +244,7 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("cpp".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/src/code_go_ast_v1.rs b/crates/kebab-chunk/src/code_go_ast_v1.rs index 22e9310..35d6651 100644 --- a/crates/kebab-chunk/src/code_go_ast_v1.rs +++ b/crates/kebab-chunk/src/code_go_ast_v1.rs @@ -244,6 +244,7 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("go".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/src/code_java_ast_v1.rs b/crates/kebab-chunk/src/code_java_ast_v1.rs index 07e0ab8..6dba47e 100644 --- a/crates/kebab-chunk/src/code_java_ast_v1.rs +++ b/crates/kebab-chunk/src/code_java_ast_v1.rs @@ -244,6 +244,7 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("java".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/src/code_js_ast_v1.rs b/crates/kebab-chunk/src/code_js_ast_v1.rs index 8ae1fc5..8a3d8d5 100644 --- a/crates/kebab-chunk/src/code_js_ast_v1.rs +++ b/crates/kebab-chunk/src/code_js_ast_v1.rs @@ -244,6 +244,7 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("javascript".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/src/code_kotlin_ast_v1.rs b/crates/kebab-chunk/src/code_kotlin_ast_v1.rs index 1c1a386..10ec314 100644 --- a/crates/kebab-chunk/src/code_kotlin_ast_v1.rs +++ b/crates/kebab-chunk/src/code_kotlin_ast_v1.rs @@ -244,6 +244,7 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("kotlin".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/src/code_python_ast_v1.rs b/crates/kebab-chunk/src/code_python_ast_v1.rs index ac62678..37b5757 100644 --- a/crates/kebab-chunk/src/code_python_ast_v1.rs +++ b/crates/kebab-chunk/src/code_python_ast_v1.rs @@ -244,6 +244,7 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("python".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/src/code_rust_ast_v1.rs b/crates/kebab-chunk/src/code_rust_ast_v1.rs index 365ed87..3018515 100644 --- a/crates/kebab-chunk/src/code_rust_ast_v1.rs +++ b/crates/kebab-chunk/src/code_rust_ast_v1.rs @@ -244,6 +244,7 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("rust".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/src/code_ts_ast_v1.rs b/crates/kebab-chunk/src/code_ts_ast_v1.rs index 42dd4ac..b001bbc 100644 --- a/crates/kebab-chunk/src/code_ts_ast_v1.rs +++ b/crates/kebab-chunk/src/code_ts_ast_v1.rs @@ -244,6 +244,7 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("typescript".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/src/md_heading_v1.rs b/crates/kebab-chunk/src/md_heading_v1.rs index 0265d1f..ae19ad9 100644 --- a/crates/kebab-chunk/src/md_heading_v1.rs +++ b/crates/kebab-chunk/src/md_heading_v1.rs @@ -450,6 +450,7 @@ mod tests { git_branch: None, git_commit: None, code_lang: None, + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: kebab_core::ParserVersion("test-parser-0".into()), diff --git a/crates/kebab-chunk/src/pdf_page_v1.rs b/crates/kebab-chunk/src/pdf_page_v1.rs index e615163..dfcdac9 100644 --- a/crates/kebab-chunk/src/pdf_page_v1.rs +++ b/crates/kebab-chunk/src/pdf_page_v1.rs @@ -355,6 +355,7 @@ mod tests { git_branch: None, git_commit: None, code_lang: None, + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version, @@ -533,6 +534,7 @@ mod tests { git_branch: None, git_commit: None, code_lang: None, + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version, diff --git a/crates/kebab-chunk/tests/code_c_ast_snapshot.rs b/crates/kebab-chunk/tests/code_c_ast_snapshot.rs index e403ac9..fabb228 100644 --- a/crates/kebab-chunk/tests/code_c_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_c_ast_snapshot.rs @@ -111,6 +111,7 @@ fn fixed_doc() -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("c".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs b/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs index d7ce320..9f9ef83 100644 --- a/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs @@ -118,6 +118,7 @@ fn fixed_doc() -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("cpp".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/code_go_ast_snapshot.rs b/crates/kebab-chunk/tests/code_go_ast_snapshot.rs index a6be7b8..f53255e 100644 --- a/crates/kebab-chunk/tests/code_go_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_go_ast_snapshot.rs @@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("go".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/code_java_ast_snapshot.rs b/crates/kebab-chunk/tests/code_java_ast_snapshot.rs index 42a1ea9..daa883e 100644 --- a/crates/kebab-chunk/tests/code_java_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_java_ast_snapshot.rs @@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("java".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/code_js_ast_snapshot.rs b/crates/kebab-chunk/tests/code_js_ast_snapshot.rs index 6171827..c199a9e 100644 --- a/crates/kebab-chunk/tests/code_js_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_js_ast_snapshot.rs @@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("javascript".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs b/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs index ede55c1..f51a96a 100644 --- a/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs @@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("kotlin".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/code_python_ast_snapshot.rs b/crates/kebab-chunk/tests/code_python_ast_snapshot.rs index c4d7642..112978f 100644 --- a/crates/kebab-chunk/tests/code_python_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_python_ast_snapshot.rs @@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("python".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs b/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs index af85e66..0aa00da 100644 --- a/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs @@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("rust".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/code_text_paragraph_v1.rs b/crates/kebab-chunk/tests/code_text_paragraph_v1.rs index a3ef17a..4cf8ec2 100644 --- a/crates/kebab-chunk/tests/code_text_paragraph_v1.rs +++ b/crates/kebab-chunk/tests/code_text_paragraph_v1.rs @@ -68,6 +68,7 @@ fn text_doc(lang: &str, text: &str) -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some(lang.into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs b/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs index 0eedcea..3fae5e8 100644 --- a/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs @@ -136,6 +136,7 @@ fn fixed_doc() -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("typescript".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/dockerfile_file_v1.rs b/crates/kebab-chunk/tests/dockerfile_file_v1.rs index f0a1a5d..92a267b 100644 --- a/crates/kebab-chunk/tests/dockerfile_file_v1.rs +++ b/crates/kebab-chunk/tests/dockerfile_file_v1.rs @@ -67,6 +67,7 @@ fn dockerfile_doc(dockerfile_text: &str) -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("dockerfile".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs index 4682821..a3b981f 100644 --- a/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs +++ b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs @@ -67,6 +67,7 @@ fn yaml_doc(yaml_text: &str) -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some("yaml".into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-chunk/tests/long_section_snapshot.rs b/crates/kebab-chunk/tests/long_section_snapshot.rs index ceb48b9..73cdbda 100644 --- a/crates/kebab-chunk/tests/long_section_snapshot.rs +++ b/crates/kebab-chunk/tests/long_section_snapshot.rs @@ -58,6 +58,8 @@ fn long_section_chunks_snapshot() { fs_ctime: asset.discovered_at, fs_mtime: asset.discovered_at, fallback_lang: Some("en".into()), + source_id: None, + fallback_trust_level: None, }; let (metadata, fm_span, _fm_warns) = parse_frontmatter(&bytes, &hints).expect("frontmatter parses"); @@ -133,6 +135,8 @@ fn long_section_chunks_are_deterministic() { fs_ctime: asset.discovered_at, fs_mtime: asset.discovered_at, fallback_lang: Some("en".into()), + source_id: None, + fallback_trust_level: None, }; let policy = ChunkPolicy { diff --git a/crates/kebab-chunk/tests/manifest_file_v1.rs b/crates/kebab-chunk/tests/manifest_file_v1.rs index c2bf1cc..98a437d 100644 --- a/crates/kebab-chunk/tests/manifest_file_v1.rs +++ b/crates/kebab-chunk/tests/manifest_file_v1.rs @@ -67,6 +67,7 @@ fn manifest_doc(lang: &str, manifest_text: &str) -> CanonicalDocument { git_branch: Some("main".into()), git_commit: Some("0".repeat(40)), code_lang: Some(lang.into()), + source_id: None, }, provenance: Provenance { events: vec![] }, parser_version: pv, diff --git a/crates/kebab-cli/src/main.rs b/crates/kebab-cli/src/main.rs index 5235c25..bb93c31 100644 --- a/crates/kebab-cli/src/main.rs +++ b/crates/kebab-cli/src/main.rs @@ -193,6 +193,31 @@ enum Cmd { )] code_lang: Vec, + /// Phase-2: filter by document source_type + /// (`markdown`, `note`, `paper`, `reference`, `inbox`). + /// Repeatable or comma-separated. Empty = no filter. + /// The clean source/provenance lever for mixed-source KBs. + #[arg( + long = "source-type", + value_name = "TYPE", + num_args = 1, + value_delimiter = ',' + )] + source_type: Vec, + + /// [[workspace.sources]]: filter by source id — the `id` of the + /// `[[workspace.sources]]` entry a document was ingested from + /// (e.g. `default`, `notes`, `code`). Repeatable or + /// comma-separated. Empty = no filter. The named-source + /// provenance lever for multi-source KBs. + #[arg( + long = "source", + value_name = "ID", + num_args = 1, + value_delimiter = ',' + )] + source: Vec, + /// p9-fb-37: emit pre-fusion lexical / vector / RRF candidate /// lists + per-stage timing in the response. Bypasses cache /// (debug intent — fresh run guaranteed). Requires embeddings @@ -615,12 +640,18 @@ fn run(cli: &Cli) -> anyhow::Result<()> { force_reingest, } => { let cfg = kebab_config::Config::load(cli.config.as_deref())?; - let scope = kebab_core::SourceScope { - root: root - .clone() - .unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)), - exclude: cfg.workspace.exclude.clone(), - ..Default::default() + // [[workspace.sources]]: when the user passes `--root ` we pin + // that single root (one ad-hoc `default` source). Otherwise we + // leave `scope.root` EMPTY so the app iterates every configured + // source (`config.resolved_sources()`); a bare empty scope.exclude + // is fine because each source carries its own merged exclude. + let scope = match root.clone() { + Some(r) => kebab_core::SourceScope { + root: r, + exclude: cfg.workspace.exclude.clone(), + ..Default::default() + }, + None => kebab_core::SourceScope::default(), }; // p9-fb-02: spawn the progress display on a background @@ -629,8 +660,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> { // call returns, the `Sender` drops and the display thread // sees `recv()` return Err — exits cleanly. let plain_env = std::env::var("KEBAB_PROGRESS") - .map(|v| v.eq_ignore_ascii_case("plain")) - .unwrap_or(false); + .is_ok_and(|v| v.eq_ignore_ascii_case("plain")); let mode = progress::ProgressMode::from_flags(cli.json, cli.quiet, plain_env); // Surface the active embedding backend/device on the terminal so the @@ -828,6 +858,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> { doc_id, repo, code_lang, + source_type, + source, trace, bulk, } => { @@ -967,6 +999,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> { doc_id: doc_id.as_ref().map(|s| kebab_core::DocumentId(s.clone())), repo: repo.clone(), code_lang: code_lang.clone(), + source_type: source_type.clone(), + source_id: source.clone(), }; let q = kebab_core::SearchQuery { diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index b90b171..723488e 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -12,6 +12,12 @@ mod paths; pub mod migrate; pub use paths::{expand_path, expand_path_with_base}; +/// Implicit source id used when a single-root `[workspace]` config (no +/// `[[workspace.sources]]`) is normalized into the multi-source model, and +/// the `DEFAULT` value of the `documents.source_id` column. Kept in sync +/// with the migration default in `migrations/V0XX__documents_source_id.sql`. +pub const DEFAULT_SOURCE_ID: &str = "default"; + /// f32 의 shortest round-trip(Display)을 f64 로 재파싱해 직렬화한다. /// `0.3_f32` 가 `0.30000001192092896` 으로 새지 않고 `0.3` 으로 출력되게 한다. /// 마이그레이션 시 toml_edit relocation 의 무손실 비교를 깨지 않도록, 그리고 @@ -88,8 +94,67 @@ pub struct Config { #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct WorkspaceCfg { - pub root: String, + /// Single-root workspace (legacy / common case). `Option` so that a + /// config that declares only `[[workspace.sources]]` (no bare `root`) + /// parses — and, symmetrically, a legacy single-`root` config (no + /// `sources`) still parses unchanged. The load-time normalizer + /// ([`Config::normalize_sources`]) reconciles the two into a single + /// non-empty `sources` list (`id = "default"` synthesized from `root`). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub root: Option, pub exclude: Vec, + /// `[[workspace.sources]]`: named multi-source declaration. When empty + /// and `root` is set, the load path normalizes to a single implicit + /// `default` source. Each entry stamps its `id` onto every document it + /// ingests and supplies per-source `trust_level` / `source_type` + /// defaults (frontmatter still wins per the §0 Q9 derive table). + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub sources: Vec, +} + +/// One named source under `[[workspace.sources]]`. +/// +/// `trust_level` / `source_type` are the **source-level defaults**: they +/// apply when a document's frontmatter does not specify the field. The +/// precedence is `frontmatter > source default > hardcoded` +/// (`TrustLevel::Primary` / `SourceType::Markdown`) — implemented in the +/// markdown derive via `BodyHints::fallback_trust_level`. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct SourceCfg { + /// Stable identifier stamped onto `documents.source_id` for every + /// document ingested from this source. Must be unique and non-empty + /// across the workspace (enforced in [`Config::validate`]). + pub id: String, + /// Root directory to walk for this source. Accepts the same + /// absolute / `~` / `${VAR}` / relative(=config-dir-based) forms as + /// the legacy `workspace.root`. + pub root: String, + /// Per-source denylist globs, merged on top of `workspace.exclude`. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub exclude: Vec, + /// Per-source default `trust_level` (frontmatter overrides it). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub trust_level: Option, + /// Per-source default `source_type` (frontmatter overrides it). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_type: Option, +} + +/// A source with its `root` resolved to an absolute path and its `exclude` +/// merged with `workspace.exclude`. Produced by [`Config::resolved_sources`] +/// — the single entry point the ingest pipeline iterates over. +#[derive(Clone, Debug, PartialEq)] +pub struct ResolvedSource { + /// Stamped onto `documents.source_id`. + pub id: String, + /// Absolute walk root (tilde / `${VAR}` / relative-to-config resolved). + pub root: PathBuf, + /// `workspace.exclude` ∪ per-source `exclude`. + pub exclude: Vec, + /// Per-source default trust level (None → fall back to `Primary`). + pub trust_level: Option, + /// Per-source default source type (None → fall back to `Markdown`). + pub source_type: Option, } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -782,12 +847,13 @@ impl Config { Self { schema_version: crate::migrate::CURRENT_SCHEMA_VERSION, workspace: WorkspaceCfg { - root: "~/KnowledgeBase".to_string(), + root: Some("~/KnowledgeBase".to_string()), exclude: vec![ ".git/**".to_string(), "node_modules/**".to_string(), ".obsidian/**".to_string(), ], + sources: vec![], }, storage: StorageCfg { data_dir: "${XDG_DATA_HOME:-~/.local/share}/kebab".to_string(), @@ -906,7 +972,78 @@ impl Config { PathBuf::from(".") }) }); - paths::expand_path_with_base(&self.workspace.root, "", &base) + paths::expand_path_with_base(&self.primary_root_raw(), "", &base) + } + + /// The raw (unexpanded) string for the *primary* workspace root, used by + /// [`resolve_workspace_root`](Self::resolve_workspace_root) and any + /// single-root code path. Order: first `[[workspace.sources]]` entry's + /// `root` → bare `workspace.root` → `~/KnowledgeBase` default. This keeps + /// every pre-existing single-root call site working when only `sources` + /// is declared. + fn primary_root_raw(&self) -> String { + if let Some(s) = self.workspace.sources.first() { + return s.root.clone(); + } + self.workspace + .root + .clone() + .unwrap_or_else(|| "~/KnowledgeBase".to_string()) + } + + /// The base directory for resolving relative source roots: the config + /// file's directory when loaded from disk, else the current dir (mirrors + /// [`resolve_workspace_root`](Self::resolve_workspace_root)). + fn root_resolution_base(&self) -> PathBuf { + self.source_dir.clone().unwrap_or_else(|| { + std::env::current_dir().unwrap_or_else(|e| { + tracing::warn!( + target: "kebab-config", + error = %e, + "current_dir() failed; falling back to '.' for source root resolution" + ); + PathBuf::from(".") + }) + }) + } + + /// Normalized, resolved list of sources to ingest. Always non-empty: + /// + /// - If `[[workspace.sources]]` is declared, each entry is returned with + /// its `root` expanded and `exclude` merged with `workspace.exclude`. + /// - Otherwise a single implicit source `id = "default"` is synthesized + /// from `workspace.root` (the legacy single-root path). + /// + /// This is the single entry point the ingest pipeline iterates over, so + /// single-root and multi-source configs share one code path. + pub fn resolved_sources(&self) -> Vec { + let base = self.root_resolution_base(); + if self.workspace.sources.is_empty() { + let root = paths::expand_path_with_base(&self.primary_root_raw(), "", &base); + return vec![ResolvedSource { + id: DEFAULT_SOURCE_ID.to_string(), + root, + exclude: self.workspace.exclude.clone(), + trust_level: None, + source_type: None, + }]; + } + self.workspace + .sources + .iter() + .map(|s| { + let root = paths::expand_path_with_base(&s.root, "", &base); + let mut exclude = self.workspace.exclude.clone(); + exclude.extend(s.exclude.iter().cloned()); + ResolvedSource { + id: s.id.clone(), + root, + exclude, + trust_level: s.trust_level, + source_type: s.source_type, + } + }) + .collect() } /// Read config from disk and merge env overrides on top of it. If the @@ -1019,10 +1156,41 @@ impl Config { cause: format!("parse_failed: {e}"), }) })?; + cfg.validate_sources().map_err(|cause| { + anyhow::Error::new(ConfigInvalid { + path: path.to_path_buf(), + cause, + }) + })?; cfg.source_dir = path.parent().map(Path::to_path_buf); Ok(cfg) } + /// Validate `[[workspace.sources]]`: every `id` must be non-empty and + /// unique across the workspace. Empty `sources` (legacy single-root) is + /// always valid. Returns the failure cause string for `ConfigInvalid`. + fn validate_sources(&self) -> Result<(), String> { + let mut seen = std::collections::HashSet::new(); + for s in &self.workspace.sources { + if s.id.trim().is_empty() { + return Err("workspace.sources: an entry has an empty `id`".to_string()); + } + if s.root.trim().is_empty() { + return Err(format!( + "workspace.sources: source `{}` has an empty `root`", + s.id + )); + } + if !seen.insert(s.id.as_str()) { + return Err(format!( + "workspace.sources: duplicate source id `{}` (ids must be unique)", + s.id + )); + } + } + Ok(()) + } + /// Apply `KEBAB_
_` env overrides. Unknown keys are ignored. /// /// The mapping is an explicit grep-friendly whitelist — one match arm @@ -1037,7 +1205,7 @@ impl Config { } match k.as_str() { // workspace - "KEBAB_WORKSPACE_ROOT" => self.workspace.root = v.clone(), + "KEBAB_WORKSPACE_ROOT" => self.workspace.root = Some(v.clone()), // storage "KEBAB_STORAGE_DATA_DIR" => self.storage.data_dir = v.clone(), @@ -2034,7 +2202,7 @@ max_context_tokens = 8000 #[test] fn legacy_include_field_is_ignored_silently() { let mut cfg = Config::defaults(); - cfg.workspace.root = "/tmp/kebab-legacy".to_string(); + cfg.workspace.root = Some("/tmp/kebab-legacy".to_string()); let mut toml_text = toml::to_string(&cfg).expect("default round-trips"); // Inject a legacy `include = [...]` line into the [workspace] block. toml_text = toml_text.replace( @@ -2048,20 +2216,105 @@ max_context_tokens = 8000 parsed.err() ); let cfg = parsed.unwrap(); - assert_eq!(cfg.workspace.root, "/tmp/kebab-legacy"); + assert_eq!(cfg.workspace.root.as_deref(), Some("/tmp/kebab-legacy")); } /// p9-fb-25: `WorkspaceCfg` must NOT have an `include` field. /// Compile-time proof: exhaustive destructure. #[test] - fn workspace_cfg_has_only_root_and_exclude_fields() { + fn workspace_cfg_has_only_root_exclude_sources_fields() { let ws = Config::defaults().workspace; let WorkspaceCfg { root: _, exclude: _, + sources: _, } = &ws; } + #[test] + fn legacy_single_root_normalizes_to_default_source() { + // A single-root config (no [[workspace.sources]]) must resolve to + // exactly one source `id = "default"` rooted at workspace.root. + let mut cfg = Config::defaults(); + cfg.workspace.root = Some("/tmp/kb-notes".to_string()); + let resolved = cfg.resolved_sources(); + assert_eq!(resolved.len(), 1); + assert_eq!(resolved[0].id, DEFAULT_SOURCE_ID); + assert_eq!(resolved[0].root, std::path::PathBuf::from("/tmp/kb-notes")); + assert_eq!(resolved[0].trust_level, None); + } + + #[test] + fn multi_source_config_resolves_each_with_merged_exclude() { + let mut cfg = Config::defaults(); + cfg.workspace.root = None; + cfg.workspace.exclude = vec![".git/**".to_string()]; + cfg.workspace.sources = vec![ + SourceCfg { + id: "notes".to_string(), + root: "/tmp/notes".to_string(), + exclude: vec![], + trust_level: Some(kebab_core::TrustLevel::Primary), + source_type: None, + }, + SourceCfg { + id: "refs".to_string(), + root: "/tmp/refs".to_string(), + exclude: vec!["draft/**".to_string()], + trust_level: Some(kebab_core::TrustLevel::Secondary), + source_type: Some(kebab_core::SourceType::Reference), + }, + ]; + // A multi-source config (no bare root) must round-trip through TOML. + let toml_text = toml::to_string(&cfg).expect("multi-source serializes"); + let cfg: Config = toml::from_str(&toml_text).expect("multi-source parses"); + cfg.validate_sources().expect("valid sources"); + let resolved = cfg.resolved_sources(); + assert_eq!(resolved.len(), 2); + assert_eq!(resolved[0].id, "notes"); + assert_eq!(resolved[0].root, std::path::PathBuf::from("/tmp/notes")); + assert_eq!(resolved[0].exclude, vec![".git/**".to_string()]); + assert_eq!(resolved[0].trust_level, Some(kebab_core::TrustLevel::Primary)); + assert_eq!(resolved[1].id, "refs"); + // workspace.exclude ∪ per-source exclude. + assert_eq!( + resolved[1].exclude, + vec![".git/**".to_string(), "draft/**".to_string()] + ); + assert_eq!( + resolved[1].source_type, + Some(kebab_core::SourceType::Reference) + ); + assert_eq!( + resolved[1].trust_level, + Some(kebab_core::TrustLevel::Secondary) + ); + } + + fn source_cfg(id: &str, root: &str) -> SourceCfg { + SourceCfg { + id: id.to_string(), + root: root.to_string(), + exclude: vec![], + trust_level: None, + source_type: None, + } + } + + #[test] + fn duplicate_source_ids_rejected() { + let mut cfg = Config::defaults(); + cfg.workspace.sources = vec![source_cfg("dup", "/a"), source_cfg("dup", "/b")]; + assert!(cfg.validate_sources().is_err(), "duplicate ids must fail"); + } + + #[test] + fn empty_source_id_rejected() { + let mut cfg = Config::defaults(); + cfg.workspace.sources = vec![source_cfg("", "/a")]; + assert!(cfg.validate_sources().is_err(), "empty id must fail"); + } + #[test] fn default_stale_threshold_is_30() { let c = Config::defaults(); diff --git a/crates/kebab-config/src/migrate.rs b/crates/kebab-config/src/migrate.rs index 10af302..7c1fbdd 100644 --- a/crates/kebab-config/src/migrate.rs +++ b/crates/kebab-config/src/migrate.rs @@ -9,7 +9,7 @@ use toml_edit::{DocumentMut, Item}; /// 현재 바이너리가 이해하는 config 스키마 버전. 마이그레이션 완료 시 /// 사용자 파일의 `schema_version` 을 이 값으로 stamp 한다. -pub const CURRENT_SCHEMA_VERSION: u32 = 3; +pub const CURRENT_SCHEMA_VERSION: u32 = 4; /// 한 번의 마이그레이션에서 발생한 개별 변경. #[derive(Clone, Debug, PartialEq, serde::Serialize)] @@ -68,6 +68,7 @@ const HEADER: &str = "\ fn section_comment(path: &str) -> Option<&'static str> { Some(match path { "workspace" => "# 색인 대상 워크스페이스.", + "workspace.sources" => "# named multi-source (각 source 의 id 가 documents.source_id 로 stamp).", "storage" => "# XDG 저장 경로(데이터/sqlite/벡터/에셋/모델).", "indexing" => "# 병렬도 + 파일시스템 watch.", "chunking" => "# 청크 크기·오버랩·heading 존중.", @@ -376,6 +377,39 @@ pub fn step_2_to_3(doc: &mut DocumentMut, changes: &mut Vec) { copy_image_paddle_to_pdf(doc); } +/// v3 → v4: 단일 `workspace.root` 를 `[[workspace.sources]]` 의 implicit +/// `default` source 로 미러링한다(`id = "default"`, `root = <기존 root>`). +/// 기존 `workspace.root` 키는 그대로 둔다 — `resolved_sources()` 가 sources +/// 가 있으면 그쪽을 우선하므로 무해하고, defaults reconcile 이 root 를 다시 +/// 추가하려 하지 않게 한다. 멱등: `[[workspace.sources]]` 가 이미 있으면 no-op. +pub fn step_3_to_4(doc: &mut DocumentMut, changes: &mut Vec) { + let Some(ws) = doc.get_mut("workspace").and_then(Item::as_table_mut) else { + return; + }; + // 이미 sources 가 선언돼 있으면(array-of-tables 든 inline 이든) 손대지 않음. + if ws.contains_key("sources") { + return; + } + // root 가 없으면 만들 게 없음(defaults 에는 항상 있지만 방어). + let Some(root_val) = ws.get("root").and_then(Item::as_str).map(str::to_string) else { + return; + }; + + let mut entry = toml_edit::Table::new(); + entry.insert("id", toml_edit::value("default")); + entry.insert("root", toml_edit::value(root_val)); + + let mut aot = toml_edit::ArrayOfTables::new(); + aot.push(entry); + ws.insert("sources", Item::ArrayOfTables(aot)); + + changes.push(MigrationChange { + kind: ChangeKind::AddedSection, + path: "workspace.sources".to_string(), + detail: "workspace.root → [[workspace.sources]] id=default".to_string(), + }); +} + /// 파일의 schema_version(없으면 1) 부터 CURRENT 까지 step 적용. fn run_steps(doc: &mut DocumentMut, from: u32, changes: &mut Vec) { if from < 2 { @@ -384,6 +418,9 @@ fn run_steps(doc: &mut DocumentMut, from: u32, changes: &mut Vec, + + /// `[[workspace.sources]]`: id of the named source this document was + /// ingested from (the `id` of the matching `[[workspace.sources]]` + /// entry; `"default"` for single-root workspaces normalized to the + /// implicit `default` source). null on documents ingested before the + /// multi-source feature; the store column defaults to `"default"`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_id: Option, } #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] @@ -105,12 +113,14 @@ mod tests { git_branch: None, git_commit: None, code_lang: None, + source_id: None, }; let v = serde_json::to_value(&m).unwrap(); assert!(v.get("repo").is_none()); assert!(v.get("git_branch").is_none()); assert!(v.get("git_commit").is_none()); assert!(v.get("code_lang").is_none()); + assert!(v.get("source_id").is_none()); } #[test] @@ -128,8 +138,10 @@ mod tests { git_branch: Some("main".into()), git_commit: Some("a".repeat(40)), code_lang: Some("rust".into()), + source_id: Some("notes".into()), }; let v = serde_json::to_value(&m).unwrap(); + assert_eq!(v["source_id"], "notes"); assert_eq!(v["repo"], "kebab"); assert_eq!(v["git_branch"], "main"); assert_eq!(v["git_commit"].as_str().unwrap().len(), 40); diff --git a/crates/kebab-core/src/search.rs b/crates/kebab-core/src/search.rs index 9297372..c10b1a8 100644 --- a/crates/kebab-core/src/search.rs +++ b/crates/kebab-core/src/search.rs @@ -69,6 +69,20 @@ pub struct SearchFilters { /// Unknown values produce empty hits (consistent with `media` policy). #[serde(default)] pub code_lang: Vec, + /// Phase-2 (jira-contamination experiment): filter by `documents.source_type` + /// (`markdown` | `note` | `paper` | `reference` | `inbox`). Empty = no filter; + /// multi-value = OR. Direct indexed column — the clean provenance/source lever: + /// filtering recovers concept-query precision without the see-saw of global + /// trust-weighting (see tasks/HOTFIXES.md A/B evidence). + #[serde(default)] + pub source_type: Vec, + /// `[[workspace.sources]]`: filter by `documents.source_id` (the `id` of + /// the `[[workspace.sources]]` entry a document was ingested from; e.g. + /// `default`, `notes`, `code`). Empty = no filter; multi-value = OR. + /// Direct indexed column (idx_docs_source_id) — the named-source + /// provenance lever for multi-source KBs. + #[serde(default)] + pub source_id: Vec, } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] diff --git a/crates/kebab-mcp/src/tools/search.rs b/crates/kebab-mcp/src/tools/search.rs index 00e67be..fe1120d 100644 --- a/crates/kebab-mcp/src/tools/search.rs +++ b/crates/kebab-mcp/src/tools/search.rs @@ -107,6 +107,8 @@ pub fn handle(state: &KebabAppState, input: SearchInput) -> CallToolResult { doc_id: input.doc_id.clone().map(kebab_core::DocumentId), repo: vec![], code_lang: vec![], + source_type: vec![], + source_id: vec![], }; let query = kebab_core::SearchQuery { diff --git a/crates/kebab-mcp/tests/error_mapping.rs b/crates/kebab-mcp/tests/error_mapping.rs index 7763972..977d031 100644 --- a/crates/kebab-mcp/tests/error_mapping.rs +++ b/crates/kebab-mcp/tests/error_mapping.rs @@ -10,7 +10,7 @@ async fn schema_tool_emits_error_v1_when_db_missing() { let dir = tempfile::tempdir().unwrap(); let mut cfg = Config::defaults(); cfg.storage.data_dir = dir.path().to_string_lossy().into_owned(); - cfg.workspace.root = dir.path().join("notes").to_string_lossy().into_owned(); + cfg.workspace.root = Some(dir.path().join("notes").to_string_lossy().into_owned()); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; // Note: NO ingest call — kebab.sqlite is absent → schema_with_config diff --git a/crates/kebab-mcp/tests/tools_call_ask.rs b/crates/kebab-mcp/tests/tools_call_ask.rs index 5a5ea0c..a30fcbe 100644 --- a/crates/kebab-mcp/tests/tools_call_ask.rs +++ b/crates/kebab-mcp/tests/tools_call_ask.rs @@ -10,7 +10,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); - cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-mcp/tests/tools_call_ask_multi_hop.rs b/crates/kebab-mcp/tests/tools_call_ask_multi_hop.rs index c121cd2..0cbbc7f 100644 --- a/crates/kebab-mcp/tests/tools_call_ask_multi_hop.rs +++ b/crates/kebab-mcp/tests/tools_call_ask_multi_hop.rs @@ -27,7 +27,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); - cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-mcp/tests/tools_call_bulk_search.rs b/crates/kebab-mcp/tests/tools_call_bulk_search.rs index 348a439..87cf571 100644 --- a/crates/kebab-mcp/tests/tools_call_bulk_search.rs +++ b/crates/kebab-mcp/tests/tools_call_bulk_search.rs @@ -12,7 +12,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); - cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-mcp/tests/tools_call_doctor.rs b/crates/kebab-mcp/tests/tools_call_doctor.rs index a7eb53e..1d29651 100644 --- a/crates/kebab-mcp/tests/tools_call_doctor.rs +++ b/crates/kebab-mcp/tests/tools_call_doctor.rs @@ -9,10 +9,10 @@ async fn doctor_tool_returns_doctor_v1_json() { let dir = tempfile::tempdir().unwrap(); let mut cfg = Config::defaults(); cfg.storage.data_dir = dir.path().join("data").to_string_lossy().into_owned(); - cfg.workspace.root = dir.path().join("notes").to_string_lossy().into_owned(); + cfg.workspace.root = Some(dir.path().join("notes").to_string_lossy().into_owned()); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; - std::fs::create_dir_all(&cfg.workspace.root).unwrap(); + std::fs::create_dir_all(cfg.resolve_workspace_root()).unwrap(); // Pass None for config_path — doctor falls back to XDG default probe // (path won't exist in the tempdir, which is fine; doctor reports it diff --git a/crates/kebab-mcp/tests/tools_call_fetch.rs b/crates/kebab-mcp/tests/tools_call_fetch.rs index fff3b72..8b5c8c9 100644 --- a/crates/kebab-mcp/tests/tools_call_fetch.rs +++ b/crates/kebab-mcp/tests/tools_call_fetch.rs @@ -16,7 +16,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); - cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-mcp/tests/tools_call_ingest_file.rs b/crates/kebab-mcp/tests/tools_call_ingest_file.rs index 5afe75f..fe1feaa 100644 --- a/crates/kebab-mcp/tests/tools_call_ingest_file.rs +++ b/crates/kebab-mcp/tests/tools_call_ingest_file.rs @@ -15,7 +15,7 @@ async fn ingest_file_tool_returns_ingest_report_v1() { fs::create_dir_all(&data).unwrap(); let mut cfg = Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.storage.data_dir = data.to_string_lossy().into_owned(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; @@ -61,7 +61,7 @@ async fn ingest_file_tool_idempotent_on_second_call() { std::fs::create_dir_all(&data).unwrap(); let mut cfg = kebab_config::Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.storage.data_dir = data.to_string_lossy().into_owned(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-mcp/tests/tools_call_ingest_stdin.rs b/crates/kebab-mcp/tests/tools_call_ingest_stdin.rs index 0e9967e..c9e521c 100644 --- a/crates/kebab-mcp/tests/tools_call_ingest_stdin.rs +++ b/crates/kebab-mcp/tests/tools_call_ingest_stdin.rs @@ -14,7 +14,7 @@ fn fresh_state(dir: &std::path::Path) -> KebabAppState { fs::create_dir_all(&data).unwrap(); let mut cfg = Config::defaults(); - cfg.workspace.root = workspace.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace.to_string_lossy().into_owned()); cfg.storage.data_dir = data.to_string_lossy().into_owned(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-mcp/tests/tools_call_schema.rs b/crates/kebab-mcp/tests/tools_call_schema.rs index 135bbc3..25fb776 100644 --- a/crates/kebab-mcp/tests/tools_call_schema.rs +++ b/crates/kebab-mcp/tests/tools_call_schema.rs @@ -11,7 +11,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); - cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-mcp/tests/tools_call_search.rs b/crates/kebab-mcp/tests/tools_call_search.rs index 8ca7b55..0edc021 100644 --- a/crates/kebab-mcp/tests/tools_call_search.rs +++ b/crates/kebab-mcp/tests/tools_call_search.rs @@ -11,7 +11,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); - cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-mcp/tests/tools_call_search_trace.rs b/crates/kebab-mcp/tests/tools_call_search_trace.rs index f1e5835..5be77ed 100644 --- a/crates/kebab-mcp/tests/tools_call_search_trace.rs +++ b/crates/kebab-mcp/tests/tools_call_search_trace.rs @@ -11,7 +11,7 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); - cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.root = Some(workspace_root.to_string_lossy().into_owned()); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); cfg.models.embedding.dimensions = 0; diff --git a/crates/kebab-parse-code/src/c.rs b/crates/kebab-parse-code/src/c.rs index 5a834a2..abbf98d 100644 --- a/crates/kebab-parse-code/src/c.rs +++ b/crates/kebab-parse-code/src/c.rs @@ -131,6 +131,7 @@ impl Extractor for CAstExtractor { git_branch, git_commit, code_lang: Some("c".to_string()), + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-code/src/cpp.rs b/crates/kebab-parse-code/src/cpp.rs index 7a8f838..f66de22 100644 --- a/crates/kebab-parse-code/src/cpp.rs +++ b/crates/kebab-parse-code/src/cpp.rs @@ -155,6 +155,7 @@ impl Extractor for CppAstExtractor { git_branch, git_commit, code_lang: Some("cpp".to_string()), + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-code/src/go.rs b/crates/kebab-parse-code/src/go.rs index f6ac8cf..72cde9a 100644 --- a/crates/kebab-parse-code/src/go.rs +++ b/crates/kebab-parse-code/src/go.rs @@ -133,6 +133,7 @@ impl Extractor for GoAstExtractor { git_branch, git_commit, code_lang: Some("go".to_string()), + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-code/src/java.rs b/crates/kebab-parse-code/src/java.rs index be28326..79b0377 100644 --- a/crates/kebab-parse-code/src/java.rs +++ b/crates/kebab-parse-code/src/java.rs @@ -144,6 +144,7 @@ impl Extractor for JavaAstExtractor { git_branch, git_commit, code_lang: Some("java".to_string()), + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-code/src/javascript.rs b/crates/kebab-parse-code/src/javascript.rs index 321cdc8..c804a92 100644 --- a/crates/kebab-parse-code/src/javascript.rs +++ b/crates/kebab-parse-code/src/javascript.rs @@ -151,6 +151,7 @@ impl Extractor for JavascriptAstExtractor { git_branch, git_commit, code_lang: Some("javascript".to_string()), + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-code/src/kotlin.rs b/crates/kebab-parse-code/src/kotlin.rs index 1e95632..4b8fa12 100644 --- a/crates/kebab-parse-code/src/kotlin.rs +++ b/crates/kebab-parse-code/src/kotlin.rs @@ -149,6 +149,7 @@ impl Extractor for KotlinAstExtractor { git_branch, git_commit, code_lang: Some("kotlin".to_string()), + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-code/src/python.rs b/crates/kebab-parse-code/src/python.rs index 1c7956a..78a1512 100644 --- a/crates/kebab-parse-code/src/python.rs +++ b/crates/kebab-parse-code/src/python.rs @@ -133,6 +133,7 @@ impl Extractor for PythonAstExtractor { git_branch, git_commit, code_lang: Some("python".to_string()), + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-code/src/rust.rs b/crates/kebab-parse-code/src/rust.rs index d1302a0..39c4b5b 100644 --- a/crates/kebab-parse-code/src/rust.rs +++ b/crates/kebab-parse-code/src/rust.rs @@ -136,6 +136,7 @@ impl Extractor for RustAstExtractor { git_branch, git_commit, code_lang: Some("rust".to_string()), + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-code/src/typescript.rs b/crates/kebab-parse-code/src/typescript.rs index de07866..88aa281 100644 --- a/crates/kebab-parse-code/src/typescript.rs +++ b/crates/kebab-parse-code/src/typescript.rs @@ -144,6 +144,7 @@ impl Extractor for TypescriptAstExtractor { git_branch, git_commit, code_lang: Some("typescript".to_string()), + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs index 422d010..b310a18 100644 --- a/crates/kebab-parse-image/src/lib.rs +++ b/crates/kebab-parse-image/src/lib.rs @@ -203,6 +203,7 @@ impl Extractor for ImageExtractor { git_branch: None, git_commit: None, code_lang: None, + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-parse-md/src/frontmatter.rs b/crates/kebab-parse-md/src/frontmatter.rs index 04b6efd..f743123 100644 --- a/crates/kebab-parse-md/src/frontmatter.rs +++ b/crates/kebab-parse-md/src/frontmatter.rs @@ -42,6 +42,16 @@ pub struct BodyHints { /// Optional language fallback used when neither frontmatter nor lingua /// detection produce a value. If `None` the final fallback is `"und"`. pub fallback_lang: Option, + /// `[[workspace.sources]]`: id of the source this document is being + /// ingested from. Copied verbatim into `Metadata.source_id` (frontmatter + /// does not override the source id — it is an ingest-time provenance + /// stamp, not a user-authored field). `None` when single-root / + /// unspecified, in which case `Metadata.source_id` stays `None`. + pub source_id: Option, + /// `[[workspace.sources]]`: per-source default `trust_level`. Consulted + /// only when the frontmatter does not specify `trust_level`. Precedence: + /// frontmatter > this source default > hardcoded `Primary`. + pub fallback_trust_level: Option, } /// Byte range of the frontmatter region inside the input slice. @@ -444,8 +454,12 @@ fn derive_metadata( }; // ---- trust_level ---- + // Precedence: frontmatter > per-source default (hints.fallback_trust_level) + // > hardcoded Primary. An *unknown* frontmatter value warns and also falls + // through to the source default (then Primary), so a typo doesn't silently + // promote past the source's intended trust. let trust_level = match raw.trust_level.as_deref() { - None => TrustLevel::Primary, + None => hints.fallback_trust_level.unwrap_or(TrustLevel::Primary), Some(s) => { if let Some(tl) = parse_trust_level(s) { tl @@ -454,7 +468,7 @@ fn derive_metadata( kind: WarningKind::MalformedFrontmatter, note: format!("unknown trust_level={s}, defaulted to primary"), }); - TrustLevel::Primary + hints.fallback_trust_level.unwrap_or(TrustLevel::Primary) } } }; @@ -477,6 +491,10 @@ fn derive_metadata( git_branch: None, git_commit: None, code_lang: None, + // `[[workspace.sources]]`: ingest-time provenance stamp. Frontmatter + // does not override the source id — it is supplied by the caller + // (kebab-app) from the matching source's config `id`. + source_id: hints.source_id.clone(), } } @@ -604,6 +622,8 @@ mod tests { fs_ctime: datetime!(2024-01-01 00:00:00 UTC), fs_mtime: datetime!(2024-01-02 00:00:00 UTC), fallback_lang: None, + source_id: None, + fallback_trust_level: None, } } @@ -695,6 +715,47 @@ source_type: alien\n\ assert!(warns.iter().any(|w| w.note.contains("source_type=alien"))); } + fn hints_with_source(id: &str, trust: Option) -> BodyHints { + BodyHints { + source_id: Some(id.to_string()), + fallback_trust_level: trust, + ..hints() + } + } + + #[test] + fn source_default_trust_applied_when_frontmatter_absent() { + // No `trust_level:` in frontmatter → the per-source default wins + // over the hardcoded Primary. + let md = b"---\ntitle: Doc\n---\nbody\n"; + let (meta, _span, warns) = + parse_frontmatter(md, &hints_with_source("notes", Some(TrustLevel::Secondary))) + .unwrap(); + assert!(warns.is_empty(), "warnings: {warns:?}"); + assert_eq!(meta.trust_level, TrustLevel::Secondary); + assert_eq!(meta.source_id.as_deref(), Some("notes")); + } + + #[test] + fn frontmatter_trust_overrides_source_default() { + // Explicit frontmatter trust beats the per-source default. + let md = b"---\ntrust_level: generated\n---\nbody\n"; + let (meta, _span, _warns) = + parse_frontmatter(md, &hints_with_source("notes", Some(TrustLevel::Secondary))) + .unwrap(); + assert_eq!(meta.trust_level, TrustLevel::Generated); + assert_eq!(meta.source_id.as_deref(), Some("notes")); + } + + #[test] + fn no_source_id_leaves_metadata_source_id_none() { + let md = b"---\ntitle: Doc\n---\nbody\n"; + let (meta, _span, _warns) = parse_frontmatter(md, &hints()).unwrap(); + assert_eq!(meta.source_id, None); + // Without a source default, hardcoded Primary still applies. + assert_eq!(meta.trust_level, TrustLevel::Primary); + } + #[test] fn malformed_yaml_emits_warning_and_defaults() { // Unclosed quote → YAML parse fails. diff --git a/crates/kebab-parse-md/src/normalize.rs b/crates/kebab-parse-md/src/normalize.rs index 9adf4be..1141fa9 100644 --- a/crates/kebab-parse-md/src/normalize.rs +++ b/crates/kebab-parse-md/src/normalize.rs @@ -469,6 +469,7 @@ mod tests { git_branch: None, git_commit: None, code_lang: None, + source_id: None, } } diff --git a/crates/kebab-parse-md/tests/frontmatter_snapshots.rs b/crates/kebab-parse-md/tests/frontmatter_snapshots.rs index cf9c473..214ae6a 100644 --- a/crates/kebab-parse-md/tests/frontmatter_snapshots.rs +++ b/crates/kebab-parse-md/tests/frontmatter_snapshots.rs @@ -37,6 +37,8 @@ fn pinned_hints() -> BodyHints { fs_ctime: datetime!(2024-01-01 00:00:00 UTC), fs_mtime: datetime!(2024-01-02 00:00:00 UTC), fallback_lang: None, + source_id: None, + fallback_trust_level: None, } } diff --git a/crates/kebab-parse-md/tests/normalize_snapshot.rs b/crates/kebab-parse-md/tests/normalize_snapshot.rs index 3c84997..31b76e3 100644 --- a/crates/kebab-parse-md/tests/normalize_snapshot.rs +++ b/crates/kebab-parse-md/tests/normalize_snapshot.rs @@ -86,6 +86,8 @@ fn code_and_table_canonical_snapshot() { fs_ctime: asset.discovered_at, fs_mtime: asset.discovered_at, fallback_lang: Some("en".into()), + source_id: None, + fallback_trust_level: None, }; let (metadata, fm_span, _fm_warns) = parse_frontmatter(&bytes, &hints).expect("frontmatter parses"); diff --git a/crates/kebab-parse-pdf/src/lib.rs b/crates/kebab-parse-pdf/src/lib.rs index 0ecd6e2..630f048 100644 --- a/crates/kebab-parse-pdf/src/lib.rs +++ b/crates/kebab-parse-pdf/src/lib.rs @@ -203,6 +203,7 @@ impl Extractor for PdfTextExtractor { git_branch: None, git_commit: None, code_lang: None, + source_id: None, }; tracing::debug!( diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs index 8101e5b..630e6a6 100644 --- a/crates/kebab-search/src/lexical.rs +++ b/crates/kebab-search/src/lexical.rs @@ -419,6 +419,31 @@ fn run_query( } } + // Phase-2: source_type filter (IN-list on the direct `documents.source_type` + // column). Empty Vec = no filter; multi-value = OR. Mirrors filters.rs. + if !filters.source_type.is_empty() { + let placeholders = std::iter::repeat_n("?", filters.source_type.len()) + .collect::>() + .join(","); + sql.push_str(&format!(" AND d.source_type IN ({placeholders})")); + for st in &filters.source_type { + params.push(Box::new(st.clone())); + } + } + + // [[workspace.sources]]: source_id filter (IN-list on the direct + // `documents.source_id` column). Empty Vec = no filter; multi-value = OR. + // Mirrors filters.rs. + if !filters.source_id.is_empty() { + let placeholders = std::iter::repeat_n("?", filters.source_id.len()) + .collect::>() + .join(","); + sql.push_str(&format!(" AND d.source_id IN ({placeholders})")); + for sid in &filters.source_id { + params.push(Box::new(sid.clone())); + } + } + // p9-fb-36: ingested_after filter. // `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per // fb-32 ingest path), so lexicographic >= compare is correct — but only diff --git a/crates/kebab-source-fs/src/connector.rs b/crates/kebab-source-fs/src/connector.rs index 7c6d2fb..293e6c8 100644 --- a/crates/kebab-source-fs/src/connector.rs +++ b/crates/kebab-source-fs/src/connector.rs @@ -231,6 +231,47 @@ pub struct FsScanSkips { pub events: Vec, } +impl FsScanSkips { + /// `[[workspace.sources]]`: fold another source's scan skips into `self`, + /// so a multi-source ingest reports aggregate counts. Counters add; + /// per-category sample vecs concatenate and re-cap at 5 (spec §5.5); + /// events concatenate. + pub fn merge(&mut self, other: FsScanSkips) { + self.skipped_gitignore = self.skipped_gitignore.saturating_add(other.skipped_gitignore); + self.skipped_kebabignore = self + .skipped_kebabignore + .saturating_add(other.skipped_kebabignore); + self.skipped_builtin_blacklist = self + .skipped_builtin_blacklist + .saturating_add(other.skipped_builtin_blacklist); + self.skipped_generated = self.skipped_generated.saturating_add(other.skipped_generated); + self.skipped_size_exceeded = self + .skipped_size_exceeded + .saturating_add(other.skipped_size_exceeded); + + fn merge_samples(dst: &mut Vec, src: Vec) { + for s in src { + if dst.len() >= 5 { + break; + } + dst.push(s); + } + } + merge_samples(&mut self.skip_examples.generated, other.skip_examples.generated); + merge_samples( + &mut self.skip_examples.size_exceeded, + other.skip_examples.size_exceeded, + ); + merge_samples( + &mut self.skip_examples.builtin_blacklist, + other.skip_examples.builtin_blacklist, + ); + merge_samples(&mut self.skip_examples.gitignore, other.skip_examples.gitignore); + + self.events.extend(other.events); + } +} + /// A single per-file skip event for structured ingest log (v0.20.x). #[derive(Debug)] pub struct FsSkipEvent { @@ -326,7 +367,7 @@ mod tests { fn cfg_with_root(root: &str) -> Config { let mut c = Config::defaults(); - c.workspace.root = root.to_string(); + c.workspace.root = Some(root.to_string()); c.workspace.exclude.clear(); c } diff --git a/crates/kebab-source-fs/tests/include_allowlist.rs b/crates/kebab-source-fs/tests/include_allowlist.rs index dacf4f8..524761a 100644 --- a/crates/kebab-source-fs/tests/include_allowlist.rs +++ b/crates/kebab-source-fs/tests/include_allowlist.rs @@ -20,7 +20,7 @@ use kebab_source_fs::FsSourceConnector; fn cfg_with_root(root: &str) -> Config { let mut c = Config::defaults(); - c.workspace.root = root.to_string(); + c.workspace.root = Some(root.to_string()); c.workspace.exclude.clear(); // Disable size / generated caps so small test files always pass. c.ingest.code.max_file_bytes = u64::MAX; diff --git a/crates/kebab-source-fs/tests/snapshot_tree1.rs b/crates/kebab-source-fs/tests/snapshot_tree1.rs index 08d89d3..5f4b9b4 100644 --- a/crates/kebab-source-fs/tests/snapshot_tree1.rs +++ b/crates/kebab-source-fs/tests/snapshot_tree1.rs @@ -50,7 +50,7 @@ fn baseline_path() -> PathBuf { fn cfg_for_fixture(root: &str) -> Config { let mut c = Config::defaults(); - c.workspace.root = root.to_string(); + c.workspace.root = Some(root.to_string()); // Clear default excludes (`.git/**`, `node_modules/**`, `.obsidian/**`) // so the snapshot is purely a function of the fixture + .kebabignore + // baked-in default-excludes. diff --git a/crates/kebab-source-fs/tests/symlink_cycle.rs b/crates/kebab-source-fs/tests/symlink_cycle.rs index 8250c26..c1b0d53 100644 --- a/crates/kebab-source-fs/tests/symlink_cycle.rs +++ b/crates/kebab-source-fs/tests/symlink_cycle.rs @@ -23,7 +23,7 @@ use kebab_source_fs::FsSourceConnector; fn cfg_with_root(root: &str) -> Config { let mut c = Config::defaults(); - c.workspace.root = root.to_string(); + c.workspace.root = Some(root.to_string()); c.workspace.exclude.clear(); c } diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index 904c4fb..365ecad 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -745,6 +745,14 @@ fn upsert_document( // `markdown` for the column). let source_type = source_type_label(&doc.metadata.source_type); let trust_level = trust_level_label(&doc.metadata.trust_level); + // `[[workspace.sources]]`: id of the source this doc came from. Falls back + // to the column default `"default"` for docs without an explicit source + // (single-root workspaces / pre-multi-source ingests). + let source_id = doc + .metadata + .source_id + .as_deref() + .unwrap_or(kebab_config::DEFAULT_SOURCE_ID); let created_at = doc .metadata .created_at @@ -757,11 +765,11 @@ fn upsert_document( tx.execute( "INSERT INTO documents ( doc_id, asset_id, workspace_path, title, lang, - source_type, trust_level, parser_version, + source_type, trust_level, source_id, parser_version, doc_version, schema_version, metadata_json, provenance_json, created_at, updated_at, last_chunker_version, last_embedding_version - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(doc_id) DO UPDATE SET asset_id = excluded.asset_id, workspace_path = excluded.workspace_path, @@ -769,6 +777,7 @@ fn upsert_document( lang = excluded.lang, source_type = excluded.source_type, trust_level = excluded.trust_level, + source_id = excluded.source_id, parser_version = excluded.parser_version, -- doc_version: bump on update. excluded.doc_version is the -- caller's submitted value; we ignore it and add 1 to the @@ -788,6 +797,7 @@ fn upsert_document( doc.lang.0, source_type, trust_level, + source_id, doc.parser_version.0, i64::from(doc.doc_version), i64::from(doc.schema_version), diff --git a/crates/kebab-store-sqlite/src/filters.rs b/crates/kebab-store-sqlite/src/filters.rs index ff3b899..d6690c7 100644 --- a/crates/kebab-store-sqlite/src/filters.rs +++ b/crates/kebab-store-sqlite/src/filters.rs @@ -191,6 +191,31 @@ impl SqliteStore { } } + // Phase-2: source_type filter (IN-list on the direct `documents.source_type` + // column, idx_docs_source_type). Empty Vec = no filter; multi-value = OR. + if !filters.source_type.is_empty() { + let placeholders = std::iter::repeat_n("?", filters.source_type.len()) + .collect::>() + .join(","); + sql.push_str(&format!(" AND d.source_type IN ({placeholders})")); + for st in &filters.source_type { + bind.push(Box::new(st.clone())); + } + } + + // [[workspace.sources]]: source_id filter (IN-list on the direct + // `documents.source_id` column, idx_docs_source_id). Empty Vec = no + // filter; multi-value = OR. Mirrors the source_type filter above. + if !filters.source_id.is_empty() { + let placeholders = std::iter::repeat_n("?", filters.source_id.len()) + .collect::>() + .join(","); + sql.push_str(&format!(" AND d.source_id IN ({placeholders})")); + for sid in &filters.source_id { + bind.push(Box::new(sid.clone())); + } + } + // p9-fb-36: ingested_after filter. // `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32); // lexicographic >= compare is correct — but only when the filter @@ -1000,6 +1025,121 @@ mod tests { ); } + /// [[workspace.sources]]: the `source_id` filter keeps only chunks whose + /// owning document's `documents.source_id` column is in the IN-list. + #[test] + fn filter_chunks_source_id_keeps_matching_source() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let c1 = "11111111111111111111111111111111"; + let c2 = "22222222222222222222222222222222"; + let c3 = "33333333333333333333333333333333"; + // Three docs, each with a distinct source_id column value. + seed_with_source_id(&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "notes/a.md", "notes"); + seed_with_source_id(&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "code/b.rs", "code"); + seed_with_source_id( + &store, + c3, + "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", + "x.md", + "default", + ); + + // Single value. + let f = SearchFilters { + source_id: vec!["notes".to_string()], + ..Default::default() + }; + let out = store + .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f) + .unwrap(); + assert_eq!(out, vec![cid(c1)], "only the `notes` source chunk survives"); + + // Multi-value OR. + let f = SearchFilters { + source_id: vec!["notes".to_string(), "code".to_string()], + ..Default::default() + }; + let out = store + .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f) + .unwrap(); + assert_eq!(out, vec![cid(c1), cid(c2)], "notes OR code survive"); + + // Empty filter = no filtering. + let f = SearchFilters::default(); + let out = store + .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f) + .unwrap(); + assert_eq!(out, vec![cid(c1), cid(c2), cid(c3)]); + } + + /// Seed one committed doc + chunk + embedding with an explicit + /// `documents.source_id` column value (the DEFAULT is `'default'`). + fn seed_with_source_id( + store: &SqliteStore, + chunk_id: &str, + doc_id: &str, + workspace_path: &str, + source_id: &str, + ) { + let asset_id = format!("a{}", &doc_id[..31]); + { + let conn = store.lock_conn(); + conn.execute( + "INSERT INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, '\"markdown\"', 1, ?, 'reference', ?, + '1970-01-01T00:00:00Z')", + params![ + asset_id, + format!("file://{workspace_path}"), + workspace_path, + workspace_path, + workspace_path, + ], + ) + .unwrap(); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, source_id, parser_version, doc_version, + schema_version, metadata_json, provenance_json, + created_at, updated_at + ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', ?, 'v1', + 1, 1, '{}', '{}', '1970-01-01T00:00:00Z', + '1970-01-01T00:00:00Z')", + params![doc_id, asset_id, workspace_path, source_id], + ) + .unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]', + '1970-01-01T00:00:00Z')", + params![chunk_id, doc_id], + ) + .unwrap(); + } + let embed_row = EmbeddingRecordRow { + embedding_id: format!("e{}", &chunk_id[..31]), + chunk_id: chunk_id.to_string(), + model_id: "m".to_string(), + model_version: "v1".to_string(), + dimensions: 4, + lance_table: "t".to_string(), + created_at: OffsetDateTime::UNIX_EPOCH, + }; + store + .put_embedding_records_pending(std::slice::from_ref(&embed_row)) + .unwrap(); + store + .mark_embedding_records_committed(std::slice::from_ref(&embed_row.embedding_id)) + .unwrap(); + } + #[test] fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() { // Regression test for the non-UTC offset lex-compare bug. diff --git a/crates/kebab-store-sqlite/src/stats_ext.rs b/crates/kebab-store-sqlite/src/stats_ext.rs index 8cd8ce5..33ec6b9 100644 --- a/crates/kebab-store-sqlite/src/stats_ext.rs +++ b/crates/kebab-store-sqlite/src/stats_ext.rs @@ -80,7 +80,7 @@ pub fn breakdowns(conn: &Connection, threshold_days: u64) -> rusqlite::Result
std::io::Result { fn file_size_or_zero(p: &Path) -> u64 { - std::fs::metadata(p).map(|m| m.len()).unwrap_or(0) + std::fs::metadata(p).map_or(0, |m| m.len()) } fn dir_walk_sum(p: &Path) -> std::io::Result { if !p.exists() { diff --git a/crates/kebab-store-sqlite/tests/contract_roundtrip.rs b/crates/kebab-store-sqlite/tests/contract_roundtrip.rs index bfa6f6f..dc40910 100644 --- a/crates/kebab-store-sqlite/tests/contract_roundtrip.rs +++ b/crates/kebab-store-sqlite/tests/contract_roundtrip.rs @@ -57,6 +57,8 @@ fn document_and_chunks_round_trip_through_sqlite() { fs_ctime: asset.discovered_at, fs_mtime: asset.discovered_at, fallback_lang: Some("en".into()), + source_id: None, + fallback_trust_level: None, }; let (mut metadata, _fm_span, _fm_warns) = parse_frontmatter(&bytes, &hints).unwrap(); let (parsed_blocks, parse_warns) = parse_blocks(&bytes, 1).unwrap(); diff --git a/crates/kebab-store-sqlite/tests/idempotency.rs b/crates/kebab-store-sqlite/tests/idempotency.rs index 1171c0a..b57fe46 100644 --- a/crates/kebab-store-sqlite/tests/idempotency.rs +++ b/crates/kebab-store-sqlite/tests/idempotency.rs @@ -45,6 +45,7 @@ fn make_metadata() -> Metadata { git_branch: None, git_commit: None, code_lang: None, + source_id: None, } } diff --git a/crates/kebab-store-sqlite/tests/incremental_ingest.rs b/crates/kebab-store-sqlite/tests/incremental_ingest.rs index ef67706..20abc66 100644 --- a/crates/kebab-store-sqlite/tests/incremental_ingest.rs +++ b/crates/kebab-store-sqlite/tests/incremental_ingest.rs @@ -55,6 +55,7 @@ fn make_doc() -> CanonicalDocument { git_branch: None, git_commit: None, code_lang: None, + source_id: None, }; CanonicalDocument { doc_id, diff --git a/crates/kebab-store-sqlite/tests/list_docs.rs b/crates/kebab-store-sqlite/tests/list_docs.rs index 8a536dc..d8ccb4e 100644 --- a/crates/kebab-store-sqlite/tests/list_docs.rs +++ b/crates/kebab-store-sqlite/tests/list_docs.rs @@ -58,6 +58,7 @@ fn make_doc( git_branch: None, git_commit: None, code_lang: None, + source_id: None, }; let doc = CanonicalDocument { doc_id, diff --git a/crates/kebab-tui/src/ask.rs b/crates/kebab-tui/src/ask.rs index 287b7c2..919da87 100644 --- a/crates/kebab-tui/src/ask.rs +++ b/crates/kebab-tui/src/ask.rs @@ -598,8 +598,7 @@ fn spawn_ask_worker(state: &mut App) { fn make_conversation_id() -> String { let nanos = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) - .map(|d| d.as_nanos()) - .unwrap_or(0); + .map_or(0, |d| d.as_nanos()); format!("conv_{nanos:032x}") } diff --git a/crates/kebab-tui/src/ingest_progress.rs b/crates/kebab-tui/src/ingest_progress.rs index f3ae475..b47e0d8 100644 --- a/crates/kebab-tui/src/ingest_progress.rs +++ b/crates/kebab-tui/src/ingest_progress.rs @@ -34,11 +34,10 @@ pub fn start_ingest(app: &mut App) -> anyhow::Result<()> { anyhow::bail!("ingest already running"); } let cfg = app.config.clone(); - let scope = SourceScope { - root: std::path::PathBuf::from(&cfg.workspace.root), - exclude: cfg.workspace.exclude.clone(), - ..Default::default() - }; + // [[workspace.sources]]: leave `scope.root` empty so the app iterates + // every configured source (`config.resolved_sources()`), mirroring the + // CLI `kebab ingest` path. Each source carries its own merged exclude. + let scope = SourceScope::default(); let (tx, rx) = mpsc::channel::(); let cancel = Arc::new(AtomicBool::new(false)); let cancel_for_worker = cancel.clone(); diff --git a/crates/kebab-tui/src/search.rs b/crates/kebab-tui/src/search.rs index 7cf5d17..3888b4e 100644 --- a/crates/kebab-tui/src/search.rs +++ b/crates/kebab-tui/src/search.rs @@ -304,10 +304,11 @@ pub fn handle_key_search(state: &mut App, key: KeyEvent) -> KeyOutcome { // `terminal.clear()` couldn't happen — leaving the // previous frame leaking through the new draw. let editor = std::env::var("EDITOR").unwrap_or_else(|_| "vi".into()); - // `~/...` / `${XDG_…}` expansion via `kebab-config::expand_path` - // — same helper used by the markdown / image / PDF ingest - // paths (HOTFIXES 2026-05-02 P9-4 follow-up). - let workspace_root = kebab_config::expand_path(&state.config.workspace.root, ""); + // [[workspace.sources]]: resolve the primary workspace root + // (first source / legacy `root`). `resolve_workspace_root` applies + // the same `~` / `${XDG_…}` / relative-to-config expansion as the + // markdown / image / PDF ingest paths (HOTFIXES 2026-05-02 P9-4). + let workspace_root = state.config.resolve_workspace_root(); state.pending_editor = Some(crate::app::EditorRequest { citation: citation.unwrap(), editor_env: editor, diff --git a/crates/kebab-tui/tests/ask.rs b/crates/kebab-tui/tests/ask.rs index 487c8a3..3fdd63d 100644 --- a/crates/kebab-tui/tests/ask.rs +++ b/crates/kebab-tui/tests/ask.rs @@ -19,7 +19,7 @@ use time::OffsetDateTime; fn fresh_app() -> App { let mut config = Config::defaults(); config.storage.data_dir = "/tmp/kebab-tui-ask-tests-noop".to_string(); - config.workspace.root = "/tmp/kebab-tui-ask-tests-noop/workspace".to_string(); + config.workspace.root = Some("/tmp/kebab-tui-ask-tests-noop/workspace".to_string()); let mut app = App::new(config).expect("App::new"); app.focus = Pane::Ask; // p9-fb-12 follow-up: mirror the run loop's auto-flip on pane diff --git a/crates/kebab-tui/tests/cheatsheet.rs b/crates/kebab-tui/tests/cheatsheet.rs index 1d1a0bb..5aca8d6 100644 --- a/crates/kebab-tui/tests/cheatsheet.rs +++ b/crates/kebab-tui/tests/cheatsheet.rs @@ -12,7 +12,7 @@ use ratatui::layout::Rect; fn fresh_app(focus: Pane) -> App { let mut config = Config::defaults(); config.storage.data_dir = "/tmp/kebab-tui-cheatsheet-tests-noop".to_string(); - config.workspace.root = "/tmp/kebab-tui-cheatsheet-tests-noop/workspace".to_string(); + config.workspace.root = Some("/tmp/kebab-tui-cheatsheet-tests-noop/workspace".to_string()); let mut app = App::new(config).expect("App::new"); app.focus = focus; app diff --git a/crates/kebab-tui/tests/inspect.rs b/crates/kebab-tui/tests/inspect.rs index 4e0525f..de87577 100644 --- a/crates/kebab-tui/tests/inspect.rs +++ b/crates/kebab-tui/tests/inspect.rs @@ -23,7 +23,7 @@ use time::OffsetDateTime; fn fresh_app() -> App { let mut config = Config::defaults(); config.storage.data_dir = "/tmp/kebab-tui-inspect-tests-noop".to_string(); - config.workspace.root = "/tmp/kebab-tui-inspect-tests-noop/workspace".to_string(); + config.workspace.root = Some("/tmp/kebab-tui-inspect-tests-noop/workspace".to_string()); let mut app = App::new(config).expect("App::new"); app.focus = Pane::Inspect; app.inspect = Some(InspectState::default()); @@ -85,6 +85,7 @@ fn make_doc() -> CanonicalDocument { git_branch: None, git_commit: None, code_lang: None, + source_id: None, }, provenance: Provenance { events: vec![ProvenanceEvent { diff --git a/crates/kebab-tui/tests/mode.rs b/crates/kebab-tui/tests/mode.rs index 13bbe3e..f74aa34 100644 --- a/crates/kebab-tui/tests/mode.rs +++ b/crates/kebab-tui/tests/mode.rs @@ -9,7 +9,7 @@ use kebab_tui::{App, Mode, Pane, mode_intercept}; fn fresh_app(focus: Pane) -> App { let mut config = Config::defaults(); config.storage.data_dir = "/tmp/kebab-tui-mode-tests-noop".to_string(); - config.workspace.root = "/tmp/kebab-tui-mode-tests-noop/workspace".to_string(); + config.workspace.root = Some("/tmp/kebab-tui-mode-tests-noop/workspace".to_string()); let mut app = App::new(config).expect("App::new"); app.focus = focus; app.mode = Mode::auto_for(focus); diff --git a/crates/kebab-tui/tests/search.rs b/crates/kebab-tui/tests/search.rs index e3e8b9a..bade14e 100644 --- a/crates/kebab-tui/tests/search.rs +++ b/crates/kebab-tui/tests/search.rs @@ -18,7 +18,7 @@ use std::path::Path; fn fresh_app() -> App { let mut config = Config::defaults(); config.storage.data_dir = "/tmp/kebab-tui-search-tests-noop".to_string(); - config.workspace.root = "/tmp/kebab-tui-search-tests-noop/workspace".to_string(); + config.workspace.root = Some("/tmp/kebab-tui-search-tests-noop/workspace".to_string()); let mut app = App::new(config).expect("App::new"); app.focus = Pane::Search; // p9-fb-12 follow-up: mirror the run loop's auto-flip — Search diff --git a/crates/kebab-tui/tests/status_bar.rs b/crates/kebab-tui/tests/status_bar.rs index ec49ab3..d00db4b 100644 --- a/crates/kebab-tui/tests/status_bar.rs +++ b/crates/kebab-tui/tests/status_bar.rs @@ -9,7 +9,7 @@ use ratatui::layout::Rect; fn fresh_app(focus: Pane) -> App { let mut config = Config::defaults(); config.storage.data_dir = "/tmp/kebab-tui-status-bar-tests-noop".to_string(); - config.workspace.root = "/tmp/kebab-tui-status-bar-tests-noop/workspace".to_string(); + config.workspace.root = Some("/tmp/kebab-tui-status-bar-tests-noop/workspace".to_string()); let mut app = App::new(config).expect("App::new"); app.focus = focus; app diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index f586925..029e0c6 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -34,6 +34,7 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab- | RRF fusion_score | `[0, 1]` 정규화 — `2 / (k_rrf + 1)` 로 나눠 mode 간 비교 가능 (post-merge hotfix) | | ~~doc-side expansion 별칭 (v0.21.0)~~ | **제거됨 (v0.25.0, HOTFIXES 2026-06-03)** — 색인-시 청크당 LLM 별칭 생성 + 별칭 검색 채널을 완전히 제거. 별칭 ROI 음수(cross-lingual 은 e5-large 단독으로 충분, 기여는 설명형 +2 그룹뿐인데 대가가 청크당 색인-시 LLM). V013 마이그레이션이 `chunk_aliases_fts` + `chunks.aliases` DROP. 기존 KB 의 잔존 별칭 벡터는 검색 시 `strip_alias_suffix` 로 본문 chunk 에 매핑(graceful)되거나 `kebab reset` 으로 정리. spec: `docs/superpowers/specs/2026-06-03-remove-doc-expansion-spec.md`. | | 파생물 캐시 `derivation_cache` (V012, v0.21.0) | 비싼 ingest 파생물(embedding 벡터)을 청크 **내용 해시** 키로 SQLite 에 캐싱 → 재색인 시 내용 불변 청크는 재계산 skip. `cache_key = blake3(kind ‖ text_blake3 ‖ version_key)[:32]`; version_key 에 model/dimensions 포함 → §9 cascade 와 정합(버전 bump 시 자동 miss). 위치 기반 `chunk_id` 와 달리 내용이 같으면 문서·위치 무관 동일 키. 순수 가산 — `corpus_revision` bump 안 함, 손상/삭제돼도 정확성 영향 0(miss → 재계산). search/ask 는 `kebab.sqlite`+`lancedb` 만으로 동작하므로 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능 (HOTFIXES 2026-05-31). (별칭 LLM 캐싱 kind 는 v0.25.0 에서 제거 — embedding kind 만 남음.) | +| provenance 출처 필터 (v0.29.0) | 혼합 출처 KB 의 레버 = **질의 시 출처 필터링** (전역 trust 가중 아님). config `[[workspace.sources]]`(각 id/root/trust_level/source_type) → `documents.source_id` 컬럼(V014, additive·DEFAULT `'default'`) stamp + 검색 `--source ` / `--source-type `(lexical+vector 두 site, OR). 단일 root 는 implicit `default` source 로 정규화(config v3→v4 `step_3_to_4` 미러). per-source trust/type 는 frontmatter 부재 시 기본값(우선순위 frontmatter > source 기본값 > Primary). **전역 trust 곱셈가중(weighted-RRF)은 반증** — A/B 에서 θ=0.85 만으로 incident MRR 0.918→0.340 절벽(점수 압축), 작은 오염 잡으려다 큰 개선 버리는 see-saw 라 빌드 안 함. 필터는 see-saw 없음. (HOTFIXES 2026-06-21) | | layout | XDG (`~/.local/share/kebab/`, `~/.config/kebab/`, …) | 전체 frozen 설계는 [docs/superpowers/specs/2026-04-27-kebab-final-form-design.md](superpowers/specs/2026-04-27-kebab-final-form-design.md) 12 sections 참조. @@ -219,7 +220,7 @@ kebab/ │ ├── kebab-tui/ # Ratatui shell + Library 패널 (P9-1) │ ├── kebab-mcp/ # stdio MCP server — tools: schema, doctor, search, ask (P9-FB-30) │ └── kebab-cli/ # binary (P0 → 핫픽스로 --config flag wiring 강화) -├── migrations/ # SQLite refinery V001..V012 (V012 = derivation_cache, v0.21.0) +├── migrations/ # SQLite refinery V001..V014 (V012 = derivation_cache v0.21.0, V013 = drop chunk_aliases v0.25.0, V014 = documents.source_id v0.29.0) └── fixtures/ # 테스트 fixture 트리 ``` diff --git a/docs/DOGFOOD.md b/docs/DOGFOOD.md index 4df400e..21b016a 100644 --- a/docs/DOGFOOD.md +++ b/docs/DOGFOOD.md @@ -397,6 +397,22 @@ $KB search 'tokenizer' --mode lexical --json | jq '.hits | length' # ≥ 1 if co - `--lang` ISO code. - `--path-glob` workspace_path glob. +### §2.4bis Source / provenance filters (`--source` / `--source-type`, v0.29.0) + +```bash +# 출처 id 필터 ([[workspace.sources]] 의 id; 단일 root 는 "default"). +"$RELEASE_BIN" search --config "$DOGFOOD/config.toml" "query" --source jira --json | jq '.hits | length' +# source_type 필터 (markdown/note/paper/reference/inbox). +"$RELEASE_BIN" search --config "$DOGFOOD/config.toml" "query" --source-type reference,markdown --json +``` + +**verify**: +- `--source` / `--source-type` repeatable + comma-sep, OR within. +- lexical · vector · hybrid 모든 모드에 동일 적용 (직접 인덱스 컬럼 `documents.source_id` / `source_type`). +- 모르는 값 → silently empty (no error). +- 멀티소스 KB 측정: `--source wiki` 가 개념 질의 오염 회복(MRR 0.780→0.810), `--source jira` 가 incident 0.918→0.975 (HOTFIXES 2026-06-21). +- trust precedence: `[[workspace.sources]]` 의 per-source `trust_level` 가 frontmatter 부재 시 적용 → `--trust-min primary` 와 조합 시 secondary source 배제. + ### §2.5 Search pagination (p9-fb-34) ```bash diff --git a/docs/SMOKE.md b/docs/SMOKE.md index 692d51c..7a4c9bf 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -290,6 +290,21 @@ kebab search "rust" --doc-id "" --tag rust --json Bad `--ingested-after` → `error.v1.code = config_invalid`, exit 2. Unknown `--media` value → silently empty (no error). +### Source filters (`--source` / `--source-type`) + +````bash +# 단일 root 워크스페이스는 implicit `default` source 로 정규화되므로 +# 모든 문서가 source_id="default" — 이 필터는 전체와 동일하다. +kebab search "rust" --source default --json | jq '.hits | length' + +# source_type 필터 (frontmatter 의 source_type: 또는 source 기본값). +kebab search "rust" --source-type markdown,reference --json | jq '.hits | length' +```` + +멀티소스 KB 는 `[[workspace.sources]]` 로 명명 source 를 선언하면 +`--source ` 로 출처를 좁힌다 (예: `--source jira` → jira 문서만). +빈 값 = 무필터, 콤마/반복 = OR. 모르는 값 → silently empty (no error). + ### Trace + stats (fb-37) Re-run a search with `--trace` to see per-stage candidate lists + timing: diff --git a/docs/release-notes/v0.29.0-draft.md b/docs/release-notes/v0.29.0-draft.md new file mode 100644 index 0000000..5de09c7 --- /dev/null +++ b/docs/release-notes/v0.29.0-draft.md @@ -0,0 +1,104 @@ +--- +title: kebab v0.29.0 release notes (draft) +created: 2026-06-21 +status: draft +release_trigger: + - 신규 CLI flag `kebab search --source` / `--source-type` — pre-1.0 minor bump + - 신규 config `[[workspace.sources]]` + V014 migration (documents.source_id) — 인터페이스 추가 + - config schema v3→v4 (단일 root → implicit default source 미러) +--- + +# kebab v0.29.0 — provenance 출처 필터: 멀티소스 + `--source` + +v0.28.0(config 스키마 재편) 후속 minor release. 혼합 출처 KB — 예컨대 위키 +문서와 jira 이슈를 한 KB 에 같이 넣은 경우 — 에서 **색인은 전부 하되 질의 시 +출처로 좁히는** provenance 레버를 추가한다. **기존 단일-폴더 사용자는 아무것도 +손대지 않아도 된다** — 단일 `workspace.root` 는 자동으로 `default` 라는 하나의 +source 로 정규화되고, 새 DB 컬럼은 기존 문서에 `default` 를 자동 채우는 additive +마이그레이션이라 재색인이 발생하지 않는다. + +--- + +## 변경 사실 + +**1) 검색 출처 필터 2종.** `kebab search` 에 두 필터가 생겼다. + +| flag | 의미 | +|---|---| +| `--source ` | `[[workspace.sources]]` 에 선언한 source 의 id 로 필터 (예: `--source jira`) | +| `--source-type ` | `markdown` / `note` / `paper` / `reference` / `inbox` 로 필터 | + +둘 다 반복(`--source a --source b`) 또는 콤마(`--source a,b`)로 여러 값을 줄 수 +있고 OR 로 묶인다. 빈 값 = 무필터. lexical · vector · hybrid **모든 검색 모드**에 +동일하게 적용된다(직접 인덱스 컬럼 — 추가 비용 거의 없음). + +**2) `[[workspace.sources]]` 멀티소스 config.** 단일 `[workspace] root` 대신 +여러 폴더를 명명 source 로 선언할 수 있다. + +```toml +[[workspace.sources]] +id = "notes" +root = "~/KnowledgeBase/notes" + +[[workspace.sources]] +id = "jira" +root = "~/exports/jira" +trust_level = "secondary" # frontmatter 가 없을 때의 출처 기본 신뢰도 +source_type = "reference" # frontmatter 가 없을 때의 출처 기본 타입 +``` + +각 source 의 `id` 는 그 폴더에서 색인된 모든 문서에 stamp 되고 +(`documents.source_id` 컬럼), `--source ` 필터의 대상이 된다. +`trust_level` / `source_type` 은 **출처 기본값**으로, 문서 frontmatter 가 해당 +필드를 지정하지 않을 때만 적용된다 (우선순위: **frontmatter > source 기본값 > +하드코딩 Primary/Markdown**). `kebab ingest` 는 `--root` 를 주지 않으면 선언된 +모든 source 를 각자의 root + exclude 로 순회한다. + +## Trade-off — 왜 필터인가 (전역 trust 가중은 반증됨) + +"출처가 섞이면 신뢰도로 가중하면 되지 않나?" 를 통제 실험으로 검증했고, +**전역 trust 곱셈가중은 반증됐다**. jira 를 docs KB 에 섞으면 개념 질의는 약하게 +오염(top-3 정답은 유지, rank1→2 강등)되지만 운영/이슈 질의는 크게 개선된다 +(jira_only hit@10 0/10 → 10/10). 그런데 jira 점수에 θ=0.85 만 곱해도 RAG 점수 +압축 때문에 incident MRR 이 0.918→0.340 으로 절벽 하락한다 — 작은 오염을 잡으려다 +큰 개선을 버리는 see-saw. 그래서 전역 가중은 **빌드하지 않았다**. + +올바른 레버는 **질의 시 출처 필터링**이다: 색인은 전부 해 두고(운영 질의는 +jira 가 답하게), 개념 질의에서만 `--source wiki` 로 좁힌다. see-saw 없이 양쪽을 +다 얻는다. + +## Mitigation — 기존 사용자 무영향 (재색인 0) + +- **단일 root 그대로 동작**: `[[workspace.sources]]` 를 선언하지 않으면 기존 + `workspace.root` 가 implicit `default` source 로 정규화된다. 모든 문서가 + `source_id = "default"`. +- **V014 는 additive**: `documents.source_id` 컬럼은 `DEFAULT 'default'` 라 기존 + row 가 자동으로 채워진다. 데이터 재작성·재색인·`corpus_revision` bump 없음. +- **config v3→v4 자동 변환**: load 시 메모리에서 자동 변환(디스크 미변경), + `kebab config migrate` 로 파일 갱신 시 값·주석 보존 + 멱등. 단일 `root` 는 + `[[workspace.sources]]` id=default 로 **미러**되며 기존 `root` 키도 그대로 남는다. + +도그푸딩(v0.29.0 release 빌드, 실험 corpus): 620 문서 / 0 error 색인, +`source_id = {jira: 400, wiki: 220}`. trust precedence 실측 — jira 는 source +기본값 secondary 라 `--trust-min primary` 시 0/6 노출, wiki 는 primary 유지. +출처 필터 실측 — `--source wiki` 로 개념 질의 MRR 0.780→0.810(오염 회복), +`--source jira` 로 incident 0.918→0.975. + +## Upgrade 절차 + +1. 아무것도 안 해도 된다 — 기존 `config.toml` 과 KB 는 그대로 동작한다(단일 root + = `default` source, V014 자동 backfill). +2. 출처를 나누고 싶으면 `kebab config migrate` 로 config 를 v4 로 갱신한 뒤 + `[[workspace.sources]]` 블록을 손으로 추가하고 `kebab ingest --force-reingest` + 로 각 문서에 새 `source_id` 를 stamp 한다. (단순 `ingest` 는 내용이 안 바뀐 + 문서를 skip 하므로, 기존 문서의 source_id 를 `default` 외 값으로 바꾸려면 + `--force-reingest` 필요.) +3. 검색에서 `--source ` / `--source-type ` 로 출처를 좁힌다. + +## Known limitations / 다음 + +- **MCP search 도구**는 아직 `--source` / `--source-type` 를 노출하지 않는다 + (CLI 전용). agent 용 MCP 필터 노출은 다음 additive 후보. +- **`kebab list`** 출력(`doc_summary.v1`)에 `source_id` 가 아직 안 실린다. +- **`kebab ask`** citation 에 provenance 라벨이 아직 없다 — 검색 필터는 되지만 + 답변 근거의 출처 표기는 다음 단계. diff --git a/migrations/V014__documents_source_id.sql b/migrations/V014__documents_source_id.sql new file mode 100644 index 0000000..88372cb --- /dev/null +++ b/migrations/V014__documents_source_id.sql @@ -0,0 +1,15 @@ +-- V014: [[workspace.sources]] multi-source support. +-- +-- Adds `documents.source_id`: the id of the `[[workspace.sources]]` entry a +-- document was ingested from. Single-root workspaces (and every pre-existing +-- row) get the implicit `default` id via the column DEFAULT — so this is a +-- backward-compatible additive migration (no data rewrite, no corpus_revision +-- bump required for existing chunks/embeddings). +-- +-- The DEFAULT 'default' literal is kept in sync with +-- `kebab_config::DEFAULT_SOURCE_ID`. The index backs the `--source ` +-- search filter (SearchFilters.source_id → `d.source_id IN (...)`). + +ALTER TABLE documents ADD COLUMN source_id TEXT NOT NULL DEFAULT 'default'; + +CREATE INDEX idx_docs_source_id ON documents(source_id); diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index 0564fc3..33901dc 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,85 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-06-21 — provenance 출처 필터: `[[workspace.sources]]` 멀티소스 + `--source` / `--source-type` (v0.29.0) + +**무엇을 추가했나.** 혼합 출처 KB(예: 위키 문서 + jira 이슈)에서 "출처별로 +검색을 좁히는" provenance 레버를 두 층으로 붙였다. 검색에 `--source-type` +(Phase-2) 와 `--source ` (Phase-3) 필터, config 에 `[[workspace.sources]]` +명명 멀티소스 선언, 저장소에 `documents.source_id` 컬럼(V014)을 추가했다. +기존 단일-root 사용자는 **아무것도 손대지 않아도 된다** — load 시 단일 `root` +가 implicit `default` source 로 정규화되고, V014 는 additive(컬럼 DEFAULT +`'default'`)라 재색인이 발생하지 않는다. + +**왜 필터인가 — 전역 trust 가중(weighted-RRF)은 반증됨.** Phase-1 통제 실험 +(`/home/user/large_data/out/kebab-ab/`, MongoDB 도메인 정합 corpus)에서 jira 를 +docs KB 에 섞으면 **개념 질의는 약하게 오염**(concept ΔMRR −0.072, +CI[−0.159,−0.004]; top-3 정답은 유지, rank1→2 강등)되지만 **운영/이슈 질의는 +크게 개선**(incident ΔMRR +0.972, jira_only hit@10 0/10 → 10/10)됨을 측정했다. +Phase-2 에서 골든셋을 142 쿼리로 확장(`golden_v2.json`: 워크플로 12토픽 병렬생성 +96 + 원본 46)해 재현(concept 70 −0.03 유의, incident 66 +0.92~0.97)한 뒤, θ +sweep 시뮬(`eval_phase2.py`)로 **전역 trust 곱셈가중을 반증**했다 — jira 에 +θ=0.85 만 곱해도 RAG 점수 압축 때문에 incident MRR 0.918→0.340 으로 절벽 하락. +작은 오염을 잡으려다 큰 개선을 버리는 see-saw 라 **빌드하지 않았다**. 올바른 +레버는 see-saw 없는 **출처 필터링**: 색인은 전부 하되 질의 시 출처로 좁힌다. + +**구현 표면.** + +- **config 스키마** (`kebab-config`): `WorkspaceCfg.root` 가 `Option` 으로, + 신규 `WorkspaceCfg.sources: Vec` 추가. `SourceCfg { id, root, + exclude?, trust_level?, source_type? }`. `Config::resolved_sources()` 가 + 단일 entry point — `sources` 가 비면 `workspace.root` 를 implicit `default` + source 로 합성, 있으면 각 entry 의 root 확장 + `workspace.exclude` ∪ per-source + exclude. `validate_sources()` 가 id 비어있음/중복을 `ConfigInvalid` 로 거절. +- **config v3→v4 migration** (`migrate.rs::step_3_to_4`): 단일 `workspace.root` + 를 `[[workspace.sources]]` id=default 로 **미러**(기존 root 키는 보존 — 둘 다 + default 를 가리켜 무해). `[[workspace.sources]]` 가 이미 있으면 no-op. 멱등. + `CURRENT_SCHEMA_VERSION` 3→4. load 시 메모리 자동 변환 + `kebab config migrate` + 로 디스크 갱신(값·주석 보존) — v0.28.0 v2→v3 패턴 동일. +- **저장소** (V014 `documents_source_id.sql`): `documents.source_id TEXT NOT NULL + DEFAULT 'default'` + `idx_docs_source_id`. additive — 기존 row 는 DEFAULT 로 + `'default'`, 재색인/`corpus_revision` bump 불요. DEFAULT 리터럴은 + `kebab_config::DEFAULT_SOURCE_ID` 와 동기. +- **도메인/파서**: `Metadata.source_id: Option` 추가(`skip_serializing_if + = Option::is_none`). `BodyHints` 에 `source_id` + `fallback_trust_level` 추가 — + markdown derive 의 trust precedence 가 **frontmatter > per-source 기본값 > + 하드코딩 Primary**. source_id 는 frontmatter 가 덮지 않는 ingest-time + provenance stamp. +- **ingest** (`kebab-app`): `--root` 미지정 시 `resolved_sources()` 를 순회하며 + 각 source 를 own root+exclude 로 스캔하고 asset→source_id 매핑을 만든 뒤 doc + 마다 source_id + source 기본 trust 를 stamp. `--root`/single-file/include 지정 + 시는 ad-hoc `default` source 한 개(기존 동작 보존). `FsScanSkips::merge` 로 + 멀티소스 스킵 집계. +- **검색 필터**: `SearchFilters` 에 `source_type: Vec` + `source_id: + Vec`(빈 vec = 무필터, multi-value = OR). lexical(FTS5 + `kebab-search/lexical.rs`)·vector(`kebab-store-sqlite/filters.rs`) **두 site + 모두** `d.source_type IN (...)` / `d.source_id IN (...)` 직접 인덱스 컬럼 필터. + CLI `kebab search --source-type ` + `--source `(repeatable/comma-sep). + +**검증(도그푸딩, v0.29.0 release 빌드, 실험 corpus xdg_sources KB).** ingest +620 문서 / 0 error, `source_id = {jira: 400, wiki: 220}`. **trust precedence +실측**: jira source 기본값 secondary(frontmatter 없어도) → `--trust-min primary` +시 jira 0/6 노출, wiki primary 유지. **출처 필터 실측**: `--source wiki` → 개념 +질의 MRR 0.780→0.810 (KB_wiki 수준 오염 회복), `--source jira` → incident +0.918→0.975. Phase-2 `--source-type reference`/`markdown` 도 동일 효과(concept +0.810, incident 0.975). weighted-RRF 절벽과 대비해 필터는 see-saw 없음. + +**Known limitations / follow-up.** + +- **MCP search 도구 미노출**: `kebab-mcp/tools/search.rs` 는 `source_type` / + `source_id` 를 빈 vec 로 채워 컴파일만 맞춤 — agent 가 MCP 로 출처 필터를 못 + 건다. `SearchInput` 에 두 필드 추가가 다음 additive 후보. +- **`kebab list` / `doc_summary.v1` 에 source_id 미노출**: `documents.source_id` + 는 stamp/필터되지만 list 출력(`DocSummary`)에는 안 실린다 — 사용자가 "이 문서가 + 어느 source 냐" 를 list 로 못 본다. additive 후보. +- **RAG provenance 라벨 미구현**: `kebab ask` citation 에 source 라벨 없음. 검색은 + 필터 가능하나 답변 근거의 출처 표기는 다음 단계. +- **구현 교훈**: `Metadata`(Default 없음 · 60+ 곳에서 literal 구성)에 필수 필드 + 추가는 churn 이 큼(이 PR 의 90+ 파일 대부분이 `source_id: None` 추가). 차라리 + store 레이어에서 stamp 하면 저-churn — 다음 리팩터 후보. + +관련 메모리: jira-contamination-ab-experiment(Phase-1/2/3 측정), jira-wiki-dogfood-kb. + ## 2026-06-04 — config 스키마 v2→v3 재편: 미디어 ingest 통합 (v0.28.0) **무엇을 바꿨나.** `config.toml` 의 미디어 형식 설정을 `[ingest.*]` 우산 아래로 통합했다. 첫 non-additive rename 마이그레이션.