feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type

혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance 레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR 0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버. - config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type); 단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음. - config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등) - V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0) - Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary) - ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp - 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR) - CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep) 도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810, --source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값). version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor). follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨. 자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
2026-06-21 08:35:19 +00:00
parent 403e162ac0
commit 58ac62d53a
101 changed files with 1201 additions and 111 deletions
--- a/crates/kebab-store-sqlite/src/documents.rs
+++ b/crates/kebab-store-sqlite/src/documents.rs
@@ -745,6 +745,14 @@ fn upsert_document(
    // `markdown` for the column).
    let source_type = source_type_label(&doc.metadata.source_type);
    let trust_level = trust_level_label(&doc.metadata.trust_level);
+    // `[[workspace.sources]]`: id of the source this doc came from. Falls back
+    // to the column default `"default"` for docs without an explicit source
+    // (single-root workspaces / pre-multi-source ingests).
+    let source_id = doc
+        .metadata
+        .source_id
+        .as_deref()
+        .unwrap_or(kebab_config::DEFAULT_SOURCE_ID);
    let created_at = doc
        .metadata
        .created_at
@@ -757,11 +765,11 @@ fn upsert_document(
    tx.execute(
        "INSERT INTO documents (
            doc_id, asset_id, workspace_path, title, lang,
-            source_type, trust_level, parser_version,
+            source_type, trust_level, source_id, parser_version,
            doc_version, schema_version, metadata_json,
            provenance_json, created_at, updated_at,
            last_chunker_version, last_embedding_version
-        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ON CONFLICT(doc_id) DO UPDATE SET
            asset_id              = excluded.asset_id,
            workspace_path        = excluded.workspace_path,
@@ -769,6 +777,7 @@ fn upsert_document(
            lang                  = excluded.lang,
            source_type           = excluded.source_type,
            trust_level           = excluded.trust_level,
+            source_id             = excluded.source_id,
            parser_version        = excluded.parser_version,
            -- doc_version: bump on update. excluded.doc_version is the
            -- caller's submitted value; we ignore it and add 1 to the
@@ -788,6 +797,7 @@ fn upsert_document(
            doc.lang.0,
            source_type,
            trust_level,
+            source_id,
            doc.parser_version.0,
            i64::from(doc.doc_version),
            i64::from(doc.schema_version),
--- a/crates/kebab-store-sqlite/src/filters.rs
+++ b/crates/kebab-store-sqlite/src/filters.rs
@@ -191,6 +191,31 @@ impl SqliteStore {
            }
        }

+        // Phase-2: source_type filter (IN-list on the direct `documents.source_type`
+        // column, idx_docs_source_type). Empty Vec = no filter; multi-value = OR.
+        if !filters.source_type.is_empty() {
+            let placeholders = std::iter::repeat_n("?", filters.source_type.len())
+                .collect::<Vec<_>>()
+                .join(",");
+            sql.push_str(&format!(" AND d.source_type IN ({placeholders})"));
+            for st in &filters.source_type {
+                bind.push(Box::new(st.clone()));
+            }
+        }
+
+        // [[workspace.sources]]: source_id filter (IN-list on the direct
+        // `documents.source_id` column, idx_docs_source_id). Empty Vec = no
+        // filter; multi-value = OR. Mirrors the source_type filter above.
+        if !filters.source_id.is_empty() {
+            let placeholders = std::iter::repeat_n("?", filters.source_id.len())
+                .collect::<Vec<_>>()
+                .join(",");
+            sql.push_str(&format!(" AND d.source_id IN ({placeholders})"));
+            for sid in &filters.source_id {
+                bind.push(Box::new(sid.clone()));
+            }
+        }
+
        // p9-fb-36: ingested_after filter.
        // `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
        // lexicographic >= compare is correct — but only when the filter
@@ -1000,6 +1025,121 @@ mod tests {
        );
    }

+    /// [[workspace.sources]]: the `source_id` filter keeps only chunks whose
+    /// owning document's `documents.source_id` column is in the IN-list.
+    #[test]
+    fn filter_chunks_source_id_keeps_matching_source() {
+        let tmp = TempDir::new().unwrap();
+        let store = open_store(&tmp);
+        let c1 = "11111111111111111111111111111111";
+        let c2 = "22222222222222222222222222222222";
+        let c3 = "33333333333333333333333333333333";
+        // Three docs, each with a distinct source_id column value.
+        seed_with_source_id(&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "notes/a.md", "notes");
+        seed_with_source_id(&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "code/b.rs", "code");
+        seed_with_source_id(
+            &store,
+            c3,
+            "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
+            "x.md",
+            "default",
+        );
+
+        // Single value.
+        let f = SearchFilters {
+            source_id: vec!["notes".to_string()],
+            ..Default::default()
+        };
+        let out = store
+            .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
+            .unwrap();
+        assert_eq!(out, vec![cid(c1)], "only the `notes` source chunk survives");
+
+        // Multi-value OR.
+        let f = SearchFilters {
+            source_id: vec!["notes".to_string(), "code".to_string()],
+            ..Default::default()
+        };
+        let out = store
+            .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
+            .unwrap();
+        assert_eq!(out, vec![cid(c1), cid(c2)], "notes OR code survive");
+
+        // Empty filter = no filtering.
+        let f = SearchFilters::default();
+        let out = store
+            .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
+            .unwrap();
+        assert_eq!(out, vec![cid(c1), cid(c2), cid(c3)]);
+    }
+
+    /// Seed one committed doc + chunk + embedding with an explicit
+    /// `documents.source_id` column value (the DEFAULT is `'default'`).
+    fn seed_with_source_id(
+        store: &SqliteStore,
+        chunk_id: &str,
+        doc_id: &str,
+        workspace_path: &str,
+        source_id: &str,
+    ) {
+        let asset_id = format!("a{}", &doc_id[..31]);
+        {
+            let conn = store.lock_conn();
+            conn.execute(
+                "INSERT INTO assets (
+                    asset_id, source_uri, workspace_path, media_type, byte_len,
+                    checksum, storage_kind, storage_path, discovered_at
+                 ) VALUES (?, ?, ?, '\"markdown\"', 1, ?, 'reference', ?,
+                           '1970-01-01T00:00:00Z')",
+                params![
+                    asset_id,
+                    format!("file://{workspace_path}"),
+                    workspace_path,
+                    workspace_path,
+                    workspace_path,
+                ],
+            )
+            .unwrap();
+            conn.execute(
+                "INSERT INTO documents (
+                    doc_id, asset_id, workspace_path, title, lang, source_type,
+                    trust_level, source_id, parser_version, doc_version,
+                    schema_version, metadata_json, provenance_json,
+                    created_at, updated_at
+                 ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', ?, 'v1',
+                           1, 1, '{}', '{}', '1970-01-01T00:00:00Z',
+                           '1970-01-01T00:00:00Z')",
+                params![doc_id, asset_id, workspace_path, source_id],
+            )
+            .unwrap();
+            conn.execute(
+                "INSERT INTO chunks (
+                    chunk_id, doc_id, text, heading_path_json, section_label,
+                    source_spans_json, token_estimate, chunker_version,
+                    policy_hash, block_ids_json, created_at
+                 ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
+                           '1970-01-01T00:00:00Z')",
+                params![chunk_id, doc_id],
+            )
+            .unwrap();
+        }
+        let embed_row = EmbeddingRecordRow {
+            embedding_id: format!("e{}", &chunk_id[..31]),
+            chunk_id: chunk_id.to_string(),
+            model_id: "m".to_string(),
+            model_version: "v1".to_string(),
+            dimensions: 4,
+            lance_table: "t".to_string(),
+            created_at: OffsetDateTime::UNIX_EPOCH,
+        };
+        store
+            .put_embedding_records_pending(std::slice::from_ref(&embed_row))
+            .unwrap();
+        store
+            .mark_embedding_records_committed(std::slice::from_ref(&embed_row.embedding_id))
+            .unwrap();
+    }
+
    #[test]
    fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
        // Regression test for the non-UTC offset lex-compare bug.
--- a/crates/kebab-store-sqlite/src/stats_ext.rs
+++ b/crates/kebab-store-sqlite/src/stats_ext.rs
@@ -80,7 +80,7 @@ pub fn breakdowns(conn: &Connection, threshold_days: u64) -> rusqlite::Result<Br
 /// the LanceDB directory tree. Missing files / dir = 0.
 pub fn index_bytes(data_dir: &Path) -> std::io::Result<IndexBytes> {
    fn file_size_or_zero(p: &Path) -> u64 {
-        std::fs::metadata(p).map(|m| m.len()).unwrap_or(0)
+        std::fs::metadata(p).map_or(0, |m| m.len())
    }
    fn dir_walk_sum(p: &Path) -> std::io::Result<u64> {
        if !p.exists() {
--- a/crates/kebab-store-sqlite/tests/contract_roundtrip.rs
+++ b/crates/kebab-store-sqlite/tests/contract_roundtrip.rs
@@ -57,6 +57,8 @@ fn document_and_chunks_round_trip_through_sqlite() {
        fs_ctime: asset.discovered_at,
        fs_mtime: asset.discovered_at,
        fallback_lang: Some("en".into()),
+        source_id: None,
+        fallback_trust_level: None,
    };
    let (mut metadata, _fm_span, _fm_warns) = parse_frontmatter(&bytes, &hints).unwrap();
    let (parsed_blocks, parse_warns) = parse_blocks(&bytes, 1).unwrap();
--- a/crates/kebab-store-sqlite/tests/idempotency.rs
+++ b/crates/kebab-store-sqlite/tests/idempotency.rs
@@ -45,6 +45,7 @@ fn make_metadata() -> Metadata {
        git_branch: None,
        git_commit: None,
        code_lang: None,
+        source_id: None,
    }
 }

--- a/crates/kebab-store-sqlite/tests/incremental_ingest.rs
+++ b/crates/kebab-store-sqlite/tests/incremental_ingest.rs
@@ -55,6 +55,7 @@ fn make_doc() -> CanonicalDocument {
        git_branch: None,
        git_commit: None,
        code_lang: None,
+        source_id: None,
    };
    CanonicalDocument {
        doc_id,
--- a/crates/kebab-store-sqlite/tests/list_docs.rs
+++ b/crates/kebab-store-sqlite/tests/list_docs.rs
@@ -58,6 +58,7 @@ fn make_doc(
        git_branch: None,
        git_commit: None,
        code_lang: None,
+        source_id: None,
    };
    let doc = CanonicalDocument {
        doc_id,