feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type

혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance
레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR
0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버.

- config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type);
  단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음.
- config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등)
- V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0)
- Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary)
- ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp
- 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR)
- CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep)

도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810,
--source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값).

version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor).
follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨.

자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
This commit is contained in:
2026-06-21 08:35:19 +00:00
parent 403e162ac0
commit 58ac62d53a
101 changed files with 1201 additions and 111 deletions

View File

@@ -745,6 +745,14 @@ fn upsert_document(
// `markdown` for the column).
let source_type = source_type_label(&doc.metadata.source_type);
let trust_level = trust_level_label(&doc.metadata.trust_level);
// `[[workspace.sources]]`: id of the source this doc came from. Falls back
// to the column default `"default"` for docs without an explicit source
// (single-root workspaces / pre-multi-source ingests).
let source_id = doc
.metadata
.source_id
.as_deref()
.unwrap_or(kebab_config::DEFAULT_SOURCE_ID);
let created_at = doc
.metadata
.created_at
@@ -757,11 +765,11 @@ fn upsert_document(
tx.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang,
source_type, trust_level, parser_version,
source_type, trust_level, source_id, parser_version,
doc_version, schema_version, metadata_json,
provenance_json, created_at, updated_at,
last_chunker_version, last_embedding_version
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(doc_id) DO UPDATE SET
asset_id = excluded.asset_id,
workspace_path = excluded.workspace_path,
@@ -769,6 +777,7 @@ fn upsert_document(
lang = excluded.lang,
source_type = excluded.source_type,
trust_level = excluded.trust_level,
source_id = excluded.source_id,
parser_version = excluded.parser_version,
-- doc_version: bump on update. excluded.doc_version is the
-- caller's submitted value; we ignore it and add 1 to the
@@ -788,6 +797,7 @@ fn upsert_document(
doc.lang.0,
source_type,
trust_level,
source_id,
doc.parser_version.0,
i64::from(doc.doc_version),
i64::from(doc.schema_version),

View File

@@ -191,6 +191,31 @@ impl SqliteStore {
}
}
// Phase-2: source_type filter (IN-list on the direct `documents.source_type`
// column, idx_docs_source_type). Empty Vec = no filter; multi-value = OR.
if !filters.source_type.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.source_type.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(" AND d.source_type IN ({placeholders})"));
for st in &filters.source_type {
bind.push(Box::new(st.clone()));
}
}
// [[workspace.sources]]: source_id filter (IN-list on the direct
// `documents.source_id` column, idx_docs_source_id). Empty Vec = no
// filter; multi-value = OR. Mirrors the source_type filter above.
if !filters.source_id.is_empty() {
let placeholders = std::iter::repeat_n("?", filters.source_id.len())
.collect::<Vec<_>>()
.join(",");
sql.push_str(&format!(" AND d.source_id IN ({placeholders})"));
for sid in &filters.source_id {
bind.push(Box::new(sid.clone()));
}
}
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
// lexicographic >= compare is correct — but only when the filter
@@ -1000,6 +1025,121 @@ mod tests {
);
}
/// [[workspace.sources]]: the `source_id` filter keeps only chunks whose
/// owning document's `documents.source_id` column is in the IN-list.
#[test]
fn filter_chunks_source_id_keeps_matching_source() {
let tmp = TempDir::new().unwrap();
let store = open_store(&tmp);
let c1 = "11111111111111111111111111111111";
let c2 = "22222222222222222222222222222222";
let c3 = "33333333333333333333333333333333";
// Three docs, each with a distinct source_id column value.
seed_with_source_id(&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "notes/a.md", "notes");
seed_with_source_id(&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "code/b.rs", "code");
seed_with_source_id(
&store,
c3,
"d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
"x.md",
"default",
);
// Single value.
let f = SearchFilters {
source_id: vec!["notes".to_string()],
..Default::default()
};
let out = store
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
.unwrap();
assert_eq!(out, vec![cid(c1)], "only the `notes` source chunk survives");
// Multi-value OR.
let f = SearchFilters {
source_id: vec!["notes".to_string(), "code".to_string()],
..Default::default()
};
let out = store
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
.unwrap();
assert_eq!(out, vec![cid(c1), cid(c2)], "notes OR code survive");
// Empty filter = no filtering.
let f = SearchFilters::default();
let out = store
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
.unwrap();
assert_eq!(out, vec![cid(c1), cid(c2), cid(c3)]);
}
/// Seed one committed doc + chunk + embedding with an explicit
/// `documents.source_id` column value (the DEFAULT is `'default'`).
fn seed_with_source_id(
store: &SqliteStore,
chunk_id: &str,
doc_id: &str,
workspace_path: &str,
source_id: &str,
) {
let asset_id = format!("a{}", &doc_id[..31]);
{
let conn = store.lock_conn();
conn.execute(
"INSERT INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, ?, ?, '\"markdown\"', 1, ?, 'reference', ?,
'1970-01-01T00:00:00Z')",
params![
asset_id,
format!("file://{workspace_path}"),
workspace_path,
workspace_path,
workspace_path,
],
)
.unwrap();
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang, source_type,
trust_level, source_id, parser_version, doc_version,
schema_version, metadata_json, provenance_json,
created_at, updated_at
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', ?, 'v1',
1, 1, '{}', '{}', '1970-01-01T00:00:00Z',
'1970-01-01T00:00:00Z')",
params![doc_id, asset_id, workspace_path, source_id],
)
.unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
'1970-01-01T00:00:00Z')",
params![chunk_id, doc_id],
)
.unwrap();
}
let embed_row = EmbeddingRecordRow {
embedding_id: format!("e{}", &chunk_id[..31]),
chunk_id: chunk_id.to_string(),
model_id: "m".to_string(),
model_version: "v1".to_string(),
dimensions: 4,
lance_table: "t".to_string(),
created_at: OffsetDateTime::UNIX_EPOCH,
};
store
.put_embedding_records_pending(std::slice::from_ref(&embed_row))
.unwrap();
store
.mark_embedding_records_committed(std::slice::from_ref(&embed_row.embedding_id))
.unwrap();
}
#[test]
fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
// Regression test for the non-UTC offset lex-compare bug.

View File

@@ -80,7 +80,7 @@ pub fn breakdowns(conn: &Connection, threshold_days: u64) -> rusqlite::Result<Br
/// the LanceDB directory tree. Missing files / dir = 0.
pub fn index_bytes(data_dir: &Path) -> std::io::Result<IndexBytes> {
fn file_size_or_zero(p: &Path) -> u64 {
std::fs::metadata(p).map(|m| m.len()).unwrap_or(0)
std::fs::metadata(p).map_or(0, |m| m.len())
}
fn dir_walk_sum(p: &Path) -> std::io::Result<u64> {
if !p.exists() {

View File

@@ -57,6 +57,8 @@ fn document_and_chunks_round_trip_through_sqlite() {
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: Some("en".into()),
source_id: None,
fallback_trust_level: None,
};
let (mut metadata, _fm_span, _fm_warns) = parse_frontmatter(&bytes, &hints).unwrap();
let (parsed_blocks, parse_warns) = parse_blocks(&bytes, 1).unwrap();

View File

@@ -45,6 +45,7 @@ fn make_metadata() -> Metadata {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
}
}

View File

@@ -55,6 +55,7 @@ fn make_doc() -> CanonicalDocument {
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
};
CanonicalDocument {
doc_id,

View File

@@ -58,6 +58,7 @@ fn make_doc(
git_branch: None,
git_commit: None,
code_lang: None,
source_id: None,
};
let doc = CanonicalDocument {
doc_id,