feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type
혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance 레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR 0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버. - config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type); 단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음. - config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등) - V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0) - Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary) - ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp - 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR) - CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep) 도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810, --source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값). version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor). follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨. 자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
This commit is contained in:
@@ -745,6 +745,14 @@ fn upsert_document(
|
||||
// `markdown` for the column).
|
||||
let source_type = source_type_label(&doc.metadata.source_type);
|
||||
let trust_level = trust_level_label(&doc.metadata.trust_level);
|
||||
// `[[workspace.sources]]`: id of the source this doc came from. Falls back
|
||||
// to the column default `"default"` for docs without an explicit source
|
||||
// (single-root workspaces / pre-multi-source ingests).
|
||||
let source_id = doc
|
||||
.metadata
|
||||
.source_id
|
||||
.as_deref()
|
||||
.unwrap_or(kebab_config::DEFAULT_SOURCE_ID);
|
||||
let created_at = doc
|
||||
.metadata
|
||||
.created_at
|
||||
@@ -757,11 +765,11 @@ fn upsert_document(
|
||||
tx.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang,
|
||||
source_type, trust_level, parser_version,
|
||||
source_type, trust_level, source_id, parser_version,
|
||||
doc_version, schema_version, metadata_json,
|
||||
provenance_json, created_at, updated_at,
|
||||
last_chunker_version, last_embedding_version
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(doc_id) DO UPDATE SET
|
||||
asset_id = excluded.asset_id,
|
||||
workspace_path = excluded.workspace_path,
|
||||
@@ -769,6 +777,7 @@ fn upsert_document(
|
||||
lang = excluded.lang,
|
||||
source_type = excluded.source_type,
|
||||
trust_level = excluded.trust_level,
|
||||
source_id = excluded.source_id,
|
||||
parser_version = excluded.parser_version,
|
||||
-- doc_version: bump on update. excluded.doc_version is the
|
||||
-- caller's submitted value; we ignore it and add 1 to the
|
||||
@@ -788,6 +797,7 @@ fn upsert_document(
|
||||
doc.lang.0,
|
||||
source_type,
|
||||
trust_level,
|
||||
source_id,
|
||||
doc.parser_version.0,
|
||||
i64::from(doc.doc_version),
|
||||
i64::from(doc.schema_version),
|
||||
|
||||
@@ -191,6 +191,31 @@ impl SqliteStore {
|
||||
}
|
||||
}
|
||||
|
||||
// Phase-2: source_type filter (IN-list on the direct `documents.source_type`
|
||||
// column, idx_docs_source_type). Empty Vec = no filter; multi-value = OR.
|
||||
if !filters.source_type.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.source_type.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(" AND d.source_type IN ({placeholders})"));
|
||||
for st in &filters.source_type {
|
||||
bind.push(Box::new(st.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// [[workspace.sources]]: source_id filter (IN-list on the direct
|
||||
// `documents.source_id` column, idx_docs_source_id). Empty Vec = no
|
||||
// filter; multi-value = OR. Mirrors the source_type filter above.
|
||||
if !filters.source_id.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.source_id.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(" AND d.source_id IN ({placeholders})"));
|
||||
for sid in &filters.source_id {
|
||||
bind.push(Box::new(sid.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p9-fb-36: ingested_after filter.
|
||||
// `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
|
||||
// lexicographic >= compare is correct — but only when the filter
|
||||
@@ -1000,6 +1025,121 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
/// [[workspace.sources]]: the `source_id` filter keeps only chunks whose
|
||||
/// owning document's `documents.source_id` column is in the IN-list.
|
||||
#[test]
|
||||
fn filter_chunks_source_id_keeps_matching_source() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let store = open_store(&tmp);
|
||||
let c1 = "11111111111111111111111111111111";
|
||||
let c2 = "22222222222222222222222222222222";
|
||||
let c3 = "33333333333333333333333333333333";
|
||||
// Three docs, each with a distinct source_id column value.
|
||||
seed_with_source_id(&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "notes/a.md", "notes");
|
||||
seed_with_source_id(&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "code/b.rs", "code");
|
||||
seed_with_source_id(
|
||||
&store,
|
||||
c3,
|
||||
"d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
|
||||
"x.md",
|
||||
"default",
|
||||
);
|
||||
|
||||
// Single value.
|
||||
let f = SearchFilters {
|
||||
source_id: vec!["notes".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1)], "only the `notes` source chunk survives");
|
||||
|
||||
// Multi-value OR.
|
||||
let f = SearchFilters {
|
||||
source_id: vec!["notes".to_string(), "code".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1), cid(c2)], "notes OR code survive");
|
||||
|
||||
// Empty filter = no filtering.
|
||||
let f = SearchFilters::default();
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1), cid(c2), cid(c3)]);
|
||||
}
|
||||
|
||||
/// Seed one committed doc + chunk + embedding with an explicit
|
||||
/// `documents.source_id` column value (the DEFAULT is `'default'`).
|
||||
fn seed_with_source_id(
|
||||
store: &SqliteStore,
|
||||
chunk_id: &str,
|
||||
doc_id: &str,
|
||||
workspace_path: &str,
|
||||
source_id: &str,
|
||||
) {
|
||||
let asset_id = format!("a{}", &doc_id[..31]);
|
||||
{
|
||||
let conn = store.lock_conn();
|
||||
conn.execute(
|
||||
"INSERT INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, ?, ?, '\"markdown\"', 1, ?, 'reference', ?,
|
||||
'1970-01-01T00:00:00Z')",
|
||||
params![
|
||||
asset_id,
|
||||
format!("file://{workspace_path}"),
|
||||
workspace_path,
|
||||
workspace_path,
|
||||
workspace_path,
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang, source_type,
|
||||
trust_level, source_id, parser_version, doc_version,
|
||||
schema_version, metadata_json, provenance_json,
|
||||
created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', ?, 'v1',
|
||||
1, 1, '{}', '{}', '1970-01-01T00:00:00Z',
|
||||
'1970-01-01T00:00:00Z')",
|
||||
params![doc_id, asset_id, workspace_path, source_id],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
|
||||
'1970-01-01T00:00:00Z')",
|
||||
params![chunk_id, doc_id],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
let embed_row = EmbeddingRecordRow {
|
||||
embedding_id: format!("e{}", &chunk_id[..31]),
|
||||
chunk_id: chunk_id.to_string(),
|
||||
model_id: "m".to_string(),
|
||||
model_version: "v1".to_string(),
|
||||
dimensions: 4,
|
||||
lance_table: "t".to_string(),
|
||||
created_at: OffsetDateTime::UNIX_EPOCH,
|
||||
};
|
||||
store
|
||||
.put_embedding_records_pending(std::slice::from_ref(&embed_row))
|
||||
.unwrap();
|
||||
store
|
||||
.mark_embedding_records_committed(std::slice::from_ref(&embed_row.embedding_id))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
|
||||
// Regression test for the non-UTC offset lex-compare bug.
|
||||
|
||||
@@ -80,7 +80,7 @@ pub fn breakdowns(conn: &Connection, threshold_days: u64) -> rusqlite::Result<Br
|
||||
/// the LanceDB directory tree. Missing files / dir = 0.
|
||||
pub fn index_bytes(data_dir: &Path) -> std::io::Result<IndexBytes> {
|
||||
fn file_size_or_zero(p: &Path) -> u64 {
|
||||
std::fs::metadata(p).map(|m| m.len()).unwrap_or(0)
|
||||
std::fs::metadata(p).map_or(0, |m| m.len())
|
||||
}
|
||||
fn dir_walk_sum(p: &Path) -> std::io::Result<u64> {
|
||||
if !p.exists() {
|
||||
|
||||
@@ -57,6 +57,8 @@ fn document_and_chunks_round_trip_through_sqlite() {
|
||||
fs_ctime: asset.discovered_at,
|
||||
fs_mtime: asset.discovered_at,
|
||||
fallback_lang: Some("en".into()),
|
||||
source_id: None,
|
||||
fallback_trust_level: None,
|
||||
};
|
||||
let (mut metadata, _fm_span, _fm_warns) = parse_frontmatter(&bytes, &hints).unwrap();
|
||||
let (parsed_blocks, parse_warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
|
||||
@@ -45,6 +45,7 @@ fn make_metadata() -> Metadata {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -55,6 +55,7 @@ fn make_doc() -> CanonicalDocument {
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
};
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
|
||||
@@ -58,6 +58,7 @@ fn make_doc(
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
source_id: None,
|
||||
};
|
||||
let doc = CanonicalDocument {
|
||||
doc_id,
|
||||
|
||||
Reference in New Issue
Block a user