혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance 레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR 0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버. - config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type); 단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음. - config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등) - V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0) - Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary) - ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp - 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR) - CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep) 도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810, --source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값). version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor). follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨. 자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
261 lines
8.5 KiB
Rust
261 lines
8.5 KiB
Rust
//! Behavioural tests for `ManifestFileV1Chunker`.
|
|
//!
|
|
//! Documents are constructed manually (no kebab-parse-code dependency) by
|
|
//! placing the raw manifest text into a single `Block::Code`, mirroring the
|
|
//! pattern used in `dockerfile_file_v1.rs`.
|
|
|
|
use std::path::PathBuf;
|
|
|
|
use kebab_chunk::ManifestFileV1Chunker;
|
|
use kebab_core::{
|
|
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
|
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
|
WorkspacePath, id_for_block, id_for_doc,
|
|
};
|
|
use time::OffsetDateTime;
|
|
|
|
// ── helpers ──────────────────────────────────────────────────────────────────
|
|
|
|
fn fixtures_dir() -> PathBuf {
|
|
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
|
.join("tests")
|
|
.join("fixtures")
|
|
}
|
|
|
|
/// Build a `CanonicalDocument` with a single `Block::Code` containing manifest text.
|
|
fn manifest_doc(lang: &str, manifest_text: &str) -> CanonicalDocument {
|
|
let wp = WorkspacePath(format!("build/{}", manifest_filename(lang)));
|
|
let aid = AssetId("m".repeat(64));
|
|
let pv = ParserVersion("code-manifest-v1".into());
|
|
let doc_id = id_for_doc(&wp, &aid, &pv);
|
|
|
|
let line_count = manifest_text.lines().count() as u32;
|
|
let span = SourceSpan::Code {
|
|
line_start: 1,
|
|
line_end: line_count.max(1),
|
|
symbol: None,
|
|
lang: Some(lang.into()),
|
|
};
|
|
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
|
|
let block = Block::Code(CodeBlock {
|
|
common: CommonBlock {
|
|
block_id: bid,
|
|
heading_path: vec![],
|
|
source_span: span,
|
|
},
|
|
lang: Some(lang.into()),
|
|
code: manifest_text.to_string(),
|
|
});
|
|
|
|
CanonicalDocument {
|
|
doc_id,
|
|
source_asset_id: aid,
|
|
workspace_path: wp,
|
|
title: format!("Manifest ({lang})"),
|
|
lang: Lang("und".into()),
|
|
blocks: vec![block],
|
|
metadata: Metadata {
|
|
aliases: vec![],
|
|
tags: vec![],
|
|
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
|
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
|
source_type: SourceType::Note,
|
|
trust_level: TrustLevel::Primary,
|
|
user_id_alias: None,
|
|
user: Default::default(),
|
|
repo: Some("kebab".into()),
|
|
git_branch: Some("main".into()),
|
|
git_commit: Some("0".repeat(40)),
|
|
code_lang: Some(lang.into()),
|
|
source_id: None,
|
|
},
|
|
provenance: Provenance { events: vec![] },
|
|
parser_version: pv,
|
|
schema_version: 1,
|
|
doc_version: 1,
|
|
last_chunker_version: None,
|
|
last_embedding_version: None,
|
|
}
|
|
}
|
|
|
|
fn manifest_filename(lang: &str) -> &'static str {
|
|
match lang {
|
|
"toml" => "Cargo.toml",
|
|
"json" => "package.json",
|
|
"xml" => "pom.xml",
|
|
"go-mod" => "go.mod",
|
|
_ => "manifest",
|
|
}
|
|
}
|
|
|
|
fn policy() -> ChunkPolicy {
|
|
ChunkPolicy {
|
|
target_tokens: 500,
|
|
overlap_tokens: 80,
|
|
respect_markdown_headings: false,
|
|
chunker_version: ChunkerVersion("manifest-file-v1".into()),
|
|
}
|
|
}
|
|
|
|
// ── tests ─────────────────────────────────────────────────────────────────────
|
|
|
|
/// A Cargo.toml fixture must emit exactly 1 chunk with the correct symbol,
|
|
/// lang, and line range.
|
|
#[test]
|
|
fn cargo_toml_single_chunk_with_toml_lang() {
|
|
let fixture_path = fixtures_dir().join("sample_cargo.toml");
|
|
let text = std::fs::read_to_string(&fixture_path)
|
|
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
|
|
|
let doc = manifest_doc("toml", &text);
|
|
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
|
|
|
|
assert_eq!(
|
|
chunks.len(),
|
|
1,
|
|
"expected 1 chunk, got {}: {chunks:#?}",
|
|
chunks.len()
|
|
);
|
|
|
|
let span = chunks[0].source_spans.first().expect("at least one span");
|
|
match span {
|
|
SourceSpan::Code {
|
|
line_start,
|
|
line_end: _,
|
|
symbol,
|
|
lang,
|
|
} => {
|
|
assert_eq!(*line_start, 1, "line_start must be 1");
|
|
assert_eq!(
|
|
symbol.as_deref(),
|
|
Some("<manifest>"),
|
|
"symbol must be '<manifest>'"
|
|
);
|
|
assert_eq!(lang.as_deref(), Some("toml"), "lang must be 'toml'");
|
|
}
|
|
other => panic!("expected SourceSpan::Code, got {other:?}"),
|
|
}
|
|
|
|
assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1");
|
|
}
|
|
|
|
/// A package.json fixture must emit exactly 1 chunk with the correct symbol,
|
|
/// lang, and line range.
|
|
#[test]
|
|
fn package_json_single_chunk_with_json_lang() {
|
|
let fixture_path = fixtures_dir().join("sample_package.json");
|
|
let text = std::fs::read_to_string(&fixture_path)
|
|
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
|
|
|
let doc = manifest_doc("json", &text);
|
|
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
|
|
|
|
assert_eq!(
|
|
chunks.len(),
|
|
1,
|
|
"expected 1 chunk, got {}: {chunks:#?}",
|
|
chunks.len()
|
|
);
|
|
|
|
let span = chunks[0].source_spans.first().expect("at least one span");
|
|
match span {
|
|
SourceSpan::Code {
|
|
line_start,
|
|
line_end: _,
|
|
symbol,
|
|
lang,
|
|
} => {
|
|
assert_eq!(*line_start, 1, "line_start must be 1");
|
|
assert_eq!(
|
|
symbol.as_deref(),
|
|
Some("<manifest>"),
|
|
"symbol must be '<manifest>'"
|
|
);
|
|
assert_eq!(lang.as_deref(), Some("json"), "lang must be 'json'");
|
|
}
|
|
other => panic!("expected SourceSpan::Code, got {other:?}"),
|
|
}
|
|
|
|
assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1");
|
|
}
|
|
|
|
/// A pom.xml fixture must emit exactly 1 chunk with the correct symbol,
|
|
/// lang, and line range.
|
|
#[test]
|
|
fn pom_xml_single_chunk_with_xml_lang() {
|
|
let fixture_path = fixtures_dir().join("sample_pom.xml");
|
|
let text = std::fs::read_to_string(&fixture_path)
|
|
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
|
|
|
let doc = manifest_doc("xml", &text);
|
|
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
|
|
|
|
assert_eq!(
|
|
chunks.len(),
|
|
1,
|
|
"expected 1 chunk, got {}: {chunks:#?}",
|
|
chunks.len()
|
|
);
|
|
|
|
let span = chunks[0].source_spans.first().expect("at least one span");
|
|
match span {
|
|
SourceSpan::Code {
|
|
line_start,
|
|
line_end: _,
|
|
symbol,
|
|
lang,
|
|
} => {
|
|
assert_eq!(*line_start, 1, "line_start must be 1");
|
|
assert_eq!(
|
|
symbol.as_deref(),
|
|
Some("<manifest>"),
|
|
"symbol must be '<manifest>'"
|
|
);
|
|
assert_eq!(lang.as_deref(), Some("xml"), "lang must be 'xml'");
|
|
}
|
|
other => panic!("expected SourceSpan::Code, got {other:?}"),
|
|
}
|
|
|
|
assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1");
|
|
}
|
|
|
|
/// A go.mod fixture must emit exactly 1 chunk with the correct symbol,
|
|
/// lang, and line range.
|
|
#[test]
|
|
fn go_mod_single_chunk_with_go_mod_lang() {
|
|
let fixture_path = fixtures_dir().join("sample_go.mod");
|
|
let text = std::fs::read_to_string(&fixture_path)
|
|
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
|
|
|
let doc = manifest_doc("go-mod", &text);
|
|
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
|
|
|
|
assert_eq!(
|
|
chunks.len(),
|
|
1,
|
|
"expected 1 chunk, got {}: {chunks:#?}",
|
|
chunks.len()
|
|
);
|
|
|
|
let span = chunks[0].source_spans.first().expect("at least one span");
|
|
match span {
|
|
SourceSpan::Code {
|
|
line_start,
|
|
line_end: _,
|
|
symbol,
|
|
lang,
|
|
} => {
|
|
assert_eq!(*line_start, 1, "line_start must be 1");
|
|
assert_eq!(
|
|
symbol.as_deref(),
|
|
Some("<manifest>"),
|
|
"symbol must be '<manifest>'"
|
|
);
|
|
assert_eq!(lang.as_deref(), Some("go-mod"), "lang must be 'go-mod'");
|
|
}
|
|
other => panic!("expected SourceSpan::Code, got {other:?}"),
|
|
}
|
|
|
|
assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1");
|
|
}
|