Files
kebab/crates/kebab-chunk/tests/code_text_paragraph_v1.rs
altair823 58ac62d53a feat(search): provenance 출처 필터 — [[workspace.sources]] 멀티소스 + --source/--source-type
혼합 출처 KB(위키+jira 등)에서 색인은 전부 하되 질의 시 출처로 좁히는 provenance
레버. 전역 trust 곱셈가중(weighted-RRF)은 A/B 에서 반증(θ=0.85 만으로 incident MRR
0.918→0.340 절벽, 점수 압축) — 필터가 see-saw 없는 올바른 레버.

- config [[workspace.sources]] (각 id/root/exclude/trust_level/source_type);
  단일 root 는 implicit `default` source 로 정규화. validate: id 유일·비어있지 않음.
- config schema v3→v4 (step_3_to_4, root→[[workspace.sources]] id=default 미러, 멱등)
- V014 documents.source_id 컬럼+인덱스 (additive, DEFAULT 'default', 재색인 0)
- Metadata.source_id + BodyHints trust precedence(frontmatter > source 기본값 > Primary)
- ingest: --root 미지정 시 resolved_sources() 순회 + doc 마다 source_id/trust stamp
- 검색 SearchFilters.source_type/source_id → lexical + vector 두 site (IN, OR)
- CLI kebab search --source <id> / --source-type <type> (repeatable/comma-sep)

도그푸딩(620 doc, jira400+wiki220): --source wiki 로 개념 질의 MRR 0.780→0.810,
--source jira 로 incident 0.918→0.975. trust precedence 실측(jira=secondary 기본값).

version bump 0.28.0 → 0.29.0 (신규 CLI flag + config 키 + V014 migration → minor).
follow-up: MCP search 필터 미노출 · kebab list source_id 미표시 · RAG provenance 라벨.

자세한 내용: tasks/HOTFIXES.md (2026-06-21), docs/release-notes/v0.29.0-draft.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_012Mc6W1fgsrbFKTsqA6P8La
2026-06-21 08:35:19 +00:00

272 lines
9.1 KiB
Rust

//! Behavioural tests for `CodeTextParagraphV1Chunker`.
//!
//! Documents are constructed manually (no kebab-parse-code dependency) by
//! placing raw text into a single `Block::Code`, mirroring the pattern used
//! in `k8s_manifest_resource_v1.rs`.
use std::path::PathBuf;
use kebab_chunk::CodeTextParagraphV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
// ── helpers ──────────────────────────────────────────────────────────────────
fn fixtures_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests")
.join("fixtures")
}
/// Build a `CanonicalDocument` with a single `Block::Code` containing `text`
/// and the supplied `lang` label.
fn text_doc(lang: &str, text: &str) -> CanonicalDocument {
let wp = WorkspacePath("scripts/sample.sh".into());
let aid = AssetId("d".repeat(64));
let pv = ParserVersion("code-text-paragraph-v1".into());
let doc_id = id_for_doc(&wp, &aid, &pv);
let line_count = text.lines().count() as u32;
let span = SourceSpan::Code {
line_start: 1,
line_end: line_count.max(1),
symbol: None,
lang: Some(lang.into()),
};
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
let block = Block::Code(CodeBlock {
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some(lang.into()),
code: text.to_string(),
});
CanonicalDocument {
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "sample.sh".into(),
lang: Lang("und".into()),
blocks: vec![block],
metadata: Metadata {
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some(lang.into()),
source_id: None,
},
provenance: Provenance { events: vec![] },
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion("code-text-paragraph-v1".into()),
}
}
// ── tests ─────────────────────────────────────────────────────────────────────
/// `sample_shell.sh` has 4 paragraphs separated by 3 blank lines:
/// - paragraph 1: lines 1-2 (shebang + set -euo pipefail)
/// - paragraph 2: lines 4-7 (env setup block)
/// - paragraph 3: lines 9-11 (ingest block)
/// - paragraph 4: lines 13-15 (report block)
///
/// We assert:
/// - exactly 4 chunks (one per paragraph)
/// - all symbols are None (Tier 3 spec §9.3)
/// - all langs are "shell"
/// - line ranges are strictly ascending and do NOT include the blank lines
/// (lines 3, 8, 12 must not appear in any range)
#[test]
fn shell_multi_paragraph_splits_on_blank_lines() {
let fixture_path = fixtures_dir().join("sample_shell.sh");
let text = std::fs::read_to_string(&fixture_path)
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
let doc = text_doc("shell", &text);
let chunks = CodeTextParagraphV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
assert_eq!(
chunks.len(),
4,
"expected 4 chunks (one per paragraph), got {}: {chunks:#?}",
chunks.len()
);
// All symbols must be None (Tier 3 requirement).
for (i, chunk) in chunks.iter().enumerate() {
match &chunk.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(
symbol.is_none(),
"chunk[{i}] symbol must be None for Tier 3 chunker, got {symbol:?}"
);
}
other => panic!("chunk[{i}]: expected Code span, got {other:?}"),
}
}
// All langs must be "shell".
for (i, chunk) in chunks.iter().enumerate() {
match &chunk.source_spans[0] {
SourceSpan::Code { lang, .. } => {
assert_eq!(
lang.as_deref(),
Some("shell"),
"chunk[{i}] lang must be 'shell', got {lang:?}"
);
}
other => panic!("chunk[{i}]: expected Code span, got {other:?}"),
}
}
// Line ranges must be strictly ascending with no overlap,
// and blank lines (3, 8, 12) must not be included in any range.
let expected_ranges: &[(u32, u32)] = &[(1, 2), (4, 7), (9, 11), (13, 15)];
let actual_ranges: Vec<(u32, u32)> = chunks
.iter()
.map(|c| match &c.source_spans[0] {
SourceSpan::Code {
line_start,
line_end,
..
} => (*line_start, *line_end),
other => panic!("expected Code span, got {other:?}"),
})
.collect();
assert_eq!(
actual_ranges, expected_ranges,
"line ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}"
);
}
/// `sample_long_paragraph.txt` has exactly 200 non-blank lines and no blank
/// lines, so the entire file is one paragraph. 200 > 80 (FALLBACK_LINES_PER_CHUNK),
/// so the oversize window split fires with stride 60:
/// - window 1: lines 1-80
/// - window 2: lines 61-140
/// - window 3: lines 121-200
///
/// All chunk_ids must be distinct (the #L{window_start} split_key suffix).
#[test]
fn single_long_paragraph_line_window_split() {
let fixture_path = fixtures_dir().join("sample_long_paragraph.txt");
let text = std::fs::read_to_string(&fixture_path)
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
assert_eq!(
text.lines().count(),
200,
"fixture must have exactly 200 lines"
);
let doc = text_doc("shell", &text);
let chunks = CodeTextParagraphV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
assert_eq!(
chunks.len(),
3,
"expected 3 window chunks for 200-line paragraph, got {}: {chunks:#?}",
chunks.len()
);
let expected_ranges: &[(u32, u32)] = &[(1, 80), (61, 140), (121, 200)];
let actual_ranges: Vec<(u32, u32)> = chunks
.iter()
.map(|c| match &c.source_spans[0] {
SourceSpan::Code {
line_start,
line_end,
..
} => (*line_start, *line_end),
other => panic!("expected Code span, got {other:?}"),
})
.collect();
assert_eq!(
actual_ranges, expected_ranges,
"window ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}"
);
// All chunk_ids must be distinct (#L{window_start} suffix differentiates them).
let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect();
assert_eq!(
ids.len(),
chunks.len(),
"oversize window chunks must have distinct chunk_ids"
);
}
/// An empty source file (no non-blank lines) must yield zero chunks.
#[test]
fn empty_file_emits_zero_chunks() {
let doc = text_doc("shell", "");
let chunks = CodeTextParagraphV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
assert_eq!(
chunks.len(),
0,
"empty file must yield 0 chunks, got {}: {chunks:#?}",
chunks.len()
);
}
/// The `lang` field on each emitted chunk must match the `lang` passed to
/// `text_doc`, regardless of content. `symbol` must be `None` (Tier 3 spec).
#[test]
fn lang_field_preserved_from_input_doc() {
let doc = text_doc("yaml", "key1: value1\nkey2: value2\n");
let chunks = CodeTextParagraphV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
assert!(!chunks.is_empty(), "expected at least one chunk");
match &chunks[0].source_spans[0] {
SourceSpan::Code { lang, symbol, .. } => {
assert_eq!(
lang.as_deref(),
Some("yaml"),
"lang must be 'yaml', got {lang:?}"
);
assert!(
symbol.is_none(),
"symbol must be None for Tier 3 chunker, got {symbol:?}"
);
}
other => panic!("expected Code span, got {other:?}"),
}
}