From 0b7d8af759d13490d34f6af3743ef9794e3ccd85 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 11:22:48 +0000 Subject: [PATCH] =?UTF-8?q?feat(p10-3):=20code-text-paragraph-v1=20chunker?= =?UTF-8?q?=20=E2=80=94=20paragraph=20+=20line-window=20fallback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Blank-line paragraph segmentation (whitespace-only lines as boundaries, blank lines themselves never in any chunk's range). Paragraphs > 80 lines split into 80-line windows with 20-line overlap (stride 60), sharing the input lang and symbol=None per spec §9.3. tier2_shared exposes a new build_chunk_no_symbol helper so Chunk id/hash/token semantics stay identical with Tier 1/2. Extracts build_chunk_from_span as private core so build_chunk and build_chunk_no_symbol share mechanics without drift. 4 unit tests cover multi-paragraph shell (4 paragraphs, blank-line boundaries verified), 200-line oversize line-window split (chunks 1-80 / 61-140 / 121-200), empty file, and lang preservation when input is yaml. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../kebab-chunk/src/code_text_paragraph_v1.rs | 167 +++++++++++ crates/kebab-chunk/src/lib.rs | 2 + crates/kebab-chunk/src/tier2_shared.rs | 44 ++- .../tests/code_text_paragraph_v1.rs | 270 ++++++++++++++++++ .../tests/fixtures/sample_long_paragraph.txt | 200 +++++++++++++ .../tests/fixtures/sample_shell.sh | 15 + 6 files changed, 697 insertions(+), 1 deletion(-) create mode 100644 crates/kebab-chunk/src/code_text_paragraph_v1.rs create mode 100644 crates/kebab-chunk/tests/code_text_paragraph_v1.rs create mode 100644 crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt create mode 100644 crates/kebab-chunk/tests/fixtures/sample_shell.sh diff --git a/crates/kebab-chunk/src/code_text_paragraph_v1.rs b/crates/kebab-chunk/src/code_text_paragraph_v1.rs new file mode 100644 index 0000000..aa41b52 --- /dev/null +++ b/crates/kebab-chunk/src/code_text_paragraph_v1.rs @@ -0,0 +1,167 @@ +//! p10-3: Tier 3 paragraph + line-window fallback chunker. +//! +//! Splits code/text files on blank-line paragraph boundaries. Paragraphs +//! with more than 80 lines are further split into 80-line windows with a +//! 20-line overlap (stride 60) — the same oversize pattern used by Tier 1/2 +//! chunkers but without AST structure, hence no symbol. +//! +//! Per spec §9.3: all emitted chunks carry `symbol: None`. + +use crate::tier2_shared::{build_chunk_no_symbol, policy_hash}; +use anyhow::Result; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; + +pub const VERSION_LABEL: &str = "code-text-paragraph-v1"; + +/// Lines-per-window for the oversize fallback (Tier 3). +const FALLBACK_LINES_PER_CHUNK: usize = 80; +/// Overlap between consecutive windows. +const FALLBACK_LINES_OVERLAP: usize = 20; +// stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP = 60. + +#[derive(Clone, Copy, Debug, Default)] +pub struct CodeTextParagraphV1Chunker; + +impl Chunker for CodeTextParagraphV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + policy_hash(policy) + } + + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result> { + // Expect a single Block::Code carrying the full source text. + let (text, lang_str) = match doc.blocks.first() { + Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")), + _ => return Ok(vec![]), + }; + + let mut chunks = Vec::new(); + for para in split_paragraphs(text) { + push_paragraph(&mut chunks, doc, policy, ¶, lang_str)?; + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = chunks.len(), + "code-text-paragraph-v1 chunked", + ); + + Ok(chunks) + } +} + +/// A contiguous run of non-blank lines from the source text. +struct Paragraph { + /// Lines joined with `\n` (no trailing newline). + text: String, + /// 1-indexed line number of the first line in the source file. + line_start: u32, + /// 1-indexed line number of the last line in the source file. + line_end: u32, +} + +/// Split `text` into `Paragraph`s separated by blank (all-whitespace) lines. +/// +/// Blank lines are treated as boundaries and are NOT included in any +/// paragraph's line range. Paragraphs that would consist entirely of blank +/// lines are skipped. +fn split_paragraphs(text: &str) -> Vec { + let mut paragraphs = Vec::new(); + let mut current: Vec<&str> = Vec::new(); + let mut current_start: Option = None; + + for (idx, line) in text.lines().enumerate() { + let line_no = (idx + 1) as u32; + let is_blank = line.trim().is_empty(); + if is_blank { + if let Some(start) = current_start.take() { + let end = start + current.len() as u32 - 1; + paragraphs.push(Paragraph { + text: current.join("\n"), + line_start: start, + line_end: end, + }); + current.clear(); + } + } else { + if current_start.is_none() { + current_start = Some(line_no); + } + current.push(line); + } + } + // Flush any trailing paragraph not terminated by a blank line. + if let Some(start) = current_start { + let end = start + current.len() as u32 - 1; + paragraphs.push(Paragraph { + text: current.join("\n"), + line_start: start, + line_end: end, + }); + } + paragraphs +} + +/// Emit one or more chunks for a single paragraph. +/// +/// Paragraphs with ≤ `FALLBACK_LINES_PER_CHUNK` lines become a single chunk. +/// Larger paragraphs are split into overlapping windows of +/// `FALLBACK_LINES_PER_CHUNK` lines with stride `FALLBACK_LINES_PER_CHUNK - +/// FALLBACK_LINES_OVERLAP`. The last window may be shorter. Window starts +/// are passed as `split_key` so `id_for_chunk` can produce distinct ids +/// across windows. +fn push_paragraph( + out: &mut Vec, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + para: &Paragraph, + lang: &str, +) -> Result<()> { + let n_lines = (para.line_end - para.line_start + 1) as usize; + + if n_lines <= FALLBACK_LINES_PER_CHUNK { + // Single chunk — no split_key needed. + out.push(build_chunk_no_symbol( + doc, + policy, + ¶.text, + para.line_start, + para.line_end, + lang, + VERSION_LABEL, + None, + )); + return Ok(()); + } + + // Oversize: line-window split with overlap. + let stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP; + let lines: Vec<&str> = para.text.lines().collect(); + let mut i = 0usize; + loop { + let end = (i + FALLBACK_LINES_PER_CHUNK).min(lines.len()); + let window_text = lines[i..end].join("\n"); + let window_start = para.line_start + i as u32; + let window_end = para.line_start + (end as u32) - 1; + // Use window_start as split_key so chunk_ids are unique across windows. + out.push(build_chunk_no_symbol( + doc, + policy, + &window_text, + window_start, + window_end, + lang, + VERSION_LABEL, + Some(window_start), + )); + if end == lines.len() { + break; + } + i += stride; + } + Ok(()) +} diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index 9b65e05..eee3f69 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -28,6 +28,7 @@ mod tier2_shared; pub mod k8s_manifest_resource_v1; pub mod dockerfile_file_v1; pub mod manifest_file_v1; +pub mod code_text_paragraph_v1; pub use code_go_ast_v1::CodeGoAstV1Chunker; pub use code_java_ast_v1::CodeJavaAstV1Chunker; @@ -41,3 +42,4 @@ pub use pdf_page_v1::PdfPageV1Chunker; pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker; pub use dockerfile_file_v1::DockerfileFileV1Chunker; pub use manifest_file_v1::ManifestFileV1Chunker; +pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker; diff --git a/crates/kebab-chunk/src/tier2_shared.rs b/crates/kebab-chunk/src/tier2_shared.rs index 3709384..c80b863 100644 --- a/crates/kebab-chunk/src/tier2_shared.rs +++ b/crates/kebab-chunk/src/tier2_shared.rs @@ -105,7 +105,49 @@ pub(crate) fn build_chunk( symbol: Some(symbol.to_string()), lang: Some(lang.to_string()), }; + build_chunk_from_span(doc, chunker_version, base_policy_hash, text, span, split_key) +} +/// Like `build_chunk` but emits `symbol: None`. Used by Tier 3 (per spec §9.3). +/// +/// Accepts `policy: &ChunkPolicy` and `chunker_version: &str` (string slice) +/// so callers don't need to pre-compute the hash and version wrapper. +/// `split_key` is `Some(window_start)` for oversize line-window splits. +#[allow(clippy::too_many_arguments)] +pub(crate) fn build_chunk_no_symbol( + doc: &CanonicalDocument, + policy: &ChunkPolicy, + text: &str, + line_start: u32, + line_end: u32, + lang: &str, + chunker_version: &str, + split_key: Option, +) -> Chunk { + let cv = ChunkerVersion(chunker_version.to_string()); + let base_policy_hash = policy_hash(policy); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: None, + lang: Some(lang.to_string()), + }; + build_chunk_from_span(doc, &cv, &base_policy_hash, text, span, split_key) +} + +/// Core chunk-building logic shared by `build_chunk` and `build_chunk_no_symbol`. +/// +/// Takes a pre-built `SourceSpan` so the only difference between the two +/// public helpers is whether `symbol` is `Some` or `None`. All id/hash/ +/// token mechanics are identical. +fn build_chunk_from_span( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + base_policy_hash: &str, + text: &str, + span: SourceSpan, + split_key: Option, +) -> Chunk { // id_hash mirrors code_rust_ast_v1's make_chunk logic: // split_key Some(k) => "{base_policy_hash}#L{k}" // split_key None => base_policy_hash @@ -114,7 +156,7 @@ pub(crate) fn build_chunk( None => base_policy_hash.to_string(), }; - // block_ids: Tier 2 chunkers have no per-block structure (the whole file + // block_ids: Tier 2/3 chunkers have no per-block structure (the whole file // is one Block::Code), so we pass an empty slice — same as using the doc- // level slice without explicit block granularity. let block_ids: Vec = vec![]; diff --git a/crates/kebab-chunk/tests/code_text_paragraph_v1.rs b/crates/kebab-chunk/tests/code_text_paragraph_v1.rs new file mode 100644 index 0000000..a3ef17a --- /dev/null +++ b/crates/kebab-chunk/tests/code_text_paragraph_v1.rs @@ -0,0 +1,270 @@ +//! Behavioural tests for `CodeTextParagraphV1Chunker`. +//! +//! Documents are constructed manually (no kebab-parse-code dependency) by +//! placing raw text into a single `Block::Code`, mirroring the pattern used +//! in `k8s_manifest_resource_v1.rs`. + +use std::path::PathBuf; + +use kebab_chunk::CodeTextParagraphV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, +}; +use time::OffsetDateTime; + +// ── helpers ────────────────────────────────────────────────────────────────── + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +/// Build a `CanonicalDocument` with a single `Block::Code` containing `text` +/// and the supplied `lang` label. +fn text_doc(lang: &str, text: &str) -> CanonicalDocument { + let wp = WorkspacePath("scripts/sample.sh".into()); + let aid = AssetId("d".repeat(64)); + let pv = ParserVersion("code-text-paragraph-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + let line_count = text.lines().count() as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: line_count.max(1), + symbol: None, + lang: Some(lang.into()), + }; + let bid = id_for_block(&doc_id, "code", &[], 0, &span); + let block = Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some(lang.into()), + code: text.to_string(), + }); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "sample.sh".into(), + lang: Lang("und".into()), + blocks: vec![block], + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some(lang.into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("code-text-paragraph-v1".into()), + } +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +/// `sample_shell.sh` has 4 paragraphs separated by 3 blank lines: +/// - paragraph 1: lines 1-2 (shebang + set -euo pipefail) +/// - paragraph 2: lines 4-7 (env setup block) +/// - paragraph 3: lines 9-11 (ingest block) +/// - paragraph 4: lines 13-15 (report block) +/// +/// We assert: +/// - exactly 4 chunks (one per paragraph) +/// - all symbols are None (Tier 3 spec §9.3) +/// - all langs are "shell" +/// - line ranges are strictly ascending and do NOT include the blank lines +/// (lines 3, 8, 12 must not appear in any range) +#[test] +fn shell_multi_paragraph_splits_on_blank_lines() { + let fixture_path = fixtures_dir().join("sample_shell.sh"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = text_doc("shell", &text); + let chunks = CodeTextParagraphV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 4, + "expected 4 chunks (one per paragraph), got {}: {chunks:#?}", + chunks.len() + ); + + // All symbols must be None (Tier 3 requirement). + for (i, chunk) in chunks.iter().enumerate() { + match &chunk.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + assert!( + symbol.is_none(), + "chunk[{i}] symbol must be None for Tier 3 chunker, got {symbol:?}" + ); + } + other => panic!("chunk[{i}]: expected Code span, got {other:?}"), + } + } + + // All langs must be "shell". + for (i, chunk) in chunks.iter().enumerate() { + match &chunk.source_spans[0] { + SourceSpan::Code { lang, .. } => { + assert_eq!( + lang.as_deref(), + Some("shell"), + "chunk[{i}] lang must be 'shell', got {lang:?}" + ); + } + other => panic!("chunk[{i}]: expected Code span, got {other:?}"), + } + } + + // Line ranges must be strictly ascending with no overlap, + // and blank lines (3, 8, 12) must not be included in any range. + let expected_ranges: &[(u32, u32)] = &[(1, 2), (4, 7), (9, 11), (13, 15)]; + let actual_ranges: Vec<(u32, u32)> = chunks + .iter() + .map(|c| match &c.source_spans[0] { + SourceSpan::Code { + line_start, + line_end, + .. + } => (*line_start, *line_end), + other => panic!("expected Code span, got {other:?}"), + }) + .collect(); + + assert_eq!( + actual_ranges, expected_ranges, + "line ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}" + ); +} + +/// `sample_long_paragraph.txt` has exactly 200 non-blank lines and no blank +/// lines, so the entire file is one paragraph. 200 > 80 (FALLBACK_LINES_PER_CHUNK), +/// so the oversize window split fires with stride 60: +/// - window 1: lines 1-80 +/// - window 2: lines 61-140 +/// - window 3: lines 121-200 +/// +/// All chunk_ids must be distinct (the #L{window_start} split_key suffix). +#[test] +fn single_long_paragraph_line_window_split() { + let fixture_path = fixtures_dir().join("sample_long_paragraph.txt"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + assert_eq!( + text.lines().count(), + 200, + "fixture must have exactly 200 lines" + ); + + let doc = text_doc("shell", &text); + let chunks = CodeTextParagraphV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 3, + "expected 3 window chunks for 200-line paragraph, got {}: {chunks:#?}", + chunks.len() + ); + + let expected_ranges: &[(u32, u32)] = &[(1, 80), (61, 140), (121, 200)]; + let actual_ranges: Vec<(u32, u32)> = chunks + .iter() + .map(|c| match &c.source_spans[0] { + SourceSpan::Code { + line_start, + line_end, + .. + } => (*line_start, *line_end), + other => panic!("expected Code span, got {other:?}"), + }) + .collect(); + + assert_eq!( + actual_ranges, expected_ranges, + "window ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}" + ); + + // All chunk_ids must be distinct (#L{window_start} suffix differentiates them). + let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect(); + assert_eq!( + ids.len(), + chunks.len(), + "oversize window chunks must have distinct chunk_ids" + ); +} + +/// An empty source file (no non-blank lines) must yield zero chunks. +#[test] +fn empty_file_emits_zero_chunks() { + let doc = text_doc("shell", ""); + let chunks = CodeTextParagraphV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 0, + "empty file must yield 0 chunks, got {}: {chunks:#?}", + chunks.len() + ); +} + +/// The `lang` field on each emitted chunk must match the `lang` passed to +/// `text_doc`, regardless of content. `symbol` must be `None` (Tier 3 spec). +#[test] +fn lang_field_preserved_from_input_doc() { + let doc = text_doc("yaml", "key1: value1\nkey2: value2\n"); + let chunks = CodeTextParagraphV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert!(!chunks.is_empty(), "expected at least one chunk"); + + match &chunks[0].source_spans[0] { + SourceSpan::Code { lang, symbol, .. } => { + assert_eq!( + lang.as_deref(), + Some("yaml"), + "lang must be 'yaml', got {lang:?}" + ); + assert!( + symbol.is_none(), + "symbol must be None for Tier 3 chunker, got {symbol:?}" + ); + } + other => panic!("expected Code span, got {other:?}"), + } +} diff --git a/crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt b/crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt new file mode 100644 index 0000000..192ff33 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt @@ -0,0 +1,200 @@ +line 001 +line 002 +line 003 +line 004 +line 005 +line 006 +line 007 +line 008 +line 009 +line 010 +line 011 +line 012 +line 013 +line 014 +line 015 +line 016 +line 017 +line 018 +line 019 +line 020 +line 021 +line 022 +line 023 +line 024 +line 025 +line 026 +line 027 +line 028 +line 029 +line 030 +line 031 +line 032 +line 033 +line 034 +line 035 +line 036 +line 037 +line 038 +line 039 +line 040 +line 041 +line 042 +line 043 +line 044 +line 045 +line 046 +line 047 +line 048 +line 049 +line 050 +line 051 +line 052 +line 053 +line 054 +line 055 +line 056 +line 057 +line 058 +line 059 +line 060 +line 061 +line 062 +line 063 +line 064 +line 065 +line 066 +line 067 +line 068 +line 069 +line 070 +line 071 +line 072 +line 073 +line 074 +line 075 +line 076 +line 077 +line 078 +line 079 +line 080 +line 081 +line 082 +line 083 +line 084 +line 085 +line 086 +line 087 +line 088 +line 089 +line 090 +line 091 +line 092 +line 093 +line 094 +line 095 +line 096 +line 097 +line 098 +line 099 +line 100 +line 101 +line 102 +line 103 +line 104 +line 105 +line 106 +line 107 +line 108 +line 109 +line 110 +line 111 +line 112 +line 113 +line 114 +line 115 +line 116 +line 117 +line 118 +line 119 +line 120 +line 121 +line 122 +line 123 +line 124 +line 125 +line 126 +line 127 +line 128 +line 129 +line 130 +line 131 +line 132 +line 133 +line 134 +line 135 +line 136 +line 137 +line 138 +line 139 +line 140 +line 141 +line 142 +line 143 +line 144 +line 145 +line 146 +line 147 +line 148 +line 149 +line 150 +line 151 +line 152 +line 153 +line 154 +line 155 +line 156 +line 157 +line 158 +line 159 +line 160 +line 161 +line 162 +line 163 +line 164 +line 165 +line 166 +line 167 +line 168 +line 169 +line 170 +line 171 +line 172 +line 173 +line 174 +line 175 +line 176 +line 177 +line 178 +line 179 +line 180 +line 181 +line 182 +line 183 +line 184 +line 185 +line 186 +line 187 +line 188 +line 189 +line 190 +line 191 +line 192 +line 193 +line 194 +line 195 +line 196 +line 197 +line 198 +line 199 +line 200 diff --git a/crates/kebab-chunk/tests/fixtures/sample_shell.sh b/crates/kebab-chunk/tests/fixtures/sample_shell.sh new file mode 100644 index 0000000..2fc2911 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_shell.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +# First paragraph: env setup +export KEBAB_HOME="${KEBAB_HOME:-$HOME/.local/share/kebab}" +mkdir -p "$KEBAB_HOME" +cd "$KEBAB_HOME" + +# Second paragraph: ingest +echo "ingesting workspace..." +kebab ingest --config /etc/kebab/config.toml + +# Third paragraph: report +echo "done" +kebab schema --json | jq '.stats'