feat(app): App::fetch span mode + PDF/audio rejection (fb-35)
Line-based slice over fmt_canonical_to_markdown output. PDF / audio source_type → span_not_supported StructuredError. Out-of-range line_end clamps to total; effective_end reflects post-budget trim. invalid_input on zero / inverted bounds. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -168,14 +168,104 @@ fn trim_to_chars(s: &str, n: usize) -> String {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn fetch_span(
|
fn fetch_span(
|
||||||
_app: &App,
|
app: &App,
|
||||||
_id: DocumentId,
|
id: DocumentId,
|
||||||
_line_start: u32,
|
line_start: u32,
|
||||||
_line_end: u32,
|
line_end: u32,
|
||||||
_opts: FetchOpts,
|
opts: FetchOpts,
|
||||||
) -> Result<FetchResult> {
|
) -> Result<FetchResult> {
|
||||||
// Implemented in Task 5.
|
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
|
||||||
anyhow::bail!("fetch_span not yet implemented")
|
.ok_or_else(|| {
|
||||||
|
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||||
|
schema_version: ERROR_V1_ID.to_string(),
|
||||||
|
code: "doc_not_found".to_string(),
|
||||||
|
message: format!("doc_id '{}' not found", id.0),
|
||||||
|
details: serde_json::Value::Null,
|
||||||
|
hint: None,
|
||||||
|
}))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Reject line-incompatible media types (PDF / audio). `SourceType`
|
||||||
|
// (markdown / note / paper / reference / inbox) is the *user-facing*
|
||||||
|
// category, not the rendering format — the actual byte-level format
|
||||||
|
// lives on the source `RawAsset.media_type`. Look it up via
|
||||||
|
// workspace_path (unique key per asset).
|
||||||
|
if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset_by_workspace_path(
|
||||||
|
&app.sqlite,
|
||||||
|
&doc.workspace_path,
|
||||||
|
)? {
|
||||||
|
if matches!(
|
||||||
|
asset.media_type,
|
||||||
|
kebab_core::MediaType::Pdf | kebab_core::MediaType::Audio(_)
|
||||||
|
) {
|
||||||
|
return Err(anyhow::Error::new(StructuredError(ErrorV1 {
|
||||||
|
schema_version: ERROR_V1_ID.to_string(),
|
||||||
|
code: "span_not_supported".to_string(),
|
||||||
|
message: format!(
|
||||||
|
"doc '{}' has media_type {:?}; line-based span fetch unsupported. \
|
||||||
|
Use `fetch chunk` or `fetch doc` instead.",
|
||||||
|
id.0, asset.media_type
|
||||||
|
),
|
||||||
|
details: serde_json::Value::Null,
|
||||||
|
hint: Some("kind = chunk or kind = doc instead".to_string()),
|
||||||
|
})));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if line_start == 0 || line_end == 0 || line_end < line_start {
|
||||||
|
return Err(anyhow::Error::new(StructuredError(ErrorV1 {
|
||||||
|
schema_version: ERROR_V1_ID.to_string(),
|
||||||
|
code: "invalid_input".to_string(),
|
||||||
|
message: format!(
|
||||||
|
"line_start ({line_start}) and line_end ({line_end}) must be 1-based with start <= end"
|
||||||
|
),
|
||||||
|
details: serde_json::Value::Null,
|
||||||
|
hint: None,
|
||||||
|
})));
|
||||||
|
}
|
||||||
|
|
||||||
|
let full = fmt_canonical_to_markdown(&doc);
|
||||||
|
let lines: Vec<&str> = full.lines().collect();
|
||||||
|
let total = lines.len() as u32;
|
||||||
|
let effective_end_raw = line_end.min(total).max(line_start);
|
||||||
|
let lo = (line_start - 1) as usize;
|
||||||
|
let hi = effective_end_raw as usize;
|
||||||
|
let mut text = lines[lo..hi].join("\n");
|
||||||
|
|
||||||
|
let mut truncated = effective_end_raw != line_end;
|
||||||
|
let mut effective_end = effective_end_raw;
|
||||||
|
if let Some(max_tokens) = opts.max_tokens {
|
||||||
|
let max_chars = max_tokens.saturating_mul(4);
|
||||||
|
if text.chars().count() > max_chars {
|
||||||
|
text = trim_to_chars(&text, max_chars);
|
||||||
|
truncated = true;
|
||||||
|
let kept = text.lines().count() as u32;
|
||||||
|
effective_end = (line_start - 1) + kept;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let now = OffsetDateTime::now_utc();
|
||||||
|
let stale = compute_stale(
|
||||||
|
doc_metadata_updated_at(&doc),
|
||||||
|
now,
|
||||||
|
app.config.search.stale_threshold_days,
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(FetchResult {
|
||||||
|
kind: FetchKind::Span,
|
||||||
|
doc_id: doc.doc_id.clone(),
|
||||||
|
doc_path: doc.workspace_path.clone(),
|
||||||
|
indexed_at: doc_metadata_updated_at(&doc),
|
||||||
|
stale,
|
||||||
|
chunk: None,
|
||||||
|
context_before: Vec::new(),
|
||||||
|
context_after: Vec::new(),
|
||||||
|
text: Some(text),
|
||||||
|
line_start: Some(line_start),
|
||||||
|
line_end: Some(line_end),
|
||||||
|
effective_end: Some(effective_end),
|
||||||
|
truncated,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// p9-fb-35: list chunks for a document in ordinal order, return
|
/// p9-fb-35: list chunks for a document in ordinal order, return
|
||||||
|
|||||||
@@ -153,3 +153,100 @@ fn fetch_doc_with_max_tokens_truncates() {
|
|||||||
let text = result.text.expect("doc text");
|
let text = result.text.expect("doc text");
|
||||||
assert!(text.chars().count() <= 100, "trimmed text len {}", text.chars().count());
|
assert!(text.chars().count() <= 100, "trimmed text len {}", text.chars().count());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fetch_span_returns_line_range() {
|
||||||
|
let env = common::TestEnv::new();
|
||||||
|
// Use a list so the canonical-to-markdown roundtrip emits 5
|
||||||
|
// single-line entries joined by `\n` (paragraphs would be joined by
|
||||||
|
// `\n\n`, and CommonMark soft breaks inside one paragraph collapse to
|
||||||
|
// spaces — see crates/kebab-parse-md/src/blocks.rs `Event::SoftBreak`).
|
||||||
|
let body = "- Line one.\n- Line two.\n- Line three.\n- Line four.\n- Line five.\n";
|
||||||
|
common::ingest_md(&env, "lines.md", body);
|
||||||
|
let app = env.app();
|
||||||
|
|
||||||
|
let q = kebab_core::SearchQuery {
|
||||||
|
text: "Line".to_string(),
|
||||||
|
mode: kebab_core::SearchMode::Lexical,
|
||||||
|
k: 1,
|
||||||
|
filters: kebab_core::SearchFilters::default(),
|
||||||
|
};
|
||||||
|
let hits = app.search(q).unwrap();
|
||||||
|
let doc_id = hits[0].doc_id.clone();
|
||||||
|
|
||||||
|
let result = app
|
||||||
|
.fetch(
|
||||||
|
FetchQuery::Span {
|
||||||
|
doc_id,
|
||||||
|
line_start: 2,
|
||||||
|
line_end: 4,
|
||||||
|
},
|
||||||
|
FetchOpts::default(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(result.kind, FetchKind::Span);
|
||||||
|
let text = result.text.expect("span text");
|
||||||
|
let line_count = text.lines().count();
|
||||||
|
assert_eq!(line_count, 3, "span should be 3 lines: {text:?}");
|
||||||
|
assert_eq!(result.line_start, Some(2));
|
||||||
|
assert_eq!(result.line_end, Some(4));
|
||||||
|
assert_eq!(result.effective_end, Some(4));
|
||||||
|
assert!(!result.truncated);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fetch_span_clamps_line_end_when_out_of_range() {
|
||||||
|
let env = common::TestEnv::new();
|
||||||
|
common::ingest_md(&env, "short.md", "Line one.\nLine two.\n");
|
||||||
|
let app = env.app();
|
||||||
|
let q = kebab_core::SearchQuery {
|
||||||
|
text: "Line".to_string(),
|
||||||
|
mode: kebab_core::SearchMode::Lexical,
|
||||||
|
k: 1,
|
||||||
|
filters: kebab_core::SearchFilters::default(),
|
||||||
|
};
|
||||||
|
let hits = app.search(q).unwrap();
|
||||||
|
let doc_id = hits[0].doc_id.clone();
|
||||||
|
|
||||||
|
let result = app
|
||||||
|
.fetch(
|
||||||
|
FetchQuery::Span {
|
||||||
|
doc_id,
|
||||||
|
line_start: 1,
|
||||||
|
line_end: 999,
|
||||||
|
},
|
||||||
|
FetchOpts::default(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let text = result.text.expect("span text");
|
||||||
|
let actual_lines = text.lines().count();
|
||||||
|
assert_eq!(result.effective_end, Some(actual_lines as u32));
|
||||||
|
assert!(actual_lines < 999);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fetch_span_invalid_input_when_zero_lines() {
|
||||||
|
let env = common::TestEnv::new();
|
||||||
|
common::ingest_md(&env, "a.md", "Line one.\n");
|
||||||
|
let app = env.app();
|
||||||
|
let q = kebab_core::SearchQuery {
|
||||||
|
text: "Line".to_string(),
|
||||||
|
mode: kebab_core::SearchMode::Lexical,
|
||||||
|
k: 1,
|
||||||
|
filters: kebab_core::SearchFilters::default(),
|
||||||
|
};
|
||||||
|
let hits = app.search(q).unwrap();
|
||||||
|
let doc_id = hits[0].doc_id.clone();
|
||||||
|
|
||||||
|
let err = app
|
||||||
|
.fetch(
|
||||||
|
FetchQuery::Span {
|
||||||
|
doc_id,
|
||||||
|
line_start: 0,
|
||||||
|
line_end: 0,
|
||||||
|
},
|
||||||
|
FetchOpts::default(),
|
||||||
|
)
|
||||||
|
.unwrap_err();
|
||||||
|
assert!(err.to_string().contains("invalid_input"), "got: {err}");
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user