feat(kebab-app): p9-fb-25 task 5 — Skipped warnings + skipped_by_extension aggregation
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -384,7 +384,7 @@ pub fn ingest_with_config_opts(
|
||||
let mut chunks_indexed: u32 = 0;
|
||||
let mut embeddings_indexed: u32 = 0;
|
||||
// p9-fb-25: per-extension skip count, populated in the Skipped arm below.
|
||||
let skipped_by_extension: std::collections::BTreeMap<String, u32> =
|
||||
let mut skipped_by_extension: std::collections::BTreeMap<String, u32> =
|
||||
std::collections::BTreeMap::new();
|
||||
let scanned_count: u32 = u32::try_from(assets.len()).unwrap_or(u32::MAX);
|
||||
|
||||
@@ -475,7 +475,9 @@ pub fn ingest_with_config_opts(
|
||||
}
|
||||
}
|
||||
kebab_core::IngestItemKind::Skipped => {
|
||||
skipped_count = skipped_count.saturating_add(1)
|
||||
skipped_count = skipped_count.saturating_add(1);
|
||||
let ext = ext_for_skip_warning(&item.doc_path.0);
|
||||
*skipped_by_extension.entry(ext).or_insert(0) += 1;
|
||||
}
|
||||
kebab_core::IngestItemKind::Unchanged => {
|
||||
unchanged_count = unchanged_count.saturating_add(1)
|
||||
@@ -826,6 +828,31 @@ fn try_skip_unchanged(
|
||||
}))
|
||||
}
|
||||
|
||||
/// p9-fb-25: extract the lowercase extension (no leading dot) from a
|
||||
/// workspace path for use in the `unsupported media type: .X` warning
|
||||
/// and `IngestReport.skipped_by_extension` key. Returns `"<no-ext>"`
|
||||
/// for paths with no extension. Always lowercase so `Foo.DOCX` and
|
||||
/// `bar.docx` aggregate under the same key.
|
||||
fn ext_for_skip_warning(path: &str) -> String {
|
||||
std::path::Path::new(path)
|
||||
.extension()
|
||||
.and_then(|s| s.to_str())
|
||||
.map(|s| s.to_ascii_lowercase())
|
||||
.unwrap_or_else(|| "<no-ext>".to_string())
|
||||
}
|
||||
|
||||
/// p9-fb-25: render the `IngestItem.warnings` line for a Skipped
|
||||
/// asset. `<no-ext>` sentinel renders without a leading dot;
|
||||
/// everything else gets `.ext` form.
|
||||
fn unsupported_media_warning(path: &str) -> String {
|
||||
let ext = ext_for_skip_warning(path);
|
||||
if ext == "<no-ext>" {
|
||||
"unsupported media type: <no-ext>".to_string()
|
||||
} else {
|
||||
format!("unsupported media type: .{ext}")
|
||||
}
|
||||
}
|
||||
|
||||
/// Process a single asset: read bytes, parse, normalize, chunk,
|
||||
/// persist, embed. Per-asset failures bubble up to the caller for
|
||||
/// labelling as `IngestItemKind::Error` — they do NOT abort the
|
||||
@@ -889,7 +916,7 @@ fn ingest_one_asset(
|
||||
chunk_count: None,
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: Vec::new(),
|
||||
warnings: vec![unsupported_media_warning(&asset.workspace_path.0)],
|
||||
error: None,
|
||||
});
|
||||
}
|
||||
@@ -908,9 +935,7 @@ fn ingest_one_asset(
|
||||
chunk_count: None,
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: vec![
|
||||
"kb:// source URIs are not supported by the fs ingester".into(),
|
||||
],
|
||||
warnings: vec!["kb:// URI not yet supported".to_string()],
|
||||
error: None,
|
||||
});
|
||||
}
|
||||
@@ -1103,7 +1128,7 @@ fn ingest_one_image_asset(
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: vec![
|
||||
"kb:// source URIs are not supported by the fs ingester".into(),
|
||||
"kb:// URI not yet supported".to_string(),
|
||||
],
|
||||
error: None,
|
||||
});
|
||||
@@ -1438,7 +1463,7 @@ fn ingest_one_pdf_asset(
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: vec![
|
||||
"kb:// source URIs are not supported by the fs ingester".into(),
|
||||
"kb:// URI not yet supported".to_string(),
|
||||
],
|
||||
error: None,
|
||||
});
|
||||
|
||||
43
crates/kebab-app/tests/skip_reason.rs
Normal file
43
crates/kebab-app/tests/skip_reason.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
//! p9-fb-25 task 5: skipped per-asset items must carry a human-readable
|
||||
//! reason in `warnings`, and the report's `skipped_by_extension` must
|
||||
//! aggregate by lowercase extension.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
#[test]
|
||||
fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let workspace_root = std::path::PathBuf::from(&env.config.workspace.root);
|
||||
std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
|
||||
std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();
|
||||
|
||||
let report = kebab_app::ingest_with_config(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
).unwrap();
|
||||
|
||||
let items = report.items.as_ref().expect("items array populated");
|
||||
let docx_item = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("legacy.docx"))
|
||||
.expect("docx in items");
|
||||
assert_eq!(docx_item.kind, kebab_core::IngestItemKind::Skipped);
|
||||
assert_eq!(
|
||||
docx_item.warnings,
|
||||
vec!["unsupported media type: .docx".to_string()],
|
||||
);
|
||||
let makefile_item = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("Makefile"))
|
||||
.expect("Makefile in items");
|
||||
assert_eq!(makefile_item.kind, kebab_core::IngestItemKind::Skipped);
|
||||
assert_eq!(
|
||||
makefile_item.warnings,
|
||||
vec!["unsupported media type: <no-ext>".to_string()],
|
||||
);
|
||||
assert_eq!(report.skipped_by_extension.get("docx").copied(), Some(1));
|
||||
assert_eq!(report.skipped_by_extension.get("<no-ext>").copied(), Some(1));
|
||||
}
|
||||
Reference in New Issue
Block a user