Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 16dc02cfa2 | |||
| 74f1b0571b | |||
| 918ee6c0be |
46
Cargo.lock
generated
46
Cargo.lock
generated
@@ -4127,7 +4127,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-app"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.22.1",
|
||||
@@ -4172,7 +4172,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-chunk"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4187,7 +4187,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-cli"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
@@ -4208,7 +4208,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-config"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"dirs 5.0.1",
|
||||
@@ -4223,7 +4223,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-core"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4237,7 +4237,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4251,7 +4251,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed-local"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"fastembed",
|
||||
@@ -4264,7 +4264,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-eval"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4283,7 +4283,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4292,7 +4292,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm-local"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-config",
|
||||
@@ -4309,7 +4309,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-mcp"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4327,7 +4327,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-normalize"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4342,7 +4342,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-code"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"gix",
|
||||
@@ -4360,7 +4360,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-image"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"ab_glyph",
|
||||
"anyhow",
|
||||
@@ -4384,7 +4384,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-md"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4401,7 +4401,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-pdf"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4414,7 +4414,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-types"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"kebab-core",
|
||||
"serde",
|
||||
@@ -4422,7 +4422,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-rag"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4443,7 +4443,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-search"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"globset",
|
||||
@@ -4462,7 +4462,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-source-fs"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4480,7 +4480,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-sqlite"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4501,7 +4501,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-vector"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arrow",
|
||||
@@ -4525,7 +4525,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-tui"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"crossterm",
|
||||
|
||||
@@ -31,7 +31,7 @@ edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kebab"
|
||||
version = "0.8.0"
|
||||
version = "0.8.1"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1"
|
||||
|
||||
@@ -346,6 +346,34 @@ fn run_query(
|
||||
}
|
||||
}
|
||||
|
||||
// p10-1A-1 fix (dogfood-discovered 2026-05-20): code_lang filter
|
||||
// (IN-list on metadata_json.$.code_lang). Empty Vec = no filter.
|
||||
if !filters.code_lang.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.code_lang.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(
|
||||
" AND json_extract(d.metadata_json, '$.code_lang') IN ({placeholders})"
|
||||
));
|
||||
for lang in &filters.code_lang {
|
||||
params.push(Box::new(lang.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p10-1A-1 fix (dogfood-discovered 2026-05-20): repo filter
|
||||
// (IN-list on metadata_json.$.repo). Empty Vec = no filter.
|
||||
if !filters.repo.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.repo.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(
|
||||
" AND json_extract(d.metadata_json, '$.repo') IN ({placeholders})"
|
||||
));
|
||||
for repo in &filters.repo {
|
||||
params.push(Box::new(repo.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p9-fb-36: ingested_after filter.
|
||||
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
|
||||
// fb-32 ingest path), so lexicographic >= compare is correct — but only
|
||||
|
||||
@@ -785,6 +785,19 @@ impl TestEnv {
|
||||
body: &str,
|
||||
media: MediaType,
|
||||
updated_at: OffsetDateTime,
|
||||
) -> DocumentId {
|
||||
self.insert_doc_full_with_metadata(path, body, media, updated_at, "{}")
|
||||
}
|
||||
|
||||
/// Like `insert_doc_full` but accepts an explicit `metadata_json` string
|
||||
/// so p10-1A-1 filter tests can set `metadata.code_lang` / `metadata.repo`.
|
||||
fn insert_doc_full_with_metadata(
|
||||
&self,
|
||||
path: &str,
|
||||
body: &str,
|
||||
media: MediaType,
|
||||
updated_at: OffsetDateTime,
|
||||
metadata_json: &str,
|
||||
) -> DocumentId {
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
let doc_id = self.next_id("doc");
|
||||
@@ -810,10 +823,10 @@ impl TestEnv {
|
||||
source_type, trust_level, parser_version,
|
||||
doc_version, schema_version, metadata_json,
|
||||
provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1,
|
||||
'{}', '{\"events\":[]}',
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'code', 'primary', 'pv1', 1, 1,
|
||||
?, '{\"events\":[]}',
|
||||
'2024-01-01T00:00:00Z', ?)",
|
||||
rusqlite::params![doc_id, asset_id, path, updated_at_str],
|
||||
rusqlite::params![doc_id, asset_id, path, metadata_json, updated_at_str],
|
||||
)
|
||||
.expect("insert document");
|
||||
|
||||
@@ -834,6 +847,21 @@ impl TestEnv {
|
||||
DocumentId(doc_id)
|
||||
}
|
||||
|
||||
/// Insert a code doc with explicit `code_lang` and optional `repo` in metadata.
|
||||
fn insert_code_doc(&self, path: &str, body: &str, code_lang: &str, repo: Option<&str>) -> DocumentId {
|
||||
let metadata_json = match repo {
|
||||
Some(r) => format!(r#"{{"code_lang":"{code_lang}","repo":"{r}"}}"#),
|
||||
None => format!(r#"{{"code_lang":"{code_lang}"}}"#),
|
||||
};
|
||||
self.insert_doc_full_with_metadata(
|
||||
path,
|
||||
body,
|
||||
MediaType::Markdown,
|
||||
OffsetDateTime::now_utc(),
|
||||
&metadata_json,
|
||||
)
|
||||
}
|
||||
|
||||
fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec<SearchHit> {
|
||||
let r = self.inner.retriever();
|
||||
let q = SearchQuery {
|
||||
@@ -934,6 +962,52 @@ fn lexical_empty_filters_match_default_behavior() {
|
||||
assert!(!with_default.is_empty());
|
||||
}
|
||||
|
||||
// ── p10-1A-1 filter tests ────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_by_code_lang() {
|
||||
// Three docs: python code, rust code, markdown (no code_lang).
|
||||
// Filter code_lang=["python"] → only the python doc should match.
|
||||
let env = TestEnv::new();
|
||||
env.insert_code_doc("src/main.py", "AsyncClient session", "python", None);
|
||||
env.insert_code_doc("src/lib.rs", "AsyncClient session", "rust", None);
|
||||
env.insert_doc("docs/guide.md", "AsyncClient session");
|
||||
|
||||
let filters = SearchFilters {
|
||||
code_lang: vec!["python".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_search("AsyncClient", &filters);
|
||||
assert_eq!(hits.len(), 1, "only python doc should match code_lang filter");
|
||||
assert!(
|
||||
hits[0].doc_path.0.ends_with(".py"),
|
||||
"expected python path, got: {}",
|
||||
hits[0].doc_path.0
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_by_repo() {
|
||||
// Three docs: one in repo "httpx", one in repo "requests", one with no repo.
|
||||
// Filter repo=["httpx"] → only the httpx doc should match.
|
||||
let env = TestEnv::new();
|
||||
env.insert_code_doc("httpx/client.py", "session send request", "python", Some("httpx"));
|
||||
env.insert_code_doc("requests/api.py", "session send request", "python", Some("requests"));
|
||||
env.insert_code_doc("standalone.py", "session send request", "python", None);
|
||||
|
||||
let filters = SearchFilters {
|
||||
repo: vec!["httpx".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_search("session", &filters);
|
||||
assert_eq!(hits.len(), 1, "only httpx doc should match repo filter");
|
||||
assert!(
|
||||
hits[0].doc_path.0.starts_with("httpx/"),
|
||||
"expected httpx path, got: {}",
|
||||
hits[0].doc_path.0
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_snapshot_run_1() {
|
||||
// Pinned snapshot. A small, deterministic corpus; the JSON shape of
|
||||
|
||||
@@ -153,6 +153,34 @@ impl SqliteStore {
|
||||
}
|
||||
}
|
||||
|
||||
// p10-1A-1 fix (dogfood-discovered 2026-05-20): code_lang filter
|
||||
// (IN-list on metadata_json.$.code_lang). Empty Vec = no filter.
|
||||
if !filters.code_lang.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.code_lang.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(
|
||||
" AND json_extract(d.metadata_json, '$.code_lang') IN ({placeholders})"
|
||||
));
|
||||
for lang in &filters.code_lang {
|
||||
bind.push(Box::new(lang.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p10-1A-1 fix (dogfood-discovered 2026-05-20): repo filter
|
||||
// (IN-list on metadata_json.$.repo). Empty Vec = no filter.
|
||||
if !filters.repo.is_empty() {
|
||||
let placeholders = std::iter::repeat_n("?", filters.repo.len())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
sql.push_str(&format!(
|
||||
" AND json_extract(d.metadata_json, '$.repo') IN ({placeholders})"
|
||||
));
|
||||
for repo in &filters.repo {
|
||||
bind.push(Box::new(repo.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p9-fb-36: ingested_after filter.
|
||||
// `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
|
||||
// lexicographic >= compare is correct — but only when the filter
|
||||
@@ -408,6 +436,78 @@ mod tests {
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Variant of `seed_committed_full` that additionally accepts a
|
||||
/// `metadata_json` string so p10-1A-1 filter tests can set
|
||||
/// `metadata.code_lang` / `metadata.repo` without going through the
|
||||
/// full ingest pipeline.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn seed_committed_with_metadata(
|
||||
store: &SqliteStore,
|
||||
chunk_id: &str,
|
||||
doc_id: &str,
|
||||
workspace_path: &str,
|
||||
media_type_json: &str,
|
||||
metadata_json: &str,
|
||||
) {
|
||||
let asset_id = format!("a{}", &doc_id[..31]);
|
||||
{
|
||||
let conn = store.lock_conn();
|
||||
conn.execute(
|
||||
"INSERT INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, ?, ?, ?, 0, 'deadbeefdeadbeefdeadbeefdeadbeef',
|
||||
'reference', ?, '1970-01-01T00:00:00Z')",
|
||||
params![
|
||||
asset_id,
|
||||
format!("file://{workspace_path}"),
|
||||
workspace_path,
|
||||
media_type_json,
|
||||
workspace_path,
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang, source_type,
|
||||
trust_level, parser_version, doc_version, schema_version,
|
||||
metadata_json, provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'code', 'primary', 'v1', 1, 1,
|
||||
?, '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
|
||||
params![doc_id, asset_id, workspace_path, metadata_json],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, 'code snippet', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
|
||||
'1970-01-01T00:00:00Z')",
|
||||
params![chunk_id, doc_id],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let embed_row = EmbeddingRecordRow {
|
||||
embedding_id: format!("e{}", &chunk_id[..31]),
|
||||
chunk_id: chunk_id.to_string(),
|
||||
model_id: "m".to_string(),
|
||||
model_version: "v1".to_string(),
|
||||
dimensions: 4,
|
||||
lance_table: "t".to_string(),
|
||||
created_at: OffsetDateTime::UNIX_EPOCH,
|
||||
};
|
||||
store
|
||||
.put_embedding_records_pending(std::slice::from_ref(&embed_row))
|
||||
.unwrap();
|
||||
store
|
||||
.mark_embedding_records_committed(std::slice::from_ref(
|
||||
&embed_row.embedding_id,
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn cid(s: &str) -> ChunkId {
|
||||
ChunkId(s.to_string())
|
||||
}
|
||||
@@ -671,6 +771,78 @@ mod tests {
|
||||
assert_eq!(out, vec![cid(c1)], "doc_id filter must scope to the target doc only");
|
||||
}
|
||||
|
||||
// ── p10-1A-1 new filter arms ─────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn filter_chunks_code_lang_keeps_matching_lang() {
|
||||
// c1 = python, c2 = rust, c3 = markdown (no code_lang).
|
||||
// Filter code_lang=["python"] → only c1 survives.
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let store = open_store(&tmp);
|
||||
let c1 = "11111111111111111111111111111111";
|
||||
let c2 = "22222222222222222222222222222222";
|
||||
let c3 = "33333333333333333333333333333333";
|
||||
seed_committed_with_metadata(
|
||||
&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
|
||||
"src/main.py", r#""code""#,
|
||||
r#"{"code_lang":"python"}"#,
|
||||
);
|
||||
seed_committed_with_metadata(
|
||||
&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2",
|
||||
"src/lib.rs", r#""code""#,
|
||||
r#"{"code_lang":"rust"}"#,
|
||||
);
|
||||
seed_committed_with_metadata(
|
||||
&store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
|
||||
"README.md", r#""markdown""#,
|
||||
r#"{}"#,
|
||||
);
|
||||
|
||||
let f = SearchFilters {
|
||||
code_lang: vec!["python".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1)], "only python chunk should survive code_lang filter");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_chunks_repo_keeps_matching_repo() {
|
||||
// c1 = repo "httpx", c2 = repo "requests", c3 = no repo.
|
||||
// Filter repo=["httpx"] → only c1 survives.
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let store = open_store(&tmp);
|
||||
let c1 = "11111111111111111111111111111111";
|
||||
let c2 = "22222222222222222222222222222222";
|
||||
let c3 = "33333333333333333333333333333333";
|
||||
seed_committed_with_metadata(
|
||||
&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
|
||||
"httpx/client.py", r#""code""#,
|
||||
r#"{"repo":"httpx","code_lang":"python"}"#,
|
||||
);
|
||||
seed_committed_with_metadata(
|
||||
&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2",
|
||||
"requests/api.py", r#""code""#,
|
||||
r#"{"repo":"requests","code_lang":"python"}"#,
|
||||
);
|
||||
seed_committed_with_metadata(
|
||||
&store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3",
|
||||
"standalone.py", r#""code""#,
|
||||
r#"{"code_lang":"python"}"#,
|
||||
);
|
||||
|
||||
let f = SearchFilters {
|
||||
repo: vec!["httpx".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let out = store
|
||||
.filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f)
|
||||
.unwrap();
|
||||
assert_eq!(out, vec![cid(c1)], "only httpx chunk should survive repo filter");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
|
||||
// Regression test for the non-UTC offset lex-compare bug.
|
||||
|
||||
Reference in New Issue
Block a user