Files
kebab/crates/kebab-app/tests/code_ingest_smoke.rs
altair823 ebc4ef2eea feat(p10-1c-jk): activate Java in ingest_one_code_asset dispatch
Replaces Java bail! arms with JavaAstExtractor + CodeJavaAstV1Chunker. Adds
java_file_ingests_and_searches_as_code_citation integration test — asserts
citation.lang=java, symbol=com.foo.Foo.bar, code_lang=java.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 10:44:05 +00:00

574 lines
18 KiB
Rust

//! p10-1A-2 Task 8: smoke test for Rust code ingest dispatch.
//!
//! Writes a single `.rs` file into a TempDir workspace, ingests it via
//! `kebab_app::ingest_with_config`, then searches for the symbol name and
//! asserts that the resulting `SearchHit` carries a `Citation::Code`
//! with the expected `lang`, `symbol`, and `line_start`.
//!
//! Mirrors the `pdf_pipeline.rs` harness: lexical-only (no AVX/fastembed),
//! no OCR / caption adapters needed.
mod common;
use common::{TestEnv, lexical_query};
use kebab_core::{Citation, IngestItemKind};
/// A `.rs` file with a single `pub fn add` symbol is ingested, and a
/// lexical search for "add" must return at least one `Citation::Code`
/// hit whose `lang == "rust"`, `symbol == Some("add")`, and
/// `line_start >= 1`.
#[test]
fn rust_file_ingests_and_searches_as_code_citation() {
let env = TestEnv::lexical_only();
// Write a minimal Rust file into the workspace root.
std::fs::write(
env.workspace_root.join("demo.rs"),
"/// adds two integers\npub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n",
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
let items = report.items.as_ref().expect("items present");
let code_item = items
.iter()
.find(|i| i.doc_path.0.ends_with("demo.rs"))
.expect("demo.rs item present");
assert_eq!(
code_item.kind,
IngestItemKind::New,
"first ingest must be New: {code_item:?}"
);
assert!(
code_item.block_count.unwrap_or(0) >= 1,
"at least one block expected: {code_item:?}"
);
assert!(
code_item.chunk_count.unwrap_or(0) >= 1,
"at least one chunk expected: {code_item:?}"
);
assert_eq!(
code_item.parser_version.as_ref().map(|p| p.0.as_str()),
Some("code-rust-v1"),
"parser_version must be code-rust-v1"
);
assert_eq!(
code_item.chunker_version.as_ref().map(|c| c.0.as_str()),
Some("code-rust-ast-v1"),
"chunker_version must be code-rust-ast-v1"
);
// Lexical search for the symbol name "add".
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("add"))
.expect("search must succeed");
let h = hits
.iter()
.find(|h| matches!(&h.citation, Citation::Code { .. }))
.expect("at least one Citation::Code hit for 'add'");
match &h.citation {
Citation::Code {
lang,
symbol,
line_start,
..
} => {
assert_eq!(
lang.as_deref(),
Some("rust"),
"citation.lang must be 'rust'"
);
assert_eq!(
symbol.as_deref(),
Some("add"),
"citation.symbol must be 'add'"
);
assert!(*line_start >= 1, "line_start must be ≥1");
}
_ => unreachable!(),
}
assert_eq!(
h.code_lang.as_deref(),
Some("rust"),
"SearchHit.code_lang must be 'rust'"
);
}
/// p10-1A-2 Task 8b: a code search hit must carry `SearchHit.repo` filled
/// from the document's `Metadata.repo` (which is set by `detect_repo` during
/// ingest). `detect_repo` returns the name of the directory that contains
/// `.git/`, so we `git init` the workspace root before ingesting and then
/// assert that `h.repo == Some("workspace")`.
#[test]
fn rust_code_search_hit_has_repo() {
let env = TestEnv::lexical_only();
// `detect_repo` walks up from the file looking for `.git/`.
// Initialise a bare git repo at the workspace root so it is
// discoverable. We only need the `.git/` directory — no commits
// required.
let git_status = std::process::Command::new("git")
.args(["init", "--quiet"])
.arg(env.workspace_root.as_os_str())
.status()
.expect("git init");
assert!(git_status.success(), "git init must succeed");
std::fs::write(
env.workspace_root.join("repo_demo.rs"),
"/// multiplies two integers\npub fn mul(a: i32, b: i32) -> i32 {\n a * b\n}\n",
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert_eq!(report.errors, 0, "no ingest errors: {report:?}");
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("mul"))
.expect("search must succeed");
let h = hits
.iter()
.find(|h| matches!(&h.citation, Citation::Code { .. }))
.expect("at least one Citation::Code hit for 'mul'");
// The workspace root directory is named "workspace" by `TestEnv`.
let expected_repo = env
.workspace_root
.file_name()
.and_then(|n| n.to_str())
.map(str::to_owned);
assert_eq!(
h.repo,
expected_repo,
"SearchHit.repo must match the workspace dir name (detect_repo result)"
);
// Also sanity-check code_lang is still filled.
assert_eq!(
h.code_lang.as_deref(),
Some("rust"),
"SearchHit.code_lang must be 'rust'"
);
}
/// p10-1b Task G: a `.py` file in a sub-directory is ingested and the
/// resulting `Citation::Code` hit must carry `lang="python"`,
/// `symbol="kebab_eval.metrics.compute_mrr"`, and `line_start >= 1`.
/// The sub-directory (`kebab_eval/`) ensures `module_path_for_python`
/// produces a non-empty prefix so the fully-qualified symbol assertion
/// exercises the prefix wiring end-to-end.
#[test]
fn python_file_ingests_and_searches_as_code_citation() {
let env = TestEnv::lexical_only();
let module_dir = env.workspace_root.join("kebab_eval");
std::fs::create_dir_all(&module_dir).unwrap();
std::fs::write(
module_dir.join("metrics.py"),
"\"\"\"compute metrics.\"\"\"\ndef compute_mrr(scores):\n return sum(scores) / max(len(scores), 1)\n",
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert!(report.new >= 1, "python file ingested: {report:?}");
let items = report.items.as_ref().expect("items present");
let py_item = items
.iter()
.find(|i| i.doc_path.0.ends_with("metrics.py"))
.expect("metrics.py item");
assert_eq!(
py_item.parser_version.as_ref().map(|p| p.0.as_str()),
Some("code-python-v1"),
"parser_version must be code-python-v1"
);
assert_eq!(
py_item.chunker_version.as_ref().map(|c| c.0.as_str()),
Some("code-python-ast-v1"),
"chunker_version must be code-python-ast-v1"
);
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("compute_mrr"))
.expect("search must succeed");
let h = hits
.iter()
.find(|h| matches!(&h.citation, Citation::Code { .. }))
.expect("at least one Citation::Code hit for 'compute_mrr'");
match &h.citation {
Citation::Code {
lang,
symbol,
line_start,
..
} => {
assert_eq!(
lang.as_deref(),
Some("python"),
"citation.lang must be 'python'"
);
assert_eq!(
symbol.as_deref(),
Some("kebab_eval.metrics.compute_mrr"),
"citation.symbol must be 'kebab_eval.metrics.compute_mrr'"
);
assert!(*line_start >= 1, "line_start must be >=1");
}
_ => unreachable!(),
}
assert_eq!(
h.code_lang.as_deref(),
Some("python"),
"SearchHit.code_lang must be 'python'"
);
}
/// p10-1b Task J: a `.ts` file in a sub-directory is ingested and the
/// resulting `Citation::Code` hit must carry `lang="typescript"`,
/// `symbol="src/Foo.Foo.bar"`, and `line_start >= 1`.
/// The sub-directory (`src/`) ensures `module_path_for_tsjs` produces
/// a non-empty prefix so the fully-qualified symbol assertion exercises
/// the prefix wiring end-to-end.
#[test]
fn typescript_file_ingests_and_searches_as_code_citation() {
let env = TestEnv::lexical_only();
let src_dir = env.workspace_root.join("src");
std::fs::create_dir_all(&src_dir).unwrap();
std::fs::write(
src_dir.join("Foo.ts"),
"export class Foo {\n bar(): number { return 42; }\n}\n",
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert!(report.new >= 1, "ts file ingested: {report:?}");
let items = report.items.as_ref().expect("items present");
let ts_item = items
.iter()
.find(|i| i.doc_path.0.ends_with("Foo.ts"))
.expect("Foo.ts item");
assert_eq!(
ts_item.parser_version.as_ref().map(|p| p.0.as_str()),
Some("code-ts-v1"),
"parser_version must be code-ts-v1"
);
assert_eq!(
ts_item.chunker_version.as_ref().map(|c| c.0.as_str()),
Some("code-ts-ast-v1"),
"chunker_version must be code-ts-ast-v1"
);
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("bar"))
.expect("search must succeed");
let h = hits
.iter()
.find(|h| matches!(&h.citation, Citation::Code { .. }))
.expect("at least one Citation::Code hit for 'bar'");
match &h.citation {
Citation::Code {
lang,
symbol,
line_start,
..
} => {
assert_eq!(
lang.as_deref(),
Some("typescript"),
"citation.lang must be 'typescript'"
);
assert_eq!(
symbol.as_deref(),
Some("src/Foo.Foo.bar"),
"citation.symbol must be 'src/Foo.Foo.bar'"
);
assert!(*line_start >= 1, "line_start must be >=1");
}
_ => unreachable!(),
}
assert_eq!(
h.code_lang.as_deref(),
Some("typescript"),
"SearchHit.code_lang must be 'typescript'"
);
}
/// p10-1b Task L: a `.js` file in a sub-directory is ingested and the
/// resulting `Citation::Code` hit must carry `lang="javascript"`,
/// `symbol="src/Bar.Bar.baz"`, and `line_start >= 1`.
/// The sub-directory (`src/`) ensures `module_path_for_tsjs` produces
/// a non-empty prefix so the fully-qualified symbol assertion exercises
/// the prefix wiring end-to-end.
#[test]
fn javascript_file_ingests_and_searches_as_code_citation() {
let env = TestEnv::lexical_only();
let src_dir = env.workspace_root.join("src");
std::fs::create_dir_all(&src_dir).unwrap();
std::fs::write(
src_dir.join("Bar.js"),
"export class Bar {\n baz() { return 7; }\n}\n",
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert!(report.new >= 1, "js file ingested: {report:?}");
let items = report.items.as_ref().expect("items present");
let js_item = items
.iter()
.find(|i| i.doc_path.0.ends_with("Bar.js"))
.expect("Bar.js item");
assert_eq!(
js_item.parser_version.as_ref().map(|p| p.0.as_str()),
Some("code-js-v1"),
"parser_version must be code-js-v1"
);
assert_eq!(
js_item.chunker_version.as_ref().map(|c| c.0.as_str()),
Some("code-js-ast-v1"),
"chunker_version must be code-js-ast-v1"
);
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("baz"))
.expect("search must succeed");
let h = hits
.iter()
.find(|h| matches!(&h.citation, Citation::Code { .. }))
.expect("at least one Citation::Code hit for 'baz'");
match &h.citation {
Citation::Code {
lang,
symbol,
line_start,
..
} => {
assert_eq!(
lang.as_deref(),
Some("javascript"),
"citation.lang must be 'javascript'"
);
assert_eq!(
symbol.as_deref(),
Some("src/Bar.Bar.baz"),
"citation.symbol must be 'src/Bar.Bar.baz'"
);
assert!(*line_start >= 1, "line_start must be >=1");
}
_ => unreachable!(),
}
assert_eq!(
h.code_lang.as_deref(),
Some("javascript"),
"SearchHit.code_lang must be 'javascript'"
);
}
/// p10-1c-go Task F: a `.go` file in a sub-directory is ingested and the
/// resulting `Citation::Code` hit must carry `lang="go"`,
/// `symbol="chunk.ParseDoc"`, and `line_start >= 1`.
/// The sub-directory (`chunk/`) ensures the Go package-prefix wiring
/// produces a non-empty module prefix so the fully-qualified symbol assertion
/// exercises that path end-to-end.
#[test]
fn go_file_ingests_and_searches_as_code_citation() {
let env = TestEnv::lexical_only();
let pkg_dir = env.workspace_root.join("chunk");
std::fs::create_dir_all(&pkg_dir).unwrap();
std::fs::write(
pkg_dir.join("ast.go"),
"package chunk\n\nfunc ParseDoc(input string) string {\n return input\n}\n",
)
.unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert_eq!(report.errors, 0);
assert!(report.new >= 1);
let go_item = report
.items
.as_ref()
.expect("items present")
.iter()
.find(|i| i.doc_path.0.ends_with("ast.go"))
.expect("ast.go item present");
assert_eq!(
go_item.parser_version.as_ref().map(|p| p.0.as_str()),
Some("code-go-v1"),
"parser_version must be code-go-v1"
);
assert_eq!(
go_item.chunker_version.as_ref().map(|c| c.0.as_str()),
Some("code-go-ast-v1"),
"chunker_version must be code-go-ast-v1"
);
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("ParseDoc"))
.expect("search must succeed");
let h = hits
.iter()
.find(|h| matches!(&h.citation, kebab_core::Citation::Code { .. }))
.expect("Citation::Code hit");
match &h.citation {
kebab_core::Citation::Code {
lang,
symbol,
line_start,
..
} => {
assert_eq!(lang.as_deref(), Some("go"), "citation.lang must be 'go'");
assert_eq!(
symbol.as_deref(),
Some("chunk.ParseDoc"),
"citation.symbol must be 'chunk.ParseDoc'"
);
assert!(*line_start >= 1, "line_start must be >=1");
}
_ => unreachable!(),
}
assert_eq!(
h.code_lang.as_deref(),
Some("go"),
"SearchHit.code_lang must be 'go'"
);
}
/// p10-1c-jk Task F: a `.java` file in a package directory is ingested and the
/// resulting `Citation::Code` hit must carry `lang="java"`,
/// `symbol="com.foo.Foo.bar"`, and `line_start >= 1`.
/// The sub-directory (`com/foo/`) ensures the Java package-prefix wiring
/// produces a non-empty module prefix so the fully-qualified symbol assertion
/// exercises that path end-to-end.
#[test]
fn java_file_ingests_and_searches_as_code_citation() {
let env = TestEnv::lexical_only();
let pkg_dir = env.workspace_root.join("com").join("foo");
std::fs::create_dir_all(&pkg_dir).unwrap();
std::fs::write(
pkg_dir.join("Foo.java"),
"package com.foo;\n\npublic class Foo {\n public String bar() { return \"x\"; }\n}\n",
)
.unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert_eq!(report.errors, 0);
assert!(report.new >= 1);
let java_item = report
.items
.as_ref()
.expect("items present")
.iter()
.find(|i| i.doc_path.0.ends_with("Foo.java"))
.expect("Foo.java item present");
assert_eq!(
java_item.parser_version.as_ref().map(|p| p.0.as_str()),
Some("code-java-v1"),
"parser_version must be code-java-v1"
);
assert_eq!(
java_item.chunker_version.as_ref().map(|c| c.0.as_str()),
Some("code-java-ast-v1"),
"chunker_version must be code-java-ast-v1"
);
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("bar"))
.expect("search must succeed");
let h = hits
.iter()
.find(|h| matches!(&h.citation, kebab_core::Citation::Code { .. }))
.expect("Citation::Code hit");
match &h.citation {
kebab_core::Citation::Code {
lang,
symbol,
line_start,
..
} => {
assert_eq!(lang.as_deref(), Some("java"), "citation.lang must be 'java'");
assert_eq!(
symbol.as_deref(),
Some("com.foo.Foo.bar"),
"citation.symbol must be 'com.foo.Foo.bar'"
);
assert!(*line_start >= 1, "line_start must be >=1");
}
_ => unreachable!(),
}
assert_eq!(
h.code_lang.as_deref(),
Some("java"),
"SearchHit.code_lang must be 'java'"
);
}
/// Re-ingesting the same `.rs` file without changes must report
/// `Unchanged` (incremental-skip path exercised).
#[test]
fn rust_file_re_ingest_is_unchanged() {
let env = TestEnv::lexical_only();
std::fs::write(
env.workspace_root.join("stable.rs"),
"pub fn noop() {}\n",
)
.unwrap();
let r1 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let item1 = r1
.items
.as_ref()
.unwrap()
.iter()
.find(|i| i.doc_path.0.ends_with("stable.rs"))
.cloned()
.unwrap();
assert_eq!(item1.kind, IngestItemKind::New);
let r2 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let item2 = r2
.items
.unwrap()
.into_iter()
.find(|i| i.doc_path.0.ends_with("stable.rs"))
.unwrap();
assert_eq!(
item2.kind,
IngestItemKind::Unchanged,
"identical bytes → Unchanged"
);
assert_eq!(item2.doc_id, item1.doc_id);
}