From ebc4ef2eeab4894d6f7b751182aaddba8f81b58e Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 10:44:05 +0000 Subject: [PATCH] feat(p10-1c-jk): activate Java in ingest_one_code_asset dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces Java bail! arms with JavaAstExtractor + CodeJavaAstV1Chunker. Adds java_file_ingests_and_searches_as_code_citation integration test — asserts citation.lang=java, symbol=com.foo.Foo.bar, code_lang=java. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 16 +++-- crates/kebab-app/tests/code_ingest_smoke.rs | 71 +++++++++++++++++++++ 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index a260f6e..de043bc 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeGoAstV1Chunker, CodeJsAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTsAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTsAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -50,7 +50,7 @@ use kebab_core::{ use kebab_llm_local::OllamaLanguageModel; use kebab_normalize::build_canonical_document; use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; -use kebab_parse_code::{GoAstExtractor, JavascriptAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; +use kebab_parse_code::{GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; use kebab_parse_pdf::PdfTextExtractor; use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -1829,7 +1829,7 @@ fn ingest_one_code_asset( "typescript" => ParserVersion(kebab_parse_code::TS_PARSER_VERSION.to_string()), "javascript" => ParserVersion(kebab_parse_code::JS_PARSER_VERSION.to_string()), "go" => ParserVersion(kebab_parse_code::GO_PARSER_VERSION.to_string()), - "java" => anyhow::bail!("java ingest not yet wired (p10-1c-jk Task F)"), + "java" => ParserVersion(kebab_parse_code::JAVA_PARSER_VERSION.to_string()), "kotlin" => anyhow::bail!("kotlin ingest not yet wired (p10-1c-jk Task I)"), other => anyhow::bail!("unsupported code_lang: {other}"), }; @@ -1841,7 +1841,7 @@ fn ingest_one_code_asset( "typescript" => CodeTsAstV1Chunker.chunker_version(), "javascript" => CodeJsAstV1Chunker.chunker_version(), "go" => CodeGoAstV1Chunker.chunker_version(), - "java" => anyhow::bail!("java ingest not yet wired (p10-1c-jk Task F)"), + "java" => CodeJavaAstV1Chunker.chunker_version(), "kotlin" => anyhow::bail!("kotlin ingest not yet wired (p10-1c-jk Task I)"), other => anyhow::bail!("unreachable chunker_version: {other}"), }; @@ -1884,7 +1884,9 @@ fn ingest_one_code_asset( "go" => GoAstExtractor::new() .extract(&ctx, &bytes) .context("kb-parse-code::GoAstExtractor::extract (code:go)")?, - "java" => anyhow::bail!("java ingest not yet wired (p10-1c-jk Task F)"), + "java" => JavaAstExtractor::new() + .extract(&ctx, &bytes) + .context("kb-parse-code::JavaAstExtractor::extract (code:java)")?, "kotlin" => anyhow::bail!("kotlin ingest not yet wired (p10-1c-jk Task I)"), other => anyhow::bail!("unreachable (extract): {other}"), }; @@ -1906,7 +1908,9 @@ fn ingest_one_code_asset( "go" => CodeGoAstV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::CodeGoAstV1Chunker::chunk (code:go)")?, - "java" => anyhow::bail!("java ingest not yet wired (p10-1c-jk Task F)"), + "java" => CodeJavaAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeJavaAstV1Chunker::chunk (code:java)")?, "kotlin" => anyhow::bail!("kotlin ingest not yet wired (p10-1c-jk Task I)"), other => anyhow::bail!("unreachable (chunk): {other}"), }; diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index ee852f2..e484885 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -461,6 +461,77 @@ fn go_file_ingests_and_searches_as_code_citation() { ); } +/// p10-1c-jk Task F: a `.java` file in a package directory is ingested and the +/// resulting `Citation::Code` hit must carry `lang="java"`, +/// `symbol="com.foo.Foo.bar"`, and `line_start >= 1`. +/// The sub-directory (`com/foo/`) ensures the Java package-prefix wiring +/// produces a non-empty module prefix so the fully-qualified symbol assertion +/// exercises that path end-to-end. +#[test] +fn java_file_ingests_and_searches_as_code_citation() { + let env = TestEnv::lexical_only(); + + let pkg_dir = env.workspace_root.join("com").join("foo"); + std::fs::create_dir_all(&pkg_dir).unwrap(); + std::fs::write( + pkg_dir.join("Foo.java"), + "package com.foo;\n\npublic class Foo {\n public String bar() { return \"x\"; }\n}\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0); + assert!(report.new >= 1); + + let java_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("Foo.java")) + .expect("Foo.java item present"); + assert_eq!( + java_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("code-java-v1"), + "parser_version must be code-java-v1" + ); + assert_eq!( + java_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-java-ast-v1"), + "chunker_version must be code-java-ast-v1" + ); + + let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("bar")) + .expect("search must succeed"); + let h = hits + .iter() + .find(|h| matches!(&h.citation, kebab_core::Citation::Code { .. })) + .expect("Citation::Code hit"); + match &h.citation { + kebab_core::Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!(lang.as_deref(), Some("java"), "citation.lang must be 'java'"); + assert_eq!( + symbol.as_deref(), + Some("com.foo.Foo.bar"), + "citation.symbol must be 'com.foo.Foo.bar'" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + assert_eq!( + h.code_lang.as_deref(), + Some("java"), + "SearchHit.code_lang must be 'java'" + ); +} + /// Re-ingesting the same `.rs` file without changes must report /// `Unchanged` (incremental-skip path exercised). #[test]