diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 17bc1d9..b4fb0c1 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeRustAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{CodePythonAstV1Chunker, CodeRustAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -50,7 +50,7 @@ use kebab_core::{ use kebab_llm_local::OllamaLanguageModel; use kebab_normalize::build_canonical_document; use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; -use kebab_parse_code::RustAstExtractor; +use kebab_parse_code::{PythonAstExtractor, RustAstExtractor}; use kebab_parse_pdf::PdfTextExtractor; use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -1677,19 +1677,20 @@ fn ingest_one_code_asset( } }; - // p10-1b Task D: parser_version per-lang. + // p10-1b Task D/G: parser_version per-lang. let parser_version = match code_lang { - "rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()), - "python" => anyhow::bail!("python ingest not yet wired (p10-1b Task G)"), + "rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()), + "python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()), "typescript" => anyhow::bail!("typescript ingest not yet wired (p10-1b Task J)"), "javascript" => anyhow::bail!("javascript ingest not yet wired (p10-1b Task L)"), other => anyhow::bail!("unsupported code_lang: {other}"), }; - // p10-1b Task D: chunker_version per-lang (Python/TS/JS are unreachable here; - // they bail above and get real chunkers in Tasks G/J/L). + // p10-1b Task D/G: chunker_version per-lang (TS/JS are unreachable here; + // they bail above and get real chunkers in Tasks J/L). let chunker_version = match code_lang { - "rust" => CodeRustAstV1Chunker.chunker_version(), + "rust" => CodeRustAstV1Chunker.chunker_version(), + "python" => CodePythonAstV1Chunker.chunker_version(), other => anyhow::bail!("unreachable chunker_version: {other}"), }; @@ -1714,19 +1715,25 @@ fn ingest_one_code_asset( config: &extract_config, }; - // p10-1b Task D: extractor per-lang. + // p10-1b Task D/G: extractor per-lang. let mut canonical = match code_lang { "rust" => RustAstExtractor::new() .extract(&ctx, &bytes) .context("kb-parse-code::RustAstExtractor::extract (code:rust)")?, + "python" => PythonAstExtractor::new() + .extract(&ctx, &bytes) + .context("kb-parse-code::PythonAstExtractor::extract (code:python)")?, other => anyhow::bail!("unreachable (extract): {other}"), }; - // p10-1b Task D: chunker per-lang. + // p10-1b Task D/G: chunker per-lang. let chunks = match code_lang { "rust" => CodeRustAstV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)")?, + "python" => CodePythonAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodePythonAstV1Chunker::chunk (code:python)")?, other => anyhow::bail!("unreachable (chunk): {other}"), }; diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index d6611f1..e84a8e4 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -159,6 +159,83 @@ fn rust_code_search_hit_has_repo() { ); } +/// p10-1b Task G: a `.py` file in a sub-directory is ingested and the +/// resulting `Citation::Code` hit must carry `lang="python"`, +/// `symbol="kebab_eval.metrics.compute_mrr"`, and `line_start >= 1`. +/// The sub-directory (`kebab_eval/`) ensures `module_path_for_python` +/// produces a non-empty prefix so the fully-qualified symbol assertion +/// exercises the prefix wiring end-to-end. +#[test] +fn python_file_ingests_and_searches_as_code_citation() { + let env = TestEnv::lexical_only(); + + let module_dir = env.workspace_root.join("kebab_eval"); + std::fs::create_dir_all(&module_dir).unwrap(); + std::fs::write( + module_dir.join("metrics.py"), + "\"\"\"compute metrics.\"\"\"\ndef compute_mrr(scores):\n return sum(scores) / max(len(scores), 1)\n", + ) + .unwrap(); + + let report = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + + assert!(report.new >= 1, "python file ingested: {report:?}"); + + let items = report.items.as_ref().expect("items present"); + let py_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("metrics.py")) + .expect("metrics.py item"); + assert_eq!( + py_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("code-python-v1"), + "parser_version must be code-python-v1" + ); + assert_eq!( + py_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-python-ast-v1"), + "chunker_version must be code-python-ast-v1" + ); + + let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("compute_mrr")) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'compute_mrr'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("python"), + "citation.lang must be 'python'" + ); + assert_eq!( + symbol.as_deref(), + Some("kebab_eval.metrics.compute_mrr"), + "citation.symbol must be 'kebab_eval.metrics.compute_mrr'" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("python"), + "SearchHit.code_lang must be 'python'" + ); +} + /// Re-ingesting the same `.rs` file without changes must report /// `Unchanged` (incremental-skip path exercised). #[test]