feat(p10-1b): activate Python in ingest_one_code_asset dispatch
Replaces Python bail! arms with PythonAstExtractor + CodePythonAstV1Chunker. Adds python_file_ingests_and_searches_as_code_citation integration test — asserts citation.lang=python, symbol=kebab_eval.metrics.compute_mrr, code_lang=python. TS/JS arms remain bail!() (Tasks J/L). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -39,7 +39,7 @@ use std::sync::Arc;
|
||||
use anyhow::{Context, anyhow};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use kebab_chunk::{CodeRustAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker};
|
||||
use kebab_chunk::{CodePythonAstV1Chunker, CodeRustAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker};
|
||||
use kebab_core::{
|
||||
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
|
||||
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
|
||||
@@ -50,7 +50,7 @@ use kebab_core::{
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_normalize::build_canonical_document;
|
||||
use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr};
|
||||
use kebab_parse_code::RustAstExtractor;
|
||||
use kebab_parse_code::{PythonAstExtractor, RustAstExtractor};
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
@@ -1677,19 +1677,20 @@ fn ingest_one_code_asset(
|
||||
}
|
||||
};
|
||||
|
||||
// p10-1b Task D: parser_version per-lang.
|
||||
// p10-1b Task D/G: parser_version per-lang.
|
||||
let parser_version = match code_lang {
|
||||
"rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()),
|
||||
"python" => anyhow::bail!("python ingest not yet wired (p10-1b Task G)"),
|
||||
"rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()),
|
||||
"python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()),
|
||||
"typescript" => anyhow::bail!("typescript ingest not yet wired (p10-1b Task J)"),
|
||||
"javascript" => anyhow::bail!("javascript ingest not yet wired (p10-1b Task L)"),
|
||||
other => anyhow::bail!("unsupported code_lang: {other}"),
|
||||
};
|
||||
|
||||
// p10-1b Task D: chunker_version per-lang (Python/TS/JS are unreachable here;
|
||||
// they bail above and get real chunkers in Tasks G/J/L).
|
||||
// p10-1b Task D/G: chunker_version per-lang (TS/JS are unreachable here;
|
||||
// they bail above and get real chunkers in Tasks J/L).
|
||||
let chunker_version = match code_lang {
|
||||
"rust" => CodeRustAstV1Chunker.chunker_version(),
|
||||
"rust" => CodeRustAstV1Chunker.chunker_version(),
|
||||
"python" => CodePythonAstV1Chunker.chunker_version(),
|
||||
other => anyhow::bail!("unreachable chunker_version: {other}"),
|
||||
};
|
||||
|
||||
@@ -1714,19 +1715,25 @@ fn ingest_one_code_asset(
|
||||
config: &extract_config,
|
||||
};
|
||||
|
||||
// p10-1b Task D: extractor per-lang.
|
||||
// p10-1b Task D/G: extractor per-lang.
|
||||
let mut canonical = match code_lang {
|
||||
"rust" => RustAstExtractor::new()
|
||||
.extract(&ctx, &bytes)
|
||||
.context("kb-parse-code::RustAstExtractor::extract (code:rust)")?,
|
||||
"python" => PythonAstExtractor::new()
|
||||
.extract(&ctx, &bytes)
|
||||
.context("kb-parse-code::PythonAstExtractor::extract (code:python)")?,
|
||||
other => anyhow::bail!("unreachable (extract): {other}"),
|
||||
};
|
||||
|
||||
// p10-1b Task D: chunker per-lang.
|
||||
// p10-1b Task D/G: chunker per-lang.
|
||||
let chunks = match code_lang {
|
||||
"rust" => CodeRustAstV1Chunker
|
||||
.chunk(&canonical, chunk_policy)
|
||||
.context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)")?,
|
||||
"python" => CodePythonAstV1Chunker
|
||||
.chunk(&canonical, chunk_policy)
|
||||
.context("kb-chunk::CodePythonAstV1Chunker::chunk (code:python)")?,
|
||||
other => anyhow::bail!("unreachable (chunk): {other}"),
|
||||
};
|
||||
|
||||
|
||||
@@ -159,6 +159,83 @@ fn rust_code_search_hit_has_repo() {
|
||||
);
|
||||
}
|
||||
|
||||
/// p10-1b Task G: a `.py` file in a sub-directory is ingested and the
|
||||
/// resulting `Citation::Code` hit must carry `lang="python"`,
|
||||
/// `symbol="kebab_eval.metrics.compute_mrr"`, and `line_start >= 1`.
|
||||
/// The sub-directory (`kebab_eval/`) ensures `module_path_for_python`
|
||||
/// produces a non-empty prefix so the fully-qualified symbol assertion
|
||||
/// exercises the prefix wiring end-to-end.
|
||||
#[test]
|
||||
fn python_file_ingests_and_searches_as_code_citation() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let module_dir = env.workspace_root.join("kebab_eval");
|
||||
std::fs::create_dir_all(&module_dir).unwrap();
|
||||
std::fs::write(
|
||||
module_dir.join("metrics.py"),
|
||||
"\"\"\"compute metrics.\"\"\"\ndef compute_mrr(scores):\n return sum(scores) / max(len(scores), 1)\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
assert!(report.new >= 1, "python file ingested: {report:?}");
|
||||
|
||||
let items = report.items.as_ref().expect("items present");
|
||||
let py_item = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("metrics.py"))
|
||||
.expect("metrics.py item");
|
||||
assert_eq!(
|
||||
py_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
Some("code-python-v1"),
|
||||
"parser_version must be code-python-v1"
|
||||
);
|
||||
assert_eq!(
|
||||
py_item.chunker_version.as_ref().map(|c| c.0.as_str()),
|
||||
Some("code-python-ast-v1"),
|
||||
"chunker_version must be code-python-ast-v1"
|
||||
);
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("compute_mrr"))
|
||||
.expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
.find(|h| matches!(&h.citation, Citation::Code { .. }))
|
||||
.expect("at least one Citation::Code hit for 'compute_mrr'");
|
||||
|
||||
match &h.citation {
|
||||
Citation::Code {
|
||||
lang,
|
||||
symbol,
|
||||
line_start,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("python"),
|
||||
"citation.lang must be 'python'"
|
||||
);
|
||||
assert_eq!(
|
||||
symbol.as_deref(),
|
||||
Some("kebab_eval.metrics.compute_mrr"),
|
||||
"citation.symbol must be 'kebab_eval.metrics.compute_mrr'"
|
||||
);
|
||||
assert!(*line_start >= 1, "line_start must be >=1");
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
h.code_lang.as_deref(),
|
||||
Some("python"),
|
||||
"SearchHit.code_lang must be 'python'"
|
||||
);
|
||||
}
|
||||
|
||||
/// Re-ingesting the same `.rs` file without changes must report
|
||||
/// `Unchanged` (incremental-skip path exercised).
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user