From b5d1fe8c1eaa07772cc30be5c007477798f2a9ec Mon Sep 17 00:00:00 2001 From: altair823 Date: Tue, 19 May 2026 21:13:01 +0000 Subject: [PATCH] feat(p10-1a-2): backfill SearchHit.repo from doc metadata (Task 8b) Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/app.rs | 51 +++++++++++++++++- crates/kebab-app/tests/code_ingest_smoke.rs | 58 +++++++++++++++++++++ 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index 2202cd4..792016c 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -40,8 +40,8 @@ use anyhow::{Context, Result, anyhow}; use lru::LruCache; use kebab_core::{ - Answer, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit, SearchMode, - SearchOpts, SearchQuery, VectorStore, + Answer, DocumentStore, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit, + SearchMode, SearchOpts, SearchQuery, VectorStore, }; use kebab_embed_local::FastembedEmbedder; use kebab_llm_local::OllamaLanguageModel; @@ -301,6 +301,10 @@ impl App { // `code_lang: None`; we own the post-processing here in kebab-app // and can fill it cheaply from data already present in the hit. backfill_code_lang(&mut hits); + // p10-1A-2 Task 8b: backfill `repo` from the document's + // `Metadata.repo`. Unlike `code_lang`, this cannot be derived from + // the Citation alone — it requires a store lookup by `doc_id`. + self.backfill_repo(&mut hits); Ok(hits) } @@ -394,6 +398,8 @@ impl App { ); // p10-1A-2: backfill code_lang — same as search_uncached. backfill_code_lang(&mut traced_hits); + // p10-1A-2 Task 8b: backfill repo — same as search_uncached. + self.backfill_repo(&mut traced_hits); // Apply offset + k_effective truncation (mirrors non-trace path). let drop_n = offset.min(traced_hits.len()); @@ -784,6 +790,47 @@ impl App { } } + /// p10-1A-2 Task 8b: back-fill `SearchHit.repo` from the originating + /// document's `Metadata.repo` for every hit whose `repo` field is + /// currently `None`. The search layer (kebab-search) constructs hits + /// with `repo: None` because it has no store access; we fill it here + /// in kebab-app post-retrieval via a per-distinct-`doc_id` store lookup. + /// + /// Deduplication: a small `HashMap` accumulates the + /// `(doc_id → Option)` mapping so each unique document is + /// fetched at most once. Search result sets are small (default k ≤ 20), + /// so the map overhead is negligible. A `None` entry is cached too + /// (document not found or no repo in metadata) to avoid re-querying. + /// + /// Non-repo documents (markdown, PDF, plain text, code files outside a + /// git tree) correctly keep `repo: None` — `Metadata.repo` is already + /// `None` for those, so the assignment is a no-op. + fn backfill_repo(&self, hits: &mut [SearchHit]) { + use std::collections::HashMap; + use kebab_core::DocumentId; + + // doc_id → Option where None means "not found / no repo" + let mut cache: HashMap> = HashMap::new(); + + for hit in hits.iter_mut() { + if hit.repo.is_some() { + continue; + } + let repo_val = cache + .entry(hit.doc_id.clone()) + .or_insert_with(|| { + self.sqlite + .get_document(&hit.doc_id) + .ok() + .flatten() + .and_then(|doc| doc.metadata.repo) + }); + if let Some(r) = repo_val { + hit.repo = Some(r.clone()); + } + } + } + /// Resolve the embedder + vector store, surfacing the user-friendly /// "switch to --mode lexical" error when embeddings are disabled. fn require_embeddings( diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index baf0ae4..d6611f1 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -101,6 +101,64 @@ fn rust_file_ingests_and_searches_as_code_citation() { ); } +/// p10-1A-2 Task 8b: a code search hit must carry `SearchHit.repo` filled +/// from the document's `Metadata.repo` (which is set by `detect_repo` during +/// ingest). `detect_repo` returns the name of the directory that contains +/// `.git/`, so we `git init` the workspace root before ingesting and then +/// assert that `h.repo == Some("workspace")`. +#[test] +fn rust_code_search_hit_has_repo() { + let env = TestEnv::lexical_only(); + + // `detect_repo` walks up from the file looking for `.git/`. + // Initialise a bare git repo at the workspace root so it is + // discoverable. We only need the `.git/` directory — no commits + // required. + let git_status = std::process::Command::new("git") + .args(["init", "--quiet"]) + .arg(env.workspace_root.as_os_str()) + .status() + .expect("git init"); + assert!(git_status.success(), "git init must succeed"); + + std::fs::write( + env.workspace_root.join("repo_demo.rs"), + "/// multiplies two integers\npub fn mul(a: i32, b: i32) -> i32 {\n a * b\n}\n", + ) + .unwrap(); + + let report = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + + let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("mul")) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'mul'"); + + // The workspace root directory is named "workspace" by `TestEnv`. + let expected_repo = env + .workspace_root + .file_name() + .and_then(|n| n.to_str()) + .map(str::to_owned); + assert_eq!( + h.repo, + expected_repo, + "SearchHit.repo must match the workspace dir name (detect_repo result)" + ); + // Also sanity-check code_lang is still filled. + assert_eq!( + h.code_lang.as_deref(), + Some("rust"), + "SearchHit.code_lang must be 'rust'" + ); +} + /// Re-ingesting the same `.rs` file without changes must report /// `Unchanged` (incremental-skip path exercised). #[test]