diff --git a/HANDOFF.md b/HANDOFF.md index f9cee6b..c2c4b57 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -47,6 +47,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능. - **2026-05-02 P9 도그푸딩 후속 (spec PR #59 + p9-fb-15)** — RAG multi-turn 도입. frozen design §3.8 갱신 — `Answer` 에 `conversation_id` / `turn_index` optional field, 신규 `Turn` struct, `RefusalReason::LlmStreamAborted` variant. `kebab-rag::AskOpts` 에 `history: Vec` / `conversation_id` / `turn_index` 3 field 추가, 기존 caller 는 `Vec::new() / None` (single-shot 동작 동일). `RagPipeline::ask_with_history(query, history, conversation_id, turn_index, opts)` helper. prompt 빌드: `[이전 대화]` 블록을 user prompt 위에 prepend, newest-first, char budget (`cfg.rag.max_context_tokens * 4`) 안에서 oldest 부터 drop. retrieval query expansion: 직전 answer 첫 200 자 concat. wire schema `answer.v1` 에 두 필드 + `format: date-time` 추가. p9-fb-16 (TUI conversation UI) + p9-fb-17/18 (V004 storage + CLI session) 가 같은 facade 위에 build. spec: `tasks/p9/p9-fb-15-rag-multi-turn-core.md`. - **2026-05-02 P9 도그푸딩 후속 (p9-fb-16)** — TUI Ask conversation UI. `AskState` 가 `turns: Vec` + `current_question` + `conversation_id` + `last_answer` 로 재설계. answer area 가 transcript (`Q1/A1`, `Q2/A2`, ...) 로 갈음, 매 Enter 가 이전 turns 를 `history` 로 worker 에 전달 (`ask_with_history`). conversation_id 는 첫 submit 시 timestamp-based 자동 생성 (`conv_`). `Ctrl-L` 가 turns + conversation_id 초기화 (in-flight worker 는 그대로 finish, 결과는 새 conversation 의 stale turn 으로 silently 폐기). spec: `tasks/p9/p9-fb-16-tui-ask-conversation.md`. - **2026-05-03 P9 도그푸딩 후속 (p9-fb-20)** — `kebab ask` 의 CLI citation block. 답변 출력 후 `근거:` 절 — `[N] # (score=)` 한 줄씩. `--show-citations` (default ON) / `--hide-citations` (pipe 시 답변 본문만) flag. `--json` 모드는 무영향 (citations 가 항상 wire payload 에 포함). spec p9-fb-20 의 \"TUI citation pane + jump\" 부분은 P9-3 의 기존 `render_citations_or_explain` 가 일부 cover — 추가 기능 (turn 별 fold + Enter/o jump + i inspect) 은 후속 task 로 미룸 (사용자 도그푸딩 priority 5위 의 핵심 = full path 가독성 = CLI block 으로 충족). spec: `tasks/p9/p9-fb-20-citation-surface.md`. +- **2026-05-03 P9 도그푸딩 후속 (p9-fb-07)** — Markdown title fallback chain. `kebab-normalize::derive_title(frontmatter_title, &[Block], file_stem)` — 1) frontmatter title → 2) 첫 H1 → 3) 첫 H2 → 4) 첫 paragraph 80 chars → 5) 파일 stem (모든 단계 NFC 정규화, 빈 문자열 절대 반환 안 함, 마지막 sentinel `"untitled"`). `build_canonical_document` 가 lift 후 helper 호출. parser_version 상수 `pulldown-cmark-0.x` → `md-frontmatter-v2` bump — 기존 doc 은 `doc_id` 가 갱신되므로 다음 ingest 가 자동 재처리 (idempotent upsert, design §9 cascade). spec: `tasks/p9/p9-fb-07-md-title-fallback.md`. ## 다음 task 후보 diff --git a/README.md b/README.md index 249f917..459daeb 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ kebab doctor | 명령 | 동작 | |------|------| | `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 | -| `kebab ingest []` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit | +| `kebab ingest []` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신 | | `kebab search --mode {lexical,vector,hybrid} ""` | 검색. hybrid는 RRF fusion, citation 포함 | | `kebab list docs` | 색인된 문서 목록 | | `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 | diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 5f28bee..bf29d1f 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -69,7 +69,15 @@ pub use reset::{ResetReport, ResetScope}; /// Kept in lock-step with the literal used in the `kb-store-sqlite` /// idempotency / round-trip tests so the version label written by the /// app and the one used in cross-crate fixtures match. -const KEBAB_PARSE_MD_VERSION: &str = "pulldown-cmark-0.x"; +/// +/// p9-fb-07 bumped this from `pulldown-cmark-0.x` to `md-frontmatter-v2` +/// because `kebab-normalize::derive_title` now applies a fallback chain +/// (frontmatter → H1 → H2 → first paragraph → file stem) when the +/// frontmatter title is blank. The bump invalidates `doc_id` for every +/// pre-existing Markdown document, so a re-ingest is required for the +/// new titles to land — this is the documented cascade behavior per +/// design §9. +const KEBAB_PARSE_MD_VERSION: &str = "md-frontmatter-v2"; /// Caller-supplied knobs for one [`ask`] invocation. /// diff --git a/crates/kebab-normalize/src/lib.rs b/crates/kebab-normalize/src/lib.rs index b65cb18..4d6b095 100644 --- a/crates/kebab-normalize/src/lib.rs +++ b/crates/kebab-normalize/src/lib.rs @@ -18,6 +18,7 @@ //! the shared `kb-parse-types` crate. use std::collections::HashMap; +use std::path::Path; use anyhow::Result; use kebab_core::{ @@ -95,6 +96,15 @@ pub fn build_canonical_document( .filter_map(|pb| lift_block(&doc_id, pb, &mut counters, &mut lift_warnings)) .collect(); + // p9-fb-07: title fallback chain. `title` so far holds the + // frontmatter `title` (step 1). If empty / whitespace, walk the + // lifted blocks for an H1 → H2 → first paragraph excerpt → file + // stem. NFC-normalize the chosen string so the on-wire title is + // canonically equivalent to whatever the user stored, regardless + // of source NFD/NFC form. + let file_stem = workspace_path_stem(&asset.workspace_path.0); + let title = derive_title(&title, &lifted_blocks, &file_stem); + tracing::debug!( target: "kebab-normalize", "built canonical document doc_id={} blocks={}", @@ -326,6 +336,90 @@ fn flatten_inline(i: &Inline, out: &mut String) { } } +/// p9-fb-07: derive a usable title from the frontmatter, lifted blocks, +/// and the source filename, using a documented fallback chain. +/// +/// Priority (first non-blank wins): +/// +/// 1. `frontmatter_title` — verbatim, after trimming whitespace. +/// 2. First `Heading` block at level 1 with non-blank text. +/// 3. First `Heading` block at level 2 with non-blank text. +/// 4. First `Paragraph` block (NOT `Quote`, `List`, `Code`, `Table`, +/// `ImageRef`, `AudioRef`) with non-blank text — first 80 chars. +/// 5. `file_stem` (filename minus extension — returned verbatim, no +/// case transformation; whatever the on-disk filename is becomes +/// the title text). +/// +/// The chosen string is NFC-normalized so the on-wire title is +/// canonically equivalent to the source content. Never returns an +/// empty string — if every step is blank (e.g. an empty file), the +/// `file_stem` fallback ensures a non-empty result. If `file_stem` is +/// also blank (pathological), returns `"untitled"` as a last resort. +pub fn derive_title(frontmatter_title: &str, blocks: &[Block], file_stem: &str) -> String { + let trimmed = frontmatter_title.trim(); + if !trimmed.is_empty() { + return trimmed.nfc().collect(); + } + if let Some(text) = first_heading_text(blocks, 1) { + return text; + } + if let Some(text) = first_heading_text(blocks, 2) { + return text; + } + if let Some(excerpt) = first_paragraph_excerpt(blocks, 80) { + return excerpt; + } + // `file_stem` originates from `WorkspacePath`, which `to_posix` + // already NFC-normalizes (§6.6). No second NFC pass needed — pass + // through verbatim after a defensive `trim`. + let stem = file_stem.trim(); + if !stem.is_empty() { + return stem.to_string(); + } + "untitled".to_string() +} + +fn first_heading_text(blocks: &[Block], level: u8) -> Option { + blocks.iter().find_map(|b| match b { + Block::Heading(h) if h.level == level => { + let trimmed = h.text.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.nfc().collect()) + } + } + _ => None, + }) +} + +fn first_paragraph_excerpt(blocks: &[Block], max_chars: usize) -> Option { + blocks.iter().find_map(|b| match b { + Block::Paragraph(t) => { + let trimmed = t.text.trim(); + if trimmed.is_empty() { + None + } else { + let nfc: String = trimmed.nfc().collect(); + Some(nfc.chars().take(max_chars).collect()) + } + } + _ => None, + }) +} + +/// Extract the filename stem (no extension) from a workspace path +/// string. Returns the empty string if no filename can be derived +/// (e.g. trailing slash). Multi-extension cases (`foo.tar.gz`) follow +/// `Path::file_stem` semantics — only the last extension is stripped. +fn workspace_path_stem(workspace_path: &str) -> String { + Path::new(workspace_path) + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("") + .to_string() +} + #[cfg(test)] mod tests { use super::*; @@ -796,11 +890,12 @@ mod tests { assert_eq!(id_nfd, id_nfc, "NFD and NFC heading paths must hash equal"); } - /// M7 — `metadata.user["title"] = ""` is stringy and lifts to an - /// empty `CanonicalDocument.title`. This pins the policy: an - /// explicit empty string is *not* dropped, it's lifted as-is. + /// M7 (revised by p9-fb-07) — `metadata.user["title"] = ""` lifts + /// as an empty string but the new derive_title fallback chain + /// promotes the file stem so the resulting title is non-empty. + /// spec p9-fb-07: "빈 문자열 반환 금지". #[test] - fn title_empty_string_in_user_map_falls_back_to_default() { + fn title_empty_string_in_user_map_falls_back_to_file_stem() { let asset = fixture_asset(); let mut metadata = fixture_metadata(); metadata @@ -809,13 +904,16 @@ mod tests { let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); - assert_eq!(doc.title, ""); + // workspace_path = "notes/example.md" → stem "example". + assert_eq!(doc.title, "example"); } - /// M7 — `metadata.user["title"] = 42` is non-stringy and silently - /// drops; the fallback default (empty title) is used. + /// M7 (revised by p9-fb-07) — `metadata.user["title"] = 42` is + /// non-stringy and silently drops at the lift stage; derive_title + /// then falls back through the chain to the file stem. + /// spec p9-fb-07: "빈 문자열 반환 금지". #[test] - fn title_non_string_in_user_map_silently_drops() { + fn title_non_string_in_user_map_falls_back_to_file_stem() { let asset = fixture_asset(); let mut metadata = fixture_metadata(); metadata @@ -824,7 +922,7 @@ mod tests { let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); - assert_eq!(doc.title, ""); + assert_eq!(doc.title, "example"); } /// M7 — non-stringy `lang` (e.g. an array) silently drops. This is @@ -840,4 +938,154 @@ mod tests { build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); assert_eq!(doc.lang, Lang(String::new())); } + + // ── p9-fb-07: derive_title fallback chain ─────────────────────────── + + fn span() -> SourceSpan { + SourceSpan::Line { start: 1, end: 1 } + } + + fn common_for_test() -> CommonBlock { + CommonBlock { + block_id: BlockId("0".repeat(32)), + heading_path: vec![], + source_span: span(), + } + } + + fn heading(level: u8, text: &str) -> Block { + Block::Heading(HeadingBlock { + common: common_for_test(), + level, + text: text.to_string(), + }) + } + + fn paragraph(text: &str) -> Block { + Block::Paragraph(TextBlock { + common: common_for_test(), + text: text.to_string(), + inlines: vec![], + }) + } + + /// Step 1 — frontmatter title wins, NFC-normalized. + #[test] + fn derive_title_uses_frontmatter_first() { + let blocks = vec![heading(1, "H1 Title"), paragraph("body")]; + assert_eq!( + derive_title("Frontmatter Title", &blocks, "fallback-stem"), + "Frontmatter Title" + ); + } + + /// Whitespace-only frontmatter title falls through to the next step. + #[test] + fn derive_title_blank_frontmatter_falls_through_to_h1() { + let blocks = vec![heading(1, "First H1")]; + assert_eq!(derive_title(" ", &blocks, "stem"), "First H1"); + } + + /// Step 2 — first H1 wins when frontmatter empty. + #[test] + fn derive_title_uses_h1_when_no_frontmatter() { + let blocks = vec![paragraph("intro"), heading(1, "Real Title"), heading(2, "Sub")]; + assert_eq!(derive_title("", &blocks, "stem"), "Real Title"); + } + + /// Step 3 — first H2 wins when no H1. + #[test] + fn derive_title_uses_h2_when_no_h1() { + let blocks = vec![heading(2, "First H2"), heading(2, "Second H2"), heading(1, "")]; + assert_eq!(derive_title("", &blocks, "stem"), "First H2"); + } + + /// Step 4 — first non-blank Paragraph wins; truncated to 80 chars. + /// Quotes / Lists / Code / Tables / ImageRefs do not qualify. + #[test] + fn derive_title_uses_first_paragraph_excerpt() { + let blocks = vec![ + Block::Quote(TextBlock { + common: common_for_test(), + text: "blockquote should be skipped".into(), + inlines: vec![], + }), + Block::Code(CodeBlock { + common: common_for_test(), + lang: None, + code: "code should be skipped".into(), + }), + paragraph("This paragraph wins. Long text that would exceed eighty characters once concatenated end-to-end here."), + ]; + let title = derive_title("", &blocks, "stem"); + assert_eq!(title.chars().count(), 80); + assert!(title.starts_with("This paragraph wins.")); + } + + /// Step 5 — file stem is the final fallback when there are no + /// usable blocks (e.g. table-only doc with no paragraphs). + #[test] + fn derive_title_falls_back_to_file_stem() { + let blocks = vec![Block::Table(TableBlock { + common: common_for_test(), + headers: vec!["a".into()], + rows: vec![vec!["1".into()]], + })]; + assert_eq!(derive_title("", &blocks, "table-only-doc"), "table-only-doc"); + } + + /// Step 5 sentinel — empty file_stem AND no usable blocks falls back + /// to the literal `"untitled"`. Pathological case (workspace_path + /// with no filename component). + #[test] + fn derive_title_returns_untitled_when_everything_blank() { + assert_eq!(derive_title("", &[], ""), "untitled"); + assert_eq!(derive_title(" ", &[], " "), "untitled"); + } + + /// Korean H1 in NFD form is normalized to NFC before being chosen + /// as the title. Mirrors the heading_path NFC pin elsewhere. + #[test] + fn derive_title_nfc_normalizes_korean_h1() { + let nfd = "\u{1100}\u{1161}".to_string(); // 가 (NFD) + let nfc = "\u{AC00}".to_string(); // 가 (NFC) + let blocks = vec![heading(1, &nfd)]; + assert_eq!(derive_title("", &blocks, "stem"), nfc); + } + + /// `build_canonical_document` integrates the derive_title chain — + /// when frontmatter title is empty, the first H1 is used. + #[test] + fn build_canonical_document_falls_back_to_first_h1() { + let asset = fixture_asset(); + let mut metadata = fixture_metadata(); + metadata.user.remove("title"); + let blocks = vec![ParsedBlock { + kind: kebab_parse_types::ParsedBlockKind::Heading, + heading_path: vec![], + source_span: span(), + payload: ParsedPayload::Heading { + level: 1, + text: "Lifted From H1".into(), + }, + }]; + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); + assert_eq!(doc.title, "Lifted From H1"); + } + + /// `build_canonical_document` integrates the file_stem fallback — + /// no frontmatter title, no headings, no paragraphs → filename + /// (stripped of extension). + #[test] + fn build_canonical_document_falls_back_to_file_stem() { + let asset = fixture_asset(); + // workspace_path = "notes/example.md" → stem "example" + let mut metadata = fixture_metadata(); + metadata.user.remove("title"); + let pv = parser_version(); + let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + assert_eq!(doc.title, "example"); + } } diff --git a/crates/kebab-store-sqlite/snapshots/ingest_report.snapshot.json b/crates/kebab-store-sqlite/snapshots/ingest_report.snapshot.json index 6a3e90a..429e633 100644 --- a/crates/kebab-store-sqlite/snapshots/ingest_report.snapshot.json +++ b/crates/kebab-store-sqlite/snapshots/ingest_report.snapshot.json @@ -12,7 +12,7 @@ "doc_path": "notes/alpha.md", "error": null, "kind": "new", - "parser_version": "pulldown-cmark-0.x", + "parser_version": "md-frontmatter-v2", "warnings": [] }, { @@ -25,7 +25,7 @@ "doc_path": "notes/beta.md", "error": null, "kind": "updated", - "parser_version": "pulldown-cmark-0.x", + "parser_version": "md-frontmatter-v2", "warnings": [ "malformed frontmatter" ] diff --git a/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs b/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs index acfd72d..2bc2b7b 100644 --- a/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs +++ b/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs @@ -42,7 +42,7 @@ fn fixture_report() -> IngestReport { byte_len: Some(1234), block_count: Some(7), chunk_count: Some(3), - parser_version: Some(ParserVersion("pulldown-cmark-0.x".into())), + parser_version: Some(ParserVersion("md-frontmatter-v2".into())), chunker_version: Some(ChunkerVersion("md-heading-v1".into())), warnings: vec![], error: None, @@ -55,7 +55,7 @@ fn fixture_report() -> IngestReport { byte_len: Some(2048), block_count: Some(12), chunk_count: Some(5), - parser_version: Some(ParserVersion("pulldown-cmark-0.x".into())), + parser_version: Some(ParserVersion("md-frontmatter-v2".into())), chunker_version: Some(ChunkerVersion("md-heading-v1".into())), warnings: vec!["malformed frontmatter".into()], error: None, diff --git a/tasks/p9/p9-fb-07-md-title-fallback.md b/tasks/p9/p9-fb-07-md-title-fallback.md index 321175d..51a0e9d 100644 --- a/tasks/p9/p9-fb-07-md-title-fallback.md +++ b/tasks/p9/p9-fb-07-md-title-fallback.md @@ -3,7 +3,7 @@ phase: P9 component: kebab-parse-md + kebab-normalize task_id: p9-fb-07 title: "Markdown title fallback chain (frontmatter → H1 → H2 → first paragraph → filename)" -status: planned +status: in_progress depends_on: [] unblocks: [] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md