Merge pull request 'feat(kebab-normalize): p9-fb-07 markdown title fallback chain' (#66) from feat/p9-fb-07-title into main
This commit was merged in pull request #66.
This commit is contained in:
@@ -47,6 +47,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
- **2026-05-02 P9 도그푸딩 후속 (spec PR #59 + p9-fb-15)** — RAG multi-turn 도입. frozen design §3.8 갱신 — `Answer` 에 `conversation_id` / `turn_index` optional field, 신규 `Turn` struct, `RefusalReason::LlmStreamAborted` variant. `kebab-rag::AskOpts` 에 `history: Vec<Turn>` / `conversation_id` / `turn_index` 3 field 추가, 기존 caller 는 `Vec::new() / None` (single-shot 동작 동일). `RagPipeline::ask_with_history(query, history, conversation_id, turn_index, opts)` helper. prompt 빌드: `[이전 대화]` 블록을 user prompt 위에 prepend, newest-first, char budget (`cfg.rag.max_context_tokens * 4`) 안에서 oldest 부터 drop. retrieval query expansion: 직전 answer 첫 200 자 concat. wire schema `answer.v1` 에 두 필드 + `format: date-time` 추가. p9-fb-16 (TUI conversation UI) + p9-fb-17/18 (V004 storage + CLI session) 가 같은 facade 위에 build. spec: `tasks/p9/p9-fb-15-rag-multi-turn-core.md`.
|
||||
- **2026-05-02 P9 도그푸딩 후속 (p9-fb-16)** — TUI Ask conversation UI. `AskState` 가 `turns: Vec<Turn>` + `current_question` + `conversation_id` + `last_answer` 로 재설계. answer area 가 transcript (`Q1/A1`, `Q2/A2`, ...) 로 갈음, 매 Enter 가 이전 turns 를 `history` 로 worker 에 전달 (`ask_with_history`). conversation_id 는 첫 submit 시 timestamp-based 자동 생성 (`conv_<unix_nanos_hex>`). `Ctrl-L` 가 turns + conversation_id 초기화 (in-flight worker 는 그대로 finish, 결과는 새 conversation 의 stale turn 으로 silently 폐기). spec: `tasks/p9/p9-fb-16-tui-ask-conversation.md`.
|
||||
- **2026-05-03 P9 도그푸딩 후속 (p9-fb-20)** — `kebab ask` 의 CLI citation block. 답변 출력 후 `근거:` 절 — `[N] <full path>#<fragment> (score=<s>)` 한 줄씩. `--show-citations` (default ON) / `--hide-citations` (pipe 시 답변 본문만) flag. `--json` 모드는 무영향 (citations 가 항상 wire payload 에 포함). spec p9-fb-20 의 \"TUI citation pane + jump\" 부분은 P9-3 의 기존 `render_citations_or_explain` 가 일부 cover — 추가 기능 (turn 별 fold + Enter/o jump + i inspect) 은 후속 task 로 미룸 (사용자 도그푸딩 priority 5위 의 핵심 = full path 가독성 = CLI block 으로 충족). spec: `tasks/p9/p9-fb-20-citation-surface.md`.
|
||||
- **2026-05-03 P9 도그푸딩 후속 (p9-fb-07)** — Markdown title fallback chain. `kebab-normalize::derive_title(frontmatter_title, &[Block], file_stem)` — 1) frontmatter title → 2) 첫 H1 → 3) 첫 H2 → 4) 첫 paragraph 80 chars → 5) 파일 stem (모든 단계 NFC 정규화, 빈 문자열 절대 반환 안 함, 마지막 sentinel `"untitled"`). `build_canonical_document` 가 lift 후 helper 호출. parser_version 상수 `pulldown-cmark-0.x` → `md-frontmatter-v2` bump — 기존 doc 은 `doc_id` 가 갱신되므로 다음 ingest 가 자동 재처리 (idempotent upsert, design §9 cascade). spec: `tasks/p9/p9-fb-07-md-title-fallback.md`.
|
||||
|
||||
## 다음 task 후보
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ kebab doctor
|
||||
| 명령 | 동작 |
|
||||
|------|------|
|
||||
| `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 |
|
||||
| `kebab ingest [<path>]` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit |
|
||||
| `kebab ingest [<path>]` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신 |
|
||||
| `kebab search --mode {lexical,vector,hybrid} "<query>"` | 검색. hybrid는 RRF fusion, citation 포함 |
|
||||
| `kebab list docs` | 색인된 문서 목록 |
|
||||
| `kebab inspect doc <id>` / `kebab inspect chunk <id>` | raw record 보기 |
|
||||
|
||||
@@ -69,7 +69,15 @@ pub use reset::{ResetReport, ResetScope};
|
||||
/// Kept in lock-step with the literal used in the `kb-store-sqlite`
|
||||
/// idempotency / round-trip tests so the version label written by the
|
||||
/// app and the one used in cross-crate fixtures match.
|
||||
const KEBAB_PARSE_MD_VERSION: &str = "pulldown-cmark-0.x";
|
||||
///
|
||||
/// p9-fb-07 bumped this from `pulldown-cmark-0.x` to `md-frontmatter-v2`
|
||||
/// because `kebab-normalize::derive_title` now applies a fallback chain
|
||||
/// (frontmatter → H1 → H2 → first paragraph → file stem) when the
|
||||
/// frontmatter title is blank. The bump invalidates `doc_id` for every
|
||||
/// pre-existing Markdown document, so a re-ingest is required for the
|
||||
/// new titles to land — this is the documented cascade behavior per
|
||||
/// design §9.
|
||||
const KEBAB_PARSE_MD_VERSION: &str = "md-frontmatter-v2";
|
||||
|
||||
/// Caller-supplied knobs for one [`ask`] invocation.
|
||||
///
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
//! the shared `kb-parse-types` crate.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
use kebab_core::{
|
||||
@@ -95,6 +96,15 @@ pub fn build_canonical_document(
|
||||
.filter_map(|pb| lift_block(&doc_id, pb, &mut counters, &mut lift_warnings))
|
||||
.collect();
|
||||
|
||||
// p9-fb-07: title fallback chain. `title` so far holds the
|
||||
// frontmatter `title` (step 1). If empty / whitespace, walk the
|
||||
// lifted blocks for an H1 → H2 → first paragraph excerpt → file
|
||||
// stem. NFC-normalize the chosen string so the on-wire title is
|
||||
// canonically equivalent to whatever the user stored, regardless
|
||||
// of source NFD/NFC form.
|
||||
let file_stem = workspace_path_stem(&asset.workspace_path.0);
|
||||
let title = derive_title(&title, &lifted_blocks, &file_stem);
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-normalize",
|
||||
"built canonical document doc_id={} blocks={}",
|
||||
@@ -326,6 +336,90 @@ fn flatten_inline(i: &Inline, out: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
/// p9-fb-07: derive a usable title from the frontmatter, lifted blocks,
|
||||
/// and the source filename, using a documented fallback chain.
|
||||
///
|
||||
/// Priority (first non-blank wins):
|
||||
///
|
||||
/// 1. `frontmatter_title` — verbatim, after trimming whitespace.
|
||||
/// 2. First `Heading` block at level 1 with non-blank text.
|
||||
/// 3. First `Heading` block at level 2 with non-blank text.
|
||||
/// 4. First `Paragraph` block (NOT `Quote`, `List`, `Code`, `Table`,
|
||||
/// `ImageRef`, `AudioRef`) with non-blank text — first 80 chars.
|
||||
/// 5. `file_stem` (filename minus extension — returned verbatim, no
|
||||
/// case transformation; whatever the on-disk filename is becomes
|
||||
/// the title text).
|
||||
///
|
||||
/// The chosen string is NFC-normalized so the on-wire title is
|
||||
/// canonically equivalent to the source content. Never returns an
|
||||
/// empty string — if every step is blank (e.g. an empty file), the
|
||||
/// `file_stem` fallback ensures a non-empty result. If `file_stem` is
|
||||
/// also blank (pathological), returns `"untitled"` as a last resort.
|
||||
pub fn derive_title(frontmatter_title: &str, blocks: &[Block], file_stem: &str) -> String {
|
||||
let trimmed = frontmatter_title.trim();
|
||||
if !trimmed.is_empty() {
|
||||
return trimmed.nfc().collect();
|
||||
}
|
||||
if let Some(text) = first_heading_text(blocks, 1) {
|
||||
return text;
|
||||
}
|
||||
if let Some(text) = first_heading_text(blocks, 2) {
|
||||
return text;
|
||||
}
|
||||
if let Some(excerpt) = first_paragraph_excerpt(blocks, 80) {
|
||||
return excerpt;
|
||||
}
|
||||
// `file_stem` originates from `WorkspacePath`, which `to_posix`
|
||||
// already NFC-normalizes (§6.6). No second NFC pass needed — pass
|
||||
// through verbatim after a defensive `trim`.
|
||||
let stem = file_stem.trim();
|
||||
if !stem.is_empty() {
|
||||
return stem.to_string();
|
||||
}
|
||||
"untitled".to_string()
|
||||
}
|
||||
|
||||
fn first_heading_text(blocks: &[Block], level: u8) -> Option<String> {
|
||||
blocks.iter().find_map(|b| match b {
|
||||
Block::Heading(h) if h.level == level => {
|
||||
let trimmed = h.text.trim();
|
||||
if trimmed.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(trimmed.nfc().collect())
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
|
||||
fn first_paragraph_excerpt(blocks: &[Block], max_chars: usize) -> Option<String> {
|
||||
blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(t) => {
|
||||
let trimmed = t.text.trim();
|
||||
if trimmed.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let nfc: String = trimmed.nfc().collect();
|
||||
Some(nfc.chars().take(max_chars).collect())
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract the filename stem (no extension) from a workspace path
|
||||
/// string. Returns the empty string if no filename can be derived
|
||||
/// (e.g. trailing slash). Multi-extension cases (`foo.tar.gz`) follow
|
||||
/// `Path::file_stem` semantics — only the last extension is stripped.
|
||||
fn workspace_path_stem(workspace_path: &str) -> String {
|
||||
Path::new(workspace_path)
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or("")
|
||||
.to_string()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -796,11 +890,12 @@ mod tests {
|
||||
assert_eq!(id_nfd, id_nfc, "NFD and NFC heading paths must hash equal");
|
||||
}
|
||||
|
||||
/// M7 — `metadata.user["title"] = ""` is stringy and lifts to an
|
||||
/// empty `CanonicalDocument.title`. This pins the policy: an
|
||||
/// explicit empty string is *not* dropped, it's lifted as-is.
|
||||
/// M7 (revised by p9-fb-07) — `metadata.user["title"] = ""` lifts
|
||||
/// as an empty string but the new derive_title fallback chain
|
||||
/// promotes the file stem so the resulting title is non-empty.
|
||||
/// spec p9-fb-07: "빈 문자열 반환 금지".
|
||||
#[test]
|
||||
fn title_empty_string_in_user_map_falls_back_to_default() {
|
||||
fn title_empty_string_in_user_map_falls_back_to_file_stem() {
|
||||
let asset = fixture_asset();
|
||||
let mut metadata = fixture_metadata();
|
||||
metadata
|
||||
@@ -809,13 +904,16 @@ mod tests {
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
|
||||
assert_eq!(doc.title, "");
|
||||
// workspace_path = "notes/example.md" → stem "example".
|
||||
assert_eq!(doc.title, "example");
|
||||
}
|
||||
|
||||
/// M7 — `metadata.user["title"] = 42` is non-stringy and silently
|
||||
/// drops; the fallback default (empty title) is used.
|
||||
/// M7 (revised by p9-fb-07) — `metadata.user["title"] = 42` is
|
||||
/// non-stringy and silently drops at the lift stage; derive_title
|
||||
/// then falls back through the chain to the file stem.
|
||||
/// spec p9-fb-07: "빈 문자열 반환 금지".
|
||||
#[test]
|
||||
fn title_non_string_in_user_map_silently_drops() {
|
||||
fn title_non_string_in_user_map_falls_back_to_file_stem() {
|
||||
let asset = fixture_asset();
|
||||
let mut metadata = fixture_metadata();
|
||||
metadata
|
||||
@@ -824,7 +922,7 @@ mod tests {
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
|
||||
assert_eq!(doc.title, "");
|
||||
assert_eq!(doc.title, "example");
|
||||
}
|
||||
|
||||
/// M7 — non-stringy `lang` (e.g. an array) silently drops. This is
|
||||
@@ -840,4 +938,154 @@ mod tests {
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
|
||||
assert_eq!(doc.lang, Lang(String::new()));
|
||||
}
|
||||
|
||||
// ── p9-fb-07: derive_title fallback chain ───────────────────────────
|
||||
|
||||
fn span() -> SourceSpan {
|
||||
SourceSpan::Line { start: 1, end: 1 }
|
||||
}
|
||||
|
||||
fn common_for_test() -> CommonBlock {
|
||||
CommonBlock {
|
||||
block_id: BlockId("0".repeat(32)),
|
||||
heading_path: vec![],
|
||||
source_span: span(),
|
||||
}
|
||||
}
|
||||
|
||||
fn heading(level: u8, text: &str) -> Block {
|
||||
Block::Heading(HeadingBlock {
|
||||
common: common_for_test(),
|
||||
level,
|
||||
text: text.to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
fn paragraph(text: &str) -> Block {
|
||||
Block::Paragraph(TextBlock {
|
||||
common: common_for_test(),
|
||||
text: text.to_string(),
|
||||
inlines: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
/// Step 1 — frontmatter title wins, NFC-normalized.
|
||||
#[test]
|
||||
fn derive_title_uses_frontmatter_first() {
|
||||
let blocks = vec![heading(1, "H1 Title"), paragraph("body")];
|
||||
assert_eq!(
|
||||
derive_title("Frontmatter Title", &blocks, "fallback-stem"),
|
||||
"Frontmatter Title"
|
||||
);
|
||||
}
|
||||
|
||||
/// Whitespace-only frontmatter title falls through to the next step.
|
||||
#[test]
|
||||
fn derive_title_blank_frontmatter_falls_through_to_h1() {
|
||||
let blocks = vec![heading(1, "First H1")];
|
||||
assert_eq!(derive_title(" ", &blocks, "stem"), "First H1");
|
||||
}
|
||||
|
||||
/// Step 2 — first H1 wins when frontmatter empty.
|
||||
#[test]
|
||||
fn derive_title_uses_h1_when_no_frontmatter() {
|
||||
let blocks = vec![paragraph("intro"), heading(1, "Real Title"), heading(2, "Sub")];
|
||||
assert_eq!(derive_title("", &blocks, "stem"), "Real Title");
|
||||
}
|
||||
|
||||
/// Step 3 — first H2 wins when no H1.
|
||||
#[test]
|
||||
fn derive_title_uses_h2_when_no_h1() {
|
||||
let blocks = vec![heading(2, "First H2"), heading(2, "Second H2"), heading(1, "")];
|
||||
assert_eq!(derive_title("", &blocks, "stem"), "First H2");
|
||||
}
|
||||
|
||||
/// Step 4 — first non-blank Paragraph wins; truncated to 80 chars.
|
||||
/// Quotes / Lists / Code / Tables / ImageRefs do not qualify.
|
||||
#[test]
|
||||
fn derive_title_uses_first_paragraph_excerpt() {
|
||||
let blocks = vec![
|
||||
Block::Quote(TextBlock {
|
||||
common: common_for_test(),
|
||||
text: "blockquote should be skipped".into(),
|
||||
inlines: vec![],
|
||||
}),
|
||||
Block::Code(CodeBlock {
|
||||
common: common_for_test(),
|
||||
lang: None,
|
||||
code: "code should be skipped".into(),
|
||||
}),
|
||||
paragraph("This paragraph wins. Long text that would exceed eighty characters once concatenated end-to-end here."),
|
||||
];
|
||||
let title = derive_title("", &blocks, "stem");
|
||||
assert_eq!(title.chars().count(), 80);
|
||||
assert!(title.starts_with("This paragraph wins."));
|
||||
}
|
||||
|
||||
/// Step 5 — file stem is the final fallback when there are no
|
||||
/// usable blocks (e.g. table-only doc with no paragraphs).
|
||||
#[test]
|
||||
fn derive_title_falls_back_to_file_stem() {
|
||||
let blocks = vec![Block::Table(TableBlock {
|
||||
common: common_for_test(),
|
||||
headers: vec!["a".into()],
|
||||
rows: vec![vec!["1".into()]],
|
||||
})];
|
||||
assert_eq!(derive_title("", &blocks, "table-only-doc"), "table-only-doc");
|
||||
}
|
||||
|
||||
/// Step 5 sentinel — empty file_stem AND no usable blocks falls back
|
||||
/// to the literal `"untitled"`. Pathological case (workspace_path
|
||||
/// with no filename component).
|
||||
#[test]
|
||||
fn derive_title_returns_untitled_when_everything_blank() {
|
||||
assert_eq!(derive_title("", &[], ""), "untitled");
|
||||
assert_eq!(derive_title(" ", &[], " "), "untitled");
|
||||
}
|
||||
|
||||
/// Korean H1 in NFD form is normalized to NFC before being chosen
|
||||
/// as the title. Mirrors the heading_path NFC pin elsewhere.
|
||||
#[test]
|
||||
fn derive_title_nfc_normalizes_korean_h1() {
|
||||
let nfd = "\u{1100}\u{1161}".to_string(); // 가 (NFD)
|
||||
let nfc = "\u{AC00}".to_string(); // 가 (NFC)
|
||||
let blocks = vec![heading(1, &nfd)];
|
||||
assert_eq!(derive_title("", &blocks, "stem"), nfc);
|
||||
}
|
||||
|
||||
/// `build_canonical_document` integrates the derive_title chain —
|
||||
/// when frontmatter title is empty, the first H1 is used.
|
||||
#[test]
|
||||
fn build_canonical_document_falls_back_to_first_h1() {
|
||||
let asset = fixture_asset();
|
||||
let mut metadata = fixture_metadata();
|
||||
metadata.user.remove("title");
|
||||
let blocks = vec![ParsedBlock {
|
||||
kind: kebab_parse_types::ParsedBlockKind::Heading,
|
||||
heading_path: vec![],
|
||||
source_span: span(),
|
||||
payload: ParsedPayload::Heading {
|
||||
level: 1,
|
||||
text: "Lifted From H1".into(),
|
||||
},
|
||||
}];
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
|
||||
assert_eq!(doc.title, "Lifted From H1");
|
||||
}
|
||||
|
||||
/// `build_canonical_document` integrates the file_stem fallback —
|
||||
/// no frontmatter title, no headings, no paragraphs → filename
|
||||
/// (stripped of extension).
|
||||
#[test]
|
||||
fn build_canonical_document_falls_back_to_file_stem() {
|
||||
let asset = fixture_asset();
|
||||
// workspace_path = "notes/example.md" → stem "example"
|
||||
let mut metadata = fixture_metadata();
|
||||
metadata.user.remove("title");
|
||||
let pv = parser_version();
|
||||
let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
|
||||
assert_eq!(doc.title, "example");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
"doc_path": "notes/alpha.md",
|
||||
"error": null,
|
||||
"kind": "new",
|
||||
"parser_version": "pulldown-cmark-0.x",
|
||||
"parser_version": "md-frontmatter-v2",
|
||||
"warnings": []
|
||||
},
|
||||
{
|
||||
@@ -25,7 +25,7 @@
|
||||
"doc_path": "notes/beta.md",
|
||||
"error": null,
|
||||
"kind": "updated",
|
||||
"parser_version": "pulldown-cmark-0.x",
|
||||
"parser_version": "md-frontmatter-v2",
|
||||
"warnings": [
|
||||
"malformed frontmatter"
|
||||
]
|
||||
|
||||
@@ -42,7 +42,7 @@ fn fixture_report() -> IngestReport {
|
||||
byte_len: Some(1234),
|
||||
block_count: Some(7),
|
||||
chunk_count: Some(3),
|
||||
parser_version: Some(ParserVersion("pulldown-cmark-0.x".into())),
|
||||
parser_version: Some(ParserVersion("md-frontmatter-v2".into())),
|
||||
chunker_version: Some(ChunkerVersion("md-heading-v1".into())),
|
||||
warnings: vec![],
|
||||
error: None,
|
||||
@@ -55,7 +55,7 @@ fn fixture_report() -> IngestReport {
|
||||
byte_len: Some(2048),
|
||||
block_count: Some(12),
|
||||
chunk_count: Some(5),
|
||||
parser_version: Some(ParserVersion("pulldown-cmark-0.x".into())),
|
||||
parser_version: Some(ParserVersion("md-frontmatter-v2".into())),
|
||||
chunker_version: Some(ChunkerVersion("md-heading-v1".into())),
|
||||
warnings: vec!["malformed frontmatter".into()],
|
||||
error: None,
|
||||
|
||||
@@ -3,7 +3,7 @@ phase: P9
|
||||
component: kebab-parse-md + kebab-normalize
|
||||
task_id: p9-fb-07
|
||||
title: "Markdown title fallback chain (frontmatter → H1 → H2 → first paragraph → filename)"
|
||||
status: planned
|
||||
status: in_progress
|
||||
depends_on: []
|
||||
unblocks: []
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
|
||||
|
||||
Reference in New Issue
Block a user