[package] name = "kebab-parse-pdf" version = { workspace = true } edition = { workspace = true } rust-version = { workspace = true } license = { workspace = true } repository = { workspace = true } description = "Text PDF extractor + scanned-page image extract helpers for the kebab pipeline (P7-1 + v0.20.0 sub-item 1)" [dependencies] kebab-core = { path = "../kebab-core" } anyhow = { workspace = true } serde_json = { workspace = true } time = { workspace = true } tracing = { workspace = true } # Per-page text extraction. `lopdf::Document::extract_text(&[page])` # is the only stable per-page API across the pdf-extract / lopdf # pair (pdf-extract 0.7 still exposes only whole-document calls). # pdf-extract is intentionally NOT pulled in here — its ~150 transitive # crates (pom, postscript, type1-encoding-parser, …) buy us nothing # at v1 (we don't call its whole-doc API), and the future scanned-PDF # OCR fallback can re-add it when it actually needs it. lopdf = { workspace = true } [dev-dependencies] anyhow = { workspace = true } blake3 = { workspace = true } kebab-parse-image = { path = "../kebab-parse-image" } strsim = "0.11" [lints] workspace = true