Files
pdf2cad/src/pdf2imos/extract/text.py
2026-03-03 21:24:02 +00:00

105 lines
3.0 KiB
Python

"""PDF text extraction using PyMuPDF."""
import logging
import pymupdf
from pdf2imos.models import RawText
logger = logging.getLogger(__name__)
def extract_text(page: pymupdf.Page) -> list[RawText]:
"""Extract structured text spans from a PDF page.
Uses get_text("dict") to get rich text with font/size/color info.
Filters out empty/whitespace-only spans.
Args:
page: PyMuPDF Page object
Returns:
List of RawText objects with position and formatting info.
Coordinates are in PDF space (y increases downward — NOT flipped).
Callers can flip as needed.
"""
result = []
text_dict = page.get_text("dict")
for block in text_dict.get("blocks", []):
if block.get("type") != 0: # type 0 = text block
continue
for line in block.get("lines", []):
for span in line.get("spans", []):
text = span.get("text", "").strip()
if not text:
continue
bbox = span.get("bbox", (0, 0, 0, 0))
font = span.get("font", "")
size = float(span.get("size", 0))
color = span.get("color", 0) # packed int
result.append(
RawText(
text=text,
bbox=(
float(bbox[0]),
float(bbox[1]),
float(bbox[2]),
float(bbox[3]),
),
font=font,
size=size,
color=color,
)
)
logger.debug(f"Extracted {len(result)} text spans from page")
return result
def extract_words(page: pymupdf.Page) -> list[RawText]:
"""Extract words from a PDF page using the simpler word-level extraction.
Uses get_text("words") for word-level extraction. Simpler and more reliable
for finding dimension values like "600", "720", "18".
Args:
page: PyMuPDF Page object
Returns:
List of RawText objects. font="" and size=0.0 (not available from word extraction).
"""
result = []
words = page.get_text("words")
# Each word tuple: (x0, y0, x1, y1, word, block_no, line_no, word_no)
for word_tuple in words:
if len(word_tuple) < 5:
continue
x0, y0, x1, y1, word = (
word_tuple[0],
word_tuple[1],
word_tuple[2],
word_tuple[3],
word_tuple[4],
)
word = str(word).strip()
if not word:
continue
result.append(
RawText(
text=word,
bbox=(float(x0), float(y0), float(x1), float(y1)),
font="", # word extraction doesn't provide font info
size=0.0, # word extraction doesn't provide size info
color=0,
)
)
logger.debug(f"Extracted {len(result)} words from page")
return result