105 lines
3.0 KiB
Python
105 lines
3.0 KiB
Python
"""PDF text extraction using PyMuPDF."""
|
|
import logging
|
|
|
|
import pymupdf
|
|
|
|
from pdf2imos.models import RawText
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def extract_text(page: pymupdf.Page) -> list[RawText]:
|
|
"""Extract structured text spans from a PDF page.
|
|
|
|
Uses get_text("dict") to get rich text with font/size/color info.
|
|
Filters out empty/whitespace-only spans.
|
|
|
|
Args:
|
|
page: PyMuPDF Page object
|
|
|
|
Returns:
|
|
List of RawText objects with position and formatting info.
|
|
Coordinates are in PDF space (y increases downward — NOT flipped).
|
|
Callers can flip as needed.
|
|
"""
|
|
result = []
|
|
|
|
text_dict = page.get_text("dict")
|
|
|
|
for block in text_dict.get("blocks", []):
|
|
if block.get("type") != 0: # type 0 = text block
|
|
continue
|
|
for line in block.get("lines", []):
|
|
for span in line.get("spans", []):
|
|
text = span.get("text", "").strip()
|
|
if not text:
|
|
continue
|
|
|
|
bbox = span.get("bbox", (0, 0, 0, 0))
|
|
font = span.get("font", "")
|
|
size = float(span.get("size", 0))
|
|
color = span.get("color", 0) # packed int
|
|
|
|
result.append(
|
|
RawText(
|
|
text=text,
|
|
bbox=(
|
|
float(bbox[0]),
|
|
float(bbox[1]),
|
|
float(bbox[2]),
|
|
float(bbox[3]),
|
|
),
|
|
font=font,
|
|
size=size,
|
|
color=color,
|
|
)
|
|
)
|
|
|
|
logger.debug(f"Extracted {len(result)} text spans from page")
|
|
return result
|
|
|
|
|
|
def extract_words(page: pymupdf.Page) -> list[RawText]:
|
|
"""Extract words from a PDF page using the simpler word-level extraction.
|
|
|
|
Uses get_text("words") for word-level extraction. Simpler and more reliable
|
|
for finding dimension values like "600", "720", "18".
|
|
|
|
Args:
|
|
page: PyMuPDF Page object
|
|
|
|
Returns:
|
|
List of RawText objects. font="" and size=0.0 (not available from word extraction).
|
|
"""
|
|
result = []
|
|
|
|
words = page.get_text("words")
|
|
# Each word tuple: (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
|
|
|
for word_tuple in words:
|
|
if len(word_tuple) < 5:
|
|
continue
|
|
x0, y0, x1, y1, word = (
|
|
word_tuple[0],
|
|
word_tuple[1],
|
|
word_tuple[2],
|
|
word_tuple[3],
|
|
word_tuple[4],
|
|
)
|
|
word = str(word).strip()
|
|
if not word:
|
|
continue
|
|
|
|
result.append(
|
|
RawText(
|
|
text=word,
|
|
bbox=(float(x0), float(y0), float(x1), float(y1)),
|
|
font="", # word extraction doesn't provide font info
|
|
size=0.0, # word extraction doesn't provide size info
|
|
color=0,
|
|
)
|
|
)
|
|
|
|
logger.debug(f"Extracted {len(result)} words from page")
|
|
return result
|