feat: pdf2cad
This commit is contained in:
104
src/pdf2imos/extract/text.py
Normal file
104
src/pdf2imos/extract/text.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""PDF text extraction using PyMuPDF."""
|
||||
import logging
|
||||
|
||||
import pymupdf
|
||||
|
||||
from pdf2imos.models import RawText
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_text(page: pymupdf.Page) -> list[RawText]:
|
||||
"""Extract structured text spans from a PDF page.
|
||||
|
||||
Uses get_text("dict") to get rich text with font/size/color info.
|
||||
Filters out empty/whitespace-only spans.
|
||||
|
||||
Args:
|
||||
page: PyMuPDF Page object
|
||||
|
||||
Returns:
|
||||
List of RawText objects with position and formatting info.
|
||||
Coordinates are in PDF space (y increases downward — NOT flipped).
|
||||
Callers can flip as needed.
|
||||
"""
|
||||
result = []
|
||||
|
||||
text_dict = page.get_text("dict")
|
||||
|
||||
for block in text_dict.get("blocks", []):
|
||||
if block.get("type") != 0: # type 0 = text block
|
||||
continue
|
||||
for line in block.get("lines", []):
|
||||
for span in line.get("spans", []):
|
||||
text = span.get("text", "").strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
bbox = span.get("bbox", (0, 0, 0, 0))
|
||||
font = span.get("font", "")
|
||||
size = float(span.get("size", 0))
|
||||
color = span.get("color", 0) # packed int
|
||||
|
||||
result.append(
|
||||
RawText(
|
||||
text=text,
|
||||
bbox=(
|
||||
float(bbox[0]),
|
||||
float(bbox[1]),
|
||||
float(bbox[2]),
|
||||
float(bbox[3]),
|
||||
),
|
||||
font=font,
|
||||
size=size,
|
||||
color=color,
|
||||
)
|
||||
)
|
||||
|
||||
logger.debug(f"Extracted {len(result)} text spans from page")
|
||||
return result
|
||||
|
||||
|
||||
def extract_words(page: pymupdf.Page) -> list[RawText]:
|
||||
"""Extract words from a PDF page using the simpler word-level extraction.
|
||||
|
||||
Uses get_text("words") for word-level extraction. Simpler and more reliable
|
||||
for finding dimension values like "600", "720", "18".
|
||||
|
||||
Args:
|
||||
page: PyMuPDF Page object
|
||||
|
||||
Returns:
|
||||
List of RawText objects. font="" and size=0.0 (not available from word extraction).
|
||||
"""
|
||||
result = []
|
||||
|
||||
words = page.get_text("words")
|
||||
# Each word tuple: (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
||||
|
||||
for word_tuple in words:
|
||||
if len(word_tuple) < 5:
|
||||
continue
|
||||
x0, y0, x1, y1, word = (
|
||||
word_tuple[0],
|
||||
word_tuple[1],
|
||||
word_tuple[2],
|
||||
word_tuple[3],
|
||||
word_tuple[4],
|
||||
)
|
||||
word = str(word).strip()
|
||||
if not word:
|
||||
continue
|
||||
|
||||
result.append(
|
||||
RawText(
|
||||
text=word,
|
||||
bbox=(float(x0), float(y0), float(x1), float(y1)),
|
||||
font="", # word extraction doesn't provide font info
|
||||
size=0.0, # word extraction doesn't provide size info
|
||||
color=0,
|
||||
)
|
||||
)
|
||||
|
||||
logger.debug(f"Extracted {len(result)} words from page")
|
||||
return result
|
||||
Reference in New Issue
Block a user