"""PDF text extraction using PyMuPDF.""" import logging import pymupdf from pdf2imos.models import RawText logger = logging.getLogger(__name__) def extract_text(page: pymupdf.Page) -> list[RawText]: """Extract structured text spans from a PDF page. Uses get_text("dict") to get rich text with font/size/color info. Filters out empty/whitespace-only spans. Args: page: PyMuPDF Page object Returns: List of RawText objects with position and formatting info. Coordinates are in PDF space (y increases downward — NOT flipped). Callers can flip as needed. """ result = [] text_dict = page.get_text("dict") for block in text_dict.get("blocks", []): if block.get("type") != 0: # type 0 = text block continue for line in block.get("lines", []): for span in line.get("spans", []): text = span.get("text", "").strip() if not text: continue bbox = span.get("bbox", (0, 0, 0, 0)) font = span.get("font", "") size = float(span.get("size", 0)) color = span.get("color", 0) # packed int result.append( RawText( text=text, bbox=( float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]), ), font=font, size=size, color=color, ) ) logger.debug(f"Extracted {len(result)} text spans from page") return result def extract_words(page: pymupdf.Page) -> list[RawText]: """Extract words from a PDF page using the simpler word-level extraction. Uses get_text("words") for word-level extraction. Simpler and more reliable for finding dimension values like "600", "720", "18". Args: page: PyMuPDF Page object Returns: List of RawText objects. font="" and size=0.0 (not available from word extraction). """ result = [] words = page.get_text("words") # Each word tuple: (x0, y0, x1, y1, word, block_no, line_no, word_no) for word_tuple in words: if len(word_tuple) < 5: continue x0, y0, x1, y1, word = ( word_tuple[0], word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4], ) word = str(word).strip() if not word: continue result.append( RawText( text=word, bbox=(float(x0), float(y0), float(x1), float(y1)), font="", # word extraction doesn't provide font info size=0.0, # word extraction doesn't provide size info color=0, ) ) logger.debug(f"Extracted {len(result)} words from page") return result