pdf2cad/src/pdf2imos/extract/text.py

"""PDF text extraction using PyMuPDF."""
import logging

import pymupdf

from pdf2imos.models import RawText

logger = logging.getLogger(__name__)


def extract_text(page: pymupdf.Page) -> list[RawText]:
    """Extract structured text spans from a PDF page.

    Uses get_text("dict") to get rich text with font/size/color info.
    Filters out empty/whitespace-only spans.

    Args:
        page: PyMuPDF Page object

    Returns:
        List of RawText objects with position and formatting info.
        Coordinates are in PDF space (y increases downward — NOT flipped).
        Callers can flip as needed.
    """
    result = []

    text_dict = page.get_text("dict")

    for block in text_dict.get("blocks", []):
        if block.get("type") != 0:  # type 0 = text block
            continue
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text = span.get("text", "").strip()
                if not text:
                    continue

                bbox = span.get("bbox", (0, 0, 0, 0))
                font = span.get("font", "")
                size = float(span.get("size", 0))
                color = span.get("color", 0)  # packed int

                result.append(
                    RawText(
                        text=text,
                        bbox=(
                            float(bbox[0]),
                            float(bbox[1]),
                            float(bbox[2]),
                            float(bbox[3]),
                        ),
                        font=font,
                        size=size,
                        color=color,
                    )
                )

    logger.debug(f"Extracted {len(result)} text spans from page")
    return result


def extract_words(page: pymupdf.Page) -> list[RawText]:
    """Extract words from a PDF page using the simpler word-level extraction.

    Uses get_text("words") for word-level extraction. Simpler and more reliable
    for finding dimension values like "600", "720", "18".

    Args:
        page: PyMuPDF Page object

    Returns:
        List of RawText objects. font="" and size=0.0 (not available from word extraction).
    """
    result = []

    words = page.get_text("words")
    # Each word tuple: (x0, y0, x1, y1, word, block_no, line_no, word_no)

    for word_tuple in words:
        if len(word_tuple) < 5:
            continue
        x0, y0, x1, y1, word = (
            word_tuple[0],
            word_tuple[1],
            word_tuple[2],
            word_tuple[3],
            word_tuple[4],
        )
        word = str(word).strip()
        if not word:
            continue

        result.append(
            RawText(
                text=word,
                bbox=(float(x0), float(y0), float(x1), float(y1)),
                font="",  # word extraction doesn't provide font info
                size=0.0,  # word extraction doesn't provide size info
                color=0,
            )
        )

    logger.debug(f"Extracted {len(result)} words from page")
    return result