feat: pdf2cad

2026-03-03 21:24:02 +00:00
commit 112213da6e
61 changed files with 7290 additions and 0 deletions
--- a/src/pdf2imos/extract/text.py
+++ b/src/pdf2imos/extract/text.py
@@ -0,0 +1,104 @@
+"""PDF text extraction using PyMuPDF."""
+import logging
+
+import pymupdf
+
+from pdf2imos.models import RawText
+
+logger = logging.getLogger(__name__)
+
+
+def extract_text(page: pymupdf.Page) -> list[RawText]:
+    """Extract structured text spans from a PDF page.
+
+    Uses get_text("dict") to get rich text with font/size/color info.
+    Filters out empty/whitespace-only spans.
+
+    Args:
+        page: PyMuPDF Page object
+
+    Returns:
+        List of RawText objects with position and formatting info.
+        Coordinates are in PDF space (y increases downward — NOT flipped).
+        Callers can flip as needed.
+    """
+    result = []
+
+    text_dict = page.get_text("dict")
+
+    for block in text_dict.get("blocks", []):
+        if block.get("type") != 0:  # type 0 = text block
+            continue
+        for line in block.get("lines", []):
+            for span in line.get("spans", []):
+                text = span.get("text", "").strip()
+                if not text:
+                    continue
+
+                bbox = span.get("bbox", (0, 0, 0, 0))
+                font = span.get("font", "")
+                size = float(span.get("size", 0))
+                color = span.get("color", 0)  # packed int
+
+                result.append(
+                    RawText(
+                        text=text,
+                        bbox=(
+                            float(bbox[0]),
+                            float(bbox[1]),
+                            float(bbox[2]),
+                            float(bbox[3]),
+                        ),
+                        font=font,
+                        size=size,
+                        color=color,
+                    )
+                )
+
+    logger.debug(f"Extracted {len(result)} text spans from page")
+    return result
+
+
+def extract_words(page: pymupdf.Page) -> list[RawText]:
+    """Extract words from a PDF page using the simpler word-level extraction.
+
+    Uses get_text("words") for word-level extraction. Simpler and more reliable
+    for finding dimension values like "600", "720", "18".
+
+    Args:
+        page: PyMuPDF Page object
+
+    Returns:
+        List of RawText objects. font="" and size=0.0 (not available from word extraction).
+    """
+    result = []
+
+    words = page.get_text("words")
+    # Each word tuple: (x0, y0, x1, y1, word, block_no, line_no, word_no)
+
+    for word_tuple in words:
+        if len(word_tuple) < 5:
+            continue
+        x0, y0, x1, y1, word = (
+            word_tuple[0],
+            word_tuple[1],
+            word_tuple[2],
+            word_tuple[3],
+            word_tuple[4],
+        )
+        word = str(word).strip()
+        if not word:
+            continue
+
+        result.append(
+            RawText(
+                text=word,
+                bbox=(float(x0), float(y0), float(x1), float(y1)),
+                font="",  # word extraction doesn't provide font info
+                size=0.0,  # word extraction doesn't provide size info
+                color=0,
+            )
+        )
+
+    logger.debug(f"Extracted {len(result)} words from page")
+    return result