feat: pdf2cad

2026-03-03 21:24:02 +00:00
commit 112213da6e
61 changed files with 7290 additions and 0 deletions
--- a/tests/test_annotation_extractor.py
+++ b/tests/test_annotation_extractor.py
@@ -0,0 +1,112 @@
+"""Tests for annotation extraction."""
+import pytest
+import pymupdf
+from pathlib import Path
+from pdf2imos.extract.geometry import extract_geometry
+from pdf2imos.extract.text import extract_text
+from pdf2imos.interpret.title_block import detect_title_block, extract_title_block_info
+from pdf2imos.interpret.view_segmenter import segment_views
+from pdf2imos.parse.annotations import extract_annotations
+from pdf2imos.models import PageExtraction, PartMetadata
+
+
+def make_views_and_title(pdf_path):
+    """Run pipeline up to annotation extraction."""
+    doc = pymupdf.open(str(pdf_path))
+    page = doc[0]
+    geo = extract_geometry(page)
+    texts = extract_text(page)
+    extraction = PageExtraction(
+        paths=geo.paths,
+        texts=tuple(texts),
+        page_width=geo.page_width,
+        page_height=geo.page_height,
+    )
+    title_rect, filtered = detect_title_block(extraction)
+    title_info = extract_title_block_info(extraction, title_rect) if title_rect else {}
+    views = segment_views(filtered)
+    return views, title_info
+
+
+class TestExtractAnnotations:
+    def test_returns_part_metadata(self, simple_panel_pdf):
+        views, title_info = make_views_and_title(simple_panel_pdf)
+        result = extract_annotations(views, title_info)
+        assert isinstance(result, PartMetadata)
+
+    def test_raw_annotations_is_tuple_of_strings(self, simple_panel_pdf):
+        views, title_info = make_views_and_title(simple_panel_pdf)
+        result = extract_annotations(views, title_info)
+        assert isinstance(result.raw_annotations, tuple)
+        assert all(isinstance(r, str) for r in result.raw_annotations)
+
+    def test_raw_annotations_not_empty(self, simple_panel_pdf):
+        """simple_panel.pdf has text — some should end up in raw_annotations."""
+        views, title_info = make_views_and_title(simple_panel_pdf)
+        result = extract_annotations(views, title_info)
+        # Should have at least the title block info
+        assert len(result.raw_annotations) > 0
+
+    def test_material_extracted_from_cabinet(self, cabinet_basic_pdf):
+        """cabinet_basic.pdf has material annotation 'white melamine MDF'."""
+        views, title_info = make_views_and_title(cabinet_basic_pdf)
+        result = extract_annotations(views, title_info)
+
+        # Material should be extracted OR in raw_annotations
+        found_material = (
+            len(result.materials) > 0
+            or any(
+                "melamine" in r.lower() or "mdf" in r.lower() or "18mm" in r
+                for r in result.raw_annotations
+            )
+        )
+        assert found_material, (
+            f"No material info found. Materials: {result.materials}, "
+            f"Raw: {result.raw_annotations[:5]}"
+        )
+
+    def test_drilling_from_drilling_fixture(self, panel_with_drilling_pdf):
+        """panel_with_drilling.pdf should have drilling annotation parsed."""
+        views, title_info = make_views_and_title(panel_with_drilling_pdf)
+        result = extract_annotations(views, title_info)
+
+        # Drilling should be extracted OR in raw_annotations
+        found_drilling = (
+            len(result.drilling) > 0
+            or any(
+                "5mm" in r or "12mm" in r
+                or "shelf" in r.lower() or "drill" in r.lower()
+                for r in result.raw_annotations
+            )
+        )
+        assert found_drilling, (
+            f"No drilling info found. Drilling: {result.drilling}, "
+            f"Raw: {result.raw_annotations[:5]}"
+        )
+
+    def test_all_fixtures_processable(self, all_fixture_pdfs):
+        """All fixture PDFs process without error."""
+        for pdf_path in all_fixture_pdfs:
+            views, title_info = make_views_and_title(pdf_path)
+            result = extract_annotations(views, title_info)
+            assert isinstance(result, PartMetadata)
+
+    def test_metadata_is_frozen(self, simple_panel_pdf):
+        """PartMetadata should be a frozen dataclass."""
+        views, title_info = make_views_and_title(simple_panel_pdf)
+        result = extract_annotations(views, title_info)
+        from dataclasses import FrozenInstanceError
+        try:
+            result.materials = ()  # type: ignore
+            assert False, "Should have raised FrozenInstanceError"
+        except (FrozenInstanceError, AttributeError):
+            pass  # Expected
+
+    def test_to_dict_serializable(self, simple_panel_pdf):
+        """PartMetadata.to_dict() should be JSON serializable."""
+        import json
+        views, title_info = make_views_and_title(simple_panel_pdf)
+        result = extract_annotations(views, title_info)
+        d = result.to_dict()
+        json_str = json.dumps(d)
+        assert json_str