feat: pdf2cad

This commit is contained in:
2026-03-03 21:24:02 +00:00
commit 112213da6e
61 changed files with 7290 additions and 0 deletions

View File

@@ -0,0 +1,112 @@
"""Tests for annotation extraction."""
import pytest
import pymupdf
from pathlib import Path
from pdf2imos.extract.geometry import extract_geometry
from pdf2imos.extract.text import extract_text
from pdf2imos.interpret.title_block import detect_title_block, extract_title_block_info
from pdf2imos.interpret.view_segmenter import segment_views
from pdf2imos.parse.annotations import extract_annotations
from pdf2imos.models import PageExtraction, PartMetadata
def make_views_and_title(pdf_path):
"""Run pipeline up to annotation extraction."""
doc = pymupdf.open(str(pdf_path))
page = doc[0]
geo = extract_geometry(page)
texts = extract_text(page)
extraction = PageExtraction(
paths=geo.paths,
texts=tuple(texts),
page_width=geo.page_width,
page_height=geo.page_height,
)
title_rect, filtered = detect_title_block(extraction)
title_info = extract_title_block_info(extraction, title_rect) if title_rect else {}
views = segment_views(filtered)
return views, title_info
class TestExtractAnnotations:
def test_returns_part_metadata(self, simple_panel_pdf):
views, title_info = make_views_and_title(simple_panel_pdf)
result = extract_annotations(views, title_info)
assert isinstance(result, PartMetadata)
def test_raw_annotations_is_tuple_of_strings(self, simple_panel_pdf):
views, title_info = make_views_and_title(simple_panel_pdf)
result = extract_annotations(views, title_info)
assert isinstance(result.raw_annotations, tuple)
assert all(isinstance(r, str) for r in result.raw_annotations)
def test_raw_annotations_not_empty(self, simple_panel_pdf):
"""simple_panel.pdf has text — some should end up in raw_annotations."""
views, title_info = make_views_and_title(simple_panel_pdf)
result = extract_annotations(views, title_info)
# Should have at least the title block info
assert len(result.raw_annotations) > 0
def test_material_extracted_from_cabinet(self, cabinet_basic_pdf):
"""cabinet_basic.pdf has material annotation 'white melamine MDF'."""
views, title_info = make_views_and_title(cabinet_basic_pdf)
result = extract_annotations(views, title_info)
# Material should be extracted OR in raw_annotations
found_material = (
len(result.materials) > 0
or any(
"melamine" in r.lower() or "mdf" in r.lower() or "18mm" in r
for r in result.raw_annotations
)
)
assert found_material, (
f"No material info found. Materials: {result.materials}, "
f"Raw: {result.raw_annotations[:5]}"
)
def test_drilling_from_drilling_fixture(self, panel_with_drilling_pdf):
"""panel_with_drilling.pdf should have drilling annotation parsed."""
views, title_info = make_views_and_title(panel_with_drilling_pdf)
result = extract_annotations(views, title_info)
# Drilling should be extracted OR in raw_annotations
found_drilling = (
len(result.drilling) > 0
or any(
"5mm" in r or "12mm" in r
or "shelf" in r.lower() or "drill" in r.lower()
for r in result.raw_annotations
)
)
assert found_drilling, (
f"No drilling info found. Drilling: {result.drilling}, "
f"Raw: {result.raw_annotations[:5]}"
)
def test_all_fixtures_processable(self, all_fixture_pdfs):
"""All fixture PDFs process without error."""
for pdf_path in all_fixture_pdfs:
views, title_info = make_views_and_title(pdf_path)
result = extract_annotations(views, title_info)
assert isinstance(result, PartMetadata)
def test_metadata_is_frozen(self, simple_panel_pdf):
"""PartMetadata should be a frozen dataclass."""
views, title_info = make_views_and_title(simple_panel_pdf)
result = extract_annotations(views, title_info)
from dataclasses import FrozenInstanceError
try:
result.materials = () # type: ignore
assert False, "Should have raised FrozenInstanceError"
except (FrozenInstanceError, AttributeError):
pass # Expected
def test_to_dict_serializable(self, simple_panel_pdf):
"""PartMetadata.to_dict() should be JSON serializable."""
import json
views, title_info = make_views_and_title(simple_panel_pdf)
result = extract_annotations(views, title_info)
d = result.to_dict()
json_str = json.dumps(d)
assert json_str