feat: pdf2cad
This commit is contained in:
112
tests/test_annotation_extractor.py
Normal file
112
tests/test_annotation_extractor.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Tests for annotation extraction."""
|
||||
import pytest
|
||||
import pymupdf
|
||||
from pathlib import Path
|
||||
from pdf2imos.extract.geometry import extract_geometry
|
||||
from pdf2imos.extract.text import extract_text
|
||||
from pdf2imos.interpret.title_block import detect_title_block, extract_title_block_info
|
||||
from pdf2imos.interpret.view_segmenter import segment_views
|
||||
from pdf2imos.parse.annotations import extract_annotations
|
||||
from pdf2imos.models import PageExtraction, PartMetadata
|
||||
|
||||
|
||||
def make_views_and_title(pdf_path):
|
||||
"""Run pipeline up to annotation extraction."""
|
||||
doc = pymupdf.open(str(pdf_path))
|
||||
page = doc[0]
|
||||
geo = extract_geometry(page)
|
||||
texts = extract_text(page)
|
||||
extraction = PageExtraction(
|
||||
paths=geo.paths,
|
||||
texts=tuple(texts),
|
||||
page_width=geo.page_width,
|
||||
page_height=geo.page_height,
|
||||
)
|
||||
title_rect, filtered = detect_title_block(extraction)
|
||||
title_info = extract_title_block_info(extraction, title_rect) if title_rect else {}
|
||||
views = segment_views(filtered)
|
||||
return views, title_info
|
||||
|
||||
|
||||
class TestExtractAnnotations:
|
||||
def test_returns_part_metadata(self, simple_panel_pdf):
|
||||
views, title_info = make_views_and_title(simple_panel_pdf)
|
||||
result = extract_annotations(views, title_info)
|
||||
assert isinstance(result, PartMetadata)
|
||||
|
||||
def test_raw_annotations_is_tuple_of_strings(self, simple_panel_pdf):
|
||||
views, title_info = make_views_and_title(simple_panel_pdf)
|
||||
result = extract_annotations(views, title_info)
|
||||
assert isinstance(result.raw_annotations, tuple)
|
||||
assert all(isinstance(r, str) for r in result.raw_annotations)
|
||||
|
||||
def test_raw_annotations_not_empty(self, simple_panel_pdf):
|
||||
"""simple_panel.pdf has text — some should end up in raw_annotations."""
|
||||
views, title_info = make_views_and_title(simple_panel_pdf)
|
||||
result = extract_annotations(views, title_info)
|
||||
# Should have at least the title block info
|
||||
assert len(result.raw_annotations) > 0
|
||||
|
||||
def test_material_extracted_from_cabinet(self, cabinet_basic_pdf):
|
||||
"""cabinet_basic.pdf has material annotation 'white melamine MDF'."""
|
||||
views, title_info = make_views_and_title(cabinet_basic_pdf)
|
||||
result = extract_annotations(views, title_info)
|
||||
|
||||
# Material should be extracted OR in raw_annotations
|
||||
found_material = (
|
||||
len(result.materials) > 0
|
||||
or any(
|
||||
"melamine" in r.lower() or "mdf" in r.lower() or "18mm" in r
|
||||
for r in result.raw_annotations
|
||||
)
|
||||
)
|
||||
assert found_material, (
|
||||
f"No material info found. Materials: {result.materials}, "
|
||||
f"Raw: {result.raw_annotations[:5]}"
|
||||
)
|
||||
|
||||
def test_drilling_from_drilling_fixture(self, panel_with_drilling_pdf):
|
||||
"""panel_with_drilling.pdf should have drilling annotation parsed."""
|
||||
views, title_info = make_views_and_title(panel_with_drilling_pdf)
|
||||
result = extract_annotations(views, title_info)
|
||||
|
||||
# Drilling should be extracted OR in raw_annotations
|
||||
found_drilling = (
|
||||
len(result.drilling) > 0
|
||||
or any(
|
||||
"5mm" in r or "12mm" in r
|
||||
or "shelf" in r.lower() or "drill" in r.lower()
|
||||
for r in result.raw_annotations
|
||||
)
|
||||
)
|
||||
assert found_drilling, (
|
||||
f"No drilling info found. Drilling: {result.drilling}, "
|
||||
f"Raw: {result.raw_annotations[:5]}"
|
||||
)
|
||||
|
||||
def test_all_fixtures_processable(self, all_fixture_pdfs):
|
||||
"""All fixture PDFs process without error."""
|
||||
for pdf_path in all_fixture_pdfs:
|
||||
views, title_info = make_views_and_title(pdf_path)
|
||||
result = extract_annotations(views, title_info)
|
||||
assert isinstance(result, PartMetadata)
|
||||
|
||||
def test_metadata_is_frozen(self, simple_panel_pdf):
|
||||
"""PartMetadata should be a frozen dataclass."""
|
||||
views, title_info = make_views_and_title(simple_panel_pdf)
|
||||
result = extract_annotations(views, title_info)
|
||||
from dataclasses import FrozenInstanceError
|
||||
try:
|
||||
result.materials = () # type: ignore
|
||||
assert False, "Should have raised FrozenInstanceError"
|
||||
except (FrozenInstanceError, AttributeError):
|
||||
pass # Expected
|
||||
|
||||
def test_to_dict_serializable(self, simple_panel_pdf):
|
||||
"""PartMetadata.to_dict() should be JSON serializable."""
|
||||
import json
|
||||
views, title_info = make_views_and_title(simple_panel_pdf)
|
||||
result = extract_annotations(views, title_info)
|
||||
d = result.to_dict()
|
||||
json_str = json.dumps(d)
|
||||
assert json_str
|
||||
Reference in New Issue
Block a user