"""Tests for annotation extraction.""" import pytest import pymupdf from pathlib import Path from pdf2imos.extract.geometry import extract_geometry from pdf2imos.extract.text import extract_text from pdf2imos.interpret.title_block import detect_title_block, extract_title_block_info from pdf2imos.interpret.view_segmenter import segment_views from pdf2imos.parse.annotations import extract_annotations from pdf2imos.models import PageExtraction, PartMetadata def make_views_and_title(pdf_path): """Run pipeline up to annotation extraction.""" doc = pymupdf.open(str(pdf_path)) page = doc[0] geo = extract_geometry(page) texts = extract_text(page) extraction = PageExtraction( paths=geo.paths, texts=tuple(texts), page_width=geo.page_width, page_height=geo.page_height, ) title_rect, filtered = detect_title_block(extraction) title_info = extract_title_block_info(extraction, title_rect) if title_rect else {} views = segment_views(filtered) return views, title_info class TestExtractAnnotations: def test_returns_part_metadata(self, simple_panel_pdf): views, title_info = make_views_and_title(simple_panel_pdf) result = extract_annotations(views, title_info) assert isinstance(result, PartMetadata) def test_raw_annotations_is_tuple_of_strings(self, simple_panel_pdf): views, title_info = make_views_and_title(simple_panel_pdf) result = extract_annotations(views, title_info) assert isinstance(result.raw_annotations, tuple) assert all(isinstance(r, str) for r in result.raw_annotations) def test_raw_annotations_not_empty(self, simple_panel_pdf): """simple_panel.pdf has text — some should end up in raw_annotations.""" views, title_info = make_views_and_title(simple_panel_pdf) result = extract_annotations(views, title_info) # Should have at least the title block info assert len(result.raw_annotations) > 0 def test_material_extracted_from_cabinet(self, cabinet_basic_pdf): """cabinet_basic.pdf has material annotation 'white melamine MDF'.""" views, title_info = make_views_and_title(cabinet_basic_pdf) result = extract_annotations(views, title_info) # Material should be extracted OR in raw_annotations found_material = ( len(result.materials) > 0 or any( "melamine" in r.lower() or "mdf" in r.lower() or "18mm" in r for r in result.raw_annotations ) ) assert found_material, ( f"No material info found. Materials: {result.materials}, " f"Raw: {result.raw_annotations[:5]}" ) def test_drilling_from_drilling_fixture(self, panel_with_drilling_pdf): """panel_with_drilling.pdf should have drilling annotation parsed.""" views, title_info = make_views_and_title(panel_with_drilling_pdf) result = extract_annotations(views, title_info) # Drilling should be extracted OR in raw_annotations found_drilling = ( len(result.drilling) > 0 or any( "5mm" in r or "12mm" in r or "shelf" in r.lower() or "drill" in r.lower() for r in result.raw_annotations ) ) assert found_drilling, ( f"No drilling info found. Drilling: {result.drilling}, " f"Raw: {result.raw_annotations[:5]}" ) def test_all_fixtures_processable(self, all_fixture_pdfs): """All fixture PDFs process without error.""" for pdf_path in all_fixture_pdfs: views, title_info = make_views_and_title(pdf_path) result = extract_annotations(views, title_info) assert isinstance(result, PartMetadata) def test_metadata_is_frozen(self, simple_panel_pdf): """PartMetadata should be a frozen dataclass.""" views, title_info = make_views_and_title(simple_panel_pdf) result = extract_annotations(views, title_info) from dataclasses import FrozenInstanceError try: result.materials = () # type: ignore assert False, "Should have raised FrozenInstanceError" except (FrozenInstanceError, AttributeError): pass # Expected def test_to_dict_serializable(self, simple_panel_pdf): """PartMetadata.to_dict() should be JSON serializable.""" import json views, title_info = make_views_and_title(simple_panel_pdf) result = extract_annotations(views, title_info) d = result.to_dict() json_str = json.dumps(d) assert json_str