"""Tests for PDF vector geometry extraction.""" import pytest import pymupdf from pathlib import Path from pdf2imos.extract.geometry import extract_geometry from pdf2imos.models import PageExtraction, RawPath FIXTURES_DIR = Path(__file__).parent / "fixtures" / "input" class TestExtractGeometry: def test_returns_page_extraction(self, simple_panel_pdf): doc = pymupdf.open(str(simple_panel_pdf)) result = extract_geometry(doc[0]) assert isinstance(result, PageExtraction) def test_paths_are_raw_path_objects(self, simple_panel_pdf): doc = pymupdf.open(str(simple_panel_pdf)) result = extract_geometry(doc[0]) assert all(isinstance(p, RawPath) for p in result.paths) def test_extracts_sufficient_paths(self, simple_panel_pdf): """simple_panel.pdf should have >10 paths.""" doc = pymupdf.open(str(simple_panel_pdf)) result = extract_geometry(doc[0]) assert len(result.paths) > 10, f"Expected >10 paths, got {len(result.paths)}" def test_dashes_extracted_correctly(self, simple_panel_pdf): """Solid lines have empty dashes, dashed lines have non-empty dashes.""" doc = pymupdf.open(str(simple_panel_pdf)) result = extract_geometry(doc[0]) solid = [p for p in result.paths if not p.dashes] # Should have at least some solid lines (geometry outline) assert len(solid) > 0, "No solid lines found" def test_y_coordinates_flipped(self, simple_panel_pdf): """After y-flip, rect y0 should be >= 0 and <= page_height.""" doc = pymupdf.open(str(simple_panel_pdf)) page = doc[0] result = extract_geometry(page) page_h = result.page_height for p in result.paths: x0, y0, x1, y1 = p.rect assert y0 >= -0.1, f"y0 negative: {y0}" assert y1 <= page_h + 0.1, f"y1 > page_height: {y1}" def test_texts_empty_in_result(self, simple_panel_pdf): """extract_geometry returns empty texts (text extracted separately).""" doc = pymupdf.open(str(simple_panel_pdf)) result = extract_geometry(doc[0]) assert result.texts == (), "extract_geometry should return empty texts" def test_page_dimensions_stored(self, simple_panel_pdf): """Page width and height stored correctly.""" doc = pymupdf.open(str(simple_panel_pdf)) page = doc[0] result = extract_geometry(page) assert result.page_width == pytest.approx(page.rect.width) assert result.page_height == pytest.approx(page.rect.height) def test_all_fixtures_extractable(self, all_fixture_pdfs): """All fixture PDFs can be extracted without error.""" for pdf_path in all_fixture_pdfs: doc = pymupdf.open(str(pdf_path)) result = extract_geometry(doc[0]) assert len(result.paths) > 0, f"No paths in {pdf_path.name}" def test_width_stored_in_rawpath(self, simple_panel_pdf): """RawPath.width field populated.""" doc = pymupdf.open(str(simple_panel_pdf)) result = extract_geometry(doc[0]) widths = {p.width for p in result.paths} assert len(widths) > 1, "Expected multiple distinct line widths"