75 lines
3.2 KiB
Python
75 lines
3.2 KiB
Python
"""Tests for PDF vector geometry extraction."""
|
|
import pytest
|
|
import pymupdf
|
|
from pathlib import Path
|
|
|
|
from pdf2imos.extract.geometry import extract_geometry
|
|
from pdf2imos.models import PageExtraction, RawPath
|
|
|
|
FIXTURES_DIR = Path(__file__).parent / "fixtures" / "input"
|
|
|
|
|
|
class TestExtractGeometry:
|
|
def test_returns_page_extraction(self, simple_panel_pdf):
|
|
doc = pymupdf.open(str(simple_panel_pdf))
|
|
result = extract_geometry(doc[0])
|
|
assert isinstance(result, PageExtraction)
|
|
|
|
def test_paths_are_raw_path_objects(self, simple_panel_pdf):
|
|
doc = pymupdf.open(str(simple_panel_pdf))
|
|
result = extract_geometry(doc[0])
|
|
assert all(isinstance(p, RawPath) for p in result.paths)
|
|
|
|
def test_extracts_sufficient_paths(self, simple_panel_pdf):
|
|
"""simple_panel.pdf should have >10 paths."""
|
|
doc = pymupdf.open(str(simple_panel_pdf))
|
|
result = extract_geometry(doc[0])
|
|
assert len(result.paths) > 10, f"Expected >10 paths, got {len(result.paths)}"
|
|
|
|
def test_dashes_extracted_correctly(self, simple_panel_pdf):
|
|
"""Solid lines have empty dashes, dashed lines have non-empty dashes."""
|
|
doc = pymupdf.open(str(simple_panel_pdf))
|
|
result = extract_geometry(doc[0])
|
|
solid = [p for p in result.paths if not p.dashes]
|
|
# Should have at least some solid lines (geometry outline)
|
|
assert len(solid) > 0, "No solid lines found"
|
|
|
|
def test_y_coordinates_flipped(self, simple_panel_pdf):
|
|
"""After y-flip, rect y0 should be >= 0 and <= page_height."""
|
|
doc = pymupdf.open(str(simple_panel_pdf))
|
|
page = doc[0]
|
|
result = extract_geometry(page)
|
|
page_h = result.page_height
|
|
for p in result.paths:
|
|
x0, y0, x1, y1 = p.rect
|
|
assert y0 >= -0.1, f"y0 negative: {y0}"
|
|
assert y1 <= page_h + 0.1, f"y1 > page_height: {y1}"
|
|
|
|
def test_texts_empty_in_result(self, simple_panel_pdf):
|
|
"""extract_geometry returns empty texts (text extracted separately)."""
|
|
doc = pymupdf.open(str(simple_panel_pdf))
|
|
result = extract_geometry(doc[0])
|
|
assert result.texts == (), "extract_geometry should return empty texts"
|
|
|
|
def test_page_dimensions_stored(self, simple_panel_pdf):
|
|
"""Page width and height stored correctly."""
|
|
doc = pymupdf.open(str(simple_panel_pdf))
|
|
page = doc[0]
|
|
result = extract_geometry(page)
|
|
assert result.page_width == pytest.approx(page.rect.width)
|
|
assert result.page_height == pytest.approx(page.rect.height)
|
|
|
|
def test_all_fixtures_extractable(self, all_fixture_pdfs):
|
|
"""All fixture PDFs can be extracted without error."""
|
|
for pdf_path in all_fixture_pdfs:
|
|
doc = pymupdf.open(str(pdf_path))
|
|
result = extract_geometry(doc[0])
|
|
assert len(result.paths) > 0, f"No paths in {pdf_path.name}"
|
|
|
|
def test_width_stored_in_rawpath(self, simple_panel_pdf):
|
|
"""RawPath.width field populated."""
|
|
doc = pymupdf.open(str(simple_panel_pdf))
|
|
result = extract_geometry(doc[0])
|
|
widths = {p.width for p in result.paths}
|
|
assert len(widths) > 1, "Expected multiple distinct line widths"
|