386 lines
14 KiB
Python
386 lines
14 KiB
Python
"""Tests for view boundary segmentation."""
|
|
|
|
import pymupdf
|
|
import pytest
|
|
|
|
from pdf2imos.extract.geometry import extract_geometry
|
|
from pdf2imos.extract.text import extract_text
|
|
from pdf2imos.interpret.title_block import detect_title_block
|
|
from pdf2imos.interpret.view_segmenter import (
|
|
_cluster_area,
|
|
_cluster_bbox,
|
|
_cluster_paths,
|
|
_clusters_are_close,
|
|
segment_views,
|
|
)
|
|
from pdf2imos.models import PageExtraction, RawPath, RawText, ViewRegion, ViewType
|
|
|
|
|
|
def make_filtered_extraction(pdf_path):
|
|
"""Run full pre-processing: extract → filter title block."""
|
|
doc = pymupdf.open(str(pdf_path))
|
|
page = doc[0]
|
|
geo = extract_geometry(page)
|
|
texts = extract_text(page)
|
|
extraction = PageExtraction(
|
|
paths=geo.paths,
|
|
texts=tuple(texts),
|
|
page_width=geo.page_width,
|
|
page_height=geo.page_height,
|
|
)
|
|
_, filtered = detect_title_block(extraction)
|
|
return filtered
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helper to build synthetic RawPath for unit tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_path(x0, y0, x1, y1, width=1.0):
|
|
"""Create a minimal RawPath with given bounding box."""
|
|
return RawPath(
|
|
items=(("l", (x0, y0), (x1, y1)),),
|
|
color=(0.0, 0.0, 0.0),
|
|
fill=None,
|
|
dashes="",
|
|
width=width,
|
|
rect=(x0, y0, x1, y1),
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Unit tests for clustering helpers
|
|
# ===========================================================================
|
|
|
|
|
|
class TestClusterPaths:
|
|
def test_empty_input(self):
|
|
assert _cluster_paths([]) == []
|
|
|
|
def test_single_path(self):
|
|
p = _make_path(0, 0, 10, 10)
|
|
result = _cluster_paths([p])
|
|
assert len(result) == 1
|
|
assert result[0] == [p]
|
|
|
|
def test_close_paths_merge(self):
|
|
"""Paths within gap_threshold merge into one cluster."""
|
|
p1 = _make_path(0, 0, 10, 10)
|
|
p2 = _make_path(15, 0, 25, 10) # 5pt gap from p1
|
|
result = _cluster_paths([p1, p2], gap_threshold=10.0)
|
|
assert len(result) == 1
|
|
|
|
def test_far_paths_separate(self):
|
|
"""Paths beyond gap_threshold stay as separate clusters."""
|
|
p1 = _make_path(0, 0, 10, 10)
|
|
p2 = _make_path(100, 0, 110, 10) # 90pt gap from p1
|
|
result = _cluster_paths([p1, p2], gap_threshold=25.0)
|
|
assert len(result) == 2
|
|
|
|
def test_chain_merge(self):
|
|
"""A-close-to-B and B-close-to-C → all in one cluster."""
|
|
p1 = _make_path(0, 0, 10, 10)
|
|
p2 = _make_path(20, 0, 30, 10) # 10pt from p1
|
|
p3 = _make_path(40, 0, 50, 10) # 10pt from p2
|
|
result = _cluster_paths([p1, p2, p3], gap_threshold=15.0)
|
|
assert len(result) == 1
|
|
|
|
def test_two_separate_clusters(self):
|
|
"""Two groups far apart → two clusters."""
|
|
group_a = [_make_path(0, 0, 10, 10), _make_path(5, 5, 15, 15)]
|
|
group_b = [_make_path(200, 200, 210, 210), _make_path(205, 205, 215, 215)]
|
|
result = _cluster_paths(group_a + group_b, gap_threshold=25.0)
|
|
assert len(result) == 2
|
|
|
|
|
|
class TestClusterBbox:
|
|
def test_single_path(self):
|
|
p = _make_path(5, 10, 20, 30)
|
|
assert _cluster_bbox([p]) == (5, 10, 20, 30)
|
|
|
|
def test_multiple_paths(self):
|
|
p1 = _make_path(0, 0, 10, 10)
|
|
p2 = _make_path(20, 20, 30, 30)
|
|
assert _cluster_bbox([p1, p2]) == (0, 0, 30, 30)
|
|
|
|
|
|
class TestClusterArea:
|
|
def test_area_computation(self):
|
|
cluster = [_make_path(0, 0, 10, 20)]
|
|
assert _cluster_area(cluster) == pytest.approx(200.0)
|
|
|
|
def test_zero_area(self):
|
|
cluster = [_make_path(5, 5, 5, 5)]
|
|
assert _cluster_area(cluster) == pytest.approx(0.0)
|
|
|
|
|
|
class TestClustersAreClose:
|
|
def test_overlapping(self):
|
|
a = [_make_path(0, 0, 20, 20)]
|
|
b = [_make_path(10, 10, 30, 30)]
|
|
assert _clusters_are_close(a, b, 5.0)
|
|
|
|
def test_adjacent(self):
|
|
a = [_make_path(0, 0, 10, 10)]
|
|
b = [_make_path(10, 0, 20, 10)] # 0 gap
|
|
assert _clusters_are_close(a, b, 5.0)
|
|
|
|
def test_small_gap(self):
|
|
a = [_make_path(0, 0, 10, 10)]
|
|
b = [_make_path(13, 0, 23, 10)] # 3pt gap
|
|
assert _clusters_are_close(a, b, 5.0)
|
|
|
|
def test_large_gap(self):
|
|
a = [_make_path(0, 0, 10, 10)]
|
|
b = [_make_path(50, 0, 60, 10)] # 40pt gap
|
|
assert not _clusters_are_close(a, b, 25.0)
|
|
|
|
|
|
# ===========================================================================
|
|
# Integration tests with real PDFs
|
|
# ===========================================================================
|
|
|
|
|
|
class TestSegmentViews:
|
|
def test_returns_list(self, simple_panel_pdf):
|
|
filtered = make_filtered_extraction(simple_panel_pdf)
|
|
result = segment_views(filtered)
|
|
assert isinstance(result, list)
|
|
|
|
def test_views_are_view_regions(self, simple_panel_pdf):
|
|
filtered = make_filtered_extraction(simple_panel_pdf)
|
|
result = segment_views(filtered)
|
|
assert all(isinstance(v, ViewRegion) for v in result)
|
|
|
|
def test_detects_at_least_two_views(self, simple_panel_pdf):
|
|
"""Must detect at least 2 views (FRONT + one more)."""
|
|
filtered = make_filtered_extraction(simple_panel_pdf)
|
|
result = segment_views(filtered)
|
|
assert len(result) >= 2, f"Expected >=2 views, got {len(result)}"
|
|
|
|
def test_front_view_present(self, simple_panel_pdf):
|
|
"""FRONT view must always be detected."""
|
|
filtered = make_filtered_extraction(simple_panel_pdf)
|
|
result = segment_views(filtered)
|
|
view_types = {v.view_type for v in result}
|
|
assert ViewType.FRONT in view_types, f"No FRONT view. Got: {view_types}"
|
|
|
|
def test_front_view_is_lowest(self, simple_panel_pdf):
|
|
"""FRONT view should have the lowest y-center (bottom of page in CAD)."""
|
|
filtered = make_filtered_extraction(simple_panel_pdf)
|
|
result = segment_views(filtered)
|
|
if len(result) < 2:
|
|
pytest.skip("Less than 2 views detected")
|
|
front = next((v for v in result if v.view_type == ViewType.FRONT), None)
|
|
assert front is not None
|
|
front_cy = (front.bounds[1] + front.bounds[3]) / 2
|
|
for v in result:
|
|
if v.view_type != ViewType.FRONT:
|
|
other_cy = (v.bounds[1] + v.bounds[3]) / 2
|
|
# Front should have y-center <= others (or at least not much higher)
|
|
# Allow some tolerance since SIDE may have similar y
|
|
if v.view_type == ViewType.TOP:
|
|
assert front_cy < other_cy, (
|
|
f"FRONT cy={front_cy} should be below TOP cy={other_cy}"
|
|
)
|
|
|
|
def test_each_view_has_paths(self, simple_panel_pdf):
|
|
"""Each detected view has at least one path."""
|
|
filtered = make_filtered_extraction(simple_panel_pdf)
|
|
result = segment_views(filtered)
|
|
for view in result:
|
|
assert len(view.paths) > 0, f"{view.view_type} has no paths"
|
|
|
|
def test_all_fixtures_segmentable(self, all_fixture_pdfs):
|
|
"""All fixture PDFs can be segmented without crashing."""
|
|
for pdf_path in all_fixture_pdfs:
|
|
filtered = make_filtered_extraction(pdf_path)
|
|
result = segment_views(filtered)
|
|
assert isinstance(result, list)
|
|
|
|
def test_cabinet_has_multiple_views(self, cabinet_basic_pdf):
|
|
"""Cabinet drawing should detect multiple views."""
|
|
filtered = make_filtered_extraction(cabinet_basic_pdf)
|
|
result = segment_views(filtered)
|
|
assert len(result) >= 2
|
|
|
|
def test_view_bounds_are_reasonable(self, simple_panel_pdf):
|
|
"""View bounds should be within page dimensions."""
|
|
filtered = make_filtered_extraction(simple_panel_pdf)
|
|
result = segment_views(filtered)
|
|
for view in result:
|
|
x0, y0, x1, y1 = view.bounds
|
|
assert x0 >= -5, f"x0 out of range: {x0}"
|
|
assert y0 >= -5, f"y0 out of range: {y0}"
|
|
assert x1 <= filtered.page_width + 5, f"x1 out of range: {x1}"
|
|
assert y1 <= filtered.page_height + 5, f"y1 out of range: {y1}"
|
|
|
|
def test_views_dont_overlap_much(self, simple_panel_pdf):
|
|
"""Distinct views should not overlap significantly."""
|
|
filtered = make_filtered_extraction(simple_panel_pdf)
|
|
result = segment_views(filtered)
|
|
if len(result) < 2:
|
|
pytest.skip("Less than 2 views")
|
|
for i, v1 in enumerate(result):
|
|
for v2 in result[i + 1 :]:
|
|
overlap = _bbox_overlap_area(v1.bounds, v2.bounds)
|
|
a1 = _bbox_area(v1.bounds)
|
|
a2 = _bbox_area(v2.bounds)
|
|
min_area = min(a1, a2) if min(a1, a2) > 0 else 1
|
|
# Overlap should be < 20% of smaller view
|
|
assert overlap / min_area < 0.2, (
|
|
f"{v1.view_type} and {v2.view_type} overlap "
|
|
f"{overlap / min_area:.1%}"
|
|
)
|
|
|
|
|
|
class TestSegmentViewsEmpty:
|
|
def test_empty_extraction(self):
|
|
"""Empty extraction returns empty list."""
|
|
extraction = PageExtraction(
|
|
paths=(), texts=(), page_width=595, page_height=842
|
|
)
|
|
result = segment_views(extraction)
|
|
assert result == []
|
|
|
|
|
|
class TestSegmentViewsSynthetic:
|
|
"""Test with synthetic data mimicking third-angle projection layout."""
|
|
|
|
def _make_three_view_extraction(self):
|
|
"""Create extraction with clear front/top/side layout.
|
|
|
|
Layout (CAD coords, y-up):
|
|
Top view: x=100-300, y=400-450 (above front)
|
|
Front view: x=100-300, y=100-350 (bottom-left)
|
|
Side view: x=350-400, y=100-350 (right of front)
|
|
"""
|
|
# Front view paths (large rectangle)
|
|
front_paths = [
|
|
_make_path(100, 100, 300, 350),
|
|
_make_path(120, 120, 280, 330),
|
|
]
|
|
# Top view paths (above front)
|
|
top_paths = [
|
|
_make_path(100, 400, 300, 450),
|
|
_make_path(120, 410, 280, 440),
|
|
]
|
|
# Side view paths (right of front)
|
|
side_paths = [
|
|
_make_path(350, 100, 400, 350),
|
|
_make_path(355, 120, 395, 330),
|
|
]
|
|
|
|
all_paths = tuple(front_paths + top_paths + side_paths)
|
|
return PageExtraction(
|
|
paths=all_paths,
|
|
texts=(),
|
|
page_width=595,
|
|
page_height=842,
|
|
)
|
|
|
|
def test_detects_three_views(self):
|
|
extraction = self._make_three_view_extraction()
|
|
result = segment_views(extraction)
|
|
assert len(result) == 3
|
|
|
|
def test_front_is_bottom_left(self):
|
|
extraction = self._make_three_view_extraction()
|
|
result = segment_views(extraction)
|
|
front = next((v for v in result if v.view_type == ViewType.FRONT), None)
|
|
assert front is not None
|
|
# Front should be around y=100-350
|
|
assert front.bounds[1] < 200, f"Front y0={front.bounds[1]} too high"
|
|
|
|
def test_top_is_above_front(self):
|
|
extraction = self._make_three_view_extraction()
|
|
result = segment_views(extraction)
|
|
front = next((v for v in result if v.view_type == ViewType.FRONT), None)
|
|
top = next((v for v in result if v.view_type == ViewType.TOP), None)
|
|
assert front is not None
|
|
assert top is not None
|
|
front_cy = (front.bounds[1] + front.bounds[3]) / 2
|
|
top_cy = (top.bounds[1] + top.bounds[3]) / 2
|
|
assert top_cy > front_cy, "TOP should be above FRONT"
|
|
|
|
def test_side_is_right_of_front(self):
|
|
extraction = self._make_three_view_extraction()
|
|
result = segment_views(extraction)
|
|
front = next((v for v in result if v.view_type == ViewType.FRONT), None)
|
|
side = next((v for v in result if v.view_type == ViewType.SIDE), None)
|
|
assert front is not None
|
|
assert side is not None
|
|
front_cx = (front.bounds[0] + front.bounds[2]) / 2
|
|
side_cx = (side.bounds[0] + side.bounds[2]) / 2
|
|
assert side_cx > front_cx, "SIDE should be right of FRONT"
|
|
|
|
def test_text_assignment_with_coord_conversion(self):
|
|
"""Texts in PDF coords should be assigned to correct views."""
|
|
extraction = self._make_three_view_extraction()
|
|
|
|
# Add a text that (in PDF coords) lands in the front view area
|
|
# Front view in CAD: y=100-350
|
|
# In PDF coords: y = page_h - cad_y, so y = 842-350=492 to 842-100=742
|
|
text_in_front = RawText(
|
|
text="600",
|
|
bbox=(150.0, 600.0, 170.0, 612.0), # PDF coords
|
|
font="Helvetica",
|
|
size=10.0,
|
|
color=0,
|
|
)
|
|
# Text in top view area
|
|
# Top in CAD: y=400-450
|
|
# In PDF coords: y = 842-450=392 to 842-400=442
|
|
text_in_top = RawText(
|
|
text="720",
|
|
bbox=(150.0, 400.0, 170.0, 412.0), # PDF coords
|
|
font="Helvetica",
|
|
size=10.0,
|
|
color=0,
|
|
)
|
|
|
|
extraction_with_text = PageExtraction(
|
|
paths=extraction.paths,
|
|
texts=(text_in_front, text_in_top),
|
|
page_width=595,
|
|
page_height=842,
|
|
)
|
|
result = segment_views(extraction_with_text)
|
|
|
|
front = next((v for v in result if v.view_type == ViewType.FRONT), None)
|
|
top = next((v for v in result if v.view_type == ViewType.TOP), None)
|
|
assert front is not None
|
|
|
|
# "600" should be assigned to front view
|
|
front_text_vals = [t.text for t in front.texts]
|
|
assert "600" in front_text_vals, (
|
|
f"Text '600' not in front view. Front texts: {front_text_vals}"
|
|
)
|
|
|
|
if top is not None:
|
|
top_text_vals = [t.text for t in top.texts]
|
|
assert "720" in top_text_vals, (
|
|
f"Text '720' not in top view. Top texts: {top_text_vals}"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _bbox_overlap_area(a, b):
|
|
"""Compute overlap area of two bounding boxes."""
|
|
x0 = max(a[0], b[0])
|
|
y0 = max(a[1], b[1])
|
|
x1 = min(a[2], b[2])
|
|
y1 = min(a[3], b[3])
|
|
if x1 <= x0 or y1 <= y0:
|
|
return 0.0
|
|
return (x1 - x0) * (y1 - y0)
|
|
|
|
|
|
def _bbox_area(bbox):
|
|
"""Compute area of a bounding box."""
|
|
return abs(bbox[2] - bbox[0]) * abs(bbox[3] - bbox[1])
|