pdf2cad/src/pdf2imos/parse/dimensions.py

"""Dimension extractor — find dimensional measurements from orthographic views.

Strategy:
1. Collect all text items in the view that look like numbers (parseable as float/int)
2. Convert text coordinates from PDF coords (y-down) to CAD coords (y-up)
3. For each numeric text, find the nearest horizontal or vertical line segment
4. Determine direction (H/V) from the associated line's orientation
5. Build DimensionAnnotation for each valid (text, line) pair
"""

import logging
import re

from pdf2imos.models import (
    ClassifiedLine,
    DimensionAnnotation,
    DimensionDirection,
    LineRole,
    ViewRegion,
)

logger = logging.getLogger(__name__)

# Pattern for dimension values: "600", "600.0", "600mm", "18", etc.
_NUMBER_PATTERN = re.compile(r"^(\d+\.?\d*)\s*(?:mm)?$")


def extract_dimensions(
    view: ViewRegion,
    classified_lines: list[ClassifiedLine],
    page_height: float,
) -> list[DimensionAnnotation]:
    """Extract dimension measurements from an orthographic view.

    Args:
        view: ViewRegion containing paths and texts
        classified_lines: ClassifiedLine objects from classify_lines() for this view's paths
        page_height: page height for text coordinate conversion (PDF → CAD)

    Returns:
        List of DimensionAnnotation objects
    """
    # Step 1: Get numeric texts (converted to CAD coords)
    numeric_texts = _extract_numeric_texts(view, page_height)
    if not numeric_texts:
        logger.debug("No numeric text found in view")
        return []

    logger.debug(
        "Found %d numeric texts: %s",
        len(numeric_texts),
        [t[0] for t in numeric_texts],
    )

    # Filter lines to this view's bounds (expanded slightly for dimension lines
    # that sit outside the geometry envelope)
    vx0, vy0, vx1, vy1 = view.bounds
    view_expanded = (vx0 - 80, vy0 - 80, vx1 + 80, vy1 + 80)

    view_lines = [
        line
        for line in classified_lines
        if _line_in_region(line, view_expanded)
    ]

    # Step 2: For each numeric text, find nearest line
    dimensions: list[DimensionAnnotation] = []
    used_text_centers: set[tuple[float, float]] = set()

    for value, text_center, text_bbox_cad in numeric_texts:
        # Skip very small values (not dimensions)
        if value < 1.0:
            continue

        # Round center for dedup
        center_key = (round(text_center[0], 1), round(text_center[1], 1))
        if center_key in used_text_centers:
            continue
        used_text_centers.add(center_key)

        # Find nearest line
        nearest = _find_nearest_line(text_center, view_lines)
        if nearest is None:
            logger.debug("No nearby line for text '%.1f' at %s", value, text_center)
            continue

        # Determine direction from line orientation
        direction = _line_direction(nearest)

        dimensions.append(
            DimensionAnnotation(
                value_mm=value,
                direction=direction,
                dim_line_start=nearest.start,
                dim_line_end=nearest.end,
                text_bbox=text_bbox_cad,
            )
        )

    logger.debug("Extracted %d dimensions from view", len(dimensions))
    return dimensions


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _extract_numeric_texts(
    view: ViewRegion,
    page_height: float,
) -> list[tuple[float, tuple[float, float], tuple[float, float, float, float]]]:
    """Extract text items that contain numeric values.

    CRITICAL: ViewRegion.texts are in PDF coords (y-down).
    We must convert to CAD coords (y-up) before spatial matching.

    Returns:
        list of (value_mm, text_center_cad, text_bbox_cad)
    """
    result: list[
        tuple[float, tuple[float, float], tuple[float, float, float, float]]
    ] = []

    for text in view.texts:
        text_str = text.text.strip()
        match = _NUMBER_PATTERN.match(text_str)
        if not match:
            continue

        try:
            value = float(match.group(1))
        except ValueError:
            continue

        # Convert text bbox from PDF coords to CAD coords
        tx0, ty0, tx1, ty1 = text.bbox
        cad_y0 = page_height - ty1
        cad_y1 = page_height - ty0
        text_bbox_cad = (tx0, cad_y0, tx1, cad_y1)
        text_center = ((tx0 + tx1) / 2, (cad_y0 + cad_y1) / 2)

        result.append((value, text_center, text_bbox_cad))

    return result


def _find_nearest_line(
    text_center: tuple[float, float],
    lines: list[ClassifiedLine],
    max_distance: float = 60.0,
) -> ClassifiedLine | None:
    """Find the nearest dimension or geometry line to a text center.

    Prefers DIMENSION lines over GEOMETRY lines.
    Ignores BORDER, HIDDEN, and CENTER lines.
    """
    best: ClassifiedLine | None = None
    best_dist = max_distance

    for line in lines:
        if line.role in (LineRole.BORDER, LineRole.HIDDEN, LineRole.CENTER):
            continue

        # Distance from text center to nearest point on line segment
        dist = _point_to_segment_distance(text_center, line.start, line.end)

        if dist < best_dist:
            # Prefer DIMENSION lines: if current best is DIMENSION and
            # candidate is not, only replace if much closer
            if (
                best is not None
                and best.role == LineRole.DIMENSION
                and line.role != LineRole.DIMENSION
                and dist > best_dist * 0.5
            ):
                continue
            best_dist = dist
            best = line

    return best


def _point_to_segment_distance(
    point: tuple[float, float],
    seg_start: tuple[float, float],
    seg_end: tuple[float, float],
) -> float:
    """Compute distance from point to line segment."""
    px, py = point
    x1, y1 = seg_start
    x2, y2 = seg_end

    dx, dy = x2 - x1, y2 - y1
    length_sq = dx * dx + dy * dy

    if length_sq < 0.0001:  # zero-length segment
        return ((px - x1) ** 2 + (py - y1) ** 2) ** 0.5

    t = max(0.0, min(1.0, ((px - x1) * dx + (py - y1) * dy) / length_sq))
    proj_x = x1 + t * dx
    proj_y = y1 + t * dy
    return ((px - proj_x) ** 2 + (py - proj_y) ** 2) ** 0.5


def _line_direction(line: ClassifiedLine) -> DimensionDirection:
    """Determine if a line is horizontal or vertical."""
    dx = abs(line.end[0] - line.start[0])
    dy = abs(line.end[1] - line.start[1])

    if dx > dy:
        return DimensionDirection.HORIZONTAL
    return DimensionDirection.VERTICAL


def _line_in_region(
    line: ClassifiedLine,
    region: tuple[float, float, float, float],
) -> bool:
    """Check if a line's midpoint is within a region."""
    mx = (line.start[0] + line.end[0]) / 2
    my = (line.start[1] + line.end[1]) / 2
    x0, y0, x1, y1 = region
    return x0 <= mx <= x1 and y0 <= my <= y1