feat: pdf2cad
This commit is contained in:
82
tests/test_text_extractor.py
Normal file
82
tests/test_text_extractor.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Tests for PDF text extraction."""
|
||||
import pymupdf
|
||||
|
||||
from pdf2imos.extract.text import extract_text, extract_words
|
||||
from pdf2imos.models import RawText
|
||||
|
||||
|
||||
class TestExtractText:
|
||||
def test_returns_list_of_raw_text(self, simple_panel_pdf):
|
||||
doc = pymupdf.open(str(simple_panel_pdf))
|
||||
result = extract_text(doc[0])
|
||||
assert isinstance(result, list)
|
||||
assert all(isinstance(t, RawText) for t in result)
|
||||
|
||||
def test_dimension_values_present(self, simple_panel_pdf):
|
||||
"""simple_panel.pdf must have dimension values 600, 720, 18."""
|
||||
doc = pymupdf.open(str(simple_panel_pdf))
|
||||
result = extract_text(doc[0])
|
||||
text_values = [t.text for t in result]
|
||||
assert any("600" in v for v in text_values), f"'600' not found in: {text_values}"
|
||||
assert any("720" in v for v in text_values), f"'720' not found in: {text_values}"
|
||||
assert any("18" in v for v in text_values), f"'18' not found in: {text_values}"
|
||||
|
||||
def test_material_annotation_in_cabinet(self, cabinet_basic_pdf):
|
||||
"""cabinet_basic.pdf must have material annotation text."""
|
||||
doc = pymupdf.open(str(cabinet_basic_pdf))
|
||||
result = extract_text(doc[0])
|
||||
all_text = " ".join(t.text for t in result)
|
||||
assert (
|
||||
"melamine" in all_text.lower()
|
||||
or "mdf" in all_text.lower()
|
||||
or "18mm" in all_text.lower()
|
||||
), f"No material annotation found in: {all_text[:200]}"
|
||||
|
||||
def test_bboxes_within_page(self, simple_panel_pdf):
|
||||
"""All bounding boxes must be within page dimensions."""
|
||||
doc = pymupdf.open(str(simple_panel_pdf))
|
||||
page = doc[0]
|
||||
result = extract_text(page)
|
||||
pw, ph = page.rect.width, page.rect.height
|
||||
for t in result:
|
||||
x0, y0, x1, y1 = t.bbox
|
||||
assert x0 >= -1, f"x0 out of bounds: {x0}"
|
||||
assert y0 >= -1, f"y0 out of bounds: {y0}"
|
||||
assert x1 <= pw + 1, f"x1 out of bounds: {x1}"
|
||||
assert y1 <= ph + 1, f"y1 out of bounds: {y1}"
|
||||
|
||||
def test_no_whitespace_only_spans(self, simple_panel_pdf):
|
||||
"""No empty or whitespace-only text spans returned."""
|
||||
doc = pymupdf.open(str(simple_panel_pdf))
|
||||
result = extract_text(doc[0])
|
||||
for t in result:
|
||||
assert t.text.strip(), f"Whitespace-only span found: repr={repr(t.text)}"
|
||||
|
||||
|
||||
class TestExtractWords:
|
||||
def test_returns_list_of_raw_text(self, simple_panel_pdf):
|
||||
doc = pymupdf.open(str(simple_panel_pdf))
|
||||
result = extract_words(doc[0])
|
||||
assert isinstance(result, list)
|
||||
assert all(isinstance(t, RawText) for t in result)
|
||||
|
||||
def test_dimension_values_present(self, simple_panel_pdf):
|
||||
"""Word extraction finds dimension values."""
|
||||
doc = pymupdf.open(str(simple_panel_pdf))
|
||||
result = extract_words(doc[0])
|
||||
text_values = [t.text for t in result]
|
||||
assert any("600" in v for v in text_values), f"'600' not in words: {text_values}"
|
||||
assert any("720" in v for v in text_values), f"'720' not in words: {text_values}"
|
||||
|
||||
def test_word_extraction_font_empty(self, simple_panel_pdf):
|
||||
"""Word-level extraction has empty font info (by design)."""
|
||||
doc = pymupdf.open(str(simple_panel_pdf))
|
||||
result = extract_words(doc[0])
|
||||
assert all(t.font == "" for t in result)
|
||||
|
||||
def test_all_fixtures_extractable(self, all_fixture_pdfs):
|
||||
"""All fixture PDFs can be text-extracted without error."""
|
||||
for pdf_path in all_fixture_pdfs:
|
||||
doc = pymupdf.open(str(pdf_path))
|
||||
result = extract_words(doc[0])
|
||||
assert len(result) > 0, f"No words in {pdf_path.name}"
|
||||
Reference in New Issue
Block a user