"""Tests for PDF text extraction.""" import pymupdf from pdf2imos.extract.text import extract_text, extract_words from pdf2imos.models import RawText class TestExtractText: def test_returns_list_of_raw_text(self, simple_panel_pdf): doc = pymupdf.open(str(simple_panel_pdf)) result = extract_text(doc[0]) assert isinstance(result, list) assert all(isinstance(t, RawText) for t in result) def test_dimension_values_present(self, simple_panel_pdf): """simple_panel.pdf must have dimension values 600, 720, 18.""" doc = pymupdf.open(str(simple_panel_pdf)) result = extract_text(doc[0]) text_values = [t.text for t in result] assert any("600" in v for v in text_values), f"'600' not found in: {text_values}" assert any("720" in v for v in text_values), f"'720' not found in: {text_values}" assert any("18" in v for v in text_values), f"'18' not found in: {text_values}" def test_material_annotation_in_cabinet(self, cabinet_basic_pdf): """cabinet_basic.pdf must have material annotation text.""" doc = pymupdf.open(str(cabinet_basic_pdf)) result = extract_text(doc[0]) all_text = " ".join(t.text for t in result) assert ( "melamine" in all_text.lower() or "mdf" in all_text.lower() or "18mm" in all_text.lower() ), f"No material annotation found in: {all_text[:200]}" def test_bboxes_within_page(self, simple_panel_pdf): """All bounding boxes must be within page dimensions.""" doc = pymupdf.open(str(simple_panel_pdf)) page = doc[0] result = extract_text(page) pw, ph = page.rect.width, page.rect.height for t in result: x0, y0, x1, y1 = t.bbox assert x0 >= -1, f"x0 out of bounds: {x0}" assert y0 >= -1, f"y0 out of bounds: {y0}" assert x1 <= pw + 1, f"x1 out of bounds: {x1}" assert y1 <= ph + 1, f"y1 out of bounds: {y1}" def test_no_whitespace_only_spans(self, simple_panel_pdf): """No empty or whitespace-only text spans returned.""" doc = pymupdf.open(str(simple_panel_pdf)) result = extract_text(doc[0]) for t in result: assert t.text.strip(), f"Whitespace-only span found: repr={repr(t.text)}" class TestExtractWords: def test_returns_list_of_raw_text(self, simple_panel_pdf): doc = pymupdf.open(str(simple_panel_pdf)) result = extract_words(doc[0]) assert isinstance(result, list) assert all(isinstance(t, RawText) for t in result) def test_dimension_values_present(self, simple_panel_pdf): """Word extraction finds dimension values.""" doc = pymupdf.open(str(simple_panel_pdf)) result = extract_words(doc[0]) text_values = [t.text for t in result] assert any("600" in v for v in text_values), f"'600' not in words: {text_values}" assert any("720" in v for v in text_values), f"'720' not in words: {text_values}" def test_word_extraction_font_empty(self, simple_panel_pdf): """Word-level extraction has empty font info (by design).""" doc = pymupdf.open(str(simple_panel_pdf)) result = extract_words(doc[0]) assert all(t.font == "" for t in result) def test_all_fixtures_extractable(self, all_fixture_pdfs): """All fixture PDFs can be text-extracted without error.""" for pdf_path in all_fixture_pdfs: doc = pymupdf.open(str(pdf_path)) result = extract_words(doc[0]) assert len(result) > 0, f"No words in {pdf_path.name}"