"""Tests for DocParser.""" import pytest from aucourt_ingest.processing.doc_parser import DocParser NSW_HTML = """ [2019] NSWSC 1234

REGINA v SMITH

The Court finds that the accused, John Smith, stood trial on charges of murder.

After considering the evidence, the jury returned a verdict of guilty on all counts.

The accused is sentenced to imprisonment for 18 years with a non-parole period of 13 years.

""" class TestParseHTML: def test_extracts_judgment_body(self): parser = DocParser() doc = parser.parse_html(NSW_HTML, source_id="nsw_caselaw") assert "murder" in doc.raw_text assert "guilty" in doc.raw_text assert "18 years" in doc.raw_text assert doc.format == "html" assert doc.char_count > 0 def test_strips_nav_header_footer_script(self): parser = DocParser() doc = parser.parse_html(NSW_HTML) assert "Home" not in doc.raw_text # nav assert "Copyright" not in doc.raw_text # footer assert "analytics" not in doc.raw_text # script def test_extracts_mnc(self): parser = DocParser() doc = parser.parse_html(NSW_HTML) assert doc.doc_id == "[2019] NSWSC 1234" def test_byte_input(self): parser = DocParser() doc = parser.parse_html(NSW_HTML.encode("utf-8"), source_id="nsw_caselaw") assert doc.raw_text assert doc.format == "html" def test_parse_dispatch_html(self): parser = DocParser() doc = parser.parse(NSW_HTML, "html", source_id="test") assert doc.format == "html" assert "murder" in doc.raw_text class TestParseDOCX: def test_docx_text_extraction(self): from docx import Document from io import BytesIO doc = Document() doc.add_heading("REGINA v SMITH", level=1) doc.add_paragraph("The accused was found guilty of murder.") buf = BytesIO() doc.save(buf) buf.seek(0) parser = DocParser() result = parser.parse_docx(buf.getvalue(), source_id="fedcourt", doc_id="[2020] FCA 100") assert "guilty" in result.raw_text assert "murder" in result.raw_text assert result.doc_id == "[2020] FCA 100" assert result.format == "docx" class TestParsePDF: def test_pdf_text_extraction(self): # Create a minimal valid PDF with text # pdfminer needs actual PDF bytes minimal_pdf = b"""%PDF-1.0 1 0 obj<>endobj 2 0 obj<>endobj 3 0 obj<>>>/Contents 5 0 R>>endobj 4 0 obj<>endobj 5 0 obj<>stream BT /F1 12 Tf 100 700 Td (Test judgment text) Tj ET endstream endobj xref 0 6 0000000000 65535 f 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n 0000000266 00000 n 0000000340 00000 n trailer<> startxref 434 %%EOF""" parser = DocParser() result = parser.parse_pdf(minimal_pdf, source_id="fedcourt", doc_id="[1990] FCA 50") assert result.format == "pdf" assert result.doc_id == "[1990] FCA 50" # pdfminer may or may not extract text from this minimal PDF # Just check it doesn't crash class TestParseDispatch: def test_unsupported_format_raises(self): parser = DocParser() with pytest.raises(ValueError, match="Unsupported format"): parser.parse(b"content", "csv") def test_parse_dispatch_docx(self): from docx import Document from io import BytesIO doc = Document() doc.add_paragraph("Test content") buf = BytesIO() doc.save(buf) buf.seek(0) parser = DocParser() result = parser.parse(buf.getvalue(), "docx", source_id="test") assert result.format == "docx"