aucourt-ingest/tests/test_doc_parser.py

"""Tests for DocParser."""

import pytest
from aucourt_ingest.processing.doc_parser import DocParser


NSW_HTML = """<!DOCTYPE html>
<html>
<head><title>[2019] NSWSC 1234</title></head>
<body>
<nav class="navbar"><a href="/">Home</a></nav>
<div class="judgment">
<div class="body">
<h1>REGINA v SMITH</h1>
<p>The Court finds that the accused, John Smith, stood trial on charges of murder.</p>
<p>After considering the evidence, the jury returned a verdict of guilty on all counts.</p>
<p>The accused is sentenced to imprisonment for 18 years with a non-parole period of 13 years.</p>
</div>
</div>
<footer>Copyright NSW Caselaw</footer>
<script>console.log('analytics')</script>
</body>
</html>
"""


class TestParseHTML:
    def test_extracts_judgment_body(self):
        parser = DocParser()
        doc = parser.parse_html(NSW_HTML, source_id="nsw_caselaw")
        assert "murder" in doc.raw_text
        assert "guilty" in doc.raw_text
        assert "18 years" in doc.raw_text
        assert doc.format == "html"
        assert doc.char_count > 0

    def test_strips_nav_header_footer_script(self):
        parser = DocParser()
        doc = parser.parse_html(NSW_HTML)
        assert "Home" not in doc.raw_text      # nav
        assert "Copyright" not in doc.raw_text  # footer
        assert "analytics" not in doc.raw_text   # script

    def test_extracts_mnc(self):
        parser = DocParser()
        doc = parser.parse_html(NSW_HTML)
        assert doc.doc_id == "[2019] NSWSC 1234"

    def test_byte_input(self):
        parser = DocParser()
        doc = parser.parse_html(NSW_HTML.encode("utf-8"), source_id="nsw_caselaw")
        assert doc.raw_text
        assert doc.format == "html"

    def test_parse_dispatch_html(self):
        parser = DocParser()
        doc = parser.parse(NSW_HTML, "html", source_id="test")
        assert doc.format == "html"
        assert "murder" in doc.raw_text


class TestParseDOCX:
    def test_docx_text_extraction(self):
        from docx import Document
        from io import BytesIO

        doc = Document()
        doc.add_heading("REGINA v SMITH", level=1)
        doc.add_paragraph("The accused was found guilty of murder.")

        buf = BytesIO()
        doc.save(buf)
        buf.seek(0)

        parser = DocParser()
        result = parser.parse_docx(buf.getvalue(), source_id="fedcourt", doc_id="[2020] FCA 100")
        assert "guilty" in result.raw_text
        assert "murder" in result.raw_text
        assert result.doc_id == "[2020] FCA 100"
        assert result.format == "docx"


class TestParsePDF:
    def test_pdf_text_extraction(self):
        # Create a minimal valid PDF with text
        # pdfminer needs actual PDF bytes
        minimal_pdf = b"""%PDF-1.0
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 4 0 R>>>>/Contents 5 0 R>>endobj
4 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj
5 0 obj<</Length 44>>stream
BT /F1 12 Tf 100 700 Td (Test judgment text) Tj ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000266 00000 n
0000000340 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref
434
%%EOF"""

        parser = DocParser()
        result = parser.parse_pdf(minimal_pdf, source_id="fedcourt", doc_id="[1990] FCA 50")
        assert result.format == "pdf"
        assert result.doc_id == "[1990] FCA 50"
        # pdfminer may or may not extract text from this minimal PDF
        # Just check it doesn't crash


class TestParseDispatch:
    def test_unsupported_format_raises(self):
        parser = DocParser()
        with pytest.raises(ValueError, match="Unsupported format"):
            parser.parse(b"content", "csv")

    def test_parse_dispatch_docx(self):
        from docx import Document
        from io import BytesIO

        doc = Document()
        doc.add_paragraph("Test content")
        buf = BytesIO()
        doc.save(buf)
        buf.seek(0)

        parser = DocParser()
        result = parser.parse(buf.getvalue(), "docx", source_id="test")
        assert result.format == "docx"