aucourt-ingest/tests/test_doc_parser.py

"""Tests for DocParser."""

import pytest
from aucourt_ingest.processing.doc_parser import DocParser


NSW_HTML = """<!DOCTYPE html>
<html>
<head><title>[2019] NSWSC 1234</title></head>
<body>
<nav class="navbar"><a href="/">Home</a></nav>
<div class="judgment">
<div class="body">
<h1>REGINA v SMITH</h1>
<p>The Court finds that the accused, John Smith, stood trial on charges of murder.</p>
<p>After considering the evidence, the jury returned a verdict of guilty on all counts.</p>
<p>The accused is sentenced to imprisonment for 18 years with a non-parole period of 13 years.</p>
</div>
</div>
<footer>Copyright NSW Caselaw</footer>
<script>console.log('analytics')</script>
</body>
</html>
"""


class TestParseHTML:
    def test_extracts_judgment_body(self):
        parser = DocParser()
        doc = parser.parse_html(NSW_HTML, source_id="nsw_caselaw")
        assert "murder" in doc.raw_text
        assert "guilty" in doc.raw_text
        assert "18 years" in doc.raw_text
        assert doc.format == "html"
        assert doc.char_count > 0

    def test_strips_nav_header_footer_script(self):
        parser = DocParser()
        doc = parser.parse_html(NSW_HTML)
        assert "Home" not in doc.raw_text      # nav
        assert "Copyright" not in doc.raw_text  # footer
        assert "analytics" not in doc.raw_text   # script

    def test_extracts_mnc(self):
        parser = DocParser()
        doc = parser.parse_html(NSW_HTML)
        assert doc.doc_id == "[2019] NSWSC 1234"

    def test_byte_input(self):
        parser = DocParser()
        doc = parser.parse_html(NSW_HTML.encode("utf-8"), source_id="nsw_caselaw")
        assert doc.raw_text
        assert doc.format == "html"

    def test_parse_dispatch_html(self):
        parser = DocParser()
        doc = parser.parse(NSW_HTML, "html", source_id="test")
        assert doc.format == "html"
        assert "murder" in doc.raw_text


class TestParseDOCX:
    def test_docx_text_extraction(self):
        from docx import Document
        from io import BytesIO

        doc = Document()
        doc.add_heading("REGINA v SMITH", level=1)
        doc.add_paragraph("The accused was found guilty of murder.")

        buf = BytesIO()
        doc.save(buf)
        buf.seek(0)

        parser = DocParser()
        result = parser.parse_docx(buf.getvalue(), source_id="fedcourt", doc_id="[2020] FCA 100")
        assert "guilty" in result.raw_text
        assert "murder" in result.raw_text
        assert result.doc_id == "[2020] FCA 100"
        assert result.format == "docx"


class TestParsePDF:
    def test_pdf_text_extraction(self):
        # Create a minimal valid PDF with text
        # pdfminer needs actual PDF bytes
        minimal_pdf = b"""%PDF-1.0
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 4 0 R>>>>/Contents 5 0 R>>endobj
4 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj
5 0 obj<</Length 44>>stream
BT /F1 12 Tf 100 700 Td (Test judgment text) Tj ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000266 00000 n
0000000340 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref
434
%%EOF"""

        parser = DocParser()
        result = parser.parse_pdf(minimal_pdf, source_id="fedcourt", doc_id="[1990] FCA 50")
        assert result.format == "pdf"
        assert result.doc_id == "[1990] FCA 50"
        # pdfminer may or may not extract text from this minimal PDF
        # Just check it doesn't crash


class TestParseDispatch:
    def test_unsupported_format_raises(self):
        parser = DocParser()
        with pytest.raises(ValueError, match="Unsupported format"):
            parser.parse(b"content", "csv")

    def test_parse_dispatch_docx(self):
        from docx import Document
        from io import BytesIO

        doc = Document()
        doc.add_paragraph("Test content")
        buf = BytesIO()
        doc.save(buf)
        buf.seek(0)

        parser = DocParser()
        result = parser.parse(buf.getvalue(), "docx", source_id="test")
        assert result.format == "docx"
AuCourtIngest: complete 8-stage Australian legal case ingestion pipeline Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph), property graph with 8 node types, juror subgraph queries with 6 personas, orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-05-30 01:56:23 +00:00			`"""Tests for DocParser."""`

			`import pytest`
			`from aucourt_ingest.processing.doc_parser import DocParser`


			`NSW_HTML = """<!DOCTYPE html>`
			`<html>`
			`<head><title>[2019] NSWSC 1234</title></head>`
			`<body>`
			`<nav class="navbar"><a href="/">Home</a></nav>`
			`<div class="judgment">`
			`<div class="body">`
			`<h1>REGINA v SMITH</h1>`
			`<p>The Court finds that the accused, John Smith, stood trial on charges of murder.</p>`
			`<p>After considering the evidence, the jury returned a verdict of guilty on all counts.</p>`
			`<p>The accused is sentenced to imprisonment for 18 years with a non-parole period of 13 years.</p>`
			`</div>`
			`</div>`
			`<footer>Copyright NSW Caselaw</footer>`
			`<script>console.log('analytics')</script>`
			`</body>`
			`</html>`
			`"""`


			`class TestParseHTML:`
			`def test_extracts_judgment_body(self):`
			`parser = DocParser()`
			`doc = parser.parse_html(NSW_HTML, source_id="nsw_caselaw")`
			`assert "murder" in doc.raw_text`
			`assert "guilty" in doc.raw_text`
			`assert "18 years" in doc.raw_text`
			`assert doc.format == "html"`
			`assert doc.char_count > 0`

			`def test_strips_nav_header_footer_script(self):`
			`parser = DocParser()`
			`doc = parser.parse_html(NSW_HTML)`
			`assert "Home" not in doc.raw_text # nav`
			`assert "Copyright" not in doc.raw_text # footer`
			`assert "analytics" not in doc.raw_text # script`

			`def test_extracts_mnc(self):`
			`parser = DocParser()`
			`doc = parser.parse_html(NSW_HTML)`
			`assert doc.doc_id == "[2019] NSWSC 1234"`

			`def test_byte_input(self):`
			`parser = DocParser()`
			`doc = parser.parse_html(NSW_HTML.encode("utf-8"), source_id="nsw_caselaw")`
			`assert doc.raw_text`
			`assert doc.format == "html"`

			`def test_parse_dispatch_html(self):`
			`parser = DocParser()`
			`doc = parser.parse(NSW_HTML, "html", source_id="test")`
			`assert doc.format == "html"`
			`assert "murder" in doc.raw_text`


			`class TestParseDOCX:`
			`def test_docx_text_extraction(self):`
			`from docx import Document`
			`from io import BytesIO`

			`doc = Document()`
			`doc.add_heading("REGINA v SMITH", level=1)`
			`doc.add_paragraph("The accused was found guilty of murder.")`

			`buf = BytesIO()`
			`doc.save(buf)`
			`buf.seek(0)`

			`parser = DocParser()`
			`result = parser.parse_docx(buf.getvalue(), source_id="fedcourt", doc_id="[2020] FCA 100")`
			`assert "guilty" in result.raw_text`
			`assert "murder" in result.raw_text`
			`assert result.doc_id == "[2020] FCA 100"`
			`assert result.format == "docx"`


			`class TestParsePDF:`
			`def test_pdf_text_extraction(self):`
			`# Create a minimal valid PDF with text`
			`# pdfminer needs actual PDF bytes`
			`minimal_pdf = b"""%PDF-1.0`
			`1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj`
			`2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj`
			`3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 4 0 R>>>>/Contents 5 0 R>>endobj`
			`4 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj`
			`5 0 obj<</Length 44>>stream`
			`BT /F1 12 Tf 100 700 Td (Test judgment text) Tj ET`
			`endstream`
			`endobj`
			`xref`
			`0 6`
			`0000000000 65535 f`
			`0000000009 00000 n`
			`0000000058 00000 n`
			`0000000115 00000 n`
			`0000000266 00000 n`
			`0000000340 00000 n`
			`trailer<</Size 6/Root 1 0 R>>`
			`startxref`
			`434`
			`%%EOF"""`

			`parser = DocParser()`
			`result = parser.parse_pdf(minimal_pdf, source_id="fedcourt", doc_id="[1990] FCA 50")`
			`assert result.format == "pdf"`
			`assert result.doc_id == "[1990] FCA 50"`
			`# pdfminer may or may not extract text from this minimal PDF`
			`# Just check it doesn't crash`


			`class TestParseDispatch:`
			`def test_unsupported_format_raises(self):`
			`parser = DocParser()`
			`with pytest.raises(ValueError, match="Unsupported format"):`
			`parser.parse(b"content", "csv")`

			`def test_parse_dispatch_docx(self):`
			`from docx import Document`
			`from io import BytesIO`

			`doc = Document()`
			`doc.add_paragraph("Test content")`
			`buf = BytesIO()`
			`doc.save(buf)`
			`buf.seek(0)`

			`parser = DocParser()`
			`result = parser.parse(buf.getvalue(), "docx", source_id="test")`
			`assert result.format == "docx"`