aucourt-ingest/tests/test_doc_parser.py
slothitude d77fe12cfc AuCourtIngest: complete 8-stage Australian legal case ingestion pipeline
Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph),
property graph with 8 node types, juror subgraph queries with 6 personas,
orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-30 11:56:23 +10:00

135 lines
4.1 KiB
Python

"""Tests for DocParser."""
import pytest
from aucourt_ingest.processing.doc_parser import DocParser
NSW_HTML = """<!DOCTYPE html>
<html>
<head><title>[2019] NSWSC 1234</title></head>
<body>
<nav class="navbar"><a href="/">Home</a></nav>
<div class="judgment">
<div class="body">
<h1>REGINA v SMITH</h1>
<p>The Court finds that the accused, John Smith, stood trial on charges of murder.</p>
<p>After considering the evidence, the jury returned a verdict of guilty on all counts.</p>
<p>The accused is sentenced to imprisonment for 18 years with a non-parole period of 13 years.</p>
</div>
</div>
<footer>Copyright NSW Caselaw</footer>
<script>console.log('analytics')</script>
</body>
</html>
"""
class TestParseHTML:
def test_extracts_judgment_body(self):
parser = DocParser()
doc = parser.parse_html(NSW_HTML, source_id="nsw_caselaw")
assert "murder" in doc.raw_text
assert "guilty" in doc.raw_text
assert "18 years" in doc.raw_text
assert doc.format == "html"
assert doc.char_count > 0
def test_strips_nav_header_footer_script(self):
parser = DocParser()
doc = parser.parse_html(NSW_HTML)
assert "Home" not in doc.raw_text # nav
assert "Copyright" not in doc.raw_text # footer
assert "analytics" not in doc.raw_text # script
def test_extracts_mnc(self):
parser = DocParser()
doc = parser.parse_html(NSW_HTML)
assert doc.doc_id == "[2019] NSWSC 1234"
def test_byte_input(self):
parser = DocParser()
doc = parser.parse_html(NSW_HTML.encode("utf-8"), source_id="nsw_caselaw")
assert doc.raw_text
assert doc.format == "html"
def test_parse_dispatch_html(self):
parser = DocParser()
doc = parser.parse(NSW_HTML, "html", source_id="test")
assert doc.format == "html"
assert "murder" in doc.raw_text
class TestParseDOCX:
def test_docx_text_extraction(self):
from docx import Document
from io import BytesIO
doc = Document()
doc.add_heading("REGINA v SMITH", level=1)
doc.add_paragraph("The accused was found guilty of murder.")
buf = BytesIO()
doc.save(buf)
buf.seek(0)
parser = DocParser()
result = parser.parse_docx(buf.getvalue(), source_id="fedcourt", doc_id="[2020] FCA 100")
assert "guilty" in result.raw_text
assert "murder" in result.raw_text
assert result.doc_id == "[2020] FCA 100"
assert result.format == "docx"
class TestParsePDF:
def test_pdf_text_extraction(self):
# Create a minimal valid PDF with text
# pdfminer needs actual PDF bytes
minimal_pdf = b"""%PDF-1.0
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 4 0 R>>>>/Contents 5 0 R>>endobj
4 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj
5 0 obj<</Length 44>>stream
BT /F1 12 Tf 100 700 Td (Test judgment text) Tj ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000266 00000 n
0000000340 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref
434
%%EOF"""
parser = DocParser()
result = parser.parse_pdf(minimal_pdf, source_id="fedcourt", doc_id="[1990] FCA 50")
assert result.format == "pdf"
assert result.doc_id == "[1990] FCA 50"
# pdfminer may or may not extract text from this minimal PDF
# Just check it doesn't crash
class TestParseDispatch:
def test_unsupported_format_raises(self):
parser = DocParser()
with pytest.raises(ValueError, match="Unsupported format"):
parser.parse(b"content", "csv")
def test_parse_dispatch_docx(self):
from docx import Document
from io import BytesIO
doc = Document()
doc.add_paragraph("Test content")
buf = BytesIO()
doc.save(buf)
buf.seek(0)
parser = DocParser()
result = parser.parse(buf.getvalue(), "docx", source_id="test")
assert result.format == "docx"