136 lines
4.1 KiB
Python
136 lines
4.1 KiB
Python
|
|
"""Tests for DocParser."""
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
from aucourt_ingest.processing.doc_parser import DocParser
|
||
|
|
|
||
|
|
|
||
|
|
NSW_HTML = """<!DOCTYPE html>
|
||
|
|
<html>
|
||
|
|
<head><title>[2019] NSWSC 1234</title></head>
|
||
|
|
<body>
|
||
|
|
<nav class="navbar"><a href="/">Home</a></nav>
|
||
|
|
<div class="judgment">
|
||
|
|
<div class="body">
|
||
|
|
<h1>REGINA v SMITH</h1>
|
||
|
|
<p>The Court finds that the accused, John Smith, stood trial on charges of murder.</p>
|
||
|
|
<p>After considering the evidence, the jury returned a verdict of guilty on all counts.</p>
|
||
|
|
<p>The accused is sentenced to imprisonment for 18 years with a non-parole period of 13 years.</p>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
<footer>Copyright NSW Caselaw</footer>
|
||
|
|
<script>console.log('analytics')</script>
|
||
|
|
</body>
|
||
|
|
</html>
|
||
|
|
"""
|
||
|
|
|
||
|
|
|
||
|
|
class TestParseHTML:
|
||
|
|
def test_extracts_judgment_body(self):
|
||
|
|
parser = DocParser()
|
||
|
|
doc = parser.parse_html(NSW_HTML, source_id="nsw_caselaw")
|
||
|
|
assert "murder" in doc.raw_text
|
||
|
|
assert "guilty" in doc.raw_text
|
||
|
|
assert "18 years" in doc.raw_text
|
||
|
|
assert doc.format == "html"
|
||
|
|
assert doc.char_count > 0
|
||
|
|
|
||
|
|
def test_strips_nav_header_footer_script(self):
|
||
|
|
parser = DocParser()
|
||
|
|
doc = parser.parse_html(NSW_HTML)
|
||
|
|
assert "Home" not in doc.raw_text # nav
|
||
|
|
assert "Copyright" not in doc.raw_text # footer
|
||
|
|
assert "analytics" not in doc.raw_text # script
|
||
|
|
|
||
|
|
def test_extracts_mnc(self):
|
||
|
|
parser = DocParser()
|
||
|
|
doc = parser.parse_html(NSW_HTML)
|
||
|
|
assert doc.doc_id == "[2019] NSWSC 1234"
|
||
|
|
|
||
|
|
def test_byte_input(self):
|
||
|
|
parser = DocParser()
|
||
|
|
doc = parser.parse_html(NSW_HTML.encode("utf-8"), source_id="nsw_caselaw")
|
||
|
|
assert doc.raw_text
|
||
|
|
assert doc.format == "html"
|
||
|
|
|
||
|
|
def test_parse_dispatch_html(self):
|
||
|
|
parser = DocParser()
|
||
|
|
doc = parser.parse(NSW_HTML, "html", source_id="test")
|
||
|
|
assert doc.format == "html"
|
||
|
|
assert "murder" in doc.raw_text
|
||
|
|
|
||
|
|
|
||
|
|
class TestParseDOCX:
|
||
|
|
def test_docx_text_extraction(self):
|
||
|
|
from docx import Document
|
||
|
|
from io import BytesIO
|
||
|
|
|
||
|
|
doc = Document()
|
||
|
|
doc.add_heading("REGINA v SMITH", level=1)
|
||
|
|
doc.add_paragraph("The accused was found guilty of murder.")
|
||
|
|
|
||
|
|
buf = BytesIO()
|
||
|
|
doc.save(buf)
|
||
|
|
buf.seek(0)
|
||
|
|
|
||
|
|
parser = DocParser()
|
||
|
|
result = parser.parse_docx(buf.getvalue(), source_id="fedcourt", doc_id="[2020] FCA 100")
|
||
|
|
assert "guilty" in result.raw_text
|
||
|
|
assert "murder" in result.raw_text
|
||
|
|
assert result.doc_id == "[2020] FCA 100"
|
||
|
|
assert result.format == "docx"
|
||
|
|
|
||
|
|
|
||
|
|
class TestParsePDF:
|
||
|
|
def test_pdf_text_extraction(self):
|
||
|
|
# Create a minimal valid PDF with text
|
||
|
|
# pdfminer needs actual PDF bytes
|
||
|
|
minimal_pdf = b"""%PDF-1.0
|
||
|
|
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
|
||
|
|
2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
|
||
|
|
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 4 0 R>>>>/Contents 5 0 R>>endobj
|
||
|
|
4 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj
|
||
|
|
5 0 obj<</Length 44>>stream
|
||
|
|
BT /F1 12 Tf 100 700 Td (Test judgment text) Tj ET
|
||
|
|
endstream
|
||
|
|
endobj
|
||
|
|
xref
|
||
|
|
0 6
|
||
|
|
0000000000 65535 f
|
||
|
|
0000000009 00000 n
|
||
|
|
0000000058 00000 n
|
||
|
|
0000000115 00000 n
|
||
|
|
0000000266 00000 n
|
||
|
|
0000000340 00000 n
|
||
|
|
trailer<</Size 6/Root 1 0 R>>
|
||
|
|
startxref
|
||
|
|
434
|
||
|
|
%%EOF"""
|
||
|
|
|
||
|
|
parser = DocParser()
|
||
|
|
result = parser.parse_pdf(minimal_pdf, source_id="fedcourt", doc_id="[1990] FCA 50")
|
||
|
|
assert result.format == "pdf"
|
||
|
|
assert result.doc_id == "[1990] FCA 50"
|
||
|
|
# pdfminer may or may not extract text from this minimal PDF
|
||
|
|
# Just check it doesn't crash
|
||
|
|
|
||
|
|
|
||
|
|
class TestParseDispatch:
|
||
|
|
def test_unsupported_format_raises(self):
|
||
|
|
parser = DocParser()
|
||
|
|
with pytest.raises(ValueError, match="Unsupported format"):
|
||
|
|
parser.parse(b"content", "csv")
|
||
|
|
|
||
|
|
def test_parse_dispatch_docx(self):
|
||
|
|
from docx import Document
|
||
|
|
from io import BytesIO
|
||
|
|
|
||
|
|
doc = Document()
|
||
|
|
doc.add_paragraph("Test content")
|
||
|
|
buf = BytesIO()
|
||
|
|
doc.save(buf)
|
||
|
|
buf.seek(0)
|
||
|
|
|
||
|
|
parser = DocParser()
|
||
|
|
result = parser.parse(buf.getvalue(), "docx", source_id="test")
|
||
|
|
assert result.format == "docx"
|