Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph), property graph with 8 node types, juror subgraph queries with 6 personas, orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
149 lines
5.6 KiB
Python
149 lines
5.6 KiB
Python
"""Tests for ChunkEngine (mocked LLM)."""
|
|
|
|
import json
|
|
import pytest
|
|
|
|
from aucourt_ingest.processing.chunk_engine import ChunkEngine
|
|
|
|
|
|
class MockLLM:
|
|
def __init__(self, sections: list[dict] | None = None):
|
|
self._sections = sections
|
|
|
|
async def create_message(self, prompt: str, system: str) -> str:
|
|
if self._sections is not None:
|
|
return json.dumps(self._sections)
|
|
# Default: return single section covering whole text
|
|
return json.dumps([{"section_type": "judgment", "start_char": 0, "end_char": 1000, "speaker": None}])
|
|
|
|
|
|
SAMPLE_TEXT = """
|
|
This is the opening of the judgment. The accused, John Smith, stood trial on one charge of murder
|
|
contrary to section 18 of the Crimes Act 1900 (NSW). The prosecution called five witnesses.
|
|
|
|
The first witness, Dr Jane Brown, gave evidence about the forensic examination.
|
|
She stated that the cause of death was blunt force trauma to the head.
|
|
|
|
The second witness, Officer Mark Jones, testified about the arrest.
|
|
He described the circumstances of the arrest and the items recovered.
|
|
|
|
His Honour then delivered the judgment.
|
|
After careful consideration of all the evidence, I find the accused guilty of murder.
|
|
The appropriate sentence is imprisonment for 18 years with a non-parole period of 13 years.
|
|
"""
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_chunk_with_single_section():
|
|
mock = MockLLM([{
|
|
"section_type": "judgment",
|
|
"start_char": 0,
|
|
"end_char": len(SAMPLE_TEXT),
|
|
"speaker": None,
|
|
}])
|
|
engine = ChunkEngine(llm=mock)
|
|
chunks = await engine.chunk(SAMPLE_TEXT, doc_id="[2019] NSWSC 1234")
|
|
|
|
assert len(chunks) >= 1
|
|
assert chunks[0].chunk_type == "judgment"
|
|
assert chunks[0].doc_id == "[2019] NSWSC 1234"
|
|
assert chunks[0].sequence == 0
|
|
assert chunks[0].text
|
|
assert chunks[0].token_count > 0
|
|
# Check sequential ordering
|
|
for i, c in enumerate(chunks):
|
|
assert c.sequence == i
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_chunk_with_multiple_sections():
|
|
sections = [
|
|
{"section_type": "opening", "start_char": 0, "end_char": 180, "speaker": None},
|
|
{"section_type": "testimony", "start_char": 180, "end_char": 320, "speaker": "Dr Jane Brown"},
|
|
{"section_type": "testimony", "start_char": 320, "end_char": 450, "speaker": "Officer Mark Jones"},
|
|
{"section_type": "judgment", "start_char": 450, "end_char": len(SAMPLE_TEXT), "speaker": None},
|
|
]
|
|
engine = ChunkEngine()
|
|
chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections)
|
|
|
|
assert len(chunks) == 4
|
|
assert chunks[0].chunk_type == "opening"
|
|
assert chunks[0].sequence == 0
|
|
assert chunks[1].chunk_type == "testimony"
|
|
assert chunks[1].speaker == "Dr Jane Brown"
|
|
assert chunks[1].sequence == 1
|
|
assert chunks[2].chunk_type == "testimony"
|
|
assert chunks[2].speaker == "Officer Mark Jones"
|
|
assert chunks[3].chunk_type == "judgment"
|
|
assert chunks[3].sequence == 3
|
|
|
|
|
|
def test_chunk_with_precomputed_sections():
|
|
"""Test synchronous chunk_with_sections method (no LLM)."""
|
|
engine = ChunkEngine()
|
|
sections = [
|
|
{"section_type": "opening", "start_char": 0, "end_char": 100, "speaker": None},
|
|
{"section_type": "judgment", "start_char": 100, "end_char": len(SAMPLE_TEXT), "speaker": None},
|
|
]
|
|
chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections)
|
|
|
|
assert len(chunks) == 2
|
|
assert all(c.doc_id == "[2019] NSWSC 1234" for c in chunks)
|
|
# Sequence should be continuous
|
|
assert chunks[0].sequence == 0
|
|
assert chunks[1].sequence == 1
|
|
|
|
|
|
def test_chunk_ids_are_unique():
|
|
engine = ChunkEngine()
|
|
sections = [
|
|
{"section_type": "opening", "start_char": 0, "end_char": 50, "speaker": None},
|
|
{"section_type": "judgment", "start_char": 50, "end_char": len(SAMPLE_TEXT), "speaker": None},
|
|
]
|
|
chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections)
|
|
ids = [c.chunk_id for c in chunks]
|
|
assert len(ids) == len(set(ids)), "Chunk IDs must be unique"
|
|
|
|
|
|
def test_empty_section_skipped():
|
|
engine = ChunkEngine()
|
|
sections = [
|
|
{"section_type": "opening", "start_char": 0, "end_char": 0, "speaker": None},
|
|
{"section_type": "judgment", "start_char": 50, "end_char": len(SAMPLE_TEXT), "speaker": None},
|
|
]
|
|
chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections)
|
|
assert len(chunks) == 1
|
|
assert chunks[0].chunk_type == "judgment"
|
|
|
|
|
|
def test_large_section_splits():
|
|
"""A section larger than max token budget should split into multiple chunks."""
|
|
engine = ChunkEngine()
|
|
# Create text large enough to exceed judgment max of 600 tokens (~2400 chars)
|
|
big_text = "The Court finds that " * 500 # ~5000 chars
|
|
sections = [
|
|
{"section_type": "judgment", "start_char": 0, "end_char": len(big_text), "speaker": None},
|
|
]
|
|
chunks = engine.chunk_with_sections(big_text, "[2019] NSWSC 1234", sections)
|
|
assert len(chunks) > 1, "Large section should be split"
|
|
for c in chunks:
|
|
assert c.chunk_type == "judgment"
|
|
assert c.token_count <= 600 # max tokens for judgment type
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_broken_json_fallback():
|
|
mock = MockLLM(sections=None) # Will return bad response
|
|
mock._sections = "not json at all"
|
|
mock._return_bad = True
|
|
|
|
async def bad_create_message(prompt, system):
|
|
return "not json"
|
|
|
|
mock.create_message = bad_create_message
|
|
|
|
engine = ChunkEngine(llm=mock)
|
|
chunks = await engine.chunk("some text", "[2019] NSWSC 1234")
|
|
# Should fall back to single judgment section
|
|
assert len(chunks) >= 1
|
|
assert chunks[0].chunk_type == "judgment"
|