"""Tests for ChunkEngine (mocked LLM).""" import json import pytest from aucourt_ingest.processing.chunk_engine import ChunkEngine class MockLLM: def __init__(self, sections: list[dict] | None = None): self._sections = sections async def create_message(self, prompt: str, system: str) -> str: if self._sections is not None: return json.dumps(self._sections) # Default: return single section covering whole text return json.dumps([{"section_type": "judgment", "start_char": 0, "end_char": 1000, "speaker": None}]) SAMPLE_TEXT = """ This is the opening of the judgment. The accused, John Smith, stood trial on one charge of murder contrary to section 18 of the Crimes Act 1900 (NSW). The prosecution called five witnesses. The first witness, Dr Jane Brown, gave evidence about the forensic examination. She stated that the cause of death was blunt force trauma to the head. The second witness, Officer Mark Jones, testified about the arrest. He described the circumstances of the arrest and the items recovered. His Honour then delivered the judgment. After careful consideration of all the evidence, I find the accused guilty of murder. The appropriate sentence is imprisonment for 18 years with a non-parole period of 13 years. """ @pytest.mark.asyncio async def test_chunk_with_single_section(): mock = MockLLM([{ "section_type": "judgment", "start_char": 0, "end_char": len(SAMPLE_TEXT), "speaker": None, }]) engine = ChunkEngine(llm=mock) chunks = await engine.chunk(SAMPLE_TEXT, doc_id="[2019] NSWSC 1234") assert len(chunks) >= 1 assert chunks[0].chunk_type == "judgment" assert chunks[0].doc_id == "[2019] NSWSC 1234" assert chunks[0].sequence == 0 assert chunks[0].text assert chunks[0].token_count > 0 # Check sequential ordering for i, c in enumerate(chunks): assert c.sequence == i @pytest.mark.asyncio async def test_chunk_with_multiple_sections(): sections = [ {"section_type": "opening", "start_char": 0, "end_char": 180, "speaker": None}, {"section_type": "testimony", "start_char": 180, "end_char": 320, "speaker": "Dr Jane Brown"}, {"section_type": "testimony", "start_char": 320, "end_char": 450, "speaker": "Officer Mark Jones"}, {"section_type": "judgment", "start_char": 450, "end_char": len(SAMPLE_TEXT), "speaker": None}, ] engine = ChunkEngine() chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections) assert len(chunks) == 4 assert chunks[0].chunk_type == "opening" assert chunks[0].sequence == 0 assert chunks[1].chunk_type == "testimony" assert chunks[1].speaker == "Dr Jane Brown" assert chunks[1].sequence == 1 assert chunks[2].chunk_type == "testimony" assert chunks[2].speaker == "Officer Mark Jones" assert chunks[3].chunk_type == "judgment" assert chunks[3].sequence == 3 def test_chunk_with_precomputed_sections(): """Test synchronous chunk_with_sections method (no LLM).""" engine = ChunkEngine() sections = [ {"section_type": "opening", "start_char": 0, "end_char": 100, "speaker": None}, {"section_type": "judgment", "start_char": 100, "end_char": len(SAMPLE_TEXT), "speaker": None}, ] chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections) assert len(chunks) == 2 assert all(c.doc_id == "[2019] NSWSC 1234" for c in chunks) # Sequence should be continuous assert chunks[0].sequence == 0 assert chunks[1].sequence == 1 def test_chunk_ids_are_unique(): engine = ChunkEngine() sections = [ {"section_type": "opening", "start_char": 0, "end_char": 50, "speaker": None}, {"section_type": "judgment", "start_char": 50, "end_char": len(SAMPLE_TEXT), "speaker": None}, ] chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections) ids = [c.chunk_id for c in chunks] assert len(ids) == len(set(ids)), "Chunk IDs must be unique" def test_empty_section_skipped(): engine = ChunkEngine() sections = [ {"section_type": "opening", "start_char": 0, "end_char": 0, "speaker": None}, {"section_type": "judgment", "start_char": 50, "end_char": len(SAMPLE_TEXT), "speaker": None}, ] chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections) assert len(chunks) == 1 assert chunks[0].chunk_type == "judgment" def test_large_section_splits(): """A section larger than max token budget should split into multiple chunks.""" engine = ChunkEngine() # Create text large enough to exceed judgment max of 600 tokens (~2400 chars) big_text = "The Court finds that " * 500 # ~5000 chars sections = [ {"section_type": "judgment", "start_char": 0, "end_char": len(big_text), "speaker": None}, ] chunks = engine.chunk_with_sections(big_text, "[2019] NSWSC 1234", sections) assert len(chunks) > 1, "Large section should be split" for c in chunks: assert c.chunk_type == "judgment" assert c.token_count <= 600 # max tokens for judgment type @pytest.mark.asyncio async def test_broken_json_fallback(): mock = MockLLM(sections=None) # Will return bad response mock._sections = "not json at all" mock._return_bad = True async def bad_create_message(prompt, system): return "not json" mock.create_message = bad_create_message engine = ChunkEngine(llm=mock) chunks = await engine.chunk("some text", "[2019] NSWSC 1234") # Should fall back to single judgment section assert len(chunks) >= 1 assert chunks[0].chunk_type == "judgment"