aucourt-ingest/tests/test_chunk_engine.py

"""Tests for ChunkEngine (mocked LLM)."""

import json
import pytest

from aucourt_ingest.processing.chunk_engine import ChunkEngine


class MockLLM:
    def __init__(self, sections: list[dict] | None = None):
        self._sections = sections

    async def create_message(self, prompt: str, system: str) -> str:
        if self._sections is not None:
            return json.dumps(self._sections)
        # Default: return single section covering whole text
        return json.dumps([{"section_type": "judgment", "start_char": 0, "end_char": 1000, "speaker": None}])


SAMPLE_TEXT = """
This is the opening of the judgment. The accused, John Smith, stood trial on one charge of murder
contrary to section 18 of the Crimes Act 1900 (NSW). The prosecution called five witnesses.

The first witness, Dr Jane Brown, gave evidence about the forensic examination.
She stated that the cause of death was blunt force trauma to the head.

The second witness, Officer Mark Jones, testified about the arrest.
He described the circumstances of the arrest and the items recovered.

His Honour then delivered the judgment.
After careful consideration of all the evidence, I find the accused guilty of murder.
The appropriate sentence is imprisonment for 18 years with a non-parole period of 13 years.
"""


@pytest.mark.asyncio
async def test_chunk_with_single_section():
    mock = MockLLM([{
        "section_type": "judgment",
        "start_char": 0,
        "end_char": len(SAMPLE_TEXT),
        "speaker": None,
    }])
    engine = ChunkEngine(llm=mock)
    chunks = await engine.chunk(SAMPLE_TEXT, doc_id="[2019] NSWSC 1234")

    assert len(chunks) >= 1
    assert chunks[0].chunk_type == "judgment"
    assert chunks[0].doc_id == "[2019] NSWSC 1234"
    assert chunks[0].sequence == 0
    assert chunks[0].text
    assert chunks[0].token_count > 0
    # Check sequential ordering
    for i, c in enumerate(chunks):
        assert c.sequence == i


@pytest.mark.asyncio
async def test_chunk_with_multiple_sections():
    sections = [
        {"section_type": "opening", "start_char": 0, "end_char": 180, "speaker": None},
        {"section_type": "testimony", "start_char": 180, "end_char": 320, "speaker": "Dr Jane Brown"},
        {"section_type": "testimony", "start_char": 320, "end_char": 450, "speaker": "Officer Mark Jones"},
        {"section_type": "judgment", "start_char": 450, "end_char": len(SAMPLE_TEXT), "speaker": None},
    ]
    engine = ChunkEngine()
    chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections)

    assert len(chunks) == 4
    assert chunks[0].chunk_type == "opening"
    assert chunks[0].sequence == 0
    assert chunks[1].chunk_type == "testimony"
    assert chunks[1].speaker == "Dr Jane Brown"
    assert chunks[1].sequence == 1
    assert chunks[2].chunk_type == "testimony"
    assert chunks[2].speaker == "Officer Mark Jones"
    assert chunks[3].chunk_type == "judgment"
    assert chunks[3].sequence == 3


def test_chunk_with_precomputed_sections():
    """Test synchronous chunk_with_sections method (no LLM)."""
    engine = ChunkEngine()
    sections = [
        {"section_type": "opening", "start_char": 0, "end_char": 100, "speaker": None},
        {"section_type": "judgment", "start_char": 100, "end_char": len(SAMPLE_TEXT), "speaker": None},
    ]
    chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections)

    assert len(chunks) == 2
    assert all(c.doc_id == "[2019] NSWSC 1234" for c in chunks)
    # Sequence should be continuous
    assert chunks[0].sequence == 0
    assert chunks[1].sequence == 1


def test_chunk_ids_are_unique():
    engine = ChunkEngine()
    sections = [
        {"section_type": "opening", "start_char": 0, "end_char": 50, "speaker": None},
        {"section_type": "judgment", "start_char": 50, "end_char": len(SAMPLE_TEXT), "speaker": None},
    ]
    chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections)
    ids = [c.chunk_id for c in chunks]
    assert len(ids) == len(set(ids)), "Chunk IDs must be unique"


def test_empty_section_skipped():
    engine = ChunkEngine()
    sections = [
        {"section_type": "opening", "start_char": 0, "end_char": 0, "speaker": None},
        {"section_type": "judgment", "start_char": 50, "end_char": len(SAMPLE_TEXT), "speaker": None},
    ]
    chunks = engine.chunk_with_sections(SAMPLE_TEXT, "[2019] NSWSC 1234", sections)
    assert len(chunks) == 1
    assert chunks[0].chunk_type == "judgment"


def test_large_section_splits():
    """A section larger than max token budget should split into multiple chunks."""
    engine = ChunkEngine()
    # Create text large enough to exceed judgment max of 600 tokens (~2400 chars)
    big_text = "The Court finds that " * 500  # ~5000 chars
    sections = [
        {"section_type": "judgment", "start_char": 0, "end_char": len(big_text), "speaker": None},
    ]
    chunks = engine.chunk_with_sections(big_text, "[2019] NSWSC 1234", sections)
    assert len(chunks) > 1, "Large section should be split"
    for c in chunks:
        assert c.chunk_type == "judgment"
        assert c.token_count <= 600  # max tokens for judgment type


@pytest.mark.asyncio
async def test_broken_json_fallback():
    mock = MockLLM(sections=None)  # Will return bad response
    mock._sections = "not json at all"
    mock._return_bad = True

    async def bad_create_message(prompt, system):
        return "not json"

    mock.create_message = bad_create_message

    engine = ChunkEngine(llm=mock)
    chunks = await engine.chunk("some text", "[2019] NSWSC 1234")
    # Should fall back to single judgment section
    assert len(chunks) >= 1
    assert chunks[0].chunk_type == "judgment"