aucourt-ingest/tests/test_pipeline.py

"""End-to-end integration test: 3 documents through full pipeline to graph.

Uses mocked LLM and embedding clients. Tests the full flow:
  RawDocument → DocParser → MetaExtractor → ChunkEngine → EmbedEngine → GraphBuilder
"""

from __future__ import annotations

import json
import pytest

from aucourt_ingest.models import (
    CaseMeta, Chunk, RawDocument, FetchStatus, Verdict, MatterType,
)
from aucourt_ingest.processing.pipeline import FullPipeline
from aucourt_ingest.orchestrator import Orchestrator
from aucourt_ingest.storage.in_memory_graph_db import InMemoryGraphDB


# ── Mock LLM clients ──

class MockLLMClient:
    """Mock LLM that returns canned JSON for meta extraction and chunking."""

    def _make_meta_response(self, text: str) -> str:
        """Generate meta JSON with MNC extracted from text."""
        import re
        mnc_match = re.search(r'\[(\d{4})\]\s+(\w+)\s+(\d+)', text)
        if mnc_match:
            mnc = f"[{mnc_match.group(1)}] {mnc_match.group(2)} {mnc_match.group(3)}"
            name = f"R v Case{mnc_match.group(3)}"
        else:
            mnc = "[2023] NSWSC 999"
            name = "R v Default"
        return json.dumps({
            "case_name": name,
            "mnc": mnc,
            "court": mnc.split()[1] if len(mnc.split()) > 1 else "NSWSC",
            "date_delivered": "2023-03-15",
            "jurisdiction": "NSW",
            "matter_type": "criminal",
            "judge": ["Judge Williams"],
            "charges": ["murder", "assault"],
            "verdict": "guilty",
            "exoneration_flag": False,
            "is_appeal": False,
            "appeal_of": "",
            "suppression_order": False,
            "inadmissible_evidence": ["hearsay statement"],
        })

    CHUNK_RESPONSE = """```json
{
    "sections": [
        {"type": "opening", "start": 0, "end": 500, "title": "Opening"},
        {"type": "testimony", "start": 500, "end": 2000, "title": "Witness Testimony", "speaker": "Dr Jones"},
        {"type": "exhibit", "start": 2000, "end": 3000, "title": "Exhibit A"},
        {"type": "closing", "start": 3000, "end": 3500, "title": "Closing Arguments"},
        {"type": "judgment", "start": 3500, "end": 5000, "title": "Judgment"},
        {"type": "sentence", "start": 5000, "end": 5500, "title": "Sentence"}
    ]
}
```"""

    CHUNK_RESPONSE = """```json
{
    "sections": [
        {"type": "opening", "start": 0, "end": 500, "title": "Opening"},
        {"type": "testimony", "start": 500, "end": 2000, "title": "Witness Testimony", "speaker": "Dr Jones"},
        {"type": "exhibit", "start": 2000, "end": 3000, "title": "Exhibit A"},
        {"type": "closing", "start": 3000, "end": 3500, "title": "Closing Arguments"},
        {"type": "judgment", "start": 3500, "end": 5000, "title": "Judgment"},
        {"type": "sentence", "start": 5000, "end": 5500, "title": "Sentence"}
    ]
}
```"""

    async def create_message(self, prompt: str, system: str) -> str:
        combined = (prompt + system).lower()
        if "metadata" in combined or "case" in combined or "extract" in combined:
            return self._make_meta_response(prompt)
        if "section" in combined or "chunk" in combined or "boundary" in combined:
            return self.CHUNK_RESPONSE
        return '{"sections": []}'


class MockEmbeddingClient:
    """Mock embedding client that returns deterministic vectors."""

    async def embed_batch(self, texts: list[str]) -> list[list[float]]:
        # Return deterministic 4-dim vectors
        result = []
        for text in texts:
            vec = [(hash(text) % 1000) / 1000.0 for _ in range(4)]
            import math
            norm = math.sqrt(sum(v * v for v in vec)) or 1.0
            result.append([v / norm for v in vec])
        return result


# ── Mock storage ──

class MockMetaDB:
    def __init__(self):
        self._queue: list[dict] = []
        self._status_updates: list[tuple[str, str, str | None]] = []
        self._meta_updates: list[tuple[str, dict]] = []

    async def connect(self): pass
    async def close(self): pass

    async def dequeue(self) -> dict | None:
        return self._queue.pop(0) if self._queue else None

    async def update_status(self, doc_id: str, status: str, error: str | None = None):
        self._status_updates.append((doc_id, status, error))

    async def update_doc_meta(self, doc_id: str, meta: dict):
        self._meta_updates.append((doc_id, meta))

    async def get_documents_by_status(self, status: str) -> list[dict]:
        return []


class MockDocStore:
    def __init__(self):
        self._docs: dict[tuple[str, str], str] = {}

    def store(self, source_id: str, doc_id: str, content: str):
        self._docs[(source_id, doc_id)] = content

    async def load(self, source_id: str, doc_id: str) -> str | None:
        return self._docs.get((source_id, doc_id))

    def exists(self, source_id: str, doc_id: str) -> bool:
        return (source_id, doc_id) in self._docs


class MockVectorIndex:
    def __init__(self):
        self._stored: list[tuple[str, list[Chunk]]] = []

    async def store_chunks(self, doc_id: str, chunks: list[Chunk]):
        self._stored.append((doc_id, chunks))


# ── Test fixtures ──

SAMPLE_HTML = """
<html>
<head><title>[2023] NSWSC 456 — R v Smith</title></head>
<body>
<div class="judgment">
<h1>THE SUPREME COURT OF NEW SOUTH WALES</h1>
<h2>R v SMITH</h2>
<h3>[2023] NSWSC 456</h3>
<div class="judgment-body">
<p class="headnote">The accused was found guilty of murder after a trial before his Honour.</p>

<p>The Crown called three witnesses. Dr Jane Jones, a forensic pathologist, gave
expert testimony regarding the cause of death. She stated that the victim died
from blunt force trauma to the head consistent with the weapon recovered at
the scene.</p>

<p>The defence argued that the prosecution had failed to prove intent beyond
reasonable doubt. Counsel for the accused submitted that the evidence was
circumstantial and that the hearsay statement from the witness should be
excluded.</p>

<p>Exhibit A — a photograph of the scene showing the weapon placed near the
body. The photograph was authenticated by Senior Constable Brown.</p>

<p>Closing submissions were heard on 14 March 2023. The prosecution submitted
that the cumulative evidence established guilt beyond reasonable doubt.</p>

<p>His Honour found the accused GUILTY of murder. The conviction was based on
the forensic evidence, the witness testimony, and the exhibits presented during
the trial. The appeal against conviction was noted.</p>

<p>The accused was sentenced to imprisonment for a term of 25 years with a
non-parole period of 18 years.</p>
</div>
</div>
</body>
</html>
"""


def _enqueue_doc(meta_db: MockMetaDB, doc_store: MockDocStore,
                  doc_id: str, source_id: str, content: str):
    doc_store.store(source_id, doc_id, content)
    meta_db._queue.append({"doc_id": doc_id, "source_id": source_id})


# ── Integration tests ──

class TestFullPipeline:
    """Test FullPipeline with mocked LLM and embeddings."""

    @pytest.fixture
    def pipeline(self):
        return FullPipeline(
            llm_client=MockLLMClient(),
            embedding_client=MockEmbeddingClient(),
        )

    @pytest.mark.asyncio
    async def test_parse_html(self, pipeline):
        raw = RawDocument(
            source_id="nsw_caselaw",
            doc_id="[2023] NSWSC 456",
            url="https://example.com",
            fetch_timestamp="2023-03-15",
            raw_text=SAMPLE_HTML,
            format="html",
        )
        text, fmt = await pipeline.parse(raw)

        assert fmt == "html"
        assert "SUPREME COURT" in text
        assert "guilty of murder" in text
        # Nav/script/header/footer should be stripped
        assert "<script>" not in text
        assert "<nav>" not in text

    @pytest.mark.asyncio
    async def test_extract_meta(self, pipeline):
        meta = await pipeline.extract_meta("[2023] NSWSC 456", SAMPLE_HTML)

        assert meta is not None
        assert meta.case_name == "R v Case456"
        assert meta.mnc == "[2023] NSWSC 456"
        assert meta.court == "NSWSC"
        assert len(meta.judge) == 1
        assert "murder" in meta.charges
        assert meta.inadmissible_evidence == ["hearsay statement"]

    @pytest.mark.asyncio
    async def test_chunk(self, pipeline):
        chunks = await pipeline.chunk("[2023] NSWSC 456", SAMPLE_HTML)

        assert len(chunks) > 0
        for chunk in chunks:
            assert chunk.doc_id == "[2023] NSWSC 456"
            assert chunk.chunk_type in ["opening", "testimony", "exhibit",
                                        "ruling", "closing", "judgment", "sentence"]
            assert chunk.sequence >= 0
            assert len(chunk.text) > 0

    @pytest.mark.asyncio
    async def test_embed(self, pipeline):
        chunks = [
            Chunk(chunk_id="c0", doc_id="doc1", chunk_type="opening",
                   sequence=0, text="Opening text"),
            Chunk(chunk_id="c1", doc_id="doc1", chunk_type="testimony",
                   sequence=1, text="Testimony text"),
        ]
        embedded = await pipeline.embed(chunks)

        assert len(embedded) == 2
        for chunk in embedded:
            assert chunk.embedding is not None
            assert len(chunk.embedding) > 0

    @pytest.mark.asyncio
    async def test_embed_empty_list(self, pipeline):
        result = await pipeline.embed([])
        assert result == []


class TestEndToEndOrchestration:
    """End-to-end: 3 documents through orchestrator with full pipeline to graph."""

    @pytest.mark.asyncio
    async def test_three_documents_to_graph(self):
        meta_db = MockMetaDB()
        doc_store = MockDocStore()

        pipeline = FullPipeline(
            llm_client=MockLLMClient(),
            embedding_client=MockEmbeddingClient(),
        )
        graph_db = InMemoryGraphDB()
        vector_index = MockVectorIndex()

        # Enqueue 3 documents
        for i, (doc_id, content) in enumerate([
            ("[2023] NSWSC 456", SAMPLE_HTML),
            ("[2023] NSWSC 789", SAMPLE_HTML.replace("R v Smith", "R v Jones").replace("456", "789")),
            ("[2023] NSWSC 101", SAMPLE_HTML.replace("R v Smith", "R v Brown").replace("456", "101")),
        ]):
            _enqueue_doc(meta_db, doc_store, doc_id, "nsw_caselaw", content)

        # Run orchestrator
        orch = Orchestrator(meta_db, doc_store, graph_db, vector_index, pipeline)
        stats = await orch.process_queue()

        # Verify all 3 processed
        assert stats["processed"] == 3
        assert stats["errors"] == 0
        assert stats["parsed"] == 3
        assert stats["embedded"] == 3
        assert stats["graphed"] == 3

        # Verify graph has nodes
        case_count = await graph_db.node_count("Case")
        assert case_count == 3

        judge_count = await graph_db.node_count("Judge")
        assert judge_count == 1  # All 3 cases have same judge → deduped

        charge_count = await graph_db.node_count("Charge")
        assert charge_count == 2  # murder + assault

        chunk_count = await graph_db.node_count("Chunk")
        assert chunk_count >= 3  # At least 1 chunk per document

        ruling_count = await graph_db.node_count("Ruling")
        assert ruling_count == 1  # Same ruling deduped across cases

        # Verify edges
        heard_by = await graph_db.relationship_count("HEARD_BY")
        assert heard_by >= 3

        charged_with = await graph_db.relationship_count("CHARGED_WITH")
        assert charged_with >= 3

        has_ruling = await graph_db.relationship_count("HAS_RULING")
        assert has_ruling >= 3

    @pytest.mark.asyncio
    async def test_metadata_stored_in_metadb(self):
        meta_db = MockMetaDB()
        doc_store = MockDocStore()

        pipeline = FullPipeline(
            llm_client=MockLLMClient(),
            embedding_client=MockEmbeddingClient(),
        )

        _enqueue_doc(meta_db, doc_store, "[2023] NSWSC 456", "nsw_caselaw", SAMPLE_HTML)

        orch = Orchestrator(meta_db, doc_store, pipeline=pipeline)
        await orch.process_queue()

        assert len(meta_db._meta_updates) == 1
        _, meta = meta_db._meta_updates[0]
        assert meta["mnc"] == "[2023] NSWSC 456"
        assert "NSWSC" in meta.get("court", "")

    @pytest.mark.asyncio
    async def test_vector_index_receives_chunks(self):
        meta_db = MockMetaDB()
        doc_store = MockDocStore()

        pipeline = FullPipeline(
            llm_client=MockLLMClient(),
            embedding_client=MockEmbeddingClient(),
        )
        graph_db = InMemoryGraphDB()
        vector_index = MockVectorIndex()

        _enqueue_doc(meta_db, doc_store, "[2023] NSWSC 456", "nsw_caselaw", SAMPLE_HTML)

        orch = Orchestrator(meta_db, doc_store, graph_db, vector_index, pipeline)
        await orch.process_queue()

        assert len(vector_index._stored) == 1
        doc_id, chunks = vector_index._stored[0]
        assert doc_id == "[2023] NSWSC 456"
        assert len(chunks) > 0
        for chunk in chunks:
            assert chunk.embedding is not None

    @pytest.mark.asyncio
    async def test_status_transitions_e2e(self):
        meta_db = MockMetaDB()
        doc_store = MockDocStore()

        pipeline = FullPipeline(
            llm_client=MockLLMClient(),
            embedding_client=MockEmbeddingClient(),
        )
        graph_db = InMemoryGraphDB()
        vector_index = MockVectorIndex()

        _enqueue_doc(meta_db, doc_store, "[2023] NSWSC 456", "nsw_caselaw", SAMPLE_HTML)

        orch = Orchestrator(meta_db, doc_store, graph_db, vector_index, pipeline)
        await orch.process_queue()

        statuses = [u[1] for u in meta_db._status_updates]
        assert FetchStatus.PARSED in statuses
        assert FetchStatus.EMBEDDED in statuses
        assert FetchStatus.GRAPHED in statuses

    @pytest.mark.asyncio
    async def test_graph_queryable_after_processing(self):
        """Verify graph is queryable — can find nodes and traverse edges."""
        meta_db = MockMetaDB()
        doc_store = MockDocStore()

        pipeline = FullPipeline(
            llm_client=MockLLMClient(),
            embedding_client=MockEmbeddingClient(),
        )
        graph_db = InMemoryGraphDB()
        vector_index = MockVectorIndex()

        _enqueue_doc(meta_db, doc_store, "[2023] NSWSC 456", "nsw_caselaw", SAMPLE_HTML)

        orch = Orchestrator(meta_db, doc_store, graph_db, vector_index, pipeline)
        await orch.process_queue()

        # Find case node
        case_nodes = await graph_db.query_nodes("Case", {"mnc": "[2023] NSWSC 456"})
        assert len(case_nodes) == 1
        case_id = case_nodes[0].node_id

        # Traverse to judges
        judge_rels = await graph_db.get_relationships(case_id, "HEARD_BY")
        assert len(judge_rels) >= 1
        judge_node = await graph_db.get_node(judge_rels[0].to_id)
        assert judge_node is not None
        assert judge_node.label == "Judge"

        # Verify ruling (inadmissible evidence)
        ruling_rels = await graph_db.get_relationships(case_id, "HAS_RULING")
        assert len(ruling_rels) >= 1
        ruling_node = await graph_db.get_node(ruling_rels[0].to_id)
        assert ruling_node is not None
        assert ruling_node.properties.get("outcome") == "inadmissible"