"""End-to-end integration test: 3 documents through full pipeline to graph. Uses mocked LLM and embedding clients. Tests the full flow: RawDocument → DocParser → MetaExtractor → ChunkEngine → EmbedEngine → GraphBuilder """ from __future__ import annotations import json import pytest from aucourt_ingest.models import ( CaseMeta, Chunk, RawDocument, FetchStatus, Verdict, MatterType, ) from aucourt_ingest.processing.pipeline import FullPipeline from aucourt_ingest.orchestrator import Orchestrator from aucourt_ingest.storage.in_memory_graph_db import InMemoryGraphDB # ── Mock LLM clients ── class MockLLMClient: """Mock LLM that returns canned JSON for meta extraction and chunking.""" def _make_meta_response(self, text: str) -> str: """Generate meta JSON with MNC extracted from text.""" import re mnc_match = re.search(r'\[(\d{4})\]\s+(\w+)\s+(\d+)', text) if mnc_match: mnc = f"[{mnc_match.group(1)}] {mnc_match.group(2)} {mnc_match.group(3)}" name = f"R v Case{mnc_match.group(3)}" else: mnc = "[2023] NSWSC 999" name = "R v Default" return json.dumps({ "case_name": name, "mnc": mnc, "court": mnc.split()[1] if len(mnc.split()) > 1 else "NSWSC", "date_delivered": "2023-03-15", "jurisdiction": "NSW", "matter_type": "criminal", "judge": ["Judge Williams"], "charges": ["murder", "assault"], "verdict": "guilty", "exoneration_flag": False, "is_appeal": False, "appeal_of": "", "suppression_order": False, "inadmissible_evidence": ["hearsay statement"], }) CHUNK_RESPONSE = """```json { "sections": [ {"type": "opening", "start": 0, "end": 500, "title": "Opening"}, {"type": "testimony", "start": 500, "end": 2000, "title": "Witness Testimony", "speaker": "Dr Jones"}, {"type": "exhibit", "start": 2000, "end": 3000, "title": "Exhibit A"}, {"type": "closing", "start": 3000, "end": 3500, "title": "Closing Arguments"}, {"type": "judgment", "start": 3500, "end": 5000, "title": "Judgment"}, {"type": "sentence", "start": 5000, "end": 5500, "title": "Sentence"} ] } ```""" CHUNK_RESPONSE = """```json { "sections": [ {"type": "opening", "start": 0, "end": 500, "title": "Opening"}, {"type": "testimony", "start": 500, "end": 2000, "title": "Witness Testimony", "speaker": "Dr Jones"}, {"type": "exhibit", "start": 2000, "end": 3000, "title": "Exhibit A"}, {"type": "closing", "start": 3000, "end": 3500, "title": "Closing Arguments"}, {"type": "judgment", "start": 3500, "end": 5000, "title": "Judgment"}, {"type": "sentence", "start": 5000, "end": 5500, "title": "Sentence"} ] } ```""" async def create_message(self, prompt: str, system: str) -> str: combined = (prompt + system).lower() if "metadata" in combined or "case" in combined or "extract" in combined: return self._make_meta_response(prompt) if "section" in combined or "chunk" in combined or "boundary" in combined: return self.CHUNK_RESPONSE return '{"sections": []}' class MockEmbeddingClient: """Mock embedding client that returns deterministic vectors.""" async def embed_batch(self, texts: list[str]) -> list[list[float]]: # Return deterministic 4-dim vectors result = [] for text in texts: vec = [(hash(text) % 1000) / 1000.0 for _ in range(4)] import math norm = math.sqrt(sum(v * v for v in vec)) or 1.0 result.append([v / norm for v in vec]) return result # ── Mock storage ── class MockMetaDB: def __init__(self): self._queue: list[dict] = [] self._status_updates: list[tuple[str, str, str | None]] = [] self._meta_updates: list[tuple[str, dict]] = [] async def connect(self): pass async def close(self): pass async def dequeue(self) -> dict | None: return self._queue.pop(0) if self._queue else None async def update_status(self, doc_id: str, status: str, error: str | None = None): self._status_updates.append((doc_id, status, error)) async def update_doc_meta(self, doc_id: str, meta: dict): self._meta_updates.append((doc_id, meta)) async def get_documents_by_status(self, status: str) -> list[dict]: return [] class MockDocStore: def __init__(self): self._docs: dict[tuple[str, str], str] = {} def store(self, source_id: str, doc_id: str, content: str): self._docs[(source_id, doc_id)] = content async def load(self, source_id: str, doc_id: str) -> str | None: return self._docs.get((source_id, doc_id)) def exists(self, source_id: str, doc_id: str) -> bool: return (source_id, doc_id) in self._docs class MockVectorIndex: def __init__(self): self._stored: list[tuple[str, list[Chunk]]] = [] async def store_chunks(self, doc_id: str, chunks: list[Chunk]): self._stored.append((doc_id, chunks)) # ── Test fixtures ── SAMPLE_HTML = """
The accused was found guilty of murder after a trial before his Honour.
The Crown called three witnesses. Dr Jane Jones, a forensic pathologist, gave expert testimony regarding the cause of death. She stated that the victim died from blunt force trauma to the head consistent with the weapon recovered at the scene.
The defence argued that the prosecution had failed to prove intent beyond reasonable doubt. Counsel for the accused submitted that the evidence was circumstantial and that the hearsay statement from the witness should be excluded.
Exhibit A — a photograph of the scene showing the weapon placed near the body. The photograph was authenticated by Senior Constable Brown.
Closing submissions were heard on 14 March 2023. The prosecution submitted that the cumulative evidence established guilt beyond reasonable doubt.
His Honour found the accused GUILTY of murder. The conviction was based on the forensic evidence, the witness testimony, and the exhibits presented during the trial. The appeal against conviction was noted.
The accused was sentenced to imprisonment for a term of 25 years with a non-parole period of 18 years.