aucourt-ingest/aucourt_ingest/models.py

"""Data models for AuCourtIngest."""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import Optional


# --- Source / Fetch ---

class FetchStatus(str, Enum):
    PENDING = "pending"
    FETCHED = "fetched"
    PARSED = "parsed"
    EMBEDDED = "embedded"
    GRAPHED = "graphed"
    FAILED = "failed"


@dataclass
class RawDocument:
    """Normalised output from DocParser — a single fetched document."""

    source_id: str              # e.g. "nsw_caselaw"
    doc_id: str                 # MNC or internal UUID
    url: str
    fetch_timestamp: str       # ISO 8601
    raw_text: str              # full extracted text
    format: str                # html | docx | pdf
    pages: int | None = None
    char_count: int = 0

    def __post_init__(self):
        self.char_count = len(self.raw_text)


@dataclass
class FetchQueueItem:
    source_id: str
    url: str
    priority: int = 5          # 1 = highest
    attempts: int = 0
    doc_id: str = ""           # MNC or other identifier, if known at discovery time


@dataclass
class SourceState:
    source_id: str
    last_poll: str | None = None
    last_rss_etag: str | None = None
    docs_fetched: int = 0
    docs_failed: int = 0


# --- Metadata ---

class Verdict(str, Enum):
    GUILTY = "guilty"
    NOT_GUILTY = "not_guilty"
    APPEAL_ALLOWED = "appeal_allowed"
    APPEAL_DISMISSED = "appeal_dismissed"
    CONVICTION_QUASHED = "conviction_quashed"
    SENTENCE_VARIED = "sentence_varied"
    HUNG = "hung"
    CIVIL_JUDGMENT = "civil_judgment"
    N_A = "n/a"


class MatterType(str, Enum):
    CRIMINAL = "criminal"
    CIVIL = "civil"
    APPEAL = "appeal"
    CORONIAL = "coronial"


@dataclass
class CaseMeta:
    """Extracted metadata for a single court decision."""

    # Identity
    case_name: str              # e.g. "R v Smith"
    mnc: str                    # Medium Neutral Citation
    court: str                  # NSWSC | HCA | FCA | QSC etc
    judge: list[str] = field(default_factory=list)
    date_delivered: str | None = None  # ISO 8601
    jurisdiction: str | None = None    # NSW | CTH | QLD | VIC etc

    # Classification
    matter_type: str | None = None
    charges: list[str] = field(default_factory=list)
    charge_categories: list[str] = field(default_factory=list)

    # Outcome
    verdict: str | None = None
    sentence: str | None = None       # "18 years NMP 13"
    outcome_notes: str | None = None

    # Flags
    is_appeal: bool = False
    appeal_of: str | None = None      # MNC of original decision
    was_appealed: bool = False         # filled later by back-reference
    exoneration_flag: bool = False
    inadmissible_evidence: list[str] = field(default_factory=list)
    suppression_order: bool = False


# --- Chunks ---

class ChunkType(str, Enum):
    OPENING = "opening"
    TESTIMONY = "testimony"
    EXHIBIT = "exhibit"
    RULING = "ruling"
    CLOSING = "closing"
    JUDGMENT = "judgment"
    SENTENCE = "sentence"


@dataclass
class Chunk:
    """Semantically meaningful unit from a court document."""

    chunk_id: str                # UUID
    doc_id: str                  # parent document MNC
    chunk_type: str              # ChunkType value
    sequence: int                # position in document
    text: str
    token_count: int = 0
    speaker: str | None = None  # witness name if testimony
    page_ref: str | None = None # "p.47" if available
    embedding: list[float] = field(default_factory=list)

    def __post_init__(self):
        # Rough token estimate: 1 token ~ 4 chars
        self.token_count = len(self.text) // 4


# --- Jury ---

@dataclass
class JurorPersona:
    name: str                    # e.g. "nurse"
    anchor_nodes: list[str]      # e.g. ["Witness[role=expert]"]
    edge_types: list[str]        # e.g. ["GAVE_TESTIMONY", "DESCRIBED_IN"]
    chunk_types: list[str] = field(default_factory=list)
    exclude_edges: list[str] = field(default_factory=lambda: ["RULED_INADMISSIBLE"])


@dataclass
class JurorContext:
    """Result of a juror subgraph query."""

    persona: str
    case_mnc: str
    context_text: str            # assembled chunks up to token budget
    source_chunk_ids: list[str]  # for citation / traceability
    total_tokens: int = 0