aucourt-ingest/aucourt_ingest/models.py

"""Data models for AuCourtIngest."""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import Optional


# --- Source / Fetch ---

class FetchStatus(str, Enum):
    PENDING = "pending"
    FETCHED = "fetched"
    PARSED = "parsed"
    EMBEDDED = "embedded"
    GRAPHED = "graphed"
    FAILED = "failed"


@dataclass
class RawDocument:
    """Normalised output from DocParser — a single fetched document."""

    source_id: str              # e.g. "nsw_caselaw"
    doc_id: str                 # MNC or internal UUID
    url: str
    fetch_timestamp: str       # ISO 8601
    raw_text: str              # full extracted text
    format: str                # html | docx | pdf
    pages: int | None = None
    char_count: int = 0

    def __post_init__(self):
        self.char_count = len(self.raw_text)


@dataclass
class FetchQueueItem:
    source_id: str
    url: str
    priority: int = 5          # 1 = highest
    attempts: int = 0
    doc_id: str = ""           # MNC or other identifier, if known at discovery time


@dataclass
class SourceState:
    source_id: str
    last_poll: str | None = None
    last_rss_etag: str | None = None
    docs_fetched: int = 0
    docs_failed: int = 0


# --- Metadata ---

class Verdict(str, Enum):
    GUILTY = "guilty"
    NOT_GUILTY = "not_guilty"
    APPEAL_ALLOWED = "appeal_allowed"
    APPEAL_DISMISSED = "appeal_dismissed"
    CONVICTION_QUASHED = "conviction_quashed"
    SENTENCE_VARIED = "sentence_varied"
    HUNG = "hung"
    CIVIL_JUDGMENT = "civil_judgment"
    N_A = "n/a"


class MatterType(str, Enum):
    CRIMINAL = "criminal"
    CIVIL = "civil"
    APPEAL = "appeal"
    CORONIAL = "coronial"


@dataclass
class CaseMeta:
    """Extracted metadata for a single court decision."""

    # Identity
    case_name: str              # e.g. "R v Smith"
    mnc: str                    # Medium Neutral Citation
    court: str                  # NSWSC | HCA | FCA | QSC etc
    judge: list[str] = field(default_factory=list)
    date_delivered: str | None = None  # ISO 8601
    jurisdiction: str | None = None    # NSW | CTH | QLD | VIC etc

    # Classification
    matter_type: str | None = None
    charges: list[str] = field(default_factory=list)
    charge_categories: list[str] = field(default_factory=list)

    # Outcome
    verdict: str | None = None
    sentence: str | None = None       # "18 years NMP 13"
    outcome_notes: str | None = None

    # Flags
    is_appeal: bool = False
    appeal_of: str | None = None      # MNC of original decision
    was_appealed: bool = False         # filled later by back-reference
    exoneration_flag: bool = False
    inadmissible_evidence: list[str] = field(default_factory=list)
    suppression_order: bool = False


# --- Chunks ---

class ChunkType(str, Enum):
    OPENING = "opening"
    TESTIMONY = "testimony"
    EXHIBIT = "exhibit"
    RULING = "ruling"
    CLOSING = "closing"
    JUDGMENT = "judgment"
    SENTENCE = "sentence"


@dataclass
class Chunk:
    """Semantically meaningful unit from a court document."""

    chunk_id: str                # UUID
    doc_id: str                  # parent document MNC
    chunk_type: str              # ChunkType value
    sequence: int                # position in document
    text: str
    token_count: int = 0
    speaker: str | None = None  # witness name if testimony
    page_ref: str | None = None # "p.47" if available
    embedding: list[float] = field(default_factory=list)

    def __post_init__(self):
        # Rough token estimate: 1 token ~ 4 chars
        self.token_count = len(self.text) // 4


# --- Jury ---

@dataclass
class JurorPersona:
    name: str                    # e.g. "nurse"
    anchor_nodes: list[str]      # e.g. ["Witness[role=expert]"]
    edge_types: list[str]        # e.g. ["GAVE_TESTIMONY", "DESCRIBED_IN"]
    chunk_types: list[str] = field(default_factory=list)
    exclude_edges: list[str] = field(default_factory=lambda: ["RULED_INADMISSIBLE"])


@dataclass
class JurorContext:
    """Result of a juror subgraph query."""

    persona: str
    case_mnc: str
    context_text: str            # assembled chunks up to token budget
    source_chunk_ids: list[str]  # for citation / traceability
    total_tokens: int = 0
AuCourtIngest: complete 8-stage Australian legal case ingestion pipeline Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph), property graph with 8 node types, juror subgraph queries with 6 personas, orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-05-30 01:56:23 +00:00			`"""Data models for AuCourtIngest."""`

			`from __future__ import annotations`

			`from dataclasses import dataclass, field`
			`from enum import Enum`
			`from typing import Optional`


			`# --- Source / Fetch ---`

			`class FetchStatus(str, Enum):`
			`PENDING = "pending"`
			`FETCHED = "fetched"`
			`PARSED = "parsed"`
			`EMBEDDED = "embedded"`
			`GRAPHED = "graphed"`
			`FAILED = "failed"`


			`@dataclass`
			`class RawDocument:`
			`"""Normalised output from DocParser — a single fetched document."""`

			`source_id: str # e.g. "nsw_caselaw"`
			`doc_id: str # MNC or internal UUID`
			`url: str`
			`fetch_timestamp: str # ISO 8601`
			`raw_text: str # full extracted text`
			`format: str # html \| docx \| pdf`
			`pages: int \| None = None`
			`char_count: int = 0`

			`def __post_init__(self):`
			`self.char_count = len(self.raw_text)`


			`@dataclass`
			`class FetchQueueItem:`
			`source_id: str`
			`url: str`
			`priority: int = 5 # 1 = highest`
			`attempts: int = 0`
			`doc_id: str = "" # MNC or other identifier, if known at discovery time`


			`@dataclass`
			`class SourceState:`
			`source_id: str`
			`last_poll: str \| None = None`
			`last_rss_etag: str \| None = None`
			`docs_fetched: int = 0`
			`docs_failed: int = 0`


			`# --- Metadata ---`

			`class Verdict(str, Enum):`
			`GUILTY = "guilty"`
			`NOT_GUILTY = "not_guilty"`
			`APPEAL_ALLOWED = "appeal_allowed"`
			`APPEAL_DISMISSED = "appeal_dismissed"`
			`CONVICTION_QUASHED = "conviction_quashed"`
			`SENTENCE_VARIED = "sentence_varied"`
			`HUNG = "hung"`
			`CIVIL_JUDGMENT = "civil_judgment"`
			`N_A = "n/a"`


			`class MatterType(str, Enum):`
			`CRIMINAL = "criminal"`
			`CIVIL = "civil"`
			`APPEAL = "appeal"`
			`CORONIAL = "coronial"`


			`@dataclass`
			`class CaseMeta:`
			`"""Extracted metadata for a single court decision."""`

			`# Identity`
			`case_name: str # e.g. "R v Smith"`
			`mnc: str # Medium Neutral Citation`
			`court: str # NSWSC \| HCA \| FCA \| QSC etc`
			`judge: list[str] = field(default_factory=list)`
			`date_delivered: str \| None = None # ISO 8601`
			`jurisdiction: str \| None = None # NSW \| CTH \| QLD \| VIC etc`

			`# Classification`
			`matter_type: str \| None = None`
			`charges: list[str] = field(default_factory=list)`
			`charge_categories: list[str] = field(default_factory=list)`

			`# Outcome`
			`verdict: str \| None = None`
			`sentence: str \| None = None # "18 years NMP 13"`
			`outcome_notes: str \| None = None`

			`# Flags`
			`is_appeal: bool = False`
			`appeal_of: str \| None = None # MNC of original decision`
			`was_appealed: bool = False # filled later by back-reference`
			`exoneration_flag: bool = False`
			`inadmissible_evidence: list[str] = field(default_factory=list)`
			`suppression_order: bool = False`


			`# --- Chunks ---`

			`class ChunkType(str, Enum):`
			`OPENING = "opening"`
			`TESTIMONY = "testimony"`
			`EXHIBIT = "exhibit"`
			`RULING = "ruling"`
			`CLOSING = "closing"`
			`JUDGMENT = "judgment"`
			`SENTENCE = "sentence"`


			`@dataclass`
			`class Chunk:`
			`"""Semantically meaningful unit from a court document."""`

			`chunk_id: str # UUID`
			`doc_id: str # parent document MNC`
			`chunk_type: str # ChunkType value`
			`sequence: int # position in document`
			`text: str`
			`token_count: int = 0`
			`speaker: str \| None = None # witness name if testimony`
			`page_ref: str \| None = None # "p.47" if available`
			`embedding: list[float] = field(default_factory=list)`

			`def __post_init__(self):`
			`# Rough token estimate: 1 token ~ 4 chars`
			`self.token_count = len(self.text) // 4`


			`# --- Jury ---`

			`@dataclass`
			`class JurorPersona:`
			`name: str # e.g. "nurse"`
			`anchor_nodes: list[str] # e.g. ["Witness[role=expert]"]`
			`edge_types: list[str] # e.g. ["GAVE_TESTIMONY", "DESCRIBED_IN"]`
			`chunk_types: list[str] = field(default_factory=list)`
			`exclude_edges: list[str] = field(default_factory=lambda: ["RULED_INADMISSIBLE"])`


			`@dataclass`
			`class JurorContext:`
			`"""Result of a juror subgraph query."""`

			`persona: str`
			`case_mnc: str`
			`context_text: str # assembled chunks up to token budget`
			`source_chunk_ids: list[str] # for citation / traceability`
			`total_tokens: int = 0`