159 lines
4.2 KiB
Python
159 lines
4.2 KiB
Python
|
|
"""Data models for AuCourtIngest."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from enum import Enum
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
|
||
|
|
# --- Source / Fetch ---
|
||
|
|
|
||
|
|
class FetchStatus(str, Enum):
|
||
|
|
PENDING = "pending"
|
||
|
|
FETCHED = "fetched"
|
||
|
|
PARSED = "parsed"
|
||
|
|
EMBEDDED = "embedded"
|
||
|
|
GRAPHED = "graphed"
|
||
|
|
FAILED = "failed"
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class RawDocument:
|
||
|
|
"""Normalised output from DocParser — a single fetched document."""
|
||
|
|
|
||
|
|
source_id: str # e.g. "nsw_caselaw"
|
||
|
|
doc_id: str # MNC or internal UUID
|
||
|
|
url: str
|
||
|
|
fetch_timestamp: str # ISO 8601
|
||
|
|
raw_text: str # full extracted text
|
||
|
|
format: str # html | docx | pdf
|
||
|
|
pages: int | None = None
|
||
|
|
char_count: int = 0
|
||
|
|
|
||
|
|
def __post_init__(self):
|
||
|
|
self.char_count = len(self.raw_text)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class FetchQueueItem:
|
||
|
|
source_id: str
|
||
|
|
url: str
|
||
|
|
priority: int = 5 # 1 = highest
|
||
|
|
attempts: int = 0
|
||
|
|
doc_id: str = "" # MNC or other identifier, if known at discovery time
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class SourceState:
|
||
|
|
source_id: str
|
||
|
|
last_poll: str | None = None
|
||
|
|
last_rss_etag: str | None = None
|
||
|
|
docs_fetched: int = 0
|
||
|
|
docs_failed: int = 0
|
||
|
|
|
||
|
|
|
||
|
|
# --- Metadata ---
|
||
|
|
|
||
|
|
class Verdict(str, Enum):
|
||
|
|
GUILTY = "guilty"
|
||
|
|
NOT_GUILTY = "not_guilty"
|
||
|
|
APPEAL_ALLOWED = "appeal_allowed"
|
||
|
|
APPEAL_DISMISSED = "appeal_dismissed"
|
||
|
|
CONVICTION_QUASHED = "conviction_quashed"
|
||
|
|
SENTENCE_VARIED = "sentence_varied"
|
||
|
|
HUNG = "hung"
|
||
|
|
CIVIL_JUDGMENT = "civil_judgment"
|
||
|
|
N_A = "n/a"
|
||
|
|
|
||
|
|
|
||
|
|
class MatterType(str, Enum):
|
||
|
|
CRIMINAL = "criminal"
|
||
|
|
CIVIL = "civil"
|
||
|
|
APPEAL = "appeal"
|
||
|
|
CORONIAL = "coronial"
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class CaseMeta:
|
||
|
|
"""Extracted metadata for a single court decision."""
|
||
|
|
|
||
|
|
# Identity
|
||
|
|
case_name: str # e.g. "R v Smith"
|
||
|
|
mnc: str # Medium Neutral Citation
|
||
|
|
court: str # NSWSC | HCA | FCA | QSC etc
|
||
|
|
judge: list[str] = field(default_factory=list)
|
||
|
|
date_delivered: str | None = None # ISO 8601
|
||
|
|
jurisdiction: str | None = None # NSW | CTH | QLD | VIC etc
|
||
|
|
|
||
|
|
# Classification
|
||
|
|
matter_type: str | None = None
|
||
|
|
charges: list[str] = field(default_factory=list)
|
||
|
|
charge_categories: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
# Outcome
|
||
|
|
verdict: str | None = None
|
||
|
|
sentence: str | None = None # "18 years NMP 13"
|
||
|
|
outcome_notes: str | None = None
|
||
|
|
|
||
|
|
# Flags
|
||
|
|
is_appeal: bool = False
|
||
|
|
appeal_of: str | None = None # MNC of original decision
|
||
|
|
was_appealed: bool = False # filled later by back-reference
|
||
|
|
exoneration_flag: bool = False
|
||
|
|
inadmissible_evidence: list[str] = field(default_factory=list)
|
||
|
|
suppression_order: bool = False
|
||
|
|
|
||
|
|
|
||
|
|
# --- Chunks ---
|
||
|
|
|
||
|
|
class ChunkType(str, Enum):
|
||
|
|
OPENING = "opening"
|
||
|
|
TESTIMONY = "testimony"
|
||
|
|
EXHIBIT = "exhibit"
|
||
|
|
RULING = "ruling"
|
||
|
|
CLOSING = "closing"
|
||
|
|
JUDGMENT = "judgment"
|
||
|
|
SENTENCE = "sentence"
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class Chunk:
|
||
|
|
"""Semantically meaningful unit from a court document."""
|
||
|
|
|
||
|
|
chunk_id: str # UUID
|
||
|
|
doc_id: str # parent document MNC
|
||
|
|
chunk_type: str # ChunkType value
|
||
|
|
sequence: int # position in document
|
||
|
|
text: str
|
||
|
|
token_count: int = 0
|
||
|
|
speaker: str | None = None # witness name if testimony
|
||
|
|
page_ref: str | None = None # "p.47" if available
|
||
|
|
embedding: list[float] = field(default_factory=list)
|
||
|
|
|
||
|
|
def __post_init__(self):
|
||
|
|
# Rough token estimate: 1 token ~ 4 chars
|
||
|
|
self.token_count = len(self.text) // 4
|
||
|
|
|
||
|
|
|
||
|
|
# --- Jury ---
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class JurorPersona:
|
||
|
|
name: str # e.g. "nurse"
|
||
|
|
anchor_nodes: list[str] # e.g. ["Witness[role=expert]"]
|
||
|
|
edge_types: list[str] # e.g. ["GAVE_TESTIMONY", "DESCRIBED_IN"]
|
||
|
|
chunk_types: list[str] = field(default_factory=list)
|
||
|
|
exclude_edges: list[str] = field(default_factory=lambda: ["RULED_INADMISSIBLE"])
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class JurorContext:
|
||
|
|
"""Result of a juror subgraph query."""
|
||
|
|
|
||
|
|
persona: str
|
||
|
|
case_mnc: str
|
||
|
|
context_text: str # assembled chunks up to token budget
|
||
|
|
source_chunk_ids: list[str] # for citation / traceability
|
||
|
|
total_tokens: int = 0
|