aucourt-ingest/aucourt_ingest/models.py
slothitude d77fe12cfc AuCourtIngest: complete 8-stage Australian legal case ingestion pipeline
Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph),
property graph with 8 node types, juror subgraph queries with 6 personas,
orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-30 11:56:23 +10:00

158 lines
4.2 KiB
Python

"""Data models for AuCourtIngest."""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
# --- Source / Fetch ---
class FetchStatus(str, Enum):
PENDING = "pending"
FETCHED = "fetched"
PARSED = "parsed"
EMBEDDED = "embedded"
GRAPHED = "graphed"
FAILED = "failed"
@dataclass
class RawDocument:
"""Normalised output from DocParser — a single fetched document."""
source_id: str # e.g. "nsw_caselaw"
doc_id: str # MNC or internal UUID
url: str
fetch_timestamp: str # ISO 8601
raw_text: str # full extracted text
format: str # html | docx | pdf
pages: int | None = None
char_count: int = 0
def __post_init__(self):
self.char_count = len(self.raw_text)
@dataclass
class FetchQueueItem:
source_id: str
url: str
priority: int = 5 # 1 = highest
attempts: int = 0
doc_id: str = "" # MNC or other identifier, if known at discovery time
@dataclass
class SourceState:
source_id: str
last_poll: str | None = None
last_rss_etag: str | None = None
docs_fetched: int = 0
docs_failed: int = 0
# --- Metadata ---
class Verdict(str, Enum):
GUILTY = "guilty"
NOT_GUILTY = "not_guilty"
APPEAL_ALLOWED = "appeal_allowed"
APPEAL_DISMISSED = "appeal_dismissed"
CONVICTION_QUASHED = "conviction_quashed"
SENTENCE_VARIED = "sentence_varied"
HUNG = "hung"
CIVIL_JUDGMENT = "civil_judgment"
N_A = "n/a"
class MatterType(str, Enum):
CRIMINAL = "criminal"
CIVIL = "civil"
APPEAL = "appeal"
CORONIAL = "coronial"
@dataclass
class CaseMeta:
"""Extracted metadata for a single court decision."""
# Identity
case_name: str # e.g. "R v Smith"
mnc: str # Medium Neutral Citation
court: str # NSWSC | HCA | FCA | QSC etc
judge: list[str] = field(default_factory=list)
date_delivered: str | None = None # ISO 8601
jurisdiction: str | None = None # NSW | CTH | QLD | VIC etc
# Classification
matter_type: str | None = None
charges: list[str] = field(default_factory=list)
charge_categories: list[str] = field(default_factory=list)
# Outcome
verdict: str | None = None
sentence: str | None = None # "18 years NMP 13"
outcome_notes: str | None = None
# Flags
is_appeal: bool = False
appeal_of: str | None = None # MNC of original decision
was_appealed: bool = False # filled later by back-reference
exoneration_flag: bool = False
inadmissible_evidence: list[str] = field(default_factory=list)
suppression_order: bool = False
# --- Chunks ---
class ChunkType(str, Enum):
OPENING = "opening"
TESTIMONY = "testimony"
EXHIBIT = "exhibit"
RULING = "ruling"
CLOSING = "closing"
JUDGMENT = "judgment"
SENTENCE = "sentence"
@dataclass
class Chunk:
"""Semantically meaningful unit from a court document."""
chunk_id: str # UUID
doc_id: str # parent document MNC
chunk_type: str # ChunkType value
sequence: int # position in document
text: str
token_count: int = 0
speaker: str | None = None # witness name if testimony
page_ref: str | None = None # "p.47" if available
embedding: list[float] = field(default_factory=list)
def __post_init__(self):
# Rough token estimate: 1 token ~ 4 chars
self.token_count = len(self.text) // 4
# --- Jury ---
@dataclass
class JurorPersona:
name: str # e.g. "nurse"
anchor_nodes: list[str] # e.g. ["Witness[role=expert]"]
edge_types: list[str] # e.g. ["GAVE_TESTIMONY", "DESCRIBED_IN"]
chunk_types: list[str] = field(default_factory=list)
exclude_edges: list[str] = field(default_factory=lambda: ["RULED_INADMISSIBLE"])
@dataclass
class JurorContext:
"""Result of a juror subgraph query."""
persona: str
case_mnc: str
context_text: str # assembled chunks up to token budget
source_chunk_ids: list[str] # for citation / traceability
total_tokens: int = 0