Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph), property graph with 8 node types, juror subgraph queries with 6 personas, orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
144 lines
5.3 KiB
Python
144 lines
5.3 KiB
Python
"""NSW Caselaw source — browse pagination via JSON API."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from aucourt_ingest.models import FetchQueueItem, FetchStatus, RawDocument
|
|
from aucourt_ingest.sources.base import BaseSource
|
|
from aucourt_ingest.storage.doc_store import DocStore
|
|
from aucourt_ingest.storage.meta_db import MetaDB
|
|
from aucourt_ingest.utils.mnc_parser import parse_mnc
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
NSW_CASELAW_BROWSE_URL = "https://www.caselaw.nsw.gov.au/browse/list"
|
|
NSW_CASELAW_DECISION_URL = "https://www.caselaw.nsw.gov.au/decision/{id}"
|
|
|
|
|
|
class NSWCaselawSource(BaseSource):
|
|
"""Fetch criminal decisions from NSW Caselaw via browse/list JSON API.
|
|
|
|
API: GET /browse/list?filter=criminal&page=N
|
|
Returns JSON with searchableDecisions array (200 per page).
|
|
Total: ~125,000 criminal decisions across 630 pages.
|
|
"""
|
|
|
|
source_id = "nsw_caselaw"
|
|
|
|
async def discover(self, page: int = 1, **kwargs) -> list[FetchQueueItem]:
|
|
"""Fetch a page of criminal decisions from the browse API."""
|
|
params = {"filter": "criminal", "page": page}
|
|
async with self.rate_limiter:
|
|
resp = await self.client.get(NSW_CASELAW_BROWSE_URL, params=params)
|
|
resp.raise_for_status()
|
|
|
|
data = resp.json()
|
|
decisions = data.get("searchableDecisions", [])
|
|
items = []
|
|
|
|
for dec in decisions:
|
|
mnc = dec.get("mnc", "")
|
|
if not mnc or dec.get("restricted", False):
|
|
continue
|
|
url = NSW_CASELAW_DECISION_URL.format(id=dec["id"])
|
|
items.append(FetchQueueItem(
|
|
source_id=self.source_id,
|
|
url=url,
|
|
priority=5,
|
|
doc_id=mnc,
|
|
))
|
|
if self.meta_db:
|
|
await self.meta_db.insert_document(mnc, self.source_id, url)
|
|
|
|
total = data.get("totalElements", 0)
|
|
total_pages = data.get("totalPages", 1)
|
|
logger.info(f"NSW Caselaw page {page}/{total_pages}: {len(items)} decisions (total {total})")
|
|
return items
|
|
|
|
async def fetch(self, url: str, **kwargs) -> RawDocument:
|
|
"""Fetch a single decision page and extract judgment text."""
|
|
async with self.rate_limiter:
|
|
resp = await self.client.get(url)
|
|
resp.raise_for_status()
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
# NSW Caselaw structure: <div class="judgment"> > <div class="body">
|
|
judgment_div = soup.find("div", class_="judgment")
|
|
if judgment_div:
|
|
body_div = judgment_div.find("div", class_="body")
|
|
raw_text = body_div.get_text(separator="\n", strip=True) if body_div else judgment_div.get_text(separator="\n", strip=True)
|
|
else:
|
|
body = soup.find("main") or soup.find("div", class_="container")
|
|
raw_text = body.get_text(separator="\n", strip=True) if body else resp.text
|
|
|
|
# Prefer caller-provided doc_id (MNC from discover), else extract from text
|
|
known_id = kwargs.get("doc_id", "")
|
|
parsed = parse_mnc(raw_text[:500])
|
|
doc_id = known_id or parsed.mnc if (known_id or parsed) else url.split("/")[-1]
|
|
|
|
return RawDocument(
|
|
source_id=self.source_id,
|
|
doc_id=doc_id,
|
|
url=url,
|
|
fetch_timestamp=datetime.now(timezone.utc).isoformat(),
|
|
raw_text=raw_text,
|
|
format="html",
|
|
)
|
|
|
|
|
|
async def bootstrap_nsw_caselaw(
|
|
config, meta_db: MetaDB, doc_store: DocStore,
|
|
limit: int = 0, pages: int = 0,
|
|
) -> int:
|
|
"""Discover and fetch criminal decisions from NSW Caselaw."""
|
|
source = NSWCaselawSource(config, meta_db)
|
|
await meta_db.init_source(source.source_id)
|
|
|
|
total_fetched = 0
|
|
page = 1
|
|
|
|
try:
|
|
while True:
|
|
if pages and page > pages:
|
|
break
|
|
|
|
items = await source.discover(page=page)
|
|
if not items:
|
|
break
|
|
|
|
for item in items:
|
|
if limit and total_fetched >= limit:
|
|
break
|
|
|
|
doc_id = item.doc_id or item.url.split("/")[-1]
|
|
existing = await meta_db.get_document(doc_id)
|
|
if existing and existing["fetch_status"] in (FetchStatus.FETCHED, FetchStatus.PARSED):
|
|
continue
|
|
|
|
raw = None
|
|
try:
|
|
raw = await source.fetch(item.url, doc_id=doc_id)
|
|
doc_id = raw.doc_id
|
|
doc_store.save(source.source_id, raw.doc_id, raw.raw_text, fmt="html")
|
|
await meta_db.update_status(doc_id, FetchStatus.FETCHED)
|
|
await meta_db.update_doc_meta(doc_id, char_count=raw.char_count)
|
|
await meta_db.increment_source_stats(source.source_id, fetched=1)
|
|
total_fetched += 1
|
|
logger.info(f"[{total_fetched}] {doc_id} ({raw.char_count} chars)")
|
|
except Exception as e:
|
|
logger.error(f"Failed: {item.url}: {e}")
|
|
await meta_db.update_status(doc_id, FetchStatus.FAILED, error_message=str(e))
|
|
await meta_db.increment_source_stats(source.source_id, failed=1)
|
|
|
|
if limit and total_fetched >= limit:
|
|
break
|
|
page += 1
|
|
finally:
|
|
await source.close()
|
|
|
|
return total_fetched
|