aucourt-ingest/aucourt_ingest/sources/nsw_caselaw.py

"""NSW Caselaw source — browse pagination via JSON API."""

from __future__ import annotations

import logging
from datetime import datetime, timezone

from bs4 import BeautifulSoup

from aucourt_ingest.models import FetchQueueItem, FetchStatus, RawDocument
from aucourt_ingest.sources.base import BaseSource
from aucourt_ingest.storage.doc_store import DocStore
from aucourt_ingest.storage.meta_db import MetaDB
from aucourt_ingest.utils.mnc_parser import parse_mnc

logger = logging.getLogger(__name__)

NSW_CASELAW_BROWSE_URL = "https://www.caselaw.nsw.gov.au/browse/list"
NSW_CASELAW_DECISION_URL = "https://www.caselaw.nsw.gov.au/decision/{id}"


class NSWCaselawSource(BaseSource):
    """Fetch criminal decisions from NSW Caselaw via browse/list JSON API.

    API: GET /browse/list?filter=criminal&page=N
    Returns JSON with searchableDecisions array (200 per page).
    Total: ~125,000 criminal decisions across 630 pages.
    """

    source_id = "nsw_caselaw"

    async def discover(self, page: int = 1, **kwargs) -> list[FetchQueueItem]:
        """Fetch a page of criminal decisions from the browse API."""
        params = {"filter": "criminal", "page": page}
        async with self.rate_limiter:
            resp = await self.client.get(NSW_CASELAW_BROWSE_URL, params=params)
            resp.raise_for_status()

        data = resp.json()
        decisions = data.get("searchableDecisions", [])
        items = []

        for dec in decisions:
            mnc = dec.get("mnc", "")
            if not mnc or dec.get("restricted", False):
                continue
            url = NSW_CASELAW_DECISION_URL.format(id=dec["id"])
            items.append(FetchQueueItem(
                source_id=self.source_id,
                url=url,
                priority=5,
                doc_id=mnc,
            ))
            if self.meta_db:
                await self.meta_db.insert_document(mnc, self.source_id, url)

        total = data.get("totalElements", 0)
        total_pages = data.get("totalPages", 1)
        logger.info(f"NSW Caselaw page {page}/{total_pages}: {len(items)} decisions (total {total})")
        return items

    async def fetch(self, url: str, **kwargs) -> RawDocument:
        """Fetch a single decision page and extract judgment text."""
        async with self.rate_limiter:
            resp = await self.client.get(url)
            resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")

        # NSW Caselaw structure: <div class="judgment"> > <div class="body">
        judgment_div = soup.find("div", class_="judgment")
        if judgment_div:
            body_div = judgment_div.find("div", class_="body")
            raw_text = body_div.get_text(separator="\n", strip=True) if body_div else judgment_div.get_text(separator="\n", strip=True)
        else:
            body = soup.find("main") or soup.find("div", class_="container")
            raw_text = body.get_text(separator="\n", strip=True) if body else resp.text

        # Prefer caller-provided doc_id (MNC from discover), else extract from text
        known_id = kwargs.get("doc_id", "")
        parsed = parse_mnc(raw_text[:500])
        doc_id = known_id or parsed.mnc if (known_id or parsed) else url.split("/")[-1]

        return RawDocument(
            source_id=self.source_id,
            doc_id=doc_id,
            url=url,
            fetch_timestamp=datetime.now(timezone.utc).isoformat(),
            raw_text=raw_text,
            format="html",
        )


async def bootstrap_nsw_caselaw(
    config, meta_db: MetaDB, doc_store: DocStore,
    limit: int = 0, pages: int = 0,
) -> int:
    """Discover and fetch criminal decisions from NSW Caselaw."""
    source = NSWCaselawSource(config, meta_db)
    await meta_db.init_source(source.source_id)

    total_fetched = 0
    page = 1

    try:
        while True:
            if pages and page > pages:
                break

            items = await source.discover(page=page)
            if not items:
                break

            for item in items:
                if limit and total_fetched >= limit:
                    break

                doc_id = item.doc_id or item.url.split("/")[-1]
                existing = await meta_db.get_document(doc_id)
                if existing and existing["fetch_status"] in (FetchStatus.FETCHED, FetchStatus.PARSED):
                    continue

                raw = None
                try:
                    raw = await source.fetch(item.url, doc_id=doc_id)
                    doc_id = raw.doc_id
                    doc_store.save(source.source_id, raw.doc_id, raw.raw_text, fmt="html")
                    await meta_db.update_status(doc_id, FetchStatus.FETCHED)
                    await meta_db.update_doc_meta(doc_id, char_count=raw.char_count)
                    await meta_db.increment_source_stats(source.source_id, fetched=1)
                    total_fetched += 1
                    logger.info(f"[{total_fetched}] {doc_id} ({raw.char_count} chars)")
                except Exception as e:
                    logger.error(f"Failed: {item.url}: {e}")
                    await meta_db.update_status(doc_id, FetchStatus.FAILED, error_message=str(e))
                    await meta_db.increment_source_stats(source.source_id, failed=1)

            if limit and total_fetched >= limit:
                break
            page += 1
    finally:
        await source.close()

    return total_fetched