aucourt-ingest/aucourt_ingest/config.py

"""Configuration loader for AuCourtIngest."""

from __future__ import annotations

import tomllib
from dataclasses import dataclass, field
from pathlib import Path


@dataclass
class RateLimitConfig:
    rps: float = 1.0
    concurrent: int = 1
    retry_after: int = 60


@dataclass
class SourceConfig:
    source_id: str
    base_url: str
    fetch_strategy: str
    rate_limit: RateLimitConfig = field(default_factory=RateLimitConfig)
    doc_formats: list[str] = field(default_factory=lambda: ["html"])
    coverage_from: int | None = None
    rss_feed: str | None = None
    browse_url: str | None = None


@dataclass
class StorageConfig:
    data_dir: Path = Path("data")
    docs_dir: Path = Path("data/docs")
    raw_dir: Path = Path("data/raw")
    meta_db_path: Path = Path("data/meta.db")
    chromadb_dir: Path = Path("data/chromadb")
    neo4j_uri: str = "bolt://localhost:7687"
    neo4j_user: str = "neo4j"
    neo4j_password: str = "password"
    neo4j_database: str = "au_legal"


@dataclass
class LLMConfig:
    anthropic_api_key: str = ""
    openai_api_key: str = ""
    extraction_model: str = "claude-haiku-4-5-20251001"
    embedding_model: str = "text-embedding-3-small"
    embedding_batch_size: int = 100


@dataclass
class TelegramConfig:
    bot_token: str = ""
    chat_id: str = ""
    enabled: bool = False


@dataclass
class ServerConfig:
    host: str = "127.0.0.1"
    port: int = 8000
    graph_backend: str = "memory"
    default_max_tokens: int = 4000


@dataclass
class AppConfig:
    """Root config — loaded from config.toml."""

    sources: dict[str, SourceConfig] = field(default_factory=dict)
    rate_limits: dict[str, RateLimitConfig] = field(default_factory=dict)
    storage: StorageConfig = field(default_factory=StorageConfig)
    llm: LLMConfig = field(default_factory=LLMConfig)
    telegram: TelegramConfig = field(default_factory=TelegramConfig)
    server: ServerConfig = field(default_factory=ServerConfig)
    user_agent: str = "AuCourtIngest/0.1 (legal research)"

    @classmethod
    def load(cls, path: str | Path = "config.toml") -> AppConfig:
        p = Path(path)
        if not p.exists():
            return cls()

        with open(p, "rb") as f:
            raw = tomllib.load(f)

        config = cls()

        # Parse sources
        for src_id, src_raw in raw.get("sources", {}).items():
            rl_raw = src_raw.get("rate_limit", {})
            rate_limit = RateLimitConfig(
                rps=rl_raw.get("rps", 1.0),
                concurrent=rl_raw.get("concurrent", 1),
                retry_after=rl_raw.get("retry_after", 60),
            )
            source = SourceConfig(
                source_id=src_id,
                base_url=src_raw.get("base_url", ""),
                fetch_strategy=src_raw.get("fetch_strategy", ""),
                rate_limit=rate_limit,
                doc_formats=src_raw.get("doc_formats", ["html"]),
                coverage_from=src_raw.get("coverage_from"),
                rss_feed=src_raw.get("rss_feed"),
                browse_url=src_raw.get("browse_url"),
            )
            config.sources[src_id] = source

        # Parse standalone rate limits (fallback)
        for k, v in raw.get("rate_limits", {}).items():
            config.rate_limits[k] = RateLimitConfig(
                rps=v.get("rps", 1.0),
                concurrent=v.get("concurrent", 1),
                retry_after=v.get("retry_after", 60),
            )

        # Storage
        stg = raw.get("storage", {})
        data_dir = Path(stg.get("data_dir", "data"))
        config.storage = StorageConfig(
            data_dir=data_dir,
            docs_dir=data_dir / "docs",
            raw_dir=data_dir / "raw",
            meta_db_path=data_dir / "meta.db",
            chromadb_dir=data_dir / "chromadb",
            neo4j_uri=stg.get("neo4j_uri", "bolt://localhost:7687"),
            neo4j_user=stg.get("neo4j_user", "neo4j"),
            neo4j_password=stg.get("neo4j_password", "password"),
            neo4j_database=stg.get("neo4j_database", "au_legal"),
        )

        # LLM
        llm = raw.get("llm", {})
        config.llm = LLMConfig(
            anthropic_api_key=llm.get("anthropic_api_key", ""),
            openai_api_key=llm.get("openai_api_key", ""),
            extraction_model=llm.get("extraction_model", "claude-haiku-4-5-20251001"),
            embedding_model=llm.get("embedding_model", "text-embedding-3-small"),
            embedding_batch_size=llm.get("embedding_batch_size", 100),
        )

        # Telegram
        tg = raw.get("telegram", {})
        config.telegram = TelegramConfig(
            bot_token=tg.get("bot_token", ""),
            chat_id=tg.get("chat_id", ""),
            enabled=tg.get("enabled", False),
        )

        # Server
        srv = raw.get("server", {})
        config.server = ServerConfig(
            host=srv.get("host", "127.0.0.1"),
            port=srv.get("port", 8000),
            graph_backend=srv.get("graph_backend", "memory"),
            default_max_tokens=srv.get("default_max_tokens", 4000),
        )

        config.user_agent = raw.get("user_agent", "AuCourtIngest/0.1 (legal research)")

        return config
AuCourtIngest: complete 8-stage Australian legal case ingestion pipeline Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph), property graph with 8 node types, juror subgraph queries with 6 personas, orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-05-30 01:56:23 +00:00			`"""Configuration loader for AuCourtIngest."""`

			`from __future__ import annotations`

			`import tomllib`
			`from dataclasses import dataclass, field`
			`from pathlib import Path`


			`@dataclass`
			`class RateLimitConfig:`
			`rps: float = 1.0`
			`concurrent: int = 1`
			`retry_after: int = 60`


			`@dataclass`
			`class SourceConfig:`
			`source_id: str`
			`base_url: str`
			`fetch_strategy: str`
			`rate_limit: RateLimitConfig = field(default_factory=RateLimitConfig)`
			`doc_formats: list[str] = field(default_factory=lambda: ["html"])`
			`coverage_from: int \| None = None`
			`rss_feed: str \| None = None`
			`browse_url: str \| None = None`


			`@dataclass`
			`class StorageConfig:`
			`data_dir: Path = Path("data")`
			`docs_dir: Path = Path("data/docs")`
			`raw_dir: Path = Path("data/raw")`
			`meta_db_path: Path = Path("data/meta.db")`
			`chromadb_dir: Path = Path("data/chromadb")`
			`neo4j_uri: str = "bolt://localhost:7687"`
			`neo4j_user: str = "neo4j"`
			`neo4j_password: str = "password"`
			`neo4j_database: str = "au_legal"`


			`@dataclass`
			`class LLMConfig:`
			`anthropic_api_key: str = ""`
			`openai_api_key: str = ""`
			`extraction_model: str = "claude-haiku-4-5-20251001"`
			`embedding_model: str = "text-embedding-3-small"`
			`embedding_batch_size: int = 100`


			`@dataclass`
			`class TelegramConfig:`
			`bot_token: str = ""`
			`chat_id: str = ""`
			`enabled: bool = False`


Stage 9: add read-only FastAPI query API for juror RAG queries 8 GET endpoints under /api/v1 for health, personas, cases, vector search, juror context, and hybrid search. Includes QueryService composing SubgraphQuery + VectorIndex + GraphDB, Pydantic response models, error handlers, and `serve` CLI mode via uvicorn. 20 new tests, 190 total, zero regressions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-05-30 02:08:55 +00:00			`@dataclass`
			`class ServerConfig:`
			`host: str = "127.0.0.1"`
			`port: int = 8000`
			`graph_backend: str = "memory"`
			`default_max_tokens: int = 4000`


AuCourtIngest: complete 8-stage Australian legal case ingestion pipeline Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph), property graph with 8 node types, juror subgraph queries with 6 personas, orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-05-30 01:56:23 +00:00			`@dataclass`
			`class AppConfig:`
			`"""Root config — loaded from config.toml."""`

			`sources: dict[str, SourceConfig] = field(default_factory=dict)`
			`rate_limits: dict[str, RateLimitConfig] = field(default_factory=dict)`
			`storage: StorageConfig = field(default_factory=StorageConfig)`
			`llm: LLMConfig = field(default_factory=LLMConfig)`
			`telegram: TelegramConfig = field(default_factory=TelegramConfig)`
Stage 9: add read-only FastAPI query API for juror RAG queries 8 GET endpoints under /api/v1 for health, personas, cases, vector search, juror context, and hybrid search. Includes QueryService composing SubgraphQuery + VectorIndex + GraphDB, Pydantic response models, error handlers, and `serve` CLI mode via uvicorn. 20 new tests, 190 total, zero regressions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-05-30 02:08:55 +00:00			`server: ServerConfig = field(default_factory=ServerConfig)`
AuCourtIngest: complete 8-stage Australian legal case ingestion pipeline Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph), property graph with 8 node types, juror subgraph queries with 6 personas, orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-05-30 01:56:23 +00:00			`user_agent: str = "AuCourtIngest/0.1 (legal research)"`

			`@classmethod`
			`def load(cls, path: str \| Path = "config.toml") -> AppConfig:`
			`p = Path(path)`
			`if not p.exists():`
			`return cls()`

			`with open(p, "rb") as f:`
			`raw = tomllib.load(f)`

			`config = cls()`

			`# Parse sources`
			`for src_id, src_raw in raw.get("sources", {}).items():`
			`rl_raw = src_raw.get("rate_limit", {})`
			`rate_limit = RateLimitConfig(`
			`rps=rl_raw.get("rps", 1.0),`
			`concurrent=rl_raw.get("concurrent", 1),`
			`retry_after=rl_raw.get("retry_after", 60),`
			`)`
			`source = SourceConfig(`
			`source_id=src_id,`
			`base_url=src_raw.get("base_url", ""),`
			`fetch_strategy=src_raw.get("fetch_strategy", ""),`
			`rate_limit=rate_limit,`
			`doc_formats=src_raw.get("doc_formats", ["html"]),`
			`coverage_from=src_raw.get("coverage_from"),`
			`rss_feed=src_raw.get("rss_feed"),`
			`browse_url=src_raw.get("browse_url"),`
			`)`
			`config.sources[src_id] = source`

			`# Parse standalone rate limits (fallback)`
			`for k, v in raw.get("rate_limits", {}).items():`
			`config.rate_limits[k] = RateLimitConfig(`
			`rps=v.get("rps", 1.0),`
			`concurrent=v.get("concurrent", 1),`
			`retry_after=v.get("retry_after", 60),`
			`)`

			`# Storage`
			`stg = raw.get("storage", {})`
			`data_dir = Path(stg.get("data_dir", "data"))`
			`config.storage = StorageConfig(`
			`data_dir=data_dir,`
			`docs_dir=data_dir / "docs",`
			`raw_dir=data_dir / "raw",`
			`meta_db_path=data_dir / "meta.db",`
			`chromadb_dir=data_dir / "chromadb",`
			`neo4j_uri=stg.get("neo4j_uri", "bolt://localhost:7687"),`
			`neo4j_user=stg.get("neo4j_user", "neo4j"),`
			`neo4j_password=stg.get("neo4j_password", "password"),`
			`neo4j_database=stg.get("neo4j_database", "au_legal"),`
			`)`

			`# LLM`
			`llm = raw.get("llm", {})`
			`config.llm = LLMConfig(`
			`anthropic_api_key=llm.get("anthropic_api_key", ""),`
			`openai_api_key=llm.get("openai_api_key", ""),`
			`extraction_model=llm.get("extraction_model", "claude-haiku-4-5-20251001"),`
			`embedding_model=llm.get("embedding_model", "text-embedding-3-small"),`
			`embedding_batch_size=llm.get("embedding_batch_size", 100),`
			`)`

			`# Telegram`
			`tg = raw.get("telegram", {})`
			`config.telegram = TelegramConfig(`
			`bot_token=tg.get("bot_token", ""),`
			`chat_id=tg.get("chat_id", ""),`
			`enabled=tg.get("enabled", False),`
			`)`

Stage 9: add read-only FastAPI query API for juror RAG queries 8 GET endpoints under /api/v1 for health, personas, cases, vector search, juror context, and hybrid search. Includes QueryService composing SubgraphQuery + VectorIndex + GraphDB, Pydantic response models, error handlers, and `serve` CLI mode via uvicorn. 20 new tests, 190 total, zero regressions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-05-30 02:08:55 +00:00			`# Server`
			`srv = raw.get("server", {})`
			`config.server = ServerConfig(`
			`host=srv.get("host", "127.0.0.1"),`
			`port=srv.get("port", 8000),`
			`graph_backend=srv.get("graph_backend", "memory"),`
			`default_max_tokens=srv.get("default_max_tokens", 4000),`
			`)`

AuCourtIngest: complete 8-stage Australian legal case ingestion pipeline Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph), property graph with 8 node types, juror subgraph queries with 6 personas, orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-05-30 01:56:23 +00:00			`config.user_agent = raw.get("user_agent", "AuCourtIngest/0.1 (legal research)")`

			`return config`