"""Configuration loader for AuCourtIngest.""" from __future__ import annotations import tomllib from dataclasses import dataclass, field from pathlib import Path @dataclass class RateLimitConfig: rps: float = 1.0 concurrent: int = 1 retry_after: int = 60 @dataclass class SourceConfig: source_id: str base_url: str fetch_strategy: str rate_limit: RateLimitConfig = field(default_factory=RateLimitConfig) doc_formats: list[str] = field(default_factory=lambda: ["html"]) coverage_from: int | None = None rss_feed: str | None = None browse_url: str | None = None @dataclass class StorageConfig: data_dir: Path = Path("data") docs_dir: Path = Path("data/docs") raw_dir: Path = Path("data/raw") meta_db_path: Path = Path("data/meta.db") chromadb_dir: Path = Path("data/chromadb") neo4j_uri: str = "bolt://localhost:7687" neo4j_user: str = "neo4j" neo4j_password: str = "password" neo4j_database: str = "au_legal" @dataclass class LLMConfig: anthropic_api_key: str = "" openai_api_key: str = "" extraction_model: str = "claude-haiku-4-5-20251001" embedding_model: str = "text-embedding-3-small" embedding_batch_size: int = 100 @dataclass class TelegramConfig: bot_token: str = "" chat_id: str = "" enabled: bool = False @dataclass class ServerConfig: host: str = "127.0.0.1" port: int = 8000 graph_backend: str = "memory" default_max_tokens: int = 4000 @dataclass class AppConfig: """Root config — loaded from config.toml.""" sources: dict[str, SourceConfig] = field(default_factory=dict) rate_limits: dict[str, RateLimitConfig] = field(default_factory=dict) storage: StorageConfig = field(default_factory=StorageConfig) llm: LLMConfig = field(default_factory=LLMConfig) telegram: TelegramConfig = field(default_factory=TelegramConfig) server: ServerConfig = field(default_factory=ServerConfig) user_agent: str = "AuCourtIngest/0.1 (legal research)" @classmethod def load(cls, path: str | Path = "config.toml") -> AppConfig: p = Path(path) if not p.exists(): return cls() with open(p, "rb") as f: raw = tomllib.load(f) config = cls() # Parse sources for src_id, src_raw in raw.get("sources", {}).items(): rl_raw = src_raw.get("rate_limit", {}) rate_limit = RateLimitConfig( rps=rl_raw.get("rps", 1.0), concurrent=rl_raw.get("concurrent", 1), retry_after=rl_raw.get("retry_after", 60), ) source = SourceConfig( source_id=src_id, base_url=src_raw.get("base_url", ""), fetch_strategy=src_raw.get("fetch_strategy", ""), rate_limit=rate_limit, doc_formats=src_raw.get("doc_formats", ["html"]), coverage_from=src_raw.get("coverage_from"), rss_feed=src_raw.get("rss_feed"), browse_url=src_raw.get("browse_url"), ) config.sources[src_id] = source # Parse standalone rate limits (fallback) for k, v in raw.get("rate_limits", {}).items(): config.rate_limits[k] = RateLimitConfig( rps=v.get("rps", 1.0), concurrent=v.get("concurrent", 1), retry_after=v.get("retry_after", 60), ) # Storage stg = raw.get("storage", {}) data_dir = Path(stg.get("data_dir", "data")) config.storage = StorageConfig( data_dir=data_dir, docs_dir=data_dir / "docs", raw_dir=data_dir / "raw", meta_db_path=data_dir / "meta.db", chromadb_dir=data_dir / "chromadb", neo4j_uri=stg.get("neo4j_uri", "bolt://localhost:7687"), neo4j_user=stg.get("neo4j_user", "neo4j"), neo4j_password=stg.get("neo4j_password", "password"), neo4j_database=stg.get("neo4j_database", "au_legal"), ) # LLM llm = raw.get("llm", {}) config.llm = LLMConfig( anthropic_api_key=llm.get("anthropic_api_key", ""), openai_api_key=llm.get("openai_api_key", ""), extraction_model=llm.get("extraction_model", "claude-haiku-4-5-20251001"), embedding_model=llm.get("embedding_model", "text-embedding-3-small"), embedding_batch_size=llm.get("embedding_batch_size", 100), ) # Telegram tg = raw.get("telegram", {}) config.telegram = TelegramConfig( bot_token=tg.get("bot_token", ""), chat_id=tg.get("chat_id", ""), enabled=tg.get("enabled", False), ) # Server srv = raw.get("server", {}) config.server = ServerConfig( host=srv.get("host", "127.0.0.1"), port=srv.get("port", 8000), graph_backend=srv.get("graph_backend", "memory"), default_max_tokens=srv.get("default_max_tokens", 4000), ) config.user_agent = raw.get("user_agent", "AuCourtIngest/0.1 (legal research)") return config