aucourt-ingest/aucourt_ingest/config.py

162 lines
5.1 KiB
Python
Raw Normal View History

"""Configuration loader for AuCourtIngest."""
from __future__ import annotations
import tomllib
from dataclasses import dataclass, field
from pathlib import Path
@dataclass
class RateLimitConfig:
rps: float = 1.0
concurrent: int = 1
retry_after: int = 60
@dataclass
class SourceConfig:
source_id: str
base_url: str
fetch_strategy: str
rate_limit: RateLimitConfig = field(default_factory=RateLimitConfig)
doc_formats: list[str] = field(default_factory=lambda: ["html"])
coverage_from: int | None = None
rss_feed: str | None = None
browse_url: str | None = None
@dataclass
class StorageConfig:
data_dir: Path = Path("data")
docs_dir: Path = Path("data/docs")
raw_dir: Path = Path("data/raw")
meta_db_path: Path = Path("data/meta.db")
chromadb_dir: Path = Path("data/chromadb")
neo4j_uri: str = "bolt://localhost:7687"
neo4j_user: str = "neo4j"
neo4j_password: str = "password"
neo4j_database: str = "au_legal"
@dataclass
class LLMConfig:
anthropic_api_key: str = ""
openai_api_key: str = ""
extraction_model: str = "claude-haiku-4-5-20251001"
embedding_model: str = "text-embedding-3-small"
embedding_batch_size: int = 100
@dataclass
class TelegramConfig:
bot_token: str = ""
chat_id: str = ""
enabled: bool = False
@dataclass
class ServerConfig:
host: str = "127.0.0.1"
port: int = 8000
graph_backend: str = "memory"
default_max_tokens: int = 4000
@dataclass
class AppConfig:
"""Root config — loaded from config.toml."""
sources: dict[str, SourceConfig] = field(default_factory=dict)
rate_limits: dict[str, RateLimitConfig] = field(default_factory=dict)
storage: StorageConfig = field(default_factory=StorageConfig)
llm: LLMConfig = field(default_factory=LLMConfig)
telegram: TelegramConfig = field(default_factory=TelegramConfig)
server: ServerConfig = field(default_factory=ServerConfig)
user_agent: str = "AuCourtIngest/0.1 (legal research)"
@classmethod
def load(cls, path: str | Path = "config.toml") -> AppConfig:
p = Path(path)
if not p.exists():
return cls()
with open(p, "rb") as f:
raw = tomllib.load(f)
config = cls()
# Parse sources
for src_id, src_raw in raw.get("sources", {}).items():
rl_raw = src_raw.get("rate_limit", {})
rate_limit = RateLimitConfig(
rps=rl_raw.get("rps", 1.0),
concurrent=rl_raw.get("concurrent", 1),
retry_after=rl_raw.get("retry_after", 60),
)
source = SourceConfig(
source_id=src_id,
base_url=src_raw.get("base_url", ""),
fetch_strategy=src_raw.get("fetch_strategy", ""),
rate_limit=rate_limit,
doc_formats=src_raw.get("doc_formats", ["html"]),
coverage_from=src_raw.get("coverage_from"),
rss_feed=src_raw.get("rss_feed"),
browse_url=src_raw.get("browse_url"),
)
config.sources[src_id] = source
# Parse standalone rate limits (fallback)
for k, v in raw.get("rate_limits", {}).items():
config.rate_limits[k] = RateLimitConfig(
rps=v.get("rps", 1.0),
concurrent=v.get("concurrent", 1),
retry_after=v.get("retry_after", 60),
)
# Storage
stg = raw.get("storage", {})
data_dir = Path(stg.get("data_dir", "data"))
config.storage = StorageConfig(
data_dir=data_dir,
docs_dir=data_dir / "docs",
raw_dir=data_dir / "raw",
meta_db_path=data_dir / "meta.db",
chromadb_dir=data_dir / "chromadb",
neo4j_uri=stg.get("neo4j_uri", "bolt://localhost:7687"),
neo4j_user=stg.get("neo4j_user", "neo4j"),
neo4j_password=stg.get("neo4j_password", "password"),
neo4j_database=stg.get("neo4j_database", "au_legal"),
)
# LLM
llm = raw.get("llm", {})
config.llm = LLMConfig(
anthropic_api_key=llm.get("anthropic_api_key", ""),
openai_api_key=llm.get("openai_api_key", ""),
extraction_model=llm.get("extraction_model", "claude-haiku-4-5-20251001"),
embedding_model=llm.get("embedding_model", "text-embedding-3-small"),
embedding_batch_size=llm.get("embedding_batch_size", 100),
)
# Telegram
tg = raw.get("telegram", {})
config.telegram = TelegramConfig(
bot_token=tg.get("bot_token", ""),
chat_id=tg.get("chat_id", ""),
enabled=tg.get("enabled", False),
)
# Server
srv = raw.get("server", {})
config.server = ServerConfig(
host=srv.get("host", "127.0.0.1"),
port=srv.get("port", 8000),
graph_backend=srv.get("graph_backend", "memory"),
default_max_tokens=srv.get("default_max_tokens", 4000),
)
config.user_agent = raw.get("user_agent", "AuCourtIngest/0.1 (legal research)")
return config