Source layer (5 court sources), processing pipeline (parse/extract/chunk/embed/graph), property graph with 8 node types, juror subgraph queries with 6 personas, orchestrator with bootstrap/watch/backfill/audit/process modes, 170 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
143 lines
4.6 KiB
Python
143 lines
4.6 KiB
Python
"""Configuration loader for AuCourtIngest."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import tomllib
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
|
|
@dataclass
|
|
class RateLimitConfig:
|
|
rps: float = 1.0
|
|
concurrent: int = 1
|
|
retry_after: int = 60
|
|
|
|
|
|
@dataclass
|
|
class SourceConfig:
|
|
source_id: str
|
|
base_url: str
|
|
fetch_strategy: str
|
|
rate_limit: RateLimitConfig = field(default_factory=RateLimitConfig)
|
|
doc_formats: list[str] = field(default_factory=lambda: ["html"])
|
|
coverage_from: int | None = None
|
|
rss_feed: str | None = None
|
|
browse_url: str | None = None
|
|
|
|
|
|
@dataclass
|
|
class StorageConfig:
|
|
data_dir: Path = Path("data")
|
|
docs_dir: Path = Path("data/docs")
|
|
raw_dir: Path = Path("data/raw")
|
|
meta_db_path: Path = Path("data/meta.db")
|
|
chromadb_dir: Path = Path("data/chromadb")
|
|
neo4j_uri: str = "bolt://localhost:7687"
|
|
neo4j_user: str = "neo4j"
|
|
neo4j_password: str = "password"
|
|
neo4j_database: str = "au_legal"
|
|
|
|
|
|
@dataclass
|
|
class LLMConfig:
|
|
anthropic_api_key: str = ""
|
|
openai_api_key: str = ""
|
|
extraction_model: str = "claude-haiku-4-5-20251001"
|
|
embedding_model: str = "text-embedding-3-small"
|
|
embedding_batch_size: int = 100
|
|
|
|
|
|
@dataclass
|
|
class TelegramConfig:
|
|
bot_token: str = ""
|
|
chat_id: str = ""
|
|
enabled: bool = False
|
|
|
|
|
|
@dataclass
|
|
class AppConfig:
|
|
"""Root config — loaded from config.toml."""
|
|
|
|
sources: dict[str, SourceConfig] = field(default_factory=dict)
|
|
rate_limits: dict[str, RateLimitConfig] = field(default_factory=dict)
|
|
storage: StorageConfig = field(default_factory=StorageConfig)
|
|
llm: LLMConfig = field(default_factory=LLMConfig)
|
|
telegram: TelegramConfig = field(default_factory=TelegramConfig)
|
|
user_agent: str = "AuCourtIngest/0.1 (legal research)"
|
|
|
|
@classmethod
|
|
def load(cls, path: str | Path = "config.toml") -> AppConfig:
|
|
p = Path(path)
|
|
if not p.exists():
|
|
return cls()
|
|
|
|
with open(p, "rb") as f:
|
|
raw = tomllib.load(f)
|
|
|
|
config = cls()
|
|
|
|
# Parse sources
|
|
for src_id, src_raw in raw.get("sources", {}).items():
|
|
rl_raw = src_raw.get("rate_limit", {})
|
|
rate_limit = RateLimitConfig(
|
|
rps=rl_raw.get("rps", 1.0),
|
|
concurrent=rl_raw.get("concurrent", 1),
|
|
retry_after=rl_raw.get("retry_after", 60),
|
|
)
|
|
source = SourceConfig(
|
|
source_id=src_id,
|
|
base_url=src_raw.get("base_url", ""),
|
|
fetch_strategy=src_raw.get("fetch_strategy", ""),
|
|
rate_limit=rate_limit,
|
|
doc_formats=src_raw.get("doc_formats", ["html"]),
|
|
coverage_from=src_raw.get("coverage_from"),
|
|
rss_feed=src_raw.get("rss_feed"),
|
|
browse_url=src_raw.get("browse_url"),
|
|
)
|
|
config.sources[src_id] = source
|
|
|
|
# Parse standalone rate limits (fallback)
|
|
for k, v in raw.get("rate_limits", {}).items():
|
|
config.rate_limits[k] = RateLimitConfig(
|
|
rps=v.get("rps", 1.0),
|
|
concurrent=v.get("concurrent", 1),
|
|
retry_after=v.get("retry_after", 60),
|
|
)
|
|
|
|
# Storage
|
|
stg = raw.get("storage", {})
|
|
data_dir = Path(stg.get("data_dir", "data"))
|
|
config.storage = StorageConfig(
|
|
data_dir=data_dir,
|
|
docs_dir=data_dir / "docs",
|
|
raw_dir=data_dir / "raw",
|
|
meta_db_path=data_dir / "meta.db",
|
|
chromadb_dir=data_dir / "chromadb",
|
|
neo4j_uri=stg.get("neo4j_uri", "bolt://localhost:7687"),
|
|
neo4j_user=stg.get("neo4j_user", "neo4j"),
|
|
neo4j_password=stg.get("neo4j_password", "password"),
|
|
neo4j_database=stg.get("neo4j_database", "au_legal"),
|
|
)
|
|
|
|
# LLM
|
|
llm = raw.get("llm", {})
|
|
config.llm = LLMConfig(
|
|
anthropic_api_key=llm.get("anthropic_api_key", ""),
|
|
openai_api_key=llm.get("openai_api_key", ""),
|
|
extraction_model=llm.get("extraction_model", "claude-haiku-4-5-20251001"),
|
|
embedding_model=llm.get("embedding_model", "text-embedding-3-small"),
|
|
embedding_batch_size=llm.get("embedding_batch_size", 100),
|
|
)
|
|
|
|
# Telegram
|
|
tg = raw.get("telegram", {})
|
|
config.telegram = TelegramConfig(
|
|
bot_token=tg.get("bot_token", ""),
|
|
chat_id=tg.get("chat_id", ""),
|
|
enabled=tg.get("enabled", False),
|
|
)
|
|
|
|
config.user_agent = raw.get("user_agent", "AuCourtIngest/0.1 (legal research)")
|
|
|
|
return config
|