aucourt-ingest/tests/test_graph_builder.py

466 lines
16 KiB
Python
Raw Normal View History

"""Tests for GraphBuilder + InMemoryGraphDB."""
from __future__ import annotations
import math
import pytest
from aucourt_ingest.models import CaseMeta, Chunk, Verdict, MatterType
from aucourt_ingest.processing.graph_builder import (
GraphBuilder, _cosine_similarity,
CORROBORATE_THRESHOLD, CONTRADICT_THRESHOLD,
)
from aucourt_ingest.storage.in_memory_graph_db import InMemoryGraphDB
from aucourt_ingest.storage.graph_db import _node_id, _rel_id
def _make_meta(
mnc="[2019] NSWSC 1234",
court="NSWSC",
judges=None,
charges=None,
inadmissible=None,
is_appeal=False,
appeal_of="",
verdict=Verdict.GUILTY,
exoneration_flag=False,
) -> CaseMeta:
return CaseMeta(
case_name=f"Test v State ({mnc})",
mnc=mnc,
court=court,
judge=judges if judges is not None else ["Judge Smith"],
charges=charges if charges is not None else ["murder"],
inadmissible_evidence=inadmissible or [],
is_appeal=is_appeal,
appeal_of=appeal_of,
verdict=verdict,
exoneration_flag=exoneration_flag,
matter_type=MatterType.CRIMINAL,
date_delivered="2019-06-15",
jurisdiction="NSW",
)
def _make_chunk(chunk_id="c0", chunk_type="testimony", sequence=0,
text="Some text", embedding=None, doc_id="[2019] NSWSC 1") -> Chunk:
return Chunk(
chunk_id=chunk_id,
doc_id=doc_id,
chunk_type=chunk_type,
sequence=sequence,
text=text,
token_count=50,
embedding=embedding,
)
# ── Cosine similarity ──
class TestCosineSimilarity:
def test_identical_vectors(self):
v = [0.5, 0.3, 0.8, 0.1]
assert _cosine_similarity(v, v) == pytest.approx(1.0)
def test_orthogonal_vectors(self):
assert _cosine_similarity([1, 0], [0, 1]) == pytest.approx(0.0)
def test_opposite_vectors(self):
assert _cosine_similarity([1, 0], [-1, 0]) == pytest.approx(-1.0)
def test_empty_vectors(self):
assert _cosine_similarity([], []) == 0.0
def test_zero_vectors(self):
assert _cosine_similarity([0, 0], [1, 0]) == 0.0
def test_unequal_lengths(self):
assert _cosine_similarity([1, 2], [1, 2, 3]) == 0.0
def test_mismatched_lengths(self):
assert _cosine_similarity([1, 2], []) == 0.0
# ── _node_id / _rel_id helpers ──
class TestHelperFunctions:
def test_node_id_mnc(self):
assert _node_id("Case", {"mnc": "[2019] NSWSC 1234"}) == "Case:[2019] NSWSC 1234"
def test_node_id_id(self):
assert _node_id("Judge", {"id": "j1"}) == "Judge:j1"
def test_node_id_hash_deterministic(self):
props = {"name": "Smith", "court": "NSWSC"}
a = _node_id("Judge", props)
b = _node_id("Judge", props)
assert a == b
assert a.startswith("Judge:")
assert len(a) > len("Judge:")
def test_rel_id(self):
rid = _rel_id("from123", "to456", "HEARD_BY")
assert rid == "from123-[HEARD_BY]->to456"
# ── InMemoryGraphDB ──
class TestInMemoryGraphDB:
@pytest.fixture
def db(self):
return InMemoryGraphDB()
@pytest.mark.asyncio
async def test_create_and_get_node(self, db):
await db.create_node("Case", {"mnc": "[2019] NSWSC 1"})
node = await db.get_node("Case:[2019] NSWSC 1")
assert node is not None
assert node.label == "Case"
assert node.properties["mnc"] == "[2019] NSWSC 1"
@pytest.mark.asyncio
async def test_create_node_dedup(self, db):
id1 = await db.create_node("Case", {"mnc": "[2019] NSWSC 1"})
id2 = await db.create_node("Case", {"mnc": "[2019] NSWSC 1"})
assert id1 == id2
assert await db.node_count("Case") == 1
@pytest.mark.asyncio
async def test_node_count_by_label(self, db):
await db.create_node("Case", {"mnc": "[2019] NSWSC 1"})
await db.create_node("Case", {"mnc": "[2020] NSWSC 2"})
await db.create_node("Judge", {"name": "Smith"})
assert await db.node_count("Case") == 2
assert await db.node_count("Judge") == 1
assert await db.node_count() == 3
@pytest.mark.asyncio
async def test_query_nodes(self, db):
await db.create_node("Case", {"mnc": "[2019] NSWSC 1", "court": "NSWSC"})
await db.create_node("Case", {"mnc": "[2020] VSC 2", "court": "VSC"})
results = await db.query_nodes("Case", {"court": "NSWSC"})
assert len(results) == 1
assert results[0].properties["mnc"] == "[2019] NSWSC 1"
@pytest.mark.asyncio
async def test_relationship(self, db):
case_id = await db.create_node("Case", {"mnc": "[2019] NSWSC 1"})
judge_id = await db.create_node("Judge", {"name": "Smith"})
await db.create_relationship(case_id, judge_id, "HEARD_BY")
rels = await db.get_relationships(case_id, "HEARD_BY")
assert len(rels) == 1
assert rels[0].to_id == judge_id
assert rels[0].rel_type == "HEARD_BY"
@pytest.mark.asyncio
async def test_relationship_count(self, db):
c1 = await db.create_node("Case", {"mnc": "[2019] NSWSC 1"})
j1 = await db.create_node("Judge", {"name": "Smith"})
j2 = await db.create_node("Judge", {"name": "Jones"})
await db.create_relationship(c1, j1, "HEARD_BY")
await db.create_relationship(c1, j2, "HEARD_BY")
assert await db.relationship_count("HEARD_BY") == 2
assert await db.relationship_count() == 2
@pytest.mark.asyncio
async def test_neighbors(self, db):
c1 = await db.create_node("Case", {"mnc": "[2019] NSWSC 1"})
j1 = await db.create_node("Judge", {"name": "Smith"})
await db.create_relationship(c1, j1, "HEARD_BY")
nbrs = await db.neighbors(c1, "HEARD_BY")
assert nbrs == [j1]
@pytest.mark.asyncio
async def test_close_clears(self, db):
await db.create_node("Case", {"mnc": "[2019] NSWSC 1"})
await db.close()
assert await db.node_count() == 0
# ── GraphBuilder.build_case ──
class TestBuildCase:
@pytest.fixture
def builder(self):
db = InMemoryGraphDB()
return GraphBuilder(db), db
@pytest.mark.asyncio
async def test_single_judge_single_charge(self, builder):
builder, db = builder
meta = _make_meta(judges=["Judge Smith"], charges=["murder"])
case_id = await builder.build_case(meta)
assert case_id.startswith("Case:")
assert await db.node_count("Case") == 1
assert await db.node_count("Judge") == 1
assert await db.node_count("Charge") == 1
assert await db.relationship_count("HEARD_BY") == 1
assert await db.relationship_count("CHARGED_WITH") == 1
@pytest.mark.asyncio
async def test_multiple_judges(self, builder):
builder, db = builder
meta = _make_meta(judges=["Judge Smith", "Judge Jones"])
await builder.build_case(meta)
assert await db.node_count("Judge") == 2
assert await db.relationship_count("HEARD_BY") == 2
@pytest.mark.asyncio
async def test_multiple_charges(self, builder):
builder, db = builder
meta = _make_meta(charges=["murder", "assault", "robbery"])
await builder.build_case(meta)
assert await db.node_count("Charge") == 3
assert await db.relationship_count("CHARGED_WITH") == 3
@pytest.mark.asyncio
async def test_inadmissible_evidence_creates_ruling(self, builder):
builder, db = builder
meta = _make_meta(inadmissible=["hearsay statement", "tainted identification"])
await builder.build_case(meta)
assert await db.node_count("Ruling") == 2
assert await db.relationship_count("HAS_RULING") == 2
@pytest.mark.asyncio
async def test_appeal_creates_appeals_edge(self, builder):
builder, db = builder
meta = _make_meta(
mnc="[2020] NSWSC 5678",
is_appeal=True,
appeal_of="[2019] NSWSC 1234",
)
case_id = await builder.build_case(meta)
assert await db.node_count("Case") == 2
assert await db.relationship_count("APPEALS") == 1
rels = await db.get_relationships(case_id, "APPEALS")
assert len(rels) == 1
target = await db.get_node(rels[0].to_id)
assert target.properties["mnc"] == "[2019] NSWSC 1234"
@pytest.mark.asyncio
async def test_case_properties(self, builder):
builder, db = builder
meta = _make_meta()
case_id = await builder.build_case(meta)
node = await db.get_node(case_id)
assert node.properties["mnc"] == "[2019] NSWSC 1234"
assert node.properties["court"] == "NSWSC"
assert node.properties["verdict"] == "guilty"
assert node.properties["exoneration_flag"] is False
@pytest.mark.asyncio
async def test_no_judges_no_charges(self, builder):
builder, db = builder
meta = _make_meta(judges=[], charges=[])
await builder.build_case(meta)
assert await db.node_count("Judge") == 0
assert await db.node_count("Charge") == 0
assert await db.relationship_count("HEARD_BY") == 0
assert await db.relationship_count("CHARGED_WITH") == 0
# ── GraphBuilder.build_chunks ──
class TestBuildChunks:
@pytest.fixture
def builder(self):
db = InMemoryGraphDB()
return GraphBuilder(db), db
@pytest.mark.asyncio
async def test_chunk_nodes_created(self, builder):
builder, db = builder
chunks = [
_make_chunk("c0", "opening", 0, "Opening statement"),
_make_chunk("c1", "testimony", 1, "Witness testimony"),
_make_chunk("c2", "closing", 2, "Closing argument"),
]
count = await builder.build_chunks("[2019] NSWSC 1", chunks)
assert count == 3
assert await db.node_count("Chunk") == 3
@pytest.mark.asyncio
async def test_follows_edges(self, builder):
builder, db = builder
chunks = [
_make_chunk("c0", "opening", 0),
_make_chunk("c1", "testimony", 1),
_make_chunk("c2", "closing", 2),
]
await builder.build_chunks("[2019] NSWSC 1", chunks)
assert await db.relationship_count("FOLLOWS") == 2
@pytest.mark.asyncio
async def test_single_chunk_no_edges(self, builder):
builder, db = builder
chunks = [_make_chunk("c0", "opening", 0)]
await builder.build_chunks("[2019] NSWSC 1", chunks)
assert await db.relationship_count("FOLLOWS") == 0
@pytest.mark.asyncio
async def test_no_chunks(self, builder):
builder, db = builder
count = await builder.build_chunks("[2019] NSWSC 1", [])
assert count == 0
# ── GraphBuilder.build_similarity_edges ──
class TestBuildSimilarityEdges:
@pytest.fixture
def builder(self):
db = InMemoryGraphDB()
return GraphBuilder(db), db
@pytest.mark.asyncio
async def test_corroborates_edge(self, builder):
builder, db = builder
# Nearly identical embeddings -> should corroborate
emb = [0.1, 0.9, 0.3, 0.7]
chunks = [
_make_chunk("c0", "testimony", 0, embedding=emb),
_make_chunk("c1", "testimony", 1, embedding=[x + 0.001 for x in emb]),
]
edges = await builder.build_similarity_edges("[2019] NSWSC 1", chunks)
assert edges == 1
assert await db.relationship_count("CORROBORATES") == 1
@pytest.mark.asyncio
async def test_contradicts_edge(self, builder):
builder, db = builder
chunks = [
_make_chunk("c0", "exhibit", 0, embedding=[1.0, 0.0, 0.0, 0.0]),
_make_chunk("c1", "exhibit", 1, embedding=[0.0, 1.0, 0.0, 0.0]),
]
edges = await builder.build_similarity_edges("[2019] NSWSC 1", chunks)
assert edges == 1
assert await db.relationship_count("CONTRADICTS") == 1
@pytest.mark.asyncio
async def test_no_edge_when_sim_between_thresholds(self, builder):
builder, db = builder
# Moderate similarity (~0.56) — neither corroborates nor contradicts
chunks = [
_make_chunk("c0", "testimony", 0, embedding=[0.7, 0.3, 0.5, 0.0]),
_make_chunk("c1", "testimony", 1, embedding=[0.3, 0.7, 0.1, 0.5]),
]
edges = await builder.build_similarity_edges("[2019] NSWSC 1", chunks)
assert edges == 0
@pytest.mark.asyncio
async def test_type_filter(self, builder):
builder, db = builder
emb = [0.1, 0.9, 0.3, 0.7]
chunks = [
_make_chunk("c0", "testimony", 0, embedding=emb),
_make_chunk("c1", "testimony", 1, embedding=[x + 0.001 for x in emb]),
_make_chunk("c2", "ruling", 2, embedding=[x + 0.001 for x in emb]),
]
# Only testimony should be compared
edges = await builder.build_similarity_edges(
"[2019] NSWSC 1", chunks, types=["testimony"]
)
assert edges == 1
@pytest.mark.asyncio
async def test_no_embedding_skipped(self, builder):
builder, db = builder
chunks = [
_make_chunk("c0", "testimony", 0, embedding=[1, 0, 0, 0]),
_make_chunk("c1", "testimony", 1, embedding=None),
]
edges = await builder.build_similarity_edges("[2019] NSWSC 1", chunks)
assert edges == 0
@pytest.mark.asyncio
async def test_empty_chunks(self, builder):
builder, db = builder
edges = await builder.build_similarity_edges("[2019] NSWSC 1", [])
assert edges == 0
@pytest.mark.asyncio
async def test_weight_stored(self, builder):
builder, db = builder
emb = [0.1, 0.9, 0.3, 0.7]
chunks = [
_make_chunk("c0", "testimony", 0, embedding=emb),
_make_chunk("c1", "testimony", 1, embedding=[x + 0.001 for x in emb]),
]
await builder.build_similarity_edges("[2019] NSWSC 1", chunks)
rels = await db.get_relationships("c0", "CORROBORATES")
assert len(rels) == 1
assert "weight" in rels[0].properties
assert rels[0].properties["weight"] >= CORROBORATE_THRESHOLD
# ── GraphBuilder.build_full ──
class TestBuildFull:
@pytest.mark.asyncio
async def test_full_build_summary(self):
db = InMemoryGraphDB()
builder = GraphBuilder(db)
meta = _make_meta(
judges=["Judge Smith", "Judge Jones"],
charges=["murder", "assault"],
inadmissible=["hearsay"],
)
emb = [0.1, 0.9, 0.3, 0.7]
chunks = [
_make_chunk("c0", "testimony", 0, "Testimony text", embedding=emb),
_make_chunk("c1", "testimony", 1, "Similar testimony", embedding=[x + 0.001 for x in emb]),
_make_chunk("c2", "closing", 2, "Closing text"),
]
summary = await builder.build_full(meta, chunks)
assert summary["mnc"] == "[2019] NSWSC 1234"
assert summary["nodes"]["Case"] == 1
assert summary["nodes"]["Judge"] == 2
assert summary["nodes"]["Charge"] == 2
assert summary["nodes"]["Ruling"] == 1
assert summary["nodes"]["Chunk"] == 3
assert summary["edges"]["HEARD_BY"] == 2
assert summary["edges"]["CHARGED_WITH"] == 2
assert summary["edges"]["HAS_RULING"] == 1
assert summary["edges"]["FOLLOWS"] == 2
assert summary["edges"]["CORROBORATES"] == 1
assert summary["edges"]["CONTRADICTS"] == 0
@pytest.mark.asyncio
async def test_full_build_appeal(self):
db = InMemoryGraphDB()
builder = GraphBuilder(db)
meta = _make_meta(
mnc="[2020] NSWSC 5678",
is_appeal=True,
appeal_of="[2019] NSWSC 1234",
)
summary = await builder.build_full(meta, [])
assert summary["nodes"]["Case"] == 2
assert summary["edges"]["APPEALS"] == 1
# ── GraphDB Protocol conformance ──
class TestProtocolConformance:
def test_in_memory_conforms(self):
from aucourt_ingest.storage.graph_db import GraphDB
proto_methods = {m for m in dir(GraphDB) if not m.startswith("_")}
impl_methods = set(dir(InMemoryGraphDB))
for method in proto_methods:
if not method.startswith("_"):
assert method in impl_methods, f"InMemoryGraphDB missing {method}"