aaronAI/scripts/experiments/e1_per_source_predicates.py

#!/usr/bin/env python3
"""E1 corrected metric — count distinct predicate names on edges originating from each episode."""
import json
import subprocess
from pathlib import Path

EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"

def query(group_id, cypher):
    result = subprocess.run(
        ["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
        capture_output=True, text=True
    )
    return result.stdout

def get_episode_uuid(group_id, episode_name):
    """Look up the UUID for a given episode name in a given group."""
    # Escape single quotes in the name
    safe = episode_name.replace("'", "\\'")
    cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
    output = query(group_id, cypher)
    lines = [l.strip() for l in output.split("\n") if l.strip()]
    for line in lines:
        # UUID format check
        if len(line) == 36 and line.count("-") == 4:
            return line
    return None

def count_predicates_for_episode(group_id, uuid):
    """Count distinct predicate names on edges where this episode UUID appears in r.episodes."""
    cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
    output = query(group_id, cypher)
    lines = [l.strip() for l in output.split("\n") if l.strip()]
    for line in lines:
        if line.isdigit():
            return int(line)
    return 0

def count_total_edges_for_episode(group_id, uuid):
    """Count total edges originating from this episode."""
    cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
    output = query(group_id, cypher)
    lines = [l.strip() for l in output.split("\n") if l.strip()]
    for line in lines:
        if line.isdigit():
            return int(line)
    return 0

with open(SAMPLE_FILE) as f:
    sample = json.load(f)
selected = sample["selected"]

print(f"E1 corrected per-source comparison — predicates per episode by edge origin\n")
print(f"{'Source':<60} {'A.edges':>8} {'A.preds':>8} {'B.edges':>8} {'B.preds':>8}")
print("-" * 100)

a_pred_total = 0
b_pred_total = 0
a_edge_total = 0
b_edge_total = 0
records = []

for ep in selected:
    name = ep["name"]
    a_uuid = get_episode_uuid("aaron", name)
    b_uuid = get_episode_uuid("aaron_cascade_test", name)

    a_edges = count_total_edges_for_episode("aaron", a_uuid) if a_uuid else 0
    a_preds = count_predicates_for_episode("aaron", a_uuid) if a_uuid else 0
    b_edges = count_total_edges_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0
    b_preds = count_predicates_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0

    display = name if len(name) <= 58 else name[:55] + "..."
    print(f"{display:<60} {a_edges:>8} {a_preds:>8} {b_edges:>8} {b_preds:>8}")

    records.append({
        "name": name, "bucket": ep["bucket"],
        "a_edges": a_edges, "a_preds": a_preds,
        "b_edges": b_edges, "b_preds": b_preds,
    })
    a_pred_total += a_preds
    b_pred_total += b_preds
    a_edge_total += a_edges
    b_edge_total += b_edges

print("-" * 100)
n = len(selected)
print(f"\nAggregate (n={n}):")
print(f"  Edges:      A total={a_edge_total} mean={a_edge_total/n:.1f}  B total={b_edge_total} mean={b_edge_total/n:.1f}")
print(f"  Predicates: A total={a_pred_total} mean={a_pred_total/n:.1f}  B total={b_pred_total} mean={b_pred_total/n:.1f}")
if a_pred_total > 0:
    print(f"  Predicate delta: B vs A = {(b_pred_total-a_pred_total)/a_pred_total*100:+.1f}%")
if a_edge_total > 0:
    print(f"  Edge delta:      B vs A = {(b_edge_total-a_edge_total)/a_edge_total*100:+.1f}%")

# Per-bucket
print(f"\nPer-bucket:")
for bucket in ["high", "mid", "low", "document"]:
    bucket_records = [r for r in records if r["bucket"] == bucket]
    if not bucket_records:
        continue
    bn = len(bucket_records)
    a_p = sum(r["a_preds"] for r in bucket_records)
    b_p = sum(r["b_preds"] for r in bucket_records)
    a_e = sum(r["a_edges"] for r in bucket_records)
    b_e = sum(r["b_edges"] for r in bucket_records)
    delta = ((b_p-a_p)/a_p*100) if a_p > 0 else 0
    print(f"  [{bucket:>8}] n={bn}  A.preds={a_p:>3} B.preds={b_p:>3} ({delta:+.0f}%)  A.edges={a_e:>3} B.edges={b_e:>3}")

with open(EXPERIMENTS / "cascade_reextract_corrected_comparison.json", "w") as f:
    json.dump({"per_source": records,
               "aggregate": {"a_preds": a_pred_total, "b_preds": b_pred_total,
                             "a_edges": a_edge_total, "b_edges": b_edge_total}}, f, indent=2)
print(f"\nSaved to {EXPERIMENTS / 'cascade_reextract_corrected_comparison.json'}")