156 lines
5.5 KiB
Python
156 lines
5.5 KiB
Python
"""
|
|
E1.4 per-source predicate diversity comparison — fixed version.
|
|
Looks up episode uuids by name in both production and cascade graphs.
|
|
"""
|
|
import json
|
|
from collections import defaultdict
|
|
from falkordb import FalkorDB
|
|
|
|
E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json"
|
|
PRODUCTION_GROUP = "aaron"
|
|
CASCADE_GROUP = "aaron_cascade_e14"
|
|
|
|
def get_predicates_for_episode(graph, episode_uuid):
|
|
query = """
|
|
MATCH ()-[r:RELATES_TO]->()
|
|
WHERE $uuid IN r.episodes
|
|
RETURN count(DISTINCT r.name) AS predicate_count
|
|
"""
|
|
result = graph.query(query, {"uuid": episode_uuid})
|
|
rows = result.result_set
|
|
return rows[0][0] if rows else 0
|
|
|
|
def get_edge_count_for_episode(graph, episode_uuid):
|
|
query = """
|
|
MATCH ()-[r:RELATES_TO]->()
|
|
WHERE $uuid IN r.episodes
|
|
RETURN count(r) AS edge_count
|
|
"""
|
|
result = graph.query(query, {"uuid": episode_uuid})
|
|
rows = result.result_set
|
|
return rows[0][0] if rows else 0
|
|
|
|
def find_episode_uuid(graph, source_name):
|
|
query = """
|
|
MATCH (e:Episodic {name: $name})
|
|
RETURN e.uuid AS uuid
|
|
LIMIT 1
|
|
"""
|
|
result = graph.query(query, {"name": source_name})
|
|
rows = result.result_set
|
|
return rows[0][0] if rows else None
|
|
|
|
def main():
|
|
db = FalkorDB(host='localhost', port=6379)
|
|
prod_graph = db.select_graph(PRODUCTION_GROUP)
|
|
cascade_graph = db.select_graph(CASCADE_GROUP)
|
|
|
|
with open(E14_RESULTS) as f:
|
|
e14 = json.load(f)
|
|
|
|
sources = [r for r in e14['results'] if 'submit_result' in r]
|
|
print(f"Analyzing {len(sources)} sources...")
|
|
print()
|
|
|
|
comparisons = []
|
|
missing_prod = 0
|
|
missing_cascade = 0
|
|
for src in sources:
|
|
name = src['name']
|
|
bucket = src['bucket']
|
|
|
|
prod_uuid = find_episode_uuid(prod_graph, name)
|
|
cascade_uuid = find_episode_uuid(cascade_graph, name)
|
|
|
|
if not prod_uuid:
|
|
missing_prod += 1
|
|
print(f" WARN: missing in production: {name}")
|
|
continue
|
|
if not cascade_uuid:
|
|
missing_cascade += 1
|
|
print(f" WARN: missing in cascade: {name}")
|
|
continue
|
|
|
|
prod_preds = get_predicates_for_episode(prod_graph, prod_uuid)
|
|
cascade_preds = get_predicates_for_episode(cascade_graph, cascade_uuid)
|
|
prod_edges = get_edge_count_for_episode(prod_graph, prod_uuid)
|
|
cascade_edges = get_edge_count_for_episode(cascade_graph, cascade_uuid)
|
|
|
|
comparisons.append({
|
|
"name": name,
|
|
"bucket": bucket,
|
|
"prod_preds": prod_preds,
|
|
"cascade_preds": cascade_preds,
|
|
"delta_preds": cascade_preds - prod_preds,
|
|
"prod_edges": prod_edges,
|
|
"cascade_edges": cascade_edges,
|
|
"delta_edges": cascade_edges - prod_edges,
|
|
})
|
|
|
|
if missing_prod or missing_cascade:
|
|
print()
|
|
print(f"Missing: {missing_prod} prod, {missing_cascade} cascade")
|
|
print()
|
|
|
|
if not comparisons:
|
|
print("No comparable sources found. Aborting.")
|
|
return
|
|
|
|
# Per-source detail
|
|
print(f"{'Bucket':<10} {'Source':<58} {'Preds A→B':<14} {'Δ':<6} {'Edges A→B':<14} {'Δ'}")
|
|
print("-" * 115)
|
|
for c in sorted(comparisons, key=lambda x: (x['bucket'], x['name'])):
|
|
name_short = (c['name'][:55] + '..') if len(c['name']) > 58 else c['name']
|
|
preds_str = f"{c['prod_preds']}→{c['cascade_preds']}"
|
|
edges_str = f"{c['prod_edges']}→{c['cascade_edges']}"
|
|
print(f"{c['bucket']:<10} {name_short:<58} {preds_str:<14} {c['delta_preds']:+d} {edges_str:<14} {c['delta_edges']:+d}")
|
|
|
|
# Per-bucket aggregation
|
|
print()
|
|
print("=" * 115)
|
|
print("PER-BUCKET AGGREGATION")
|
|
print("=" * 115)
|
|
by_bucket = defaultdict(list)
|
|
for c in comparisons:
|
|
by_bucket[c['bucket']].append(c)
|
|
|
|
for bucket in ['high', 'mid', 'low', 'document']:
|
|
items = by_bucket.get(bucket, [])
|
|
if not items:
|
|
continue
|
|
n = len(items)
|
|
sum_pp = sum(c['prod_preds'] for c in items)
|
|
sum_cp = sum(c['cascade_preds'] for c in items)
|
|
sum_pe = sum(c['prod_edges'] for c in items)
|
|
sum_ce = sum(c['cascade_edges'] for c in items)
|
|
positive = sum(1 for c in items if c['delta_preds'] > 0)
|
|
negative = sum(1 for c in items if c['delta_preds'] < 0)
|
|
flat = sum(1 for c in items if c['delta_preds'] == 0)
|
|
pct_pred = ((sum_cp - sum_pp) / sum_pp * 100) if sum_pp else 0
|
|
pct_edge = ((sum_ce - sum_pe) / sum_pe * 100) if sum_pe else 0
|
|
print(f"\n{bucket.upper()} (n={n}):")
|
|
print(f" Predicates: {sum_pp} → {sum_cp} ({pct_pred:+.1f}%)")
|
|
print(f" Edges: {sum_pe} → {sum_ce} ({pct_edge:+.1f}%)")
|
|
print(f" Outcomes: {positive} positive, {flat} flat, {negative} negative")
|
|
|
|
# Aggregate
|
|
print()
|
|
print("=" * 115)
|
|
print(f"AGGREGATE (n={len(comparisons)})")
|
|
print("=" * 115)
|
|
total_pp = sum(c['prod_preds'] for c in comparisons)
|
|
total_cp = sum(c['cascade_preds'] for c in comparisons)
|
|
total_pe = sum(c['prod_edges'] for c in comparisons)
|
|
total_ce = sum(c['cascade_edges'] for c in comparisons)
|
|
print(f" Predicates: {total_pp} → {total_cp} ({(total_cp-total_pp)/total_pp*100:+.1f}%)")
|
|
print(f" Edges: {total_pe} → {total_ce} ({(total_ce-total_pe)/total_pe*100:+.1f}%)")
|
|
|
|
out_path = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump(comparisons, f, indent=2)
|
|
print()
|
|
print(f"Saved to {out_path}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|