""" E1.4 per-source predicate diversity comparison — fixed version. Looks up episode uuids by name in both production and cascade graphs. """ import json from collections import defaultdict from falkordb import FalkorDB E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json" PRODUCTION_GROUP = "aaron" CASCADE_GROUP = "aaron_cascade_e14" def get_predicates_for_episode(graph, episode_uuid): query = """ MATCH ()-[r:RELATES_TO]->() WHERE $uuid IN r.episodes RETURN count(DISTINCT r.name) AS predicate_count """ result = graph.query(query, {"uuid": episode_uuid}) rows = result.result_set return rows[0][0] if rows else 0 def get_edge_count_for_episode(graph, episode_uuid): query = """ MATCH ()-[r:RELATES_TO]->() WHERE $uuid IN r.episodes RETURN count(r) AS edge_count """ result = graph.query(query, {"uuid": episode_uuid}) rows = result.result_set return rows[0][0] if rows else 0 def find_episode_uuid(graph, source_name): query = """ MATCH (e:Episodic {name: $name}) RETURN e.uuid AS uuid LIMIT 1 """ result = graph.query(query, {"name": source_name}) rows = result.result_set return rows[0][0] if rows else None def main(): db = FalkorDB(host='localhost', port=6379) prod_graph = db.select_graph(PRODUCTION_GROUP) cascade_graph = db.select_graph(CASCADE_GROUP) with open(E14_RESULTS) as f: e14 = json.load(f) sources = [r for r in e14['results'] if 'submit_result' in r] print(f"Analyzing {len(sources)} sources...") print() comparisons = [] missing_prod = 0 missing_cascade = 0 for src in sources: name = src['name'] bucket = src['bucket'] prod_uuid = find_episode_uuid(prod_graph, name) cascade_uuid = find_episode_uuid(cascade_graph, name) if not prod_uuid: missing_prod += 1 print(f" WARN: missing in production: {name}") continue if not cascade_uuid: missing_cascade += 1 print(f" WARN: missing in cascade: {name}") continue prod_preds = get_predicates_for_episode(prod_graph, prod_uuid) cascade_preds = get_predicates_for_episode(cascade_graph, cascade_uuid) prod_edges = get_edge_count_for_episode(prod_graph, prod_uuid) cascade_edges = get_edge_count_for_episode(cascade_graph, cascade_uuid) comparisons.append({ "name": name, "bucket": bucket, "prod_preds": prod_preds, "cascade_preds": cascade_preds, "delta_preds": cascade_preds - prod_preds, "prod_edges": prod_edges, "cascade_edges": cascade_edges, "delta_edges": cascade_edges - prod_edges, }) if missing_prod or missing_cascade: print() print(f"Missing: {missing_prod} prod, {missing_cascade} cascade") print() if not comparisons: print("No comparable sources found. Aborting.") return # Per-source detail print(f"{'Bucket':<10} {'Source':<58} {'Preds A→B':<14} {'Δ':<6} {'Edges A→B':<14} {'Δ'}") print("-" * 115) for c in sorted(comparisons, key=lambda x: (x['bucket'], x['name'])): name_short = (c['name'][:55] + '..') if len(c['name']) > 58 else c['name'] preds_str = f"{c['prod_preds']}→{c['cascade_preds']}" edges_str = f"{c['prod_edges']}→{c['cascade_edges']}" print(f"{c['bucket']:<10} {name_short:<58} {preds_str:<14} {c['delta_preds']:+d} {edges_str:<14} {c['delta_edges']:+d}") # Per-bucket aggregation print() print("=" * 115) print("PER-BUCKET AGGREGATION") print("=" * 115) by_bucket = defaultdict(list) for c in comparisons: by_bucket[c['bucket']].append(c) for bucket in ['high', 'mid', 'low', 'document']: items = by_bucket.get(bucket, []) if not items: continue n = len(items) sum_pp = sum(c['prod_preds'] for c in items) sum_cp = sum(c['cascade_preds'] for c in items) sum_pe = sum(c['prod_edges'] for c in items) sum_ce = sum(c['cascade_edges'] for c in items) positive = sum(1 for c in items if c['delta_preds'] > 0) negative = sum(1 for c in items if c['delta_preds'] < 0) flat = sum(1 for c in items if c['delta_preds'] == 0) pct_pred = ((sum_cp - sum_pp) / sum_pp * 100) if sum_pp else 0 pct_edge = ((sum_ce - sum_pe) / sum_pe * 100) if sum_pe else 0 print(f"\n{bucket.upper()} (n={n}):") print(f" Predicates: {sum_pp} → {sum_cp} ({pct_pred:+.1f}%)") print(f" Edges: {sum_pe} → {sum_ce} ({pct_edge:+.1f}%)") print(f" Outcomes: {positive} positive, {flat} flat, {negative} negative") # Aggregate print() print("=" * 115) print(f"AGGREGATE (n={len(comparisons)})") print("=" * 115) total_pp = sum(c['prod_preds'] for c in comparisons) total_cp = sum(c['cascade_preds'] for c in comparisons) total_pe = sum(c['prod_edges'] for c in comparisons) total_ce = sum(c['cascade_edges'] for c in comparisons) print(f" Predicates: {total_pp} → {total_cp} ({(total_cp-total_pp)/total_pp*100:+.1f}%)") print(f" Edges: {total_pe} → {total_ce} ({(total_ce-total_pe)/total_pe*100:+.1f}%)") out_path = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json" with open(out_path, "w") as f: json.dump(comparisons, f, indent=2) print() print(f"Saved to {out_path}") if __name__ == "__main__": main()