#!/usr/bin/env python3 """ E1.8 Phase 2 — Evaluate Pulls predicate counts from FalkorDB for each group_id and compares. Run after e1_8_taxfree_cascade.py completes. """ import json, subprocess from pathlib import Path RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json" EVAL_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json" GROUP_TAXFREE = "aaron_e18_taxfree" GROUP_BASELINE = "aaron_e18_baseline" GROUP_STANDARD = "aaron_e18_standard" GROUP_PROD = "aaron" GROUP_E14 = "aaron_cascade_e14" def query(group_id, cypher): result = subprocess.run( ["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher], capture_output=True, text=True ) return result.stdout def get_episode_uuid(group_id, episode_name): safe = episode_name.replace("'", "\'") cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1" output = query(group_id, cypher) for line in output.split("\n"): line = line.strip() if len(line) == 36 and line.count("-") == 4: return line return None def count_preds(group_id, uuid): cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p" output = query(group_id, cypher) for line in output.split("\n"): line = line.strip() if line.isdigit(): return int(line) return 0 def count_edges(group_id, uuid): cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n" output = query(group_id, cypher) for line in output.split("\n"): line = line.strip() if line.isdigit(): return int(line) return 0 def eval_source(name, groups): result = {"name": name} for label, group_id in groups.items(): uuid = get_episode_uuid(group_id, name) if uuid: result[f"{label}_preds"] = count_preds(group_id, uuid) result[f"{label}_edges"] = count_edges(group_id, uuid) else: result[f"{label}_preds"] = None result[f"{label}_edges"] = None return result def run(): print("E1.8 — Evaluation phase") print("=" * 60) results = json.loads(RESULTS_PATH.read_text()) eval_results = {"subsample_a": [], "subsample_b": []} # Sub-sample A — compare taxfree vs prod (baseline) vs e14 cascade print("\nSub-sample A") print(f"{'Source':<55} {'prod':>5} {'e14c':>5} {'tf':>5} {'e14Δ':>6} {'tfΔ':>6}") print("-" * 90) a_records = [] for item in results["subsample_a"]: name = item["name"] r = eval_source(name, { "prod": GROUP_PROD, "e14": GROUP_E14, "tf": GROUP_TAXFREE, }) r["bucket"] = item["bucket"] r["taxfree_metadata"] = item.get("taxfree_metadata") r["e14_delta_preds"] = item.get("e14_delta_preds") prod = r.get("prod_preds") or 0 e14 = r.get("e14_preds") or 0 tf = r.get("tf_preds") or 0 e14_delta = ((e14 - prod) / prod * 100) if prod > 0 else 0 tf_delta = ((tf - prod) / prod * 100) if prod > 0 else 0 display = name[:53] + ".." if len(name) > 55 else name print(f"{display:<55} {prod:>5} {e14:>5} {tf:>5} {e14_delta:>+5.0f}% {tf_delta:>+5.0f}%") r["tf_delta_vs_prod"] = tf_delta r["e14_delta_vs_prod"] = e14_delta a_records.append(r) eval_results["subsample_a"].append(r) # Aggregate Sub-sample A valid = [r for r in a_records if r.get("prod_preds") and r.get("tf_preds")] if valid: mean_e14_delta = sum(r["e14_delta_vs_prod"] for r in valid) / len(valid) mean_tf_delta = sum(r["tf_delta_vs_prod"] for r in valid) / len(valid) print(f"\nAggregate Sub-sample A (n={len(valid)}):") print(f" E1.4 cascade mean delta vs prod: {mean_e14_delta:+.1f}%") print(f" Taxonomy-free mean delta vs prod: {mean_tf_delta:+.1f}%") print(f" Taxonomy-free vs E1.4 cascade: {mean_tf_delta - mean_e14_delta:+.1f}pp") # Sub-sample B — all three conditions print("\n\nSub-sample B") print(f"{'Source':<55} {'base':>5} {'std':>5} {'tf':>5} {'stdΔ':>6} {'tfΔ':>6}") print("-" * 90) b_records = [] for item in results["subsample_b"]: name = item["name"] r = eval_source(name, { "base": GROUP_BASELINE, "std": GROUP_STANDARD, "tf": GROUP_TAXFREE, }) r["bucket"] = item["bucket"] r["taxfree_metadata"] = item.get("taxfree_metadata") r["standard_metadata"] = item.get("standard_metadata") base = r.get("base_preds") or 0 std = r.get("std_preds") or 0 tf = r.get("tf_preds") or 0 std_delta = ((std - base) / base * 100) if base > 0 else 0 tf_delta = ((tf - base) / base * 100) if base > 0 else 0 display = name[:53] + ".." if len(name) > 55 else name print(f"{display:<55} {base:>5} {std:>5} {tf:>5} {std_delta:>+5.0f}% {tf_delta:>+5.0f}%") r["std_delta_vs_base"] = std_delta r["tf_delta_vs_base"] = tf_delta b_records.append(r) eval_results["subsample_b"].append(r) # Aggregate Sub-sample B valid_b = [r for r in b_records if r.get("base_preds") and r.get("tf_preds")] if valid_b: mean_std_delta = sum(r["std_delta_vs_base"] for r in valid_b) / len(valid_b) mean_tf_delta = sum(r["tf_delta_vs_base"] for r in valid_b) / len(valid_b) print(f"\nAggregate Sub-sample B (n={len(valid_b)}):") print(f" Standard cascade mean delta vs baseline: {mean_std_delta:+.1f}%") print(f" Taxonomy-free mean delta vs baseline: {mean_tf_delta:+.1f}%") # By bucket print("\nPer-bucket (Sub-sample B):") for bucket in ["high", "mid", "document"]: br = [r for r in valid_b if r["bucket"] == bucket] if not br: continue m_std = sum(r["std_delta_vs_base"] for r in br) / len(br) m_tf = sum(r["tf_delta_vs_base"] for r in br) / len(br) print(f" [{bucket:>8}] n={len(br)} std={m_std:+.0f}% tf={m_tf:+.0f}%") # Decision rule evaluation print("\n" + "=" * 60) print("DECISION RULE:") if valid: improvement = mean_tf_delta - mean_e14_delta if improvement >= 20: print(f" ✓ STRONG RECOVERY (+{improvement:.1f}pp) — Stage 3.1 ships as taxonomy-free") elif improvement >= 5: print(f" ~ PARTIAL RECOVERY (+{improvement:.1f}pp) — orientation helps, needs refinement") elif improvement >= 0: print(f" ~ MARGINAL (+{improvement:.1f}pp) — consider API extractor prompt redesign (E1.9)") else: print(f" ✗ NEGATIVE ({improvement:.1f}pp) — taxonomy-free introduces more noise than standard") EVAL_PATH.write_text(json.dumps(eval_results, indent=2)) print(f"\nEval saved to {EVAL_PATH}") if __name__ == "__main__": run()