add experiment scripts and results; watcher.py latest changes

2026-04-30 18:06:03 +00:00
parent 1cf26df450
commit f11cacd9c9
55 changed files with 23594 additions and 726 deletions
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+"""
+E1.8 Phase 2 — Evaluate
+Pulls predicate counts from FalkorDB for each group_id and compares.
+Run after e1_8_taxfree_cascade.py completes.
+"""
+
+import json, subprocess
+from pathlib import Path
+
+RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
+EVAL_PATH    = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
+
+GROUP_TAXFREE  = "aaron_e18_taxfree"
+GROUP_BASELINE = "aaron_e18_baseline"
+GROUP_STANDARD = "aaron_e18_standard"
+GROUP_PROD     = "aaron"
+GROUP_E14      = "aaron_cascade_e14"
+
+
+def query(group_id, cypher):
+    result = subprocess.run(
+        ["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
+        capture_output=True, text=True
+    )
+    return result.stdout
+
+
+def get_episode_uuid(group_id, episode_name):
+    safe = episode_name.replace("'", "\'")
+    cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
+    output = query(group_id, cypher)
+    for line in output.split("\n"):
+        line = line.strip()
+        if len(line) == 36 and line.count("-") == 4:
+            return line
+    return None
+
+
+def count_preds(group_id, uuid):
+    cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
+    output = query(group_id, cypher)
+    for line in output.split("\n"):
+        line = line.strip()
+        if line.isdigit():
+            return int(line)
+    return 0
+
+
+def count_edges(group_id, uuid):
+    cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
+    output = query(group_id, cypher)
+    for line in output.split("\n"):
+        line = line.strip()
+        if line.isdigit():
+            return int(line)
+    return 0
+
+
+def eval_source(name, groups):
+    result = {"name": name}
+    for label, group_id in groups.items():
+        uuid = get_episode_uuid(group_id, name)
+        if uuid:
+            result[f"{label}_preds"] = count_preds(group_id, uuid)
+            result[f"{label}_edges"] = count_edges(group_id, uuid)
+        else:
+            result[f"{label}_preds"] = None
+            result[f"{label}_edges"] = None
+    return result
+
+
+def run():
+    print("E1.8 — Evaluation phase")
+    print("=" * 60)
+
+    results = json.loads(RESULTS_PATH.read_text())
+    eval_results = {"subsample_a": [], "subsample_b": []}
+
+    # Sub-sample A — compare taxfree vs prod (baseline) vs e14 cascade
+    print("\nSub-sample A")
+    print(f"{'Source':<55} {'prod':>5} {'e14c':>5} {'tf':>5} {'e14Δ':>6} {'tfΔ':>6}")
+    print("-" * 90)
+
+    a_records = []
+    for item in results["subsample_a"]:
+        name = item["name"]
+        r = eval_source(name, {
+            "prod": GROUP_PROD,
+            "e14": GROUP_E14,
+            "tf": GROUP_TAXFREE,
+        })
+        r["bucket"] = item["bucket"]
+        r["taxfree_metadata"] = item.get("taxfree_metadata")
+        r["e14_delta_preds"] = item.get("e14_delta_preds")
+
+        prod = r.get("prod_preds") or 0
+        e14 = r.get("e14_preds") or 0
+        tf = r.get("tf_preds") or 0
+        e14_delta = ((e14 - prod) / prod * 100) if prod > 0 else 0
+        tf_delta  = ((tf  - prod) / prod * 100) if prod > 0 else 0
+
+        display = name[:53] + ".." if len(name) > 55 else name
+        print(f"{display:<55} {prod:>5} {e14:>5} {tf:>5} {e14_delta:>+5.0f}% {tf_delta:>+5.0f}%")
+
+        r["tf_delta_vs_prod"] = tf_delta
+        r["e14_delta_vs_prod"] = e14_delta
+        a_records.append(r)
+        eval_results["subsample_a"].append(r)
+
+    # Aggregate Sub-sample A
+    valid = [r for r in a_records if r.get("prod_preds") and r.get("tf_preds")]
+    if valid:
+        mean_e14_delta = sum(r["e14_delta_vs_prod"] for r in valid) / len(valid)
+        mean_tf_delta  = sum(r["tf_delta_vs_prod"]  for r in valid) / len(valid)
+        print(f"\nAggregate Sub-sample A (n={len(valid)}):")
+        print(f"  E1.4 cascade mean delta vs prod: {mean_e14_delta:+.1f}%")
+        print(f"  Taxonomy-free mean delta vs prod: {mean_tf_delta:+.1f}%")
+        print(f"  Taxonomy-free vs E1.4 cascade: {mean_tf_delta - mean_e14_delta:+.1f}pp")
+
+    # Sub-sample B — all three conditions
+    print("\n\nSub-sample B")
+    print(f"{'Source':<55} {'base':>5} {'std':>5} {'tf':>5} {'stdΔ':>6} {'tfΔ':>6}")
+    print("-" * 90)
+
+    b_records = []
+    for item in results["subsample_b"]:
+        name = item["name"]
+        r = eval_source(name, {
+            "base": GROUP_BASELINE,
+            "std":  GROUP_STANDARD,
+            "tf":   GROUP_TAXFREE,
+        })
+        r["bucket"] = item["bucket"]
+        r["taxfree_metadata"] = item.get("taxfree_metadata")
+        r["standard_metadata"] = item.get("standard_metadata")
+
+        base = r.get("base_preds") or 0
+        std  = r.get("std_preds")  or 0
+        tf   = r.get("tf_preds")   or 0
+        std_delta = ((std - base) / base * 100) if base > 0 else 0
+        tf_delta  = ((tf  - base) / base * 100) if base > 0 else 0
+
+        display = name[:53] + ".." if len(name) > 55 else name
+        print(f"{display:<55} {base:>5} {std:>5} {tf:>5} {std_delta:>+5.0f}% {tf_delta:>+5.0f}%")
+
+        r["std_delta_vs_base"] = std_delta
+        r["tf_delta_vs_base"]  = tf_delta
+        b_records.append(r)
+        eval_results["subsample_b"].append(r)
+
+    # Aggregate Sub-sample B
+    valid_b = [r for r in b_records if r.get("base_preds") and r.get("tf_preds")]
+    if valid_b:
+        mean_std_delta = sum(r["std_delta_vs_base"] for r in valid_b) / len(valid_b)
+        mean_tf_delta  = sum(r["tf_delta_vs_base"]  for r in valid_b) / len(valid_b)
+        print(f"\nAggregate Sub-sample B (n={len(valid_b)}):")
+        print(f"  Standard cascade mean delta vs baseline: {mean_std_delta:+.1f}%")
+        print(f"  Taxonomy-free mean delta vs baseline:    {mean_tf_delta:+.1f}%")
+
+        # By bucket
+        print("\nPer-bucket (Sub-sample B):")
+        for bucket in ["high", "mid", "document"]:
+            br = [r for r in valid_b if r["bucket"] == bucket]
+            if not br:
+                continue
+            m_std = sum(r["std_delta_vs_base"] for r in br) / len(br)
+            m_tf  = sum(r["tf_delta_vs_base"]  for r in br) / len(br)
+            print(f"  [{bucket:>8}] n={len(br)}  std={m_std:+.0f}%  tf={m_tf:+.0f}%")
+
+    # Decision rule evaluation
+    print("\n" + "=" * 60)
+    print("DECISION RULE:")
+    if valid:
+        improvement = mean_tf_delta - mean_e14_delta
+        if improvement >= 20:
+            print(f"  ✓ STRONG RECOVERY (+{improvement:.1f}pp) — Stage 3.1 ships as taxonomy-free")
+        elif improvement >= 5:
+            print(f"  ~ PARTIAL RECOVERY (+{improvement:.1f}pp) — orientation helps, needs refinement")
+        elif improvement >= 0:
+            print(f"  ~ MARGINAL (+{improvement:.1f}pp) — consider API extractor prompt redesign (E1.9)")
+        else:
+            print(f"  ✗ NEGATIVE ({improvement:.1f}pp) — taxonomy-free introduces more noise than standard")
+
+    EVAL_PATH.write_text(json.dumps(eval_results, indent=2))
+    print(f"\nEval saved to {EVAL_PATH}")
+
+
+if __name__ == "__main__":
+    run()