Files
aaronAI/scripts/experiments/e1_8_eval.py
T

191 lines
6.9 KiB
Python

#!/usr/bin/env python3
"""
E1.8 Phase 2 — Evaluate
Pulls predicate counts from FalkorDB for each group_id and compares.
Run after e1_8_taxfree_cascade.py completes.
"""
import json, subprocess
from pathlib import Path
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
EVAL_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
GROUP_TAXFREE = "aaron_e18_taxfree"
GROUP_BASELINE = "aaron_e18_baseline"
GROUP_STANDARD = "aaron_e18_standard"
GROUP_PROD = "aaron"
GROUP_E14 = "aaron_cascade_e14"
def query(group_id, cypher):
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
capture_output=True, text=True
)
return result.stdout
def get_episode_uuid(group_id, episode_name):
safe = episode_name.replace("'", "\'")
cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
output = query(group_id, cypher)
for line in output.split("\n"):
line = line.strip()
if len(line) == 36 and line.count("-") == 4:
return line
return None
def count_preds(group_id, uuid):
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
output = query(group_id, cypher)
for line in output.split("\n"):
line = line.strip()
if line.isdigit():
return int(line)
return 0
def count_edges(group_id, uuid):
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
output = query(group_id, cypher)
for line in output.split("\n"):
line = line.strip()
if line.isdigit():
return int(line)
return 0
def eval_source(name, groups):
result = {"name": name}
for label, group_id in groups.items():
uuid = get_episode_uuid(group_id, name)
if uuid:
result[f"{label}_preds"] = count_preds(group_id, uuid)
result[f"{label}_edges"] = count_edges(group_id, uuid)
else:
result[f"{label}_preds"] = None
result[f"{label}_edges"] = None
return result
def run():
print("E1.8 — Evaluation phase")
print("=" * 60)
results = json.loads(RESULTS_PATH.read_text())
eval_results = {"subsample_a": [], "subsample_b": []}
# Sub-sample A — compare taxfree vs prod (baseline) vs e14 cascade
print("\nSub-sample A")
print(f"{'Source':<55} {'prod':>5} {'e14c':>5} {'tf':>5} {'e14Δ':>6} {'tfΔ':>6}")
print("-" * 90)
a_records = []
for item in results["subsample_a"]:
name = item["name"]
r = eval_source(name, {
"prod": GROUP_PROD,
"e14": GROUP_E14,
"tf": GROUP_TAXFREE,
})
r["bucket"] = item["bucket"]
r["taxfree_metadata"] = item.get("taxfree_metadata")
r["e14_delta_preds"] = item.get("e14_delta_preds")
prod = r.get("prod_preds") or 0
e14 = r.get("e14_preds") or 0
tf = r.get("tf_preds") or 0
e14_delta = ((e14 - prod) / prod * 100) if prod > 0 else 0
tf_delta = ((tf - prod) / prod * 100) if prod > 0 else 0
display = name[:53] + ".." if len(name) > 55 else name
print(f"{display:<55} {prod:>5} {e14:>5} {tf:>5} {e14_delta:>+5.0f}% {tf_delta:>+5.0f}%")
r["tf_delta_vs_prod"] = tf_delta
r["e14_delta_vs_prod"] = e14_delta
a_records.append(r)
eval_results["subsample_a"].append(r)
# Aggregate Sub-sample A
valid = [r for r in a_records if r.get("prod_preds") and r.get("tf_preds")]
if valid:
mean_e14_delta = sum(r["e14_delta_vs_prod"] for r in valid) / len(valid)
mean_tf_delta = sum(r["tf_delta_vs_prod"] for r in valid) / len(valid)
print(f"\nAggregate Sub-sample A (n={len(valid)}):")
print(f" E1.4 cascade mean delta vs prod: {mean_e14_delta:+.1f}%")
print(f" Taxonomy-free mean delta vs prod: {mean_tf_delta:+.1f}%")
print(f" Taxonomy-free vs E1.4 cascade: {mean_tf_delta - mean_e14_delta:+.1f}pp")
# Sub-sample B — all three conditions
print("\n\nSub-sample B")
print(f"{'Source':<55} {'base':>5} {'std':>5} {'tf':>5} {'stdΔ':>6} {'tfΔ':>6}")
print("-" * 90)
b_records = []
for item in results["subsample_b"]:
name = item["name"]
r = eval_source(name, {
"base": GROUP_BASELINE,
"std": GROUP_STANDARD,
"tf": GROUP_TAXFREE,
})
r["bucket"] = item["bucket"]
r["taxfree_metadata"] = item.get("taxfree_metadata")
r["standard_metadata"] = item.get("standard_metadata")
base = r.get("base_preds") or 0
std = r.get("std_preds") or 0
tf = r.get("tf_preds") or 0
std_delta = ((std - base) / base * 100) if base > 0 else 0
tf_delta = ((tf - base) / base * 100) if base > 0 else 0
display = name[:53] + ".." if len(name) > 55 else name
print(f"{display:<55} {base:>5} {std:>5} {tf:>5} {std_delta:>+5.0f}% {tf_delta:>+5.0f}%")
r["std_delta_vs_base"] = std_delta
r["tf_delta_vs_base"] = tf_delta
b_records.append(r)
eval_results["subsample_b"].append(r)
# Aggregate Sub-sample B
valid_b = [r for r in b_records if r.get("base_preds") and r.get("tf_preds")]
if valid_b:
mean_std_delta = sum(r["std_delta_vs_base"] for r in valid_b) / len(valid_b)
mean_tf_delta = sum(r["tf_delta_vs_base"] for r in valid_b) / len(valid_b)
print(f"\nAggregate Sub-sample B (n={len(valid_b)}):")
print(f" Standard cascade mean delta vs baseline: {mean_std_delta:+.1f}%")
print(f" Taxonomy-free mean delta vs baseline: {mean_tf_delta:+.1f}%")
# By bucket
print("\nPer-bucket (Sub-sample B):")
for bucket in ["high", "mid", "document"]:
br = [r for r in valid_b if r["bucket"] == bucket]
if not br:
continue
m_std = sum(r["std_delta_vs_base"] for r in br) / len(br)
m_tf = sum(r["tf_delta_vs_base"] for r in br) / len(br)
print(f" [{bucket:>8}] n={len(br)} std={m_std:+.0f}% tf={m_tf:+.0f}%")
# Decision rule evaluation
print("\n" + "=" * 60)
print("DECISION RULE:")
if valid:
improvement = mean_tf_delta - mean_e14_delta
if improvement >= 20:
print(f" ✓ STRONG RECOVERY (+{improvement:.1f}pp) — Stage 3.1 ships as taxonomy-free")
elif improvement >= 5:
print(f" ~ PARTIAL RECOVERY (+{improvement:.1f}pp) — orientation helps, needs refinement")
elif improvement >= 0:
print(f" ~ MARGINAL (+{improvement:.1f}pp) — consider API extractor prompt redesign (E1.9)")
else:
print(f" ✗ NEGATIVE ({improvement:.1f}pp) — taxonomy-free introduces more noise than standard")
EVAL_PATH.write_text(json.dumps(eval_results, indent=2))
print(f"\nEval saved to {EVAL_PATH}")
if __name__ == "__main__":
run()