191 lines
6.9 KiB
Python
191 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
E1.8 Phase 2 — Evaluate
|
|
Pulls predicate counts from FalkorDB for each group_id and compares.
|
|
Run after e1_8_taxfree_cascade.py completes.
|
|
"""
|
|
|
|
import json, subprocess
|
|
from pathlib import Path
|
|
|
|
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
|
|
EVAL_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
|
|
|
|
GROUP_TAXFREE = "aaron_e18_taxfree"
|
|
GROUP_BASELINE = "aaron_e18_baseline"
|
|
GROUP_STANDARD = "aaron_e18_standard"
|
|
GROUP_PROD = "aaron"
|
|
GROUP_E14 = "aaron_cascade_e14"
|
|
|
|
|
|
def query(group_id, cypher):
|
|
result = subprocess.run(
|
|
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
|
|
capture_output=True, text=True
|
|
)
|
|
return result.stdout
|
|
|
|
|
|
def get_episode_uuid(group_id, episode_name):
|
|
safe = episode_name.replace("'", "\'")
|
|
cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
|
|
output = query(group_id, cypher)
|
|
for line in output.split("\n"):
|
|
line = line.strip()
|
|
if len(line) == 36 and line.count("-") == 4:
|
|
return line
|
|
return None
|
|
|
|
|
|
def count_preds(group_id, uuid):
|
|
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
|
|
output = query(group_id, cypher)
|
|
for line in output.split("\n"):
|
|
line = line.strip()
|
|
if line.isdigit():
|
|
return int(line)
|
|
return 0
|
|
|
|
|
|
def count_edges(group_id, uuid):
|
|
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
|
|
output = query(group_id, cypher)
|
|
for line in output.split("\n"):
|
|
line = line.strip()
|
|
if line.isdigit():
|
|
return int(line)
|
|
return 0
|
|
|
|
|
|
def eval_source(name, groups):
|
|
result = {"name": name}
|
|
for label, group_id in groups.items():
|
|
uuid = get_episode_uuid(group_id, name)
|
|
if uuid:
|
|
result[f"{label}_preds"] = count_preds(group_id, uuid)
|
|
result[f"{label}_edges"] = count_edges(group_id, uuid)
|
|
else:
|
|
result[f"{label}_preds"] = None
|
|
result[f"{label}_edges"] = None
|
|
return result
|
|
|
|
|
|
def run():
|
|
print("E1.8 — Evaluation phase")
|
|
print("=" * 60)
|
|
|
|
results = json.loads(RESULTS_PATH.read_text())
|
|
eval_results = {"subsample_a": [], "subsample_b": []}
|
|
|
|
# Sub-sample A — compare taxfree vs prod (baseline) vs e14 cascade
|
|
print("\nSub-sample A")
|
|
print(f"{'Source':<55} {'prod':>5} {'e14c':>5} {'tf':>5} {'e14Δ':>6} {'tfΔ':>6}")
|
|
print("-" * 90)
|
|
|
|
a_records = []
|
|
for item in results["subsample_a"]:
|
|
name = item["name"]
|
|
r = eval_source(name, {
|
|
"prod": GROUP_PROD,
|
|
"e14": GROUP_E14,
|
|
"tf": GROUP_TAXFREE,
|
|
})
|
|
r["bucket"] = item["bucket"]
|
|
r["taxfree_metadata"] = item.get("taxfree_metadata")
|
|
r["e14_delta_preds"] = item.get("e14_delta_preds")
|
|
|
|
prod = r.get("prod_preds") or 0
|
|
e14 = r.get("e14_preds") or 0
|
|
tf = r.get("tf_preds") or 0
|
|
e14_delta = ((e14 - prod) / prod * 100) if prod > 0 else 0
|
|
tf_delta = ((tf - prod) / prod * 100) if prod > 0 else 0
|
|
|
|
display = name[:53] + ".." if len(name) > 55 else name
|
|
print(f"{display:<55} {prod:>5} {e14:>5} {tf:>5} {e14_delta:>+5.0f}% {tf_delta:>+5.0f}%")
|
|
|
|
r["tf_delta_vs_prod"] = tf_delta
|
|
r["e14_delta_vs_prod"] = e14_delta
|
|
a_records.append(r)
|
|
eval_results["subsample_a"].append(r)
|
|
|
|
# Aggregate Sub-sample A
|
|
valid = [r for r in a_records if r.get("prod_preds") and r.get("tf_preds")]
|
|
if valid:
|
|
mean_e14_delta = sum(r["e14_delta_vs_prod"] for r in valid) / len(valid)
|
|
mean_tf_delta = sum(r["tf_delta_vs_prod"] for r in valid) / len(valid)
|
|
print(f"\nAggregate Sub-sample A (n={len(valid)}):")
|
|
print(f" E1.4 cascade mean delta vs prod: {mean_e14_delta:+.1f}%")
|
|
print(f" Taxonomy-free mean delta vs prod: {mean_tf_delta:+.1f}%")
|
|
print(f" Taxonomy-free vs E1.4 cascade: {mean_tf_delta - mean_e14_delta:+.1f}pp")
|
|
|
|
# Sub-sample B — all three conditions
|
|
print("\n\nSub-sample B")
|
|
print(f"{'Source':<55} {'base':>5} {'std':>5} {'tf':>5} {'stdΔ':>6} {'tfΔ':>6}")
|
|
print("-" * 90)
|
|
|
|
b_records = []
|
|
for item in results["subsample_b"]:
|
|
name = item["name"]
|
|
r = eval_source(name, {
|
|
"base": GROUP_BASELINE,
|
|
"std": GROUP_STANDARD,
|
|
"tf": GROUP_TAXFREE,
|
|
})
|
|
r["bucket"] = item["bucket"]
|
|
r["taxfree_metadata"] = item.get("taxfree_metadata")
|
|
r["standard_metadata"] = item.get("standard_metadata")
|
|
|
|
base = r.get("base_preds") or 0
|
|
std = r.get("std_preds") or 0
|
|
tf = r.get("tf_preds") or 0
|
|
std_delta = ((std - base) / base * 100) if base > 0 else 0
|
|
tf_delta = ((tf - base) / base * 100) if base > 0 else 0
|
|
|
|
display = name[:53] + ".." if len(name) > 55 else name
|
|
print(f"{display:<55} {base:>5} {std:>5} {tf:>5} {std_delta:>+5.0f}% {tf_delta:>+5.0f}%")
|
|
|
|
r["std_delta_vs_base"] = std_delta
|
|
r["tf_delta_vs_base"] = tf_delta
|
|
b_records.append(r)
|
|
eval_results["subsample_b"].append(r)
|
|
|
|
# Aggregate Sub-sample B
|
|
valid_b = [r for r in b_records if r.get("base_preds") and r.get("tf_preds")]
|
|
if valid_b:
|
|
mean_std_delta = sum(r["std_delta_vs_base"] for r in valid_b) / len(valid_b)
|
|
mean_tf_delta = sum(r["tf_delta_vs_base"] for r in valid_b) / len(valid_b)
|
|
print(f"\nAggregate Sub-sample B (n={len(valid_b)}):")
|
|
print(f" Standard cascade mean delta vs baseline: {mean_std_delta:+.1f}%")
|
|
print(f" Taxonomy-free mean delta vs baseline: {mean_tf_delta:+.1f}%")
|
|
|
|
# By bucket
|
|
print("\nPer-bucket (Sub-sample B):")
|
|
for bucket in ["high", "mid", "document"]:
|
|
br = [r for r in valid_b if r["bucket"] == bucket]
|
|
if not br:
|
|
continue
|
|
m_std = sum(r["std_delta_vs_base"] for r in br) / len(br)
|
|
m_tf = sum(r["tf_delta_vs_base"] for r in br) / len(br)
|
|
print(f" [{bucket:>8}] n={len(br)} std={m_std:+.0f}% tf={m_tf:+.0f}%")
|
|
|
|
# Decision rule evaluation
|
|
print("\n" + "=" * 60)
|
|
print("DECISION RULE:")
|
|
if valid:
|
|
improvement = mean_tf_delta - mean_e14_delta
|
|
if improvement >= 20:
|
|
print(f" ✓ STRONG RECOVERY (+{improvement:.1f}pp) — Stage 3.1 ships as taxonomy-free")
|
|
elif improvement >= 5:
|
|
print(f" ~ PARTIAL RECOVERY (+{improvement:.1f}pp) — orientation helps, needs refinement")
|
|
elif improvement >= 0:
|
|
print(f" ~ MARGINAL (+{improvement:.1f}pp) — consider API extractor prompt redesign (E1.9)")
|
|
else:
|
|
print(f" ✗ NEGATIVE ({improvement:.1f}pp) — taxonomy-free introduces more noise than standard")
|
|
|
|
EVAL_PATH.write_text(json.dumps(eval_results, indent=2))
|
|
print(f"\nEval saved to {EVAL_PATH}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run()
|