add experiment scripts and results; watcher.py latest changes
This commit is contained in:
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
E1.8 Phase 2 — Evaluate
|
||||
Pulls predicate counts from FalkorDB for each group_id and compares.
|
||||
Run after e1_8_taxfree_cascade.py completes.
|
||||
"""
|
||||
|
||||
import json, subprocess
|
||||
from pathlib import Path
|
||||
|
||||
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
|
||||
EVAL_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
|
||||
|
||||
GROUP_TAXFREE = "aaron_e18_taxfree"
|
||||
GROUP_BASELINE = "aaron_e18_baseline"
|
||||
GROUP_STANDARD = "aaron_e18_standard"
|
||||
GROUP_PROD = "aaron"
|
||||
GROUP_E14 = "aaron_cascade_e14"
|
||||
|
||||
|
||||
def query(group_id, cypher):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
|
||||
def get_episode_uuid(group_id, episode_name):
|
||||
safe = episode_name.replace("'", "\'")
|
||||
cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
|
||||
output = query(group_id, cypher)
|
||||
for line in output.split("\n"):
|
||||
line = line.strip()
|
||||
if len(line) == 36 and line.count("-") == 4:
|
||||
return line
|
||||
return None
|
||||
|
||||
|
||||
def count_preds(group_id, uuid):
|
||||
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
|
||||
output = query(group_id, cypher)
|
||||
for line in output.split("\n"):
|
||||
line = line.strip()
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
|
||||
def count_edges(group_id, uuid):
|
||||
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
|
||||
output = query(group_id, cypher)
|
||||
for line in output.split("\n"):
|
||||
line = line.strip()
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
|
||||
def eval_source(name, groups):
|
||||
result = {"name": name}
|
||||
for label, group_id in groups.items():
|
||||
uuid = get_episode_uuid(group_id, name)
|
||||
if uuid:
|
||||
result[f"{label}_preds"] = count_preds(group_id, uuid)
|
||||
result[f"{label}_edges"] = count_edges(group_id, uuid)
|
||||
else:
|
||||
result[f"{label}_preds"] = None
|
||||
result[f"{label}_edges"] = None
|
||||
return result
|
||||
|
||||
|
||||
def run():
|
||||
print("E1.8 — Evaluation phase")
|
||||
print("=" * 60)
|
||||
|
||||
results = json.loads(RESULTS_PATH.read_text())
|
||||
eval_results = {"subsample_a": [], "subsample_b": []}
|
||||
|
||||
# Sub-sample A — compare taxfree vs prod (baseline) vs e14 cascade
|
||||
print("\nSub-sample A")
|
||||
print(f"{'Source':<55} {'prod':>5} {'e14c':>5} {'tf':>5} {'e14Δ':>6} {'tfΔ':>6}")
|
||||
print("-" * 90)
|
||||
|
||||
a_records = []
|
||||
for item in results["subsample_a"]:
|
||||
name = item["name"]
|
||||
r = eval_source(name, {
|
||||
"prod": GROUP_PROD,
|
||||
"e14": GROUP_E14,
|
||||
"tf": GROUP_TAXFREE,
|
||||
})
|
||||
r["bucket"] = item["bucket"]
|
||||
r["taxfree_metadata"] = item.get("taxfree_metadata")
|
||||
r["e14_delta_preds"] = item.get("e14_delta_preds")
|
||||
|
||||
prod = r.get("prod_preds") or 0
|
||||
e14 = r.get("e14_preds") or 0
|
||||
tf = r.get("tf_preds") or 0
|
||||
e14_delta = ((e14 - prod) / prod * 100) if prod > 0 else 0
|
||||
tf_delta = ((tf - prod) / prod * 100) if prod > 0 else 0
|
||||
|
||||
display = name[:53] + ".." if len(name) > 55 else name
|
||||
print(f"{display:<55} {prod:>5} {e14:>5} {tf:>5} {e14_delta:>+5.0f}% {tf_delta:>+5.0f}%")
|
||||
|
||||
r["tf_delta_vs_prod"] = tf_delta
|
||||
r["e14_delta_vs_prod"] = e14_delta
|
||||
a_records.append(r)
|
||||
eval_results["subsample_a"].append(r)
|
||||
|
||||
# Aggregate Sub-sample A
|
||||
valid = [r for r in a_records if r.get("prod_preds") and r.get("tf_preds")]
|
||||
if valid:
|
||||
mean_e14_delta = sum(r["e14_delta_vs_prod"] for r in valid) / len(valid)
|
||||
mean_tf_delta = sum(r["tf_delta_vs_prod"] for r in valid) / len(valid)
|
||||
print(f"\nAggregate Sub-sample A (n={len(valid)}):")
|
||||
print(f" E1.4 cascade mean delta vs prod: {mean_e14_delta:+.1f}%")
|
||||
print(f" Taxonomy-free mean delta vs prod: {mean_tf_delta:+.1f}%")
|
||||
print(f" Taxonomy-free vs E1.4 cascade: {mean_tf_delta - mean_e14_delta:+.1f}pp")
|
||||
|
||||
# Sub-sample B — all three conditions
|
||||
print("\n\nSub-sample B")
|
||||
print(f"{'Source':<55} {'base':>5} {'std':>5} {'tf':>5} {'stdΔ':>6} {'tfΔ':>6}")
|
||||
print("-" * 90)
|
||||
|
||||
b_records = []
|
||||
for item in results["subsample_b"]:
|
||||
name = item["name"]
|
||||
r = eval_source(name, {
|
||||
"base": GROUP_BASELINE,
|
||||
"std": GROUP_STANDARD,
|
||||
"tf": GROUP_TAXFREE,
|
||||
})
|
||||
r["bucket"] = item["bucket"]
|
||||
r["taxfree_metadata"] = item.get("taxfree_metadata")
|
||||
r["standard_metadata"] = item.get("standard_metadata")
|
||||
|
||||
base = r.get("base_preds") or 0
|
||||
std = r.get("std_preds") or 0
|
||||
tf = r.get("tf_preds") or 0
|
||||
std_delta = ((std - base) / base * 100) if base > 0 else 0
|
||||
tf_delta = ((tf - base) / base * 100) if base > 0 else 0
|
||||
|
||||
display = name[:53] + ".." if len(name) > 55 else name
|
||||
print(f"{display:<55} {base:>5} {std:>5} {tf:>5} {std_delta:>+5.0f}% {tf_delta:>+5.0f}%")
|
||||
|
||||
r["std_delta_vs_base"] = std_delta
|
||||
r["tf_delta_vs_base"] = tf_delta
|
||||
b_records.append(r)
|
||||
eval_results["subsample_b"].append(r)
|
||||
|
||||
# Aggregate Sub-sample B
|
||||
valid_b = [r for r in b_records if r.get("base_preds") and r.get("tf_preds")]
|
||||
if valid_b:
|
||||
mean_std_delta = sum(r["std_delta_vs_base"] for r in valid_b) / len(valid_b)
|
||||
mean_tf_delta = sum(r["tf_delta_vs_base"] for r in valid_b) / len(valid_b)
|
||||
print(f"\nAggregate Sub-sample B (n={len(valid_b)}):")
|
||||
print(f" Standard cascade mean delta vs baseline: {mean_std_delta:+.1f}%")
|
||||
print(f" Taxonomy-free mean delta vs baseline: {mean_tf_delta:+.1f}%")
|
||||
|
||||
# By bucket
|
||||
print("\nPer-bucket (Sub-sample B):")
|
||||
for bucket in ["high", "mid", "document"]:
|
||||
br = [r for r in valid_b if r["bucket"] == bucket]
|
||||
if not br:
|
||||
continue
|
||||
m_std = sum(r["std_delta_vs_base"] for r in br) / len(br)
|
||||
m_tf = sum(r["tf_delta_vs_base"] for r in br) / len(br)
|
||||
print(f" [{bucket:>8}] n={len(br)} std={m_std:+.0f}% tf={m_tf:+.0f}%")
|
||||
|
||||
# Decision rule evaluation
|
||||
print("\n" + "=" * 60)
|
||||
print("DECISION RULE:")
|
||||
if valid:
|
||||
improvement = mean_tf_delta - mean_e14_delta
|
||||
if improvement >= 20:
|
||||
print(f" ✓ STRONG RECOVERY (+{improvement:.1f}pp) — Stage 3.1 ships as taxonomy-free")
|
||||
elif improvement >= 5:
|
||||
print(f" ~ PARTIAL RECOVERY (+{improvement:.1f}pp) — orientation helps, needs refinement")
|
||||
elif improvement >= 0:
|
||||
print(f" ~ MARGINAL (+{improvement:.1f}pp) — consider API extractor prompt redesign (E1.9)")
|
||||
else:
|
||||
print(f" ✗ NEGATIVE ({improvement:.1f}pp) — taxonomy-free introduces more noise than standard")
|
||||
|
||||
EVAL_PATH.write_text(json.dumps(eval_results, indent=2))
|
||||
print(f"\nEval saved to {EVAL_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user