add experiment scripts and results; watcher.py latest changes

This commit is contained in:
2026-04-30 18:06:03 +00:00
parent 1cf26df450
commit f11cacd9c9
55 changed files with 23594 additions and 726 deletions
+204
View File
@@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
E1.9 Phase 1 — Retroactive validation
For each E1.8 source, query the production graph with frame_relationships
to get a coverage score, then check whether the routing tier prediction
matches the actual best-performing condition from E1.8.
No API spend required — uses existing E1.8 data and Graphiti search only.
"""
import json, requests
from pathlib import Path
GRAPHITI_URL = "http://localhost:8001"
E18_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
E18_INGEST_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_9_retroactive.json"
# Routing thresholds
HIGH_THRESHOLD = 0.70 # baseline
LOW_THRESHOLD = 0.40 # taxonomy-free
def get_coverage_score(query, group_id="aaron"):
"""Query production graph and return coverage score based on result count.
Score: 0 = no results, 0.33 = 1 result, 0.66 = 2 results, 1.0 = 3+ results.
Uses result count because Graphiti fulltext search returns score=0 for all hits.
"""
if not query or not query.strip():
return 0.0
try:
resp = requests.get(
f"{GRAPHITI_URL}/search",
params={"query": query, "limit": 3, "group_id": group_id},
timeout=30
)
resp.raise_for_status()
results = resp.json().get("results", [])
n = len(results)
return min(n / 3.0, 1.0)
except Exception as e:
print(f" Search error: {e}")
return 0.0
def assign_tier(coverage_score):
if coverage_score >= HIGH_THRESHOLD:
return "baseline"
elif coverage_score >= LOW_THRESHOLD:
return "standard"
else:
return "taxfree"
def best_condition_from_e18(record, subsample):
"""
Determine which condition actually performed best for this source in E1.8.
Sub-sample A: compare prod (baseline), e14 (standard cascade), tf (taxfree)
Sub-sample B: compare base, std, tf
"""
if subsample == "a":
prod = record.get("prod_preds") or 0
e14 = record.get("e14_preds") or 0
tf = record.get("tf_preds") or 0
best_score = max(prod, e14, tf)
if best_score == 0:
return "unknown"
if tf == best_score:
return "taxfree"
elif e14 == best_score:
return "standard"
else:
return "baseline"
else:
base = record.get("base_preds") or 0
std = record.get("std_preds") or 0
tf = record.get("tf_preds") or 0
best_score = max(base, std, tf)
if best_score == 0:
return "unknown"
if tf == best_score:
return "taxfree"
elif std == best_score:
return "standard"
else:
return "baseline"
def run():
print("E1.9 Phase 1 — Retroactive validation")
print("=" * 60)
e18_eval = json.loads(E18_PATH.read_text())
e18_ingest = json.loads(E18_INGEST_PATH.read_text())
# Build frame_relationships lookup from ingest results
fr_lookup = {}
for item in e18_ingest.get("subsample_a", []):
meta = item.get("taxfree_metadata", {})
if meta:
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
for item in e18_ingest.get("subsample_b", []):
meta = item.get("taxfree_metadata", {})
if meta:
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
results = []
correct = 0
total = 0
# Sub-sample A
print("\nSub-sample A")
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
print("-" * 95)
for record in e18_eval["subsample_a"]:
name = record["name"]
fr = fr_lookup.get(name, "")
coverage = get_coverage_score(fr)
tier = assign_tier(coverage)
actual_best = best_condition_from_e18(record, "a")
match = "" if tier == actual_best else ""
if actual_best != "unknown":
total += 1
if tier == actual_best:
correct += 1
display = name[:48] + ".." if len(name) > 50 else name
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
results.append({
"name": name, "subsample": "a", "bucket": record.get("bucket"),
"frame_relationships": fr, "coverage_score": coverage,
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
})
# Sub-sample B
print("\nSub-sample B")
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
print("-" * 95)
for record in e18_eval["subsample_b"]:
name = record["name"]
fr = fr_lookup.get(name, "")
coverage = get_coverage_score(fr)
tier = assign_tier(coverage)
actual_best = best_condition_from_e18(record, "b")
match = "" if tier == actual_best else ""
if actual_best != "unknown":
total += 1
if tier == actual_best:
correct += 1
display = name[:48] + ".." if len(name) > 50 else name
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
results.append({
"name": name, "subsample": "b", "bucket": record.get("bucket"),
"frame_relationships": fr, "coverage_score": coverage,
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
})
# Summary
rate = correct / total * 100 if total > 0 else 0
print(f"\n{'=' * 60}")
print(f"Validation rate: {correct}/{total} ({rate:.1f}%)")
print()
if rate >= 70:
print("✓ SIGNAL VALIDATED — coverage score predicts best condition")
print(" Proceed to Phase 2 (new ingestion with routing)")
elif rate >= 50:
print("~ MARGINAL — adjust thresholds before Phase 2")
print(" Review mismatch patterns below")
else:
print("✗ SIGNAL NOT PREDICTIVE — frame_relationships coverage")
print(" may not be the right signal. Consider active_frames fallback.")
# Mismatch analysis
mismatches = [r for r in results if not r["match"] and r["actual_best"] != "unknown"]
if mismatches:
print(f"\nMismatches ({len(mismatches)}):")
for r in mismatches:
print(f" [{r['bucket']:<8}] cov={r['coverage_score']:.2f} predicted={r['predicted_tier']} actual={r['actual_best']} | {r['name'][:50]}")
# Coverage score distribution
scores = [r["coverage_score"] for r in results]
print(f"\nCoverage score distribution:")
print(f" Mean: {sum(scores)/len(scores):.2f}")
print(f" Min: {min(scores):.2f}")
print(f" Max: {max(scores):.2f}")
high = sum(1 for s in scores if s >= HIGH_THRESHOLD)
mid = sum(1 for s in scores if LOW_THRESHOLD <= s < HIGH_THRESHOLD)
low = sum(1 for s in scores if s < LOW_THRESHOLD)
print(f" Tier distribution: baseline={high} standard={mid} taxfree={low}")
# Save
output = {
"validation_rate": rate,
"correct": correct,
"total": total,
"thresholds": {"high": HIGH_THRESHOLD, "low": LOW_THRESHOLD},
"results": results,
}
RESULTS_PATH.write_text(json.dumps(output, indent=2))
print(f"\nSaved to {RESULTS_PATH}")
if __name__ == "__main__":
run()