add experiment scripts and results; watcher.py latest changes
This commit is contained in:
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
E1.9 Phase 1 — Retroactive validation
|
||||
For each E1.8 source, query the production graph with frame_relationships
|
||||
to get a coverage score, then check whether the routing tier prediction
|
||||
matches the actual best-performing condition from E1.8.
|
||||
No API spend required — uses existing E1.8 data and Graphiti search only.
|
||||
"""
|
||||
|
||||
import json, requests
|
||||
from pathlib import Path
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
E18_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
|
||||
E18_INGEST_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
|
||||
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_9_retroactive.json"
|
||||
|
||||
# Routing thresholds
|
||||
HIGH_THRESHOLD = 0.70 # baseline
|
||||
LOW_THRESHOLD = 0.40 # taxonomy-free
|
||||
|
||||
|
||||
def get_coverage_score(query, group_id="aaron"):
|
||||
"""Query production graph and return coverage score based on result count.
|
||||
Score: 0 = no results, 0.33 = 1 result, 0.66 = 2 results, 1.0 = 3+ results.
|
||||
Uses result count because Graphiti fulltext search returns score=0 for all hits.
|
||||
"""
|
||||
if not query or not query.strip():
|
||||
return 0.0
|
||||
try:
|
||||
resp = requests.get(
|
||||
f"{GRAPHITI_URL}/search",
|
||||
params={"query": query, "limit": 3, "group_id": group_id},
|
||||
timeout=30
|
||||
)
|
||||
resp.raise_for_status()
|
||||
results = resp.json().get("results", [])
|
||||
n = len(results)
|
||||
return min(n / 3.0, 1.0)
|
||||
except Exception as e:
|
||||
print(f" Search error: {e}")
|
||||
return 0.0
|
||||
|
||||
|
||||
def assign_tier(coverage_score):
|
||||
if coverage_score >= HIGH_THRESHOLD:
|
||||
return "baseline"
|
||||
elif coverage_score >= LOW_THRESHOLD:
|
||||
return "standard"
|
||||
else:
|
||||
return "taxfree"
|
||||
|
||||
|
||||
def best_condition_from_e18(record, subsample):
|
||||
"""
|
||||
Determine which condition actually performed best for this source in E1.8.
|
||||
Sub-sample A: compare prod (baseline), e14 (standard cascade), tf (taxfree)
|
||||
Sub-sample B: compare base, std, tf
|
||||
"""
|
||||
if subsample == "a":
|
||||
prod = record.get("prod_preds") or 0
|
||||
e14 = record.get("e14_preds") or 0
|
||||
tf = record.get("tf_preds") or 0
|
||||
best_score = max(prod, e14, tf)
|
||||
if best_score == 0:
|
||||
return "unknown"
|
||||
if tf == best_score:
|
||||
return "taxfree"
|
||||
elif e14 == best_score:
|
||||
return "standard"
|
||||
else:
|
||||
return "baseline"
|
||||
else:
|
||||
base = record.get("base_preds") or 0
|
||||
std = record.get("std_preds") or 0
|
||||
tf = record.get("tf_preds") or 0
|
||||
best_score = max(base, std, tf)
|
||||
if best_score == 0:
|
||||
return "unknown"
|
||||
if tf == best_score:
|
||||
return "taxfree"
|
||||
elif std == best_score:
|
||||
return "standard"
|
||||
else:
|
||||
return "baseline"
|
||||
|
||||
|
||||
def run():
|
||||
print("E1.9 Phase 1 — Retroactive validation")
|
||||
print("=" * 60)
|
||||
|
||||
e18_eval = json.loads(E18_PATH.read_text())
|
||||
e18_ingest = json.loads(E18_INGEST_PATH.read_text())
|
||||
|
||||
# Build frame_relationships lookup from ingest results
|
||||
fr_lookup = {}
|
||||
for item in e18_ingest.get("subsample_a", []):
|
||||
meta = item.get("taxfree_metadata", {})
|
||||
if meta:
|
||||
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
|
||||
for item in e18_ingest.get("subsample_b", []):
|
||||
meta = item.get("taxfree_metadata", {})
|
||||
if meta:
|
||||
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
|
||||
|
||||
results = []
|
||||
correct = 0
|
||||
total = 0
|
||||
|
||||
# Sub-sample A
|
||||
print("\nSub-sample A")
|
||||
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
|
||||
print("-" * 95)
|
||||
|
||||
for record in e18_eval["subsample_a"]:
|
||||
name = record["name"]
|
||||
fr = fr_lookup.get(name, "")
|
||||
coverage = get_coverage_score(fr)
|
||||
tier = assign_tier(coverage)
|
||||
actual_best = best_condition_from_e18(record, "a")
|
||||
match = "✓" if tier == actual_best else "✗"
|
||||
if actual_best != "unknown":
|
||||
total += 1
|
||||
if tier == actual_best:
|
||||
correct += 1
|
||||
display = name[:48] + ".." if len(name) > 50 else name
|
||||
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
|
||||
results.append({
|
||||
"name": name, "subsample": "a", "bucket": record.get("bucket"),
|
||||
"frame_relationships": fr, "coverage_score": coverage,
|
||||
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
|
||||
})
|
||||
|
||||
# Sub-sample B
|
||||
print("\nSub-sample B")
|
||||
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
|
||||
print("-" * 95)
|
||||
|
||||
for record in e18_eval["subsample_b"]:
|
||||
name = record["name"]
|
||||
fr = fr_lookup.get(name, "")
|
||||
coverage = get_coverage_score(fr)
|
||||
tier = assign_tier(coverage)
|
||||
actual_best = best_condition_from_e18(record, "b")
|
||||
match = "✓" if tier == actual_best else "✗"
|
||||
if actual_best != "unknown":
|
||||
total += 1
|
||||
if tier == actual_best:
|
||||
correct += 1
|
||||
display = name[:48] + ".." if len(name) > 50 else name
|
||||
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
|
||||
results.append({
|
||||
"name": name, "subsample": "b", "bucket": record.get("bucket"),
|
||||
"frame_relationships": fr, "coverage_score": coverage,
|
||||
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
|
||||
})
|
||||
|
||||
# Summary
|
||||
rate = correct / total * 100 if total > 0 else 0
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Validation rate: {correct}/{total} ({rate:.1f}%)")
|
||||
print()
|
||||
if rate >= 70:
|
||||
print("✓ SIGNAL VALIDATED — coverage score predicts best condition")
|
||||
print(" Proceed to Phase 2 (new ingestion with routing)")
|
||||
elif rate >= 50:
|
||||
print("~ MARGINAL — adjust thresholds before Phase 2")
|
||||
print(" Review mismatch patterns below")
|
||||
else:
|
||||
print("✗ SIGNAL NOT PREDICTIVE — frame_relationships coverage")
|
||||
print(" may not be the right signal. Consider active_frames fallback.")
|
||||
|
||||
# Mismatch analysis
|
||||
mismatches = [r for r in results if not r["match"] and r["actual_best"] != "unknown"]
|
||||
if mismatches:
|
||||
print(f"\nMismatches ({len(mismatches)}):")
|
||||
for r in mismatches:
|
||||
print(f" [{r['bucket']:<8}] cov={r['coverage_score']:.2f} predicted={r['predicted_tier']} actual={r['actual_best']} | {r['name'][:50]}")
|
||||
|
||||
# Coverage score distribution
|
||||
scores = [r["coverage_score"] for r in results]
|
||||
print(f"\nCoverage score distribution:")
|
||||
print(f" Mean: {sum(scores)/len(scores):.2f}")
|
||||
print(f" Min: {min(scores):.2f}")
|
||||
print(f" Max: {max(scores):.2f}")
|
||||
high = sum(1 for s in scores if s >= HIGH_THRESHOLD)
|
||||
mid = sum(1 for s in scores if LOW_THRESHOLD <= s < HIGH_THRESHOLD)
|
||||
low = sum(1 for s in scores if s < LOW_THRESHOLD)
|
||||
print(f" Tier distribution: baseline={high} standard={mid} taxfree={low}")
|
||||
|
||||
# Save
|
||||
output = {
|
||||
"validation_rate": rate,
|
||||
"correct": correct,
|
||||
"total": total,
|
||||
"thresholds": {"high": HIGH_THRESHOLD, "low": LOW_THRESHOLD},
|
||||
"results": results,
|
||||
}
|
||||
RESULTS_PATH.write_text(json.dumps(output, indent=2))
|
||||
print(f"\nSaved to {RESULTS_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user