add experiment scripts and results; watcher.py latest changes

2026-04-30 18:06:03 +00:00
parent 1cf26df450
commit f11cacd9c9
55 changed files with 23594 additions and 726 deletions
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+E1.9 Phase 1 — Retroactive validation
+For each E1.8 source, query the production graph with frame_relationships
+to get a coverage score, then check whether the routing tier prediction
+matches the actual best-performing condition from E1.8.
+No API spend required — uses existing E1.8 data and Graphiti search only.
+"""
+
+import json, requests
+from pathlib import Path
+
+GRAPHITI_URL = "http://localhost:8001"
+E18_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
+E18_INGEST_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
+RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_9_retroactive.json"
+
+# Routing thresholds
+HIGH_THRESHOLD = 0.70   # baseline
+LOW_THRESHOLD  = 0.40   # taxonomy-free
+
+
+def get_coverage_score(query, group_id="aaron"):
+    """Query production graph and return coverage score based on result count.
+    Score: 0 = no results, 0.33 = 1 result, 0.66 = 2 results, 1.0 = 3+ results.
+    Uses result count because Graphiti fulltext search returns score=0 for all hits.
+    """
+    if not query or not query.strip():
+        return 0.0
+    try:
+        resp = requests.get(
+            f"{GRAPHITI_URL}/search",
+            params={"query": query, "limit": 3, "group_id": group_id},
+            timeout=30
+        )
+        resp.raise_for_status()
+        results = resp.json().get("results", [])
+        n = len(results)
+        return min(n / 3.0, 1.0)
+    except Exception as e:
+        print(f"    Search error: {e}")
+        return 0.0
+
+
+def assign_tier(coverage_score):
+    if coverage_score >= HIGH_THRESHOLD:
+        return "baseline"
+    elif coverage_score >= LOW_THRESHOLD:
+        return "standard"
+    else:
+        return "taxfree"
+
+
+def best_condition_from_e18(record, subsample):
+    """
+    Determine which condition actually performed best for this source in E1.8.
+    Sub-sample A: compare prod (baseline), e14 (standard cascade), tf (taxfree)
+    Sub-sample B: compare base, std, tf
+    """
+    if subsample == "a":
+        prod = record.get("prod_preds") or 0
+        e14  = record.get("e14_preds")  or 0
+        tf   = record.get("tf_preds")   or 0
+        best_score = max(prod, e14, tf)
+        if best_score == 0:
+            return "unknown"
+        if tf == best_score:
+            return "taxfree"
+        elif e14 == best_score:
+            return "standard"
+        else:
+            return "baseline"
+    else:
+        base = record.get("base_preds") or 0
+        std  = record.get("std_preds")  or 0
+        tf   = record.get("tf_preds")   or 0
+        best_score = max(base, std, tf)
+        if best_score == 0:
+            return "unknown"
+        if tf == best_score:
+            return "taxfree"
+        elif std == best_score:
+            return "standard"
+        else:
+            return "baseline"
+
+
+def run():
+    print("E1.9 Phase 1 — Retroactive validation")
+    print("=" * 60)
+
+    e18_eval   = json.loads(E18_PATH.read_text())
+    e18_ingest = json.loads(E18_INGEST_PATH.read_text())
+
+    # Build frame_relationships lookup from ingest results
+    fr_lookup = {}
+    for item in e18_ingest.get("subsample_a", []):
+        meta = item.get("taxfree_metadata", {})
+        if meta:
+            fr_lookup[item["name"]] = meta.get("frame_relationships", "")
+    for item in e18_ingest.get("subsample_b", []):
+        meta = item.get("taxfree_metadata", {})
+        if meta:
+            fr_lookup[item["name"]] = meta.get("frame_relationships", "")
+
+    results = []
+    correct = 0
+    total = 0
+
+    # Sub-sample A
+    print("\nSub-sample A")
+    print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
+    print("-" * 95)
+
+    for record in e18_eval["subsample_a"]:
+        name = record["name"]
+        fr = fr_lookup.get(name, "")
+        coverage = get_coverage_score(fr)
+        tier = assign_tier(coverage)
+        actual_best = best_condition_from_e18(record, "a")
+        match = "✓" if tier == actual_best else "✗"
+        if actual_best != "unknown":
+            total += 1
+            if tier == actual_best:
+                correct += 1
+        display = name[:48] + ".." if len(name) > 50 else name
+        print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
+        results.append({
+            "name": name, "subsample": "a", "bucket": record.get("bucket"),
+            "frame_relationships": fr, "coverage_score": coverage,
+            "predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
+        })
+
+    # Sub-sample B
+    print("\nSub-sample B")
+    print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
+    print("-" * 95)
+
+    for record in e18_eval["subsample_b"]:
+        name = record["name"]
+        fr = fr_lookup.get(name, "")
+        coverage = get_coverage_score(fr)
+        tier = assign_tier(coverage)
+        actual_best = best_condition_from_e18(record, "b")
+        match = "✓" if tier == actual_best else "✗"
+        if actual_best != "unknown":
+            total += 1
+            if tier == actual_best:
+                correct += 1
+        display = name[:48] + ".." if len(name) > 50 else name
+        print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
+        results.append({
+            "name": name, "subsample": "b", "bucket": record.get("bucket"),
+            "frame_relationships": fr, "coverage_score": coverage,
+            "predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
+        })
+
+    # Summary
+    rate = correct / total * 100 if total > 0 else 0
+    print(f"\n{'=' * 60}")
+    print(f"Validation rate: {correct}/{total} ({rate:.1f}%)")
+    print()
+    if rate >= 70:
+        print("✓ SIGNAL VALIDATED — coverage score predicts best condition")
+        print("  Proceed to Phase 2 (new ingestion with routing)")
+    elif rate >= 50:
+        print("~ MARGINAL — adjust thresholds before Phase 2")
+        print("  Review mismatch patterns below")
+    else:
+        print("✗ SIGNAL NOT PREDICTIVE — frame_relationships coverage")
+        print("  may not be the right signal. Consider active_frames fallback.")
+
+    # Mismatch analysis
+    mismatches = [r for r in results if not r["match"] and r["actual_best"] != "unknown"]
+    if mismatches:
+        print(f"\nMismatches ({len(mismatches)}):")
+        for r in mismatches:
+            print(f"  [{r['bucket']:<8}] cov={r['coverage_score']:.2f} predicted={r['predicted_tier']} actual={r['actual_best']} | {r['name'][:50]}")
+
+    # Coverage score distribution
+    scores = [r["coverage_score"] for r in results]
+    print(f"\nCoverage score distribution:")
+    print(f"  Mean: {sum(scores)/len(scores):.2f}")
+    print(f"  Min:  {min(scores):.2f}")
+    print(f"  Max:  {max(scores):.2f}")
+    high = sum(1 for s in scores if s >= HIGH_THRESHOLD)
+    mid  = sum(1 for s in scores if LOW_THRESHOLD <= s < HIGH_THRESHOLD)
+    low  = sum(1 for s in scores if s < LOW_THRESHOLD)
+    print(f"  Tier distribution: baseline={high} standard={mid} taxfree={low}")
+
+    # Save
+    output = {
+        "validation_rate": rate,
+        "correct": correct,
+        "total": total,
+        "thresholds": {"high": HIGH_THRESHOLD, "low": LOW_THRESHOLD},
+        "results": results,
+    }
+    RESULTS_PATH.write_text(json.dumps(output, indent=2))
+    print(f"\nSaved to {RESULTS_PATH}")
+
+
+if __name__ == "__main__":
+    run()