#!/usr/bin/env python3 """ E1.9 Phase 1 — Retroactive validation For each E1.8 source, query the production graph with frame_relationships to get a coverage score, then check whether the routing tier prediction matches the actual best-performing condition from E1.8. No API spend required — uses existing E1.8 data and Graphiti search only. """ import json, requests from pathlib import Path GRAPHITI_URL = "http://localhost:8001" E18_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json" E18_INGEST_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json" RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_9_retroactive.json" # Routing thresholds HIGH_THRESHOLD = 0.70 # baseline LOW_THRESHOLD = 0.40 # taxonomy-free def get_coverage_score(query, group_id="aaron"): """Query production graph and return coverage score based on result count. Score: 0 = no results, 0.33 = 1 result, 0.66 = 2 results, 1.0 = 3+ results. Uses result count because Graphiti fulltext search returns score=0 for all hits. """ if not query or not query.strip(): return 0.0 try: resp = requests.get( f"{GRAPHITI_URL}/search", params={"query": query, "limit": 3, "group_id": group_id}, timeout=30 ) resp.raise_for_status() results = resp.json().get("results", []) n = len(results) return min(n / 3.0, 1.0) except Exception as e: print(f" Search error: {e}") return 0.0 def assign_tier(coverage_score): if coverage_score >= HIGH_THRESHOLD: return "baseline" elif coverage_score >= LOW_THRESHOLD: return "standard" else: return "taxfree" def best_condition_from_e18(record, subsample): """ Determine which condition actually performed best for this source in E1.8. Sub-sample A: compare prod (baseline), e14 (standard cascade), tf (taxfree) Sub-sample B: compare base, std, tf """ if subsample == "a": prod = record.get("prod_preds") or 0 e14 = record.get("e14_preds") or 0 tf = record.get("tf_preds") or 0 best_score = max(prod, e14, tf) if best_score == 0: return "unknown" if tf == best_score: return "taxfree" elif e14 == best_score: return "standard" else: return "baseline" else: base = record.get("base_preds") or 0 std = record.get("std_preds") or 0 tf = record.get("tf_preds") or 0 best_score = max(base, std, tf) if best_score == 0: return "unknown" if tf == best_score: return "taxfree" elif std == best_score: return "standard" else: return "baseline" def run(): print("E1.9 Phase 1 — Retroactive validation") print("=" * 60) e18_eval = json.loads(E18_PATH.read_text()) e18_ingest = json.loads(E18_INGEST_PATH.read_text()) # Build frame_relationships lookup from ingest results fr_lookup = {} for item in e18_ingest.get("subsample_a", []): meta = item.get("taxfree_metadata", {}) if meta: fr_lookup[item["name"]] = meta.get("frame_relationships", "") for item in e18_ingest.get("subsample_b", []): meta = item.get("taxfree_metadata", {}) if meta: fr_lookup[item["name"]] = meta.get("frame_relationships", "") results = [] correct = 0 total = 0 # Sub-sample A print("\nSub-sample A") print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}") print("-" * 95) for record in e18_eval["subsample_a"]: name = record["name"] fr = fr_lookup.get(name, "") coverage = get_coverage_score(fr) tier = assign_tier(coverage) actual_best = best_condition_from_e18(record, "a") match = "✓" if tier == actual_best else "✗" if actual_best != "unknown": total += 1 if tier == actual_best: correct += 1 display = name[:48] + ".." if len(name) > 50 else name print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}") results.append({ "name": name, "subsample": "a", "bucket": record.get("bucket"), "frame_relationships": fr, "coverage_score": coverage, "predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best, }) # Sub-sample B print("\nSub-sample B") print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}") print("-" * 95) for record in e18_eval["subsample_b"]: name = record["name"] fr = fr_lookup.get(name, "") coverage = get_coverage_score(fr) tier = assign_tier(coverage) actual_best = best_condition_from_e18(record, "b") match = "✓" if tier == actual_best else "✗" if actual_best != "unknown": total += 1 if tier == actual_best: correct += 1 display = name[:48] + ".." if len(name) > 50 else name print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}") results.append({ "name": name, "subsample": "b", "bucket": record.get("bucket"), "frame_relationships": fr, "coverage_score": coverage, "predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best, }) # Summary rate = correct / total * 100 if total > 0 else 0 print(f"\n{'=' * 60}") print(f"Validation rate: {correct}/{total} ({rate:.1f}%)") print() if rate >= 70: print("✓ SIGNAL VALIDATED — coverage score predicts best condition") print(" Proceed to Phase 2 (new ingestion with routing)") elif rate >= 50: print("~ MARGINAL — adjust thresholds before Phase 2") print(" Review mismatch patterns below") else: print("✗ SIGNAL NOT PREDICTIVE — frame_relationships coverage") print(" may not be the right signal. Consider active_frames fallback.") # Mismatch analysis mismatches = [r for r in results if not r["match"] and r["actual_best"] != "unknown"] if mismatches: print(f"\nMismatches ({len(mismatches)}):") for r in mismatches: print(f" [{r['bucket']:<8}] cov={r['coverage_score']:.2f} predicted={r['predicted_tier']} actual={r['actual_best']} | {r['name'][:50]}") # Coverage score distribution scores = [r["coverage_score"] for r in results] print(f"\nCoverage score distribution:") print(f" Mean: {sum(scores)/len(scores):.2f}") print(f" Min: {min(scores):.2f}") print(f" Max: {max(scores):.2f}") high = sum(1 for s in scores if s >= HIGH_THRESHOLD) mid = sum(1 for s in scores if LOW_THRESHOLD <= s < HIGH_THRESHOLD) low = sum(1 for s in scores if s < LOW_THRESHOLD) print(f" Tier distribution: baseline={high} standard={mid} taxfree={low}") # Save output = { "validation_rate": rate, "correct": correct, "total": total, "thresholds": {"high": HIGH_THRESHOLD, "low": LOW_THRESHOLD}, "results": results, } RESULTS_PATH.write_text(json.dumps(output, indent=2)) print(f"\nSaved to {RESULTS_PATH}") if __name__ == "__main__": run()