aaronAI/scripts/experiments/e1_9_retroactive.py

#!/usr/bin/env python3
"""
E1.9 Phase 1 — Retroactive validation
For each E1.8 source, query the production graph with frame_relationships
to get a coverage score, then check whether the routing tier prediction
matches the actual best-performing condition from E1.8.
No API spend required — uses existing E1.8 data and Graphiti search only.
"""

import json, requests
from pathlib import Path

GRAPHITI_URL = "http://localhost:8001"
E18_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
E18_INGEST_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_9_retroactive.json"

# Routing thresholds
HIGH_THRESHOLD = 0.70   # baseline
LOW_THRESHOLD  = 0.40   # taxonomy-free


def get_coverage_score(query, group_id="aaron"):
    """Query production graph and return coverage score based on result count.
    Score: 0 = no results, 0.33 = 1 result, 0.66 = 2 results, 1.0 = 3+ results.
    Uses result count because Graphiti fulltext search returns score=0 for all hits.
    """
    if not query or not query.strip():
        return 0.0
    try:
        resp = requests.get(
            f"{GRAPHITI_URL}/search",
            params={"query": query, "limit": 3, "group_id": group_id},
            timeout=30
        )
        resp.raise_for_status()
        results = resp.json().get("results", [])
        n = len(results)
        return min(n / 3.0, 1.0)
    except Exception as e:
        print(f"    Search error: {e}")
        return 0.0


def assign_tier(coverage_score):
    if coverage_score >= HIGH_THRESHOLD:
        return "baseline"
    elif coverage_score >= LOW_THRESHOLD:
        return "standard"
    else:
        return "taxfree"


def best_condition_from_e18(record, subsample):
    """
    Determine which condition actually performed best for this source in E1.8.
    Sub-sample A: compare prod (baseline), e14 (standard cascade), tf (taxfree)
    Sub-sample B: compare base, std, tf
    """
    if subsample == "a":
        prod = record.get("prod_preds") or 0
        e14  = record.get("e14_preds")  or 0
        tf   = record.get("tf_preds")   or 0
        best_score = max(prod, e14, tf)
        if best_score == 0:
            return "unknown"
        if tf == best_score:
            return "taxfree"
        elif e14 == best_score:
            return "standard"
        else:
            return "baseline"
    else:
        base = record.get("base_preds") or 0
        std  = record.get("std_preds")  or 0
        tf   = record.get("tf_preds")   or 0
        best_score = max(base, std, tf)
        if best_score == 0:
            return "unknown"
        if tf == best_score:
            return "taxfree"
        elif std == best_score:
            return "standard"
        else:
            return "baseline"


def run():
    print("E1.9 Phase 1 — Retroactive validation")
    print("=" * 60)

    e18_eval   = json.loads(E18_PATH.read_text())
    e18_ingest = json.loads(E18_INGEST_PATH.read_text())

    # Build frame_relationships lookup from ingest results
    fr_lookup = {}
    for item in e18_ingest.get("subsample_a", []):
        meta = item.get("taxfree_metadata", {})
        if meta:
            fr_lookup[item["name"]] = meta.get("frame_relationships", "")
    for item in e18_ingest.get("subsample_b", []):
        meta = item.get("taxfree_metadata", {})
        if meta:
            fr_lookup[item["name"]] = meta.get("frame_relationships", "")

    results = []
    correct = 0
    total = 0

    # Sub-sample A
    print("\nSub-sample A")
    print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
    print("-" * 95)

    for record in e18_eval["subsample_a"]:
        name = record["name"]
        fr = fr_lookup.get(name, "")
        coverage = get_coverage_score(fr)
        tier = assign_tier(coverage)
        actual_best = best_condition_from_e18(record, "a")
        match = "✓" if tier == actual_best else "✗"
        if actual_best != "unknown":
            total += 1
            if tier == actual_best:
                correct += 1
        display = name[:48] + ".." if len(name) > 50 else name
        print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
        results.append({
            "name": name, "subsample": "a", "bucket": record.get("bucket"),
            "frame_relationships": fr, "coverage_score": coverage,
            "predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
        })

    # Sub-sample B
    print("\nSub-sample B")
    print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
    print("-" * 95)

    for record in e18_eval["subsample_b"]:
        name = record["name"]
        fr = fr_lookup.get(name, "")
        coverage = get_coverage_score(fr)
        tier = assign_tier(coverage)
        actual_best = best_condition_from_e18(record, "b")
        match = "✓" if tier == actual_best else "✗"
        if actual_best != "unknown":
            total += 1
            if tier == actual_best:
                correct += 1
        display = name[:48] + ".." if len(name) > 50 else name
        print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
        results.append({
            "name": name, "subsample": "b", "bucket": record.get("bucket"),
            "frame_relationships": fr, "coverage_score": coverage,
            "predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
        })

    # Summary
    rate = correct / total * 100 if total > 0 else 0
    print(f"\n{'=' * 60}")
    print(f"Validation rate: {correct}/{total} ({rate:.1f}%)")
    print()
    if rate >= 70:
        print("✓ SIGNAL VALIDATED — coverage score predicts best condition")
        print("  Proceed to Phase 2 (new ingestion with routing)")
    elif rate >= 50:
        print("~ MARGINAL — adjust thresholds before Phase 2")
        print("  Review mismatch patterns below")
    else:
        print("✗ SIGNAL NOT PREDICTIVE — frame_relationships coverage")
        print("  may not be the right signal. Consider active_frames fallback.")

    # Mismatch analysis
    mismatches = [r for r in results if not r["match"] and r["actual_best"] != "unknown"]
    if mismatches:
        print(f"\nMismatches ({len(mismatches)}):")
        for r in mismatches:
            print(f"  [{r['bucket']:<8}] cov={r['coverage_score']:.2f} predicted={r['predicted_tier']} actual={r['actual_best']} | {r['name'][:50]}")

    # Coverage score distribution
    scores = [r["coverage_score"] for r in results]
    print(f"\nCoverage score distribution:")
    print(f"  Mean: {sum(scores)/len(scores):.2f}")
    print(f"  Min:  {min(scores):.2f}")
    print(f"  Max:  {max(scores):.2f}")
    high = sum(1 for s in scores if s >= HIGH_THRESHOLD)
    mid  = sum(1 for s in scores if LOW_THRESHOLD <= s < HIGH_THRESHOLD)
    low  = sum(1 for s in scores if s < LOW_THRESHOLD)
    print(f"  Tier distribution: baseline={high} standard={mid} taxfree={low}")

    # Save
    output = {
        "validation_rate": rate,
        "correct": correct,
        "total": total,
        "thresholds": {"high": HIGH_THRESHOLD, "low": LOW_THRESHOLD},
        "results": results,
    }
    RESULTS_PATH.write_text(json.dumps(output, indent=2))
    print(f"\nSaved to {RESULTS_PATH}")


if __name__ == "__main__":
    run()