205 lines
7.3 KiB
Python
205 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
E1.9 Phase 1 — Retroactive validation
|
|
For each E1.8 source, query the production graph with frame_relationships
|
|
to get a coverage score, then check whether the routing tier prediction
|
|
matches the actual best-performing condition from E1.8.
|
|
No API spend required — uses existing E1.8 data and Graphiti search only.
|
|
"""
|
|
|
|
import json, requests
|
|
from pathlib import Path
|
|
|
|
GRAPHITI_URL = "http://localhost:8001"
|
|
E18_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
|
|
E18_INGEST_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
|
|
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_9_retroactive.json"
|
|
|
|
# Routing thresholds
|
|
HIGH_THRESHOLD = 0.70 # baseline
|
|
LOW_THRESHOLD = 0.40 # taxonomy-free
|
|
|
|
|
|
def get_coverage_score(query, group_id="aaron"):
|
|
"""Query production graph and return coverage score based on result count.
|
|
Score: 0 = no results, 0.33 = 1 result, 0.66 = 2 results, 1.0 = 3+ results.
|
|
Uses result count because Graphiti fulltext search returns score=0 for all hits.
|
|
"""
|
|
if not query or not query.strip():
|
|
return 0.0
|
|
try:
|
|
resp = requests.get(
|
|
f"{GRAPHITI_URL}/search",
|
|
params={"query": query, "limit": 3, "group_id": group_id},
|
|
timeout=30
|
|
)
|
|
resp.raise_for_status()
|
|
results = resp.json().get("results", [])
|
|
n = len(results)
|
|
return min(n / 3.0, 1.0)
|
|
except Exception as e:
|
|
print(f" Search error: {e}")
|
|
return 0.0
|
|
|
|
|
|
def assign_tier(coverage_score):
|
|
if coverage_score >= HIGH_THRESHOLD:
|
|
return "baseline"
|
|
elif coverage_score >= LOW_THRESHOLD:
|
|
return "standard"
|
|
else:
|
|
return "taxfree"
|
|
|
|
|
|
def best_condition_from_e18(record, subsample):
|
|
"""
|
|
Determine which condition actually performed best for this source in E1.8.
|
|
Sub-sample A: compare prod (baseline), e14 (standard cascade), tf (taxfree)
|
|
Sub-sample B: compare base, std, tf
|
|
"""
|
|
if subsample == "a":
|
|
prod = record.get("prod_preds") or 0
|
|
e14 = record.get("e14_preds") or 0
|
|
tf = record.get("tf_preds") or 0
|
|
best_score = max(prod, e14, tf)
|
|
if best_score == 0:
|
|
return "unknown"
|
|
if tf == best_score:
|
|
return "taxfree"
|
|
elif e14 == best_score:
|
|
return "standard"
|
|
else:
|
|
return "baseline"
|
|
else:
|
|
base = record.get("base_preds") or 0
|
|
std = record.get("std_preds") or 0
|
|
tf = record.get("tf_preds") or 0
|
|
best_score = max(base, std, tf)
|
|
if best_score == 0:
|
|
return "unknown"
|
|
if tf == best_score:
|
|
return "taxfree"
|
|
elif std == best_score:
|
|
return "standard"
|
|
else:
|
|
return "baseline"
|
|
|
|
|
|
def run():
|
|
print("E1.9 Phase 1 — Retroactive validation")
|
|
print("=" * 60)
|
|
|
|
e18_eval = json.loads(E18_PATH.read_text())
|
|
e18_ingest = json.loads(E18_INGEST_PATH.read_text())
|
|
|
|
# Build frame_relationships lookup from ingest results
|
|
fr_lookup = {}
|
|
for item in e18_ingest.get("subsample_a", []):
|
|
meta = item.get("taxfree_metadata", {})
|
|
if meta:
|
|
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
|
|
for item in e18_ingest.get("subsample_b", []):
|
|
meta = item.get("taxfree_metadata", {})
|
|
if meta:
|
|
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
|
|
|
|
results = []
|
|
correct = 0
|
|
total = 0
|
|
|
|
# Sub-sample A
|
|
print("\nSub-sample A")
|
|
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
|
|
print("-" * 95)
|
|
|
|
for record in e18_eval["subsample_a"]:
|
|
name = record["name"]
|
|
fr = fr_lookup.get(name, "")
|
|
coverage = get_coverage_score(fr)
|
|
tier = assign_tier(coverage)
|
|
actual_best = best_condition_from_e18(record, "a")
|
|
match = "✓" if tier == actual_best else "✗"
|
|
if actual_best != "unknown":
|
|
total += 1
|
|
if tier == actual_best:
|
|
correct += 1
|
|
display = name[:48] + ".." if len(name) > 50 else name
|
|
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
|
|
results.append({
|
|
"name": name, "subsample": "a", "bucket": record.get("bucket"),
|
|
"frame_relationships": fr, "coverage_score": coverage,
|
|
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
|
|
})
|
|
|
|
# Sub-sample B
|
|
print("\nSub-sample B")
|
|
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
|
|
print("-" * 95)
|
|
|
|
for record in e18_eval["subsample_b"]:
|
|
name = record["name"]
|
|
fr = fr_lookup.get(name, "")
|
|
coverage = get_coverage_score(fr)
|
|
tier = assign_tier(coverage)
|
|
actual_best = best_condition_from_e18(record, "b")
|
|
match = "✓" if tier == actual_best else "✗"
|
|
if actual_best != "unknown":
|
|
total += 1
|
|
if tier == actual_best:
|
|
correct += 1
|
|
display = name[:48] + ".." if len(name) > 50 else name
|
|
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
|
|
results.append({
|
|
"name": name, "subsample": "b", "bucket": record.get("bucket"),
|
|
"frame_relationships": fr, "coverage_score": coverage,
|
|
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
|
|
})
|
|
|
|
# Summary
|
|
rate = correct / total * 100 if total > 0 else 0
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Validation rate: {correct}/{total} ({rate:.1f}%)")
|
|
print()
|
|
if rate >= 70:
|
|
print("✓ SIGNAL VALIDATED — coverage score predicts best condition")
|
|
print(" Proceed to Phase 2 (new ingestion with routing)")
|
|
elif rate >= 50:
|
|
print("~ MARGINAL — adjust thresholds before Phase 2")
|
|
print(" Review mismatch patterns below")
|
|
else:
|
|
print("✗ SIGNAL NOT PREDICTIVE — frame_relationships coverage")
|
|
print(" may not be the right signal. Consider active_frames fallback.")
|
|
|
|
# Mismatch analysis
|
|
mismatches = [r for r in results if not r["match"] and r["actual_best"] != "unknown"]
|
|
if mismatches:
|
|
print(f"\nMismatches ({len(mismatches)}):")
|
|
for r in mismatches:
|
|
print(f" [{r['bucket']:<8}] cov={r['coverage_score']:.2f} predicted={r['predicted_tier']} actual={r['actual_best']} | {r['name'][:50]}")
|
|
|
|
# Coverage score distribution
|
|
scores = [r["coverage_score"] for r in results]
|
|
print(f"\nCoverage score distribution:")
|
|
print(f" Mean: {sum(scores)/len(scores):.2f}")
|
|
print(f" Min: {min(scores):.2f}")
|
|
print(f" Max: {max(scores):.2f}")
|
|
high = sum(1 for s in scores if s >= HIGH_THRESHOLD)
|
|
mid = sum(1 for s in scores if LOW_THRESHOLD <= s < HIGH_THRESHOLD)
|
|
low = sum(1 for s in scores if s < LOW_THRESHOLD)
|
|
print(f" Tier distribution: baseline={high} standard={mid} taxfree={low}")
|
|
|
|
# Save
|
|
output = {
|
|
"validation_rate": rate,
|
|
"correct": correct,
|
|
"total": total,
|
|
"thresholds": {"high": HIGH_THRESHOLD, "low": LOW_THRESHOLD},
|
|
"results": results,
|
|
}
|
|
RESULTS_PATH.write_text(json.dumps(output, indent=2))
|
|
print(f"\nSaved to {RESULTS_PATH}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run()
|