add experiment scripts and results; watcher.py latest changes

2026-04-30 18:06:03 +00:00
parent 1cf26df450
commit f11cacd9c9
55 changed files with 23594 additions and 726 deletions
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+"""
+E1.8 Phase 2 — Evaluate
+Pulls predicate counts from FalkorDB for each group_id and compares.
+Run after e1_8_taxfree_cascade.py completes.
+"""
+
+import json, subprocess
+from pathlib import Path
+
+RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
+EVAL_PATH    = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
+
+GROUP_TAXFREE  = "aaron_e18_taxfree"
+GROUP_BASELINE = "aaron_e18_baseline"
+GROUP_STANDARD = "aaron_e18_standard"
+GROUP_PROD     = "aaron"
+GROUP_E14      = "aaron_cascade_e14"
+
+
+def query(group_id, cypher):
+    result = subprocess.run(
+        ["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
+        capture_output=True, text=True
+    )
+    return result.stdout
+
+
+def get_episode_uuid(group_id, episode_name):
+    safe = episode_name.replace("'", "\'")
+    cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
+    output = query(group_id, cypher)
+    for line in output.split("\n"):
+        line = line.strip()
+        if len(line) == 36 and line.count("-") == 4:
+            return line
+    return None
+
+
+def count_preds(group_id, uuid):
+    cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
+    output = query(group_id, cypher)
+    for line in output.split("\n"):
+        line = line.strip()
+        if line.isdigit():
+            return int(line)
+    return 0
+
+
+def count_edges(group_id, uuid):
+    cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
+    output = query(group_id, cypher)
+    for line in output.split("\n"):
+        line = line.strip()
+        if line.isdigit():
+            return int(line)
+    return 0
+
+
+def eval_source(name, groups):
+    result = {"name": name}
+    for label, group_id in groups.items():
+        uuid = get_episode_uuid(group_id, name)
+        if uuid:
+            result[f"{label}_preds"] = count_preds(group_id, uuid)
+            result[f"{label}_edges"] = count_edges(group_id, uuid)
+        else:
+            result[f"{label}_preds"] = None
+            result[f"{label}_edges"] = None
+    return result
+
+
+def run():
+    print("E1.8 — Evaluation phase")
+    print("=" * 60)
+
+    results = json.loads(RESULTS_PATH.read_text())
+    eval_results = {"subsample_a": [], "subsample_b": []}
+
+    # Sub-sample A — compare taxfree vs prod (baseline) vs e14 cascade
+    print("\nSub-sample A")
+    print(f"{'Source':<55} {'prod':>5} {'e14c':>5} {'tf':>5} {'e14Δ':>6} {'tfΔ':>6}")
+    print("-" * 90)
+
+    a_records = []
+    for item in results["subsample_a"]:
+        name = item["name"]
+        r = eval_source(name, {
+            "prod": GROUP_PROD,
+            "e14": GROUP_E14,
+            "tf": GROUP_TAXFREE,
+        })
+        r["bucket"] = item["bucket"]
+        r["taxfree_metadata"] = item.get("taxfree_metadata")
+        r["e14_delta_preds"] = item.get("e14_delta_preds")
+
+        prod = r.get("prod_preds") or 0
+        e14 = r.get("e14_preds") or 0
+        tf = r.get("tf_preds") or 0
+        e14_delta = ((e14 - prod) / prod * 100) if prod > 0 else 0
+        tf_delta  = ((tf  - prod) / prod * 100) if prod > 0 else 0
+
+        display = name[:53] + ".." if len(name) > 55 else name
+        print(f"{display:<55} {prod:>5} {e14:>5} {tf:>5} {e14_delta:>+5.0f}% {tf_delta:>+5.0f}%")
+
+        r["tf_delta_vs_prod"] = tf_delta
+        r["e14_delta_vs_prod"] = e14_delta
+        a_records.append(r)
+        eval_results["subsample_a"].append(r)
+
+    # Aggregate Sub-sample A
+    valid = [r for r in a_records if r.get("prod_preds") and r.get("tf_preds")]
+    if valid:
+        mean_e14_delta = sum(r["e14_delta_vs_prod"] for r in valid) / len(valid)
+        mean_tf_delta  = sum(r["tf_delta_vs_prod"]  for r in valid) / len(valid)
+        print(f"\nAggregate Sub-sample A (n={len(valid)}):")
+        print(f"  E1.4 cascade mean delta vs prod: {mean_e14_delta:+.1f}%")
+        print(f"  Taxonomy-free mean delta vs prod: {mean_tf_delta:+.1f}%")
+        print(f"  Taxonomy-free vs E1.4 cascade: {mean_tf_delta - mean_e14_delta:+.1f}pp")
+
+    # Sub-sample B — all three conditions
+    print("\n\nSub-sample B")
+    print(f"{'Source':<55} {'base':>5} {'std':>5} {'tf':>5} {'stdΔ':>6} {'tfΔ':>6}")
+    print("-" * 90)
+
+    b_records = []
+    for item in results["subsample_b"]:
+        name = item["name"]
+        r = eval_source(name, {
+            "base": GROUP_BASELINE,
+            "std":  GROUP_STANDARD,
+            "tf":   GROUP_TAXFREE,
+        })
+        r["bucket"] = item["bucket"]
+        r["taxfree_metadata"] = item.get("taxfree_metadata")
+        r["standard_metadata"] = item.get("standard_metadata")
+
+        base = r.get("base_preds") or 0
+        std  = r.get("std_preds")  or 0
+        tf   = r.get("tf_preds")   or 0
+        std_delta = ((std - base) / base * 100) if base > 0 else 0
+        tf_delta  = ((tf  - base) / base * 100) if base > 0 else 0
+
+        display = name[:53] + ".." if len(name) > 55 else name
+        print(f"{display:<55} {base:>5} {std:>5} {tf:>5} {std_delta:>+5.0f}% {tf_delta:>+5.0f}%")
+
+        r["std_delta_vs_base"] = std_delta
+        r["tf_delta_vs_base"]  = tf_delta
+        b_records.append(r)
+        eval_results["subsample_b"].append(r)
+
+    # Aggregate Sub-sample B
+    valid_b = [r for r in b_records if r.get("base_preds") and r.get("tf_preds")]
+    if valid_b:
+        mean_std_delta = sum(r["std_delta_vs_base"] for r in valid_b) / len(valid_b)
+        mean_tf_delta  = sum(r["tf_delta_vs_base"]  for r in valid_b) / len(valid_b)
+        print(f"\nAggregate Sub-sample B (n={len(valid_b)}):")
+        print(f"  Standard cascade mean delta vs baseline: {mean_std_delta:+.1f}%")
+        print(f"  Taxonomy-free mean delta vs baseline:    {mean_tf_delta:+.1f}%")
+
+        # By bucket
+        print("\nPer-bucket (Sub-sample B):")
+        for bucket in ["high", "mid", "document"]:
+            br = [r for r in valid_b if r["bucket"] == bucket]
+            if not br:
+                continue
+            m_std = sum(r["std_delta_vs_base"] for r in br) / len(br)
+            m_tf  = sum(r["tf_delta_vs_base"]  for r in br) / len(br)
+            print(f"  [{bucket:>8}] n={len(br)}  std={m_std:+.0f}%  tf={m_tf:+.0f}%")
+
+    # Decision rule evaluation
+    print("\n" + "=" * 60)
+    print("DECISION RULE:")
+    if valid:
+        improvement = mean_tf_delta - mean_e14_delta
+        if improvement >= 20:
+            print(f"  ✓ STRONG RECOVERY (+{improvement:.1f}pp) — Stage 3.1 ships as taxonomy-free")
+        elif improvement >= 5:
+            print(f"  ~ PARTIAL RECOVERY (+{improvement:.1f}pp) — orientation helps, needs refinement")
+        elif improvement >= 0:
+            print(f"  ~ MARGINAL (+{improvement:.1f}pp) — consider API extractor prompt redesign (E1.9)")
+        else:
+            print(f"  ✗ NEGATIVE ({improvement:.1f}pp) — taxonomy-free introduces more noise than standard")
+
+    EVAL_PATH.write_text(json.dumps(eval_results, indent=2))
+    print(f"\nEval saved to {EVAL_PATH}")
+
+
+if __name__ == "__main__":
+    run()
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+"""
+E1.8 Phase 1 — Ingest
+Runs taxonomy-free and standard cascade ingestion for Sub-samples A and B.
+Run this first, then run e1_8_eval.py to pull predicate counts.
+"""
+
+import os, json, time, psycopg2, requests
+from pathlib import Path
+from dotenv import load_dotenv
+
+load_dotenv(Path.home() / "aaronai" / ".env", override=True)
+
+PG_DSN = os.getenv("PG_DSN")
+GRAPHITI_URL = "http://localhost:8001"
+RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
+
+GROUP_TAXFREE  = "aaron_e18_taxfree"
+GROUP_BASELINE = "aaron_e18_baseline"
+GROUP_STANDARD = "aaron_e18_standard"
+
+TAXFREE_PROMPT = """You are a metadata extraction system. Given a document, describe its content shape for use as orientation context in a knowledge graph extraction pass.
+
+Do not summarize content. Do not extract entities. Do not assign a single category label.
+
+Instead, describe:
+- What domains or frames are active in this content (there may be several simultaneously)
+- How those frames relate to each other in this specific document
+- What kind of relational content a knowledge graph extractor should look for
+
+Output JSON only. No prose, no explanation, no markdown.
+
+Schema:
+{
+  "active_frames": ["<frame 1>", "<frame 2>", ...],
+  "frame_relationships": "<one sentence describing how the frames interact in this document>",
+  "extraction_orientation": "<one sentence orienting the extractor toward the most relationship-rich content>",
+  "one_sentence_summary": "<one sentence describing what the document is about>"
+}
+
+Document:
+"""
+
+STANDARD_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
+
+Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
+
+Output JSON only. No prose, no explanation, no markdown code fences.
+
+Schema:
+{
+  "language": "<ISO 639-1 code>",
+  "char_length": <integer>,
+  "primary_format": "<prose|slides|code|structured|mixed>",
+  "structural_signals": {
+    "has_headings": <boolean>,
+    "has_bullet_lists": <boolean>,
+    "has_numbered_lists": <boolean>,
+    "has_tables": <boolean>,
+    "has_code_blocks": <boolean>,
+    "has_dates": <boolean>
+  },
+  "content_signals": {
+    "has_named_people": <boolean>,
+    "has_institutional_language": <boolean>,
+    "has_technical_terminology": <boolean>,
+    "has_first_person": <boolean>,
+    "has_quotations": <boolean>
+  },
+  "domain_class": "<technical|administrative|educational|personal|conversational>",
+  "one_sentence_summary": "<one sentence describing what the document is about>"
+}
+
+Document:
+"""
+
+SUBSAMPLE_A = [
+    {"name": "Claude: Lubbock on everything album lyrics", "bucket": "high"},
+    {"name": "ChatGPT: Tulsa Concept Album Guide", "bucket": "high"},
+    {"name": "ChatGPT: Rhino 3D object flow", "bucket": "high"},
+    {"name": "Claude: SUNY faculty conflict of interest policies", "bucket": "mid"},
+    {"name": "Claude: Interview presentation research and preparation", "bucket": "mid"},
+    {"name": "Claude: Research Statement Restructure", "bucket": "mid"},
+    {"name": "ChatGPT: Respect Individual Interests for Christmas", "bucket": "low"},
+    {"name": "University of North Texas Cover letter.pdf", "bucket": "document"},
+    {"name": "Claude: Finding ideal rural housing near University of Utah", "bucket": "high"},
+    {"name": "ChatGPT: SEC coaches with OSU ties", "bucket": "high"},
+    {"name": "Claude: Bonding ASA 3D printed parts", "bucket": "mid"},
+    {"name": "ChatGPT: Title: User request summary.", "bucket": "low"},
+    {"name": "ChatGPT: Scholarship Recommendation Letter Tips", "bucket": "low"},
+]
+
+SUBSAMPLE_B = [
+    {"name": "ChatGPT: Job application comparison", "bucket": "high"},
+    {"name": "ChatGPT: External review for tenure", "bucket": "high"},
+    {"name": "Claude: University of Utah interview teaching example", "bucket": "high"},
+    {"name": "ChatGPT: Starting Dropship Gun Business", "bucket": "high"},
+    {"name": "ChatGPT: Analyze business plan", "bucket": "high"},
+    {"name": "ChatGPT: Outdoor Layering Explained", "bucket": "mid"},
+    {"name": "ChatGPT: Limits in Calculus.", "bucket": "mid"},
+    {"name": "ChatGPT: Academic Program Director Role", "bucket": "mid"},
+    {"name": "ChatGPT: Lonely Island Poop Skit", "bucket": "mid"},
+    {"name": "ChatGPT: Parse Tidal playlist", "bucket": "mid"},
+    {"name": "NO thesis proposal.pdf", "bucket": "document"},
+    {"name": "PWM.pdf", "bucket": "document"},
+    {"name": "Will_It_Print.pdf", "bucket": "document"},
+    {"name": "Kim Kedem Ind Study F2025 Syllabus.docx", "bucket": "document"},
+    {"name": "Aaron Nelson Graduate Transcript.pdf", "bucket": "document"},
+]
+
+
+def get_pg():
+    return psycopg2.connect(PG_DSN)
+
+
+def get_document_text(source_name):
+    pg = get_pg()
+    cur = pg.cursor()
+    cur.execute("SELECT document FROM embeddings WHERE source = %s ORDER BY id LIMIT 20", (source_name,))
+    rows = cur.fetchall()
+    pg.close()
+    return " ".join(r[0] for r in rows)[:12000]
+
+
+def run_mistral(prompt_prefix, doc_text, label=""):
+    print(f"    → Mistral {label} running...", flush=True)
+    payload = {"model": "mistral:latest", "prompt": prompt_prefix + doc_text, "stream": False, "format": "json"}
+    resp = requests.post("http://localhost:11434/api/generate", json=payload, timeout=300)
+    resp.raise_for_status()
+    raw = resp.json().get("response", "{}")
+    print(f"    → Mistral {label} done ({len(raw)} chars)", flush=True)
+    try:
+        return json.loads(raw)
+    except Exception:
+        return {"error": "parse_failed", "raw": raw[:200]}
+
+
+def build_taxfree_orientation(meta):
+    frames = ", ".join(meta.get("active_frames", []))
+    rel = meta.get("frame_relationships", "")
+    orient = meta.get("extraction_orientation", "")
+    summary = meta.get("one_sentence_summary", "")
+    return f"Active frames: {frames}. Frame relationships: {rel} Extraction focus: {orient} Summary: {summary}"
+
+
+def build_standard_orientation(meta):
+    dc = meta.get("domain_class", "unknown")
+    pf = meta.get("primary_format", "unknown")
+    summary = meta.get("one_sentence_summary", "")
+    cs = meta.get("content_signals", {})
+    return (f"domain_class: {dc}\nprimary_format: {pf}\none_sentence_summary: {summary}\n"
+            f"has_named_people: {cs.get('has_named_people', False)}\n"
+            f"has_technical_terminology: {cs.get('has_technical_terminology', False)}")
+
+
+def ingest(source_name, doc_text, orientation, group_id):
+    payload = {
+        "episodes": [{
+            "name": source_name,
+            "content": doc_text[:12000],
+            "source_description": orientation,
+            "timestamp": "2026-04-28T00:00:00",
+        }],
+        "group_id": group_id,
+    }
+    resp = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=300)
+    resp.raise_for_status()
+
+
+def save(results):
+    RESULTS_PATH.write_text(json.dumps(results, indent=2))
+
+
+def run():
+    print("E1.8 — Ingest phase")
+    print("=" * 60)
+
+    # Load existing results if resuming
+    if RESULTS_PATH.exists():
+        results = json.loads(RESULTS_PATH.read_text())
+        done_a = {r["name"] for r in results.get("subsample_a", [])}
+        done_b = {r["name"] for r in results.get("subsample_b", [])}
+        print(f"Resuming: {len(done_a)} A done, {len(done_b)} B done")
+    else:
+        results = {"subsample_a": [], "subsample_b": []}
+        done_a, done_b = set(), set()
+
+    e14_data = json.loads((Path.home() / "aaronai" / "experiments" / "e14_per_source_comparison.json").read_text())
+    e14_by_name = {s["name"]: s for s in e14_data}
+
+    # Sub-sample A — taxonomy-free only (baseline + standard from E1.4)
+    print("\nSub-sample A — taxonomy-free ingestion only")
+    for item in SUBSAMPLE_A:
+        name = item["name"]
+        if name in done_a:
+            print(f"  SKIP (done): {name}")
+            continue
+        print(f"\n  {name}")
+        doc_text = get_document_text(name)
+        if not doc_text:
+            print(f"  SKIP — no text")
+            continue
+
+        tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
+        print(f"  frames: {tf_meta.get('active_frames', 'ERROR')}")
+        orientation = build_taxfree_orientation(tf_meta)
+
+        try:
+            ingest(name, doc_text, orientation, GROUP_TAXFREE)
+            time.sleep(3)
+            print(f"  ingested to {GROUP_TAXFREE}")
+        except Exception as e:
+            print(f"  ingest failed: {e}")
+            continue
+
+        e14 = e14_by_name.get(name, {})
+        results["subsample_a"].append({
+            "name": name,
+            "bucket": item["bucket"],
+            "taxfree_metadata": tf_meta,
+            "taxfree_orientation": orientation,
+            "e14_prod_preds": e14.get("prod_preds"),
+            "e14_cascade_preds": e14.get("cascade_preds"),
+            "e14_delta_preds": e14.get("delta_preds"),
+            "e14_prod_edges": e14.get("prod_edges"),
+            "e14_cascade_edges": e14.get("cascade_edges"),
+            "e14_delta_edges": e14.get("delta_edges"),
+        })
+        save(results)
+
+    # Sub-sample B — all three conditions
+    print("\nSub-sample B — all three conditions")
+    for item in SUBSAMPLE_B:
+        name = item["name"]
+        if name in done_b:
+            print(f"  SKIP (done): {name}")
+            continue
+        print(f"\n  {name} ({item['bucket']})")
+        doc_text = get_document_text(name)
+        if not doc_text:
+            print(f"  SKIP — no text")
+            continue
+
+        entry = {"name": name, "bucket": item["bucket"],
+                 "taxfree_metadata": None, "standard_metadata": None}
+
+        # Baseline
+        try:
+            ingest(name, doc_text, "", GROUP_BASELINE)
+            time.sleep(3)
+            print(f"  baseline ingested")
+        except Exception as e:
+            print(f"  baseline failed: {e}")
+
+        # Standard
+        std_meta = run_mistral(STANDARD_PROMPT, doc_text, "standard")
+        entry["standard_metadata"] = std_meta
+        try:
+            ingest(name, doc_text, build_standard_orientation(std_meta), GROUP_STANDARD)
+            time.sleep(3)
+            print(f"  standard ingested, domain_class={std_meta.get('domain_class','?')}")
+        except Exception as e:
+            print(f"  standard failed: {e}")
+
+        # Taxonomy-free
+        tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
+        entry["taxfree_metadata"] = tf_meta
+        print(f"  frames: {tf_meta.get('active_frames', 'ERROR')}")
+        try:
+            ingest(name, doc_text, build_taxfree_orientation(tf_meta), GROUP_TAXFREE)
+            time.sleep(3)
+            print(f"  taxfree ingested")
+        except Exception as e:
+            print(f"  taxfree failed: {e}")
+
+        results["subsample_b"].append(entry)
+        save(results)
+
+    print("\n" + "=" * 60)
+    print(f"Ingest complete. Results at {RESULTS_PATH}")
+    print("Now run: python3 ~/aaronai/scripts/experiments/e1_8_eval.py")
+
+
+if __name__ == "__main__":
+    run()
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+E1.9 Phase 1 — Retroactive validation
+For each E1.8 source, query the production graph with frame_relationships
+to get a coverage score, then check whether the routing tier prediction
+matches the actual best-performing condition from E1.8.
+No API spend required — uses existing E1.8 data and Graphiti search only.
+"""
+
+import json, requests
+from pathlib import Path
+
+GRAPHITI_URL = "http://localhost:8001"
+E18_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
+E18_INGEST_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
+RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_9_retroactive.json"
+
+# Routing thresholds
+HIGH_THRESHOLD = 0.70   # baseline
+LOW_THRESHOLD  = 0.40   # taxonomy-free
+
+
+def get_coverage_score(query, group_id="aaron"):
+    """Query production graph and return coverage score based on result count.
+    Score: 0 = no results, 0.33 = 1 result, 0.66 = 2 results, 1.0 = 3+ results.
+    Uses result count because Graphiti fulltext search returns score=0 for all hits.
+    """
+    if not query or not query.strip():
+        return 0.0
+    try:
+        resp = requests.get(
+            f"{GRAPHITI_URL}/search",
+            params={"query": query, "limit": 3, "group_id": group_id},
+            timeout=30
+        )
+        resp.raise_for_status()
+        results = resp.json().get("results", [])
+        n = len(results)
+        return min(n / 3.0, 1.0)
+    except Exception as e:
+        print(f"    Search error: {e}")
+        return 0.0
+
+
+def assign_tier(coverage_score):
+    if coverage_score >= HIGH_THRESHOLD:
+        return "baseline"
+    elif coverage_score >= LOW_THRESHOLD:
+        return "standard"
+    else:
+        return "taxfree"
+
+
+def best_condition_from_e18(record, subsample):
+    """
+    Determine which condition actually performed best for this source in E1.8.
+    Sub-sample A: compare prod (baseline), e14 (standard cascade), tf (taxfree)
+    Sub-sample B: compare base, std, tf
+    """
+    if subsample == "a":
+        prod = record.get("prod_preds") or 0
+        e14  = record.get("e14_preds")  or 0
+        tf   = record.get("tf_preds")   or 0
+        best_score = max(prod, e14, tf)
+        if best_score == 0:
+            return "unknown"
+        if tf == best_score:
+            return "taxfree"
+        elif e14 == best_score:
+            return "standard"
+        else:
+            return "baseline"
+    else:
+        base = record.get("base_preds") or 0
+        std  = record.get("std_preds")  or 0
+        tf   = record.get("tf_preds")   or 0
+        best_score = max(base, std, tf)
+        if best_score == 0:
+            return "unknown"
+        if tf == best_score:
+            return "taxfree"
+        elif std == best_score:
+            return "standard"
+        else:
+            return "baseline"
+
+
+def run():
+    print("E1.9 Phase 1 — Retroactive validation")
+    print("=" * 60)
+
+    e18_eval   = json.loads(E18_PATH.read_text())
+    e18_ingest = json.loads(E18_INGEST_PATH.read_text())
+
+    # Build frame_relationships lookup from ingest results
+    fr_lookup = {}
+    for item in e18_ingest.get("subsample_a", []):
+        meta = item.get("taxfree_metadata", {})
+        if meta:
+            fr_lookup[item["name"]] = meta.get("frame_relationships", "")
+    for item in e18_ingest.get("subsample_b", []):
+        meta = item.get("taxfree_metadata", {})
+        if meta:
+            fr_lookup[item["name"]] = meta.get("frame_relationships", "")
+
+    results = []
+    correct = 0
+    total = 0
+
+    # Sub-sample A
+    print("\nSub-sample A")
+    print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
+    print("-" * 95)
+
+    for record in e18_eval["subsample_a"]:
+        name = record["name"]
+        fr = fr_lookup.get(name, "")
+        coverage = get_coverage_score(fr)
+        tier = assign_tier(coverage)
+        actual_best = best_condition_from_e18(record, "a")
+        match = "✓" if tier == actual_best else "✗"
+        if actual_best != "unknown":
+            total += 1
+            if tier == actual_best:
+                correct += 1
+        display = name[:48] + ".." if len(name) > 50 else name
+        print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
+        results.append({
+            "name": name, "subsample": "a", "bucket": record.get("bucket"),
+            "frame_relationships": fr, "coverage_score": coverage,
+            "predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
+        })
+
+    # Sub-sample B
+    print("\nSub-sample B")
+    print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
+    print("-" * 95)
+
+    for record in e18_eval["subsample_b"]:
+        name = record["name"]
+        fr = fr_lookup.get(name, "")
+        coverage = get_coverage_score(fr)
+        tier = assign_tier(coverage)
+        actual_best = best_condition_from_e18(record, "b")
+        match = "✓" if tier == actual_best else "✗"
+        if actual_best != "unknown":
+            total += 1
+            if tier == actual_best:
+                correct += 1
+        display = name[:48] + ".." if len(name) > 50 else name
+        print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
+        results.append({
+            "name": name, "subsample": "b", "bucket": record.get("bucket"),
+            "frame_relationships": fr, "coverage_score": coverage,
+            "predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
+        })
+
+    # Summary
+    rate = correct / total * 100 if total > 0 else 0
+    print(f"\n{'=' * 60}")
+    print(f"Validation rate: {correct}/{total} ({rate:.1f}%)")
+    print()
+    if rate >= 70:
+        print("✓ SIGNAL VALIDATED — coverage score predicts best condition")
+        print("  Proceed to Phase 2 (new ingestion with routing)")
+    elif rate >= 50:
+        print("~ MARGINAL — adjust thresholds before Phase 2")
+        print("  Review mismatch patterns below")
+    else:
+        print("✗ SIGNAL NOT PREDICTIVE — frame_relationships coverage")
+        print("  may not be the right signal. Consider active_frames fallback.")
+
+    # Mismatch analysis
+    mismatches = [r for r in results if not r["match"] and r["actual_best"] != "unknown"]
+    if mismatches:
+        print(f"\nMismatches ({len(mismatches)}):")
+        for r in mismatches:
+            print(f"  [{r['bucket']:<8}] cov={r['coverage_score']:.2f} predicted={r['predicted_tier']} actual={r['actual_best']} | {r['name'][:50]}")
+
+    # Coverage score distribution
+    scores = [r["coverage_score"] for r in results]
+    print(f"\nCoverage score distribution:")
+    print(f"  Mean: {sum(scores)/len(scores):.2f}")
+    print(f"  Min:  {min(scores):.2f}")
+    print(f"  Max:  {max(scores):.2f}")
+    high = sum(1 for s in scores if s >= HIGH_THRESHOLD)
+    mid  = sum(1 for s in scores if LOW_THRESHOLD <= s < HIGH_THRESHOLD)
+    low  = sum(1 for s in scores if s < LOW_THRESHOLD)
+    print(f"  Tier distribution: baseline={high} standard={mid} taxfree={low}")
+
+    # Save
+    output = {
+        "validation_rate": rate,
+        "correct": correct,
+        "total": total,
+        "thresholds": {"high": HIGH_THRESHOLD, "low": LOW_THRESHOLD},
+        "results": results,
+    }
+    RESULTS_PATH.write_text(json.dumps(output, indent=2))
+    print(f"\nSaved to {RESULTS_PATH}")
+
+
+if __name__ == "__main__":
+    run()