add experiment scripts and results; watcher.py latest changes

2026-04-30 18:06:03 +00:00
parent 1cf26df450
commit f11cacd9c9
55 changed files with 23594 additions and 726 deletions
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""E1.4 sample selection — n=30 stratified, excluding E1's 10 sources."""
+import json
+import re
+import subprocess
+from pathlib import Path
+
+EXPERIMENTS = Path.home() / "aaronai" / "experiments"
+E1_SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
+OUTPUT = EXPERIMENTS / "e14_sample.json"
+
+TARGETS = {"high": 8, "mid": 8, "low": 8, "document": 6}
+
+def query_episode_counts():
+    query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
+             "RETURN e.name AS name, count(distinct n) AS entities "
+             "ORDER BY entities DESC")
+    result = subprocess.run(
+        ["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
+        capture_output=True, text=True
+    )
+    lines = [l for l in result.stdout.split("\n") if l.strip()]
+    episodes = []
+    i = 0
+    while i < len(lines):
+        if lines[i] == "name":
+            i += 2
+            continue
+        if lines[i].startswith("Cached") or lines[i].startswith("Query"):
+            break
+        if i + 1 < len(lines):
+            try:
+                count = int(lines[i + 1])
+                episodes.append({"name": lines[i], "entities": count})
+                i += 2
+            except ValueError:
+                i += 1
+        else:
+            i += 1
+    return episodes
+
+
+def is_document(name):
+    return any(name.lower().endswith(ext) for ext in (".pdf", ".docx", ".pptx", ".txt", ".md"))
+
+
+def doc_subtype(name):
+    """Categorize document by likely subtype."""
+    s = name.lower()
+    if "syllabus" in s or "ind study" in s or "_is" in s:
+        return "academic"
+    if "annual" in s or "report" in s or "_ar20" in s or "rtpcc" in s or "novo" in s:
+        return "reference"
+    if "cv" in s or "resume" in s or "application" in s or "cover letter" in s:
+        return "reference"
+    if "marquee" in s or "pptx" in s.lower() or "presentation" in s:
+        return "creative"
+    return "other"
+
+
+def main():
+    print("Fetching episode entity counts from Tier 1 graph...")
+    episodes = query_episode_counts()
+    print(f"Got {len(episodes)} episodes")
+
+    # Load E1's sample to exclude
+    with open(E1_SAMPLE_FILE) as f:
+        e1_sample = json.load(f)
+    e1_names = {ep["name"] for ep in e1_sample["selected"]}
+    print(f"Excluding {len(e1_names)} sources from E1")
+
+    # Quartile boundaries
+    counts = sorted([e["entities"] for e in episodes], reverse=True)
+    n = len(counts)
+    top_q = counts[n // 4]
+    bottom_q = counts[3 * n // 4]
+    print(f"Quartile boundaries: top≥{top_q}, mid={bottom_q+1}-{top_q-1}, low≤{bottom_q}")
+
+    # Filter out E1 and bucket
+    available = [e for e in episodes if e["name"] not in e1_names]
+
+    high = [e for e in available if e["entities"] >= top_q and not is_document(e["name"])]
+    mid = [e for e in available if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
+    low = [e for e in available if e["entities"] <= bottom_q and not is_document(e["name"])]
+    docs = [e for e in available if is_document(e["name"]) and e["entities"] >= 5]
+
+    print(f"\nAvailable after E1 exclusion:")
+    print(f"  High-density: {len(high)}")
+    print(f"  Mid-density:  {len(mid)}")
+    print(f"  Low-density:  {len(low)}")
+    print(f"  Documents:    {len(docs)}")
+
+    # For high/mid/low: take from middle of bucket (avoids edge cases)
+    def pick(bucket, n):
+        if len(bucket) < n:
+            print(f"  WARNING: only {len(bucket)} available, asked for {n}")
+            return bucket
+        mid_idx = len(bucket) // 2
+        start = max(0, mid_idx - n // 2)
+        return bucket[start:start + n]
+
+    selected = []
+    for ep in pick(high, TARGETS["high"]):
+        ep["bucket"] = "high"
+        selected.append(ep)
+    for ep in pick(mid, TARGETS["mid"]):
+        ep["bucket"] = "mid"
+        selected.append(ep)
+    for ep in pick(low, TARGETS["low"]):
+        ep["bucket"] = "low"
+        selected.append(ep)
+
+    # For documents: stratify by subtype, target 2 academic, 2 creative, 2 reference
+    doc_targets = {"academic": 2, "creative": 2, "reference": 2}
+    docs_by_subtype = {}
+    for ep in docs:
+        st = doc_subtype(ep["name"])
+        ep["subtype"] = st
+        docs_by_subtype.setdefault(st, []).append(ep)
+    print(f"\n  Doc subtypes available: {[(k, len(v)) for k, v in docs_by_subtype.items()]}")
+
+    # Pick from middle of each subtype bucket
+    for subtype, target in doc_targets.items():
+        sub_docs = docs_by_subtype.get(subtype, [])
+        picked = pick(sub_docs, target)
+        for ep in picked:
+            ep["bucket"] = "document"
+            selected.append(ep)
+
+    # If we're short on documents (e.g., subtype underrepresented), fill from "other"
+    doc_count = sum(1 for s in selected if s.get("bucket") == "document")
+    if doc_count < TARGETS["document"]:
+        shortage = TARGETS["document"] - doc_count
+        leftover = [e for e in docs if e["name"] not in {s["name"] for s in selected}]
+        for ep in leftover[:shortage]:
+            ep["bucket"] = "document"
+            ep["subtype"] = ep.get("subtype") or doc_subtype(ep["name"])
+            selected.append(ep)
+
+    print(f"\nSelected {len(selected)} episodes for E1.4:")
+    for ep in selected:
+        sub = f"/{ep.get('subtype')}" if ep.get('bucket') == 'document' else ""
+        print(f"  [{ep['bucket']}{sub:>10}] {ep['entities']:>3}e  {ep['name']}")
+
+    with open(OUTPUT, "w") as f:
+        json.dump({
+            "metadata": {
+                "purpose": "E1.4 cascade re-extraction replication (n=30)",
+                "exclusions": "E1's 10 sources",
+                "stratification": {**TARGETS, "document_subtypes": doc_targets},
+                "quartile_top": top_q,
+                "quartile_bottom": bottom_q,
+            },
+            "selected": selected,
+        }, f, indent=2)
+    print(f"\nSaved to {OUTPUT}")
+
+
+if __name__ == "__main__":
+    main()