aaronAI/scripts/e14_select_sample.py

#!/usr/bin/env python3
"""E1.4 sample selection — n=30 stratified, excluding E1's 10 sources."""
import json
import re
import subprocess
from pathlib import Path

EXPERIMENTS = Path.home() / "aaronai" / "experiments"
E1_SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
OUTPUT = EXPERIMENTS / "e14_sample.json"

TARGETS = {"high": 8, "mid": 8, "low": 8, "document": 6}

def query_episode_counts():
    query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
             "RETURN e.name AS name, count(distinct n) AS entities "
             "ORDER BY entities DESC")
    result = subprocess.run(
        ["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
        capture_output=True, text=True
    )
    lines = [l for l in result.stdout.split("\n") if l.strip()]
    episodes = []
    i = 0
    while i < len(lines):
        if lines[i] == "name":
            i += 2
            continue
        if lines[i].startswith("Cached") or lines[i].startswith("Query"):
            break
        if i + 1 < len(lines):
            try:
                count = int(lines[i + 1])
                episodes.append({"name": lines[i], "entities": count})
                i += 2
            except ValueError:
                i += 1
        else:
            i += 1
    return episodes


def is_document(name):
    return any(name.lower().endswith(ext) for ext in (".pdf", ".docx", ".pptx", ".txt", ".md"))


def doc_subtype(name):
    """Categorize document by likely subtype."""
    s = name.lower()
    if "syllabus" in s or "ind study" in s or "_is" in s:
        return "academic"
    if "annual" in s or "report" in s or "_ar20" in s or "rtpcc" in s or "novo" in s:
        return "reference"
    if "cv" in s or "resume" in s or "application" in s or "cover letter" in s:
        return "reference"
    if "marquee" in s or "pptx" in s.lower() or "presentation" in s:
        return "creative"
    return "other"


def main():
    print("Fetching episode entity counts from Tier 1 graph...")
    episodes = query_episode_counts()
    print(f"Got {len(episodes)} episodes")

    # Load E1's sample to exclude
    with open(E1_SAMPLE_FILE) as f:
        e1_sample = json.load(f)
    e1_names = {ep["name"] for ep in e1_sample["selected"]}
    print(f"Excluding {len(e1_names)} sources from E1")

    # Quartile boundaries
    counts = sorted([e["entities"] for e in episodes], reverse=True)
    n = len(counts)
    top_q = counts[n // 4]
    bottom_q = counts[3 * n // 4]
    print(f"Quartile boundaries: top≥{top_q}, mid={bottom_q+1}-{top_q-1}, low≤{bottom_q}")

    # Filter out E1 and bucket
    available = [e for e in episodes if e["name"] not in e1_names]

    high = [e for e in available if e["entities"] >= top_q and not is_document(e["name"])]
    mid = [e for e in available if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
    low = [e for e in available if e["entities"] <= bottom_q and not is_document(e["name"])]
    docs = [e for e in available if is_document(e["name"]) and e["entities"] >= 5]

    print(f"\nAvailable after E1 exclusion:")
    print(f"  High-density: {len(high)}")
    print(f"  Mid-density:  {len(mid)}")
    print(f"  Low-density:  {len(low)}")
    print(f"  Documents:    {len(docs)}")

    # For high/mid/low: take from middle of bucket (avoids edge cases)
    def pick(bucket, n):
        if len(bucket) < n:
            print(f"  WARNING: only {len(bucket)} available, asked for {n}")
            return bucket
        mid_idx = len(bucket) // 2
        start = max(0, mid_idx - n // 2)
        return bucket[start:start + n]

    selected = []
    for ep in pick(high, TARGETS["high"]):
        ep["bucket"] = "high"
        selected.append(ep)
    for ep in pick(mid, TARGETS["mid"]):
        ep["bucket"] = "mid"
        selected.append(ep)
    for ep in pick(low, TARGETS["low"]):
        ep["bucket"] = "low"
        selected.append(ep)

    # For documents: stratify by subtype, target 2 academic, 2 creative, 2 reference
    doc_targets = {"academic": 2, "creative": 2, "reference": 2}
    docs_by_subtype = {}
    for ep in docs:
        st = doc_subtype(ep["name"])
        ep["subtype"] = st
        docs_by_subtype.setdefault(st, []).append(ep)
    print(f"\n  Doc subtypes available: {[(k, len(v)) for k, v in docs_by_subtype.items()]}")

    # Pick from middle of each subtype bucket
    for subtype, target in doc_targets.items():
        sub_docs = docs_by_subtype.get(subtype, [])
        picked = pick(sub_docs, target)
        for ep in picked:
            ep["bucket"] = "document"
            selected.append(ep)

    # If we're short on documents (e.g., subtype underrepresented), fill from "other"
    doc_count = sum(1 for s in selected if s.get("bucket") == "document")
    if doc_count < TARGETS["document"]:
        shortage = TARGETS["document"] - doc_count
        leftover = [e for e in docs if e["name"] not in {s["name"] for s in selected}]
        for ep in leftover[:shortage]:
            ep["bucket"] = "document"
            ep["subtype"] = ep.get("subtype") or doc_subtype(ep["name"])
            selected.append(ep)

    print(f"\nSelected {len(selected)} episodes for E1.4:")
    for ep in selected:
        sub = f"/{ep.get('subtype')}" if ep.get('bucket') == 'document' else ""
        print(f"  [{ep['bucket']}{sub:>10}] {ep['entities']:>3}e  {ep['name']}")

    with open(OUTPUT, "w") as f:
        json.dump({
            "metadata": {
                "purpose": "E1.4 cascade re-extraction replication (n=30)",
                "exclusions": "E1's 10 sources",
                "stratification": {**TARGETS, "document_subtypes": doc_targets},
                "quartile_top": top_q,
                "quartile_bottom": bottom_q,
            },
            "selected": selected,
        }, f, indent=2)
    print(f"\nSaved to {OUTPUT}")


if __name__ == "__main__":
    main()