aaronAI/scripts/experiments/e1_select_sample.py

#!/usr/bin/env python3
"""E1 sample selection — pick 10 episodes from Tier 1 stratified by density and type."""
import json
import os
import subprocess
from pathlib import Path
from collections import defaultdict

EXPERIMENTS = Path.home() / "aaronai" / "experiments"
OUTPUT = EXPERIMENTS / "cascade_reextract_sample.json"

# Get all Tier 1 episodes with their entity counts via FalkorDB
def query_episode_counts():
    query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
             "RETURN e.name AS name, count(distinct n) AS entities "
             "ORDER BY entities DESC")
    result = subprocess.run(
        ["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
        capture_output=True, text=True
    )
    # Parse the output — redis-cli returns rows after a header
    lines = [l for l in result.stdout.split("\n") if l.strip()]
    episodes = []
    # Skip header rows ("name", "entities") and timing rows
    i = 0
    while i < len(lines):
        if lines[i] == "name":
            i += 2  # skip "name" and "entities" headers
            continue
        if lines[i].startswith("Cached") or lines[i].startswith("Query"):
            break
        # Each episode: name on one line, count on next
        if i + 1 < len(lines):
            try:
                count = int(lines[i + 1])
                episodes.append({"name": lines[i], "entities": count})
                i += 2
            except ValueError:
                i += 1
        else:
            i += 1
    return episodes

print("Fetching episode entity counts from FalkorDB...")
episodes = query_episode_counts()
print(f"Got {len(episodes)} episodes")

# Classify by density bucket and type
def is_document(name):
    doc_extensions = (".pdf", ".docx", ".pptx", ".txt", ".md")
    return any(name.lower().endswith(ext) for ext in doc_extensions)

# Compute quartile boundaries from the entity counts
counts = sorted([e["entities"] for e in episodes], reverse=True)
n = len(counts)
top_q = counts[n // 4]      # 25th percentile from top
bottom_q = counts[3 * n // 4]  # 75th percentile from top

print(f"\nQuartile boundaries: top={top_q}+, middle=({bottom_q+1}-{top_q-1}), bottom=0-{bottom_q}")

high = [e for e in episodes if e["entities"] >= top_q and not is_document(e["name"])]
mid = [e for e in episodes if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
low = [e for e in episodes if e["entities"] <= bottom_q and not is_document(e["name"])]
docs = [e for e in episodes if is_document(e["name"]) and e["entities"] >= 5]

print(f"High-density conversations: {len(high)}")
print(f"Mid-density conversations:  {len(mid)}")
print(f"Low-density conversations:  {len(low)}")
print(f"Documents (≥5 entities):    {len(docs)}")

# Deterministic selection — take from middle of each bucket to avoid edge cases
def pick(bucket, n):
    if len(bucket) < n:
        return bucket
    mid_idx = len(bucket) // 2
    start = max(0, mid_idx - n // 2)
    return bucket[start:start + n]

selected = (
    pick(high, 3) +
    pick(mid, 3) +
    pick(low, 2) +
    pick(docs, 2)
)

# Tag each with its bucket
def bucket_for(ep):
    if is_document(ep["name"]):
        return "document"
    if ep["entities"] >= top_q:
        return "high"
    if ep["entities"] > bottom_q:
        return "mid"
    return "low"

for ep in selected:
    ep["bucket"] = bucket_for(ep)

print(f"\nSelected {len(selected)} episodes for E1:")
for ep in selected:
    print(f"  [{ep['bucket']:>8}] {ep['entities']:>3}e  {ep['name']}")

# Save selection
with open(OUTPUT, "w") as f:
    json.dump({
        "metadata": {
            "purpose": "E1 cascade re-extraction sample (n=10)",
            "stratification": "density buckets + document subset",
            "quartile_top": top_q,
            "quartile_bottom": bottom_q,
            "total_tier1_episodes": len(episodes),
        },
        "selected": selected,
    }, f, indent=2)

print(f"\nSaved to {OUTPUT}")