#!/usr/bin/env python3 """E1.4 sample selection — n=30 stratified, excluding E1's 10 sources.""" import json import re import subprocess from pathlib import Path EXPERIMENTS = Path.home() / "aaronai" / "experiments" E1_SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json" OUTPUT = EXPERIMENTS / "e14_sample.json" TARGETS = {"high": 8, "mid": 8, "low": 8, "document": 6} def query_episode_counts(): query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) " "RETURN e.name AS name, count(distinct n) AS entities " "ORDER BY entities DESC") result = subprocess.run( ["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query], capture_output=True, text=True ) lines = [l for l in result.stdout.split("\n") if l.strip()] episodes = [] i = 0 while i < len(lines): if lines[i] == "name": i += 2 continue if lines[i].startswith("Cached") or lines[i].startswith("Query"): break if i + 1 < len(lines): try: count = int(lines[i + 1]) episodes.append({"name": lines[i], "entities": count}) i += 2 except ValueError: i += 1 else: i += 1 return episodes def is_document(name): return any(name.lower().endswith(ext) for ext in (".pdf", ".docx", ".pptx", ".txt", ".md")) def doc_subtype(name): """Categorize document by likely subtype.""" s = name.lower() if "syllabus" in s or "ind study" in s or "_is" in s: return "academic" if "annual" in s or "report" in s or "_ar20" in s or "rtpcc" in s or "novo" in s: return "reference" if "cv" in s or "resume" in s or "application" in s or "cover letter" in s: return "reference" if "marquee" in s or "pptx" in s.lower() or "presentation" in s: return "creative" return "other" def main(): print("Fetching episode entity counts from Tier 1 graph...") episodes = query_episode_counts() print(f"Got {len(episodes)} episodes") # Load E1's sample to exclude with open(E1_SAMPLE_FILE) as f: e1_sample = json.load(f) e1_names = {ep["name"] for ep in e1_sample["selected"]} print(f"Excluding {len(e1_names)} sources from E1") # Quartile boundaries counts = sorted([e["entities"] for e in episodes], reverse=True) n = len(counts) top_q = counts[n // 4] bottom_q = counts[3 * n // 4] print(f"Quartile boundaries: top≥{top_q}, mid={bottom_q+1}-{top_q-1}, low≤{bottom_q}") # Filter out E1 and bucket available = [e for e in episodes if e["name"] not in e1_names] high = [e for e in available if e["entities"] >= top_q and not is_document(e["name"])] mid = [e for e in available if bottom_q < e["entities"] < top_q and not is_document(e["name"])] low = [e for e in available if e["entities"] <= bottom_q and not is_document(e["name"])] docs = [e for e in available if is_document(e["name"]) and e["entities"] >= 5] print(f"\nAvailable after E1 exclusion:") print(f" High-density: {len(high)}") print(f" Mid-density: {len(mid)}") print(f" Low-density: {len(low)}") print(f" Documents: {len(docs)}") # For high/mid/low: take from middle of bucket (avoids edge cases) def pick(bucket, n): if len(bucket) < n: print(f" WARNING: only {len(bucket)} available, asked for {n}") return bucket mid_idx = len(bucket) // 2 start = max(0, mid_idx - n // 2) return bucket[start:start + n] selected = [] for ep in pick(high, TARGETS["high"]): ep["bucket"] = "high" selected.append(ep) for ep in pick(mid, TARGETS["mid"]): ep["bucket"] = "mid" selected.append(ep) for ep in pick(low, TARGETS["low"]): ep["bucket"] = "low" selected.append(ep) # For documents: stratify by subtype, target 2 academic, 2 creative, 2 reference doc_targets = {"academic": 2, "creative": 2, "reference": 2} docs_by_subtype = {} for ep in docs: st = doc_subtype(ep["name"]) ep["subtype"] = st docs_by_subtype.setdefault(st, []).append(ep) print(f"\n Doc subtypes available: {[(k, len(v)) for k, v in docs_by_subtype.items()]}") # Pick from middle of each subtype bucket for subtype, target in doc_targets.items(): sub_docs = docs_by_subtype.get(subtype, []) picked = pick(sub_docs, target) for ep in picked: ep["bucket"] = "document" selected.append(ep) # If we're short on documents (e.g., subtype underrepresented), fill from "other" doc_count = sum(1 for s in selected if s.get("bucket") == "document") if doc_count < TARGETS["document"]: shortage = TARGETS["document"] - doc_count leftover = [e for e in docs if e["name"] not in {s["name"] for s in selected}] for ep in leftover[:shortage]: ep["bucket"] = "document" ep["subtype"] = ep.get("subtype") or doc_subtype(ep["name"]) selected.append(ep) print(f"\nSelected {len(selected)} episodes for E1.4:") for ep in selected: sub = f"/{ep.get('subtype')}" if ep.get('bucket') == 'document' else "" print(f" [{ep['bucket']}{sub:>10}] {ep['entities']:>3}e {ep['name']}") with open(OUTPUT, "w") as f: json.dump({ "metadata": { "purpose": "E1.4 cascade re-extraction replication (n=30)", "exclusions": "E1's 10 sources", "stratification": {**TARGETS, "document_subtypes": doc_targets}, "quartile_top": top_q, "quartile_bottom": bottom_q, }, "selected": selected, }, f, indent=2) print(f"\nSaved to {OUTPUT}") if __name__ == "__main__": main()