3f7fba7e0e
Moves 28 experiment scripts to scripts/experiments/ (E1, E1.4, E1.6, E2, base_class, cascade, cost_test, briefing, consistency, token series). Moves 2 dissolved-layer scripts to scripts/deprecated/ (consolidator_v0_1.py, tier1_migration.py — under the bespoke decision both target retired substrate work). Removes 19 .bak* files from disk (gitignored, never tracked; git history is the durable record of every prior version). The 11 production scripts remain in scripts/. All systemd ExecStart paths, api.py subprocess calls, and cron jobs continue to resolve correctly — verified by grep against /etc/systemd/system/aaronai-*.service, scripts/ references in api.py, and the user crontab. Track 1 inventory cross-cutting finding: scripts/ mixed 11 production files with 32 experimental scripts and ~20 .bak files. After this commit a clean-room reader can identify the live workers from a directory listing alone. Found by Track 1 inventory 2026-05-02. See ~/aaronai/docs/scripts-reorg-plan-2026-05-02.md for full reasoning. After commit, run: 1. git log --oneline -3 — show the new commit on top 2. git status — confirm clean working tree (modulo the docs/ untracked files which are intentional)
161 lines
5.8 KiB
Python
161 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""E1.4 sample selection — n=30 stratified, excluding E1's 10 sources."""
|
|
import json
|
|
import re
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
|
E1_SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
|
OUTPUT = EXPERIMENTS / "e14_sample.json"
|
|
|
|
TARGETS = {"high": 8, "mid": 8, "low": 8, "document": 6}
|
|
|
|
def query_episode_counts():
|
|
query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
|
|
"RETURN e.name AS name, count(distinct n) AS entities "
|
|
"ORDER BY entities DESC")
|
|
result = subprocess.run(
|
|
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
|
capture_output=True, text=True
|
|
)
|
|
lines = [l for l in result.stdout.split("\n") if l.strip()]
|
|
episodes = []
|
|
i = 0
|
|
while i < len(lines):
|
|
if lines[i] == "name":
|
|
i += 2
|
|
continue
|
|
if lines[i].startswith("Cached") or lines[i].startswith("Query"):
|
|
break
|
|
if i + 1 < len(lines):
|
|
try:
|
|
count = int(lines[i + 1])
|
|
episodes.append({"name": lines[i], "entities": count})
|
|
i += 2
|
|
except ValueError:
|
|
i += 1
|
|
else:
|
|
i += 1
|
|
return episodes
|
|
|
|
|
|
def is_document(name):
|
|
return any(name.lower().endswith(ext) for ext in (".pdf", ".docx", ".pptx", ".txt", ".md"))
|
|
|
|
|
|
def doc_subtype(name):
|
|
"""Categorize document by likely subtype."""
|
|
s = name.lower()
|
|
if "syllabus" in s or "ind study" in s or "_is" in s:
|
|
return "academic"
|
|
if "annual" in s or "report" in s or "_ar20" in s or "rtpcc" in s or "novo" in s:
|
|
return "reference"
|
|
if "cv" in s or "resume" in s or "application" in s or "cover letter" in s:
|
|
return "reference"
|
|
if "marquee" in s or "pptx" in s.lower() or "presentation" in s:
|
|
return "creative"
|
|
return "other"
|
|
|
|
|
|
def main():
|
|
print("Fetching episode entity counts from Tier 1 graph...")
|
|
episodes = query_episode_counts()
|
|
print(f"Got {len(episodes)} episodes")
|
|
|
|
# Load E1's sample to exclude
|
|
with open(E1_SAMPLE_FILE) as f:
|
|
e1_sample = json.load(f)
|
|
e1_names = {ep["name"] for ep in e1_sample["selected"]}
|
|
print(f"Excluding {len(e1_names)} sources from E1")
|
|
|
|
# Quartile boundaries
|
|
counts = sorted([e["entities"] for e in episodes], reverse=True)
|
|
n = len(counts)
|
|
top_q = counts[n // 4]
|
|
bottom_q = counts[3 * n // 4]
|
|
print(f"Quartile boundaries: top≥{top_q}, mid={bottom_q+1}-{top_q-1}, low≤{bottom_q}")
|
|
|
|
# Filter out E1 and bucket
|
|
available = [e for e in episodes if e["name"] not in e1_names]
|
|
|
|
high = [e for e in available if e["entities"] >= top_q and not is_document(e["name"])]
|
|
mid = [e for e in available if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
|
|
low = [e for e in available if e["entities"] <= bottom_q and not is_document(e["name"])]
|
|
docs = [e for e in available if is_document(e["name"]) and e["entities"] >= 5]
|
|
|
|
print(f"\nAvailable after E1 exclusion:")
|
|
print(f" High-density: {len(high)}")
|
|
print(f" Mid-density: {len(mid)}")
|
|
print(f" Low-density: {len(low)}")
|
|
print(f" Documents: {len(docs)}")
|
|
|
|
# For high/mid/low: take from middle of bucket (avoids edge cases)
|
|
def pick(bucket, n):
|
|
if len(bucket) < n:
|
|
print(f" WARNING: only {len(bucket)} available, asked for {n}")
|
|
return bucket
|
|
mid_idx = len(bucket) // 2
|
|
start = max(0, mid_idx - n // 2)
|
|
return bucket[start:start + n]
|
|
|
|
selected = []
|
|
for ep in pick(high, TARGETS["high"]):
|
|
ep["bucket"] = "high"
|
|
selected.append(ep)
|
|
for ep in pick(mid, TARGETS["mid"]):
|
|
ep["bucket"] = "mid"
|
|
selected.append(ep)
|
|
for ep in pick(low, TARGETS["low"]):
|
|
ep["bucket"] = "low"
|
|
selected.append(ep)
|
|
|
|
# For documents: stratify by subtype, target 2 academic, 2 creative, 2 reference
|
|
doc_targets = {"academic": 2, "creative": 2, "reference": 2}
|
|
docs_by_subtype = {}
|
|
for ep in docs:
|
|
st = doc_subtype(ep["name"])
|
|
ep["subtype"] = st
|
|
docs_by_subtype.setdefault(st, []).append(ep)
|
|
print(f"\n Doc subtypes available: {[(k, len(v)) for k, v in docs_by_subtype.items()]}")
|
|
|
|
# Pick from middle of each subtype bucket
|
|
for subtype, target in doc_targets.items():
|
|
sub_docs = docs_by_subtype.get(subtype, [])
|
|
picked = pick(sub_docs, target)
|
|
for ep in picked:
|
|
ep["bucket"] = "document"
|
|
selected.append(ep)
|
|
|
|
# If we're short on documents (e.g., subtype underrepresented), fill from "other"
|
|
doc_count = sum(1 for s in selected if s.get("bucket") == "document")
|
|
if doc_count < TARGETS["document"]:
|
|
shortage = TARGETS["document"] - doc_count
|
|
leftover = [e for e in docs if e["name"] not in {s["name"] for s in selected}]
|
|
for ep in leftover[:shortage]:
|
|
ep["bucket"] = "document"
|
|
ep["subtype"] = ep.get("subtype") or doc_subtype(ep["name"])
|
|
selected.append(ep)
|
|
|
|
print(f"\nSelected {len(selected)} episodes for E1.4:")
|
|
for ep in selected:
|
|
sub = f"/{ep.get('subtype')}" if ep.get('bucket') == 'document' else ""
|
|
print(f" [{ep['bucket']}{sub:>10}] {ep['entities']:>3}e {ep['name']}")
|
|
|
|
with open(OUTPUT, "w") as f:
|
|
json.dump({
|
|
"metadata": {
|
|
"purpose": "E1.4 cascade re-extraction replication (n=30)",
|
|
"exclusions": "E1's 10 sources",
|
|
"stratification": {**TARGETS, "document_subtypes": doc_targets},
|
|
"quartile_top": top_q,
|
|
"quartile_bottom": bottom_q,
|
|
},
|
|
"selected": selected,
|
|
}, f, indent=2)
|
|
print(f"\nSaved to {OUTPUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|