Files
aaron 3f7fba7e0e scripts/: separate production from experimental and deprecated
Moves 28 experiment scripts to scripts/experiments/ (E1, E1.4, E1.6, E2,
base_class, cascade, cost_test, briefing, consistency, token series).
Moves 2 dissolved-layer scripts to scripts/deprecated/ (consolidator_v0_1.py,
tier1_migration.py — under the bespoke decision both target retired
substrate work).
Removes 19 .bak* files from disk (gitignored, never tracked; git history
is the durable record of every prior version).

The 11 production scripts remain in scripts/. All systemd ExecStart paths,
api.py subprocess calls, and cron jobs continue to resolve correctly —
verified by grep against /etc/systemd/system/aaronai-*.service, scripts/
references in api.py, and the user crontab.

Track 1 inventory cross-cutting finding: scripts/ mixed 11 production
files with 32 experimental scripts and ~20 .bak files. After this commit
a clean-room reader can identify the live workers from a directory listing
alone.

Found by Track 1 inventory 2026-05-02. See
~/aaronai/docs/scripts-reorg-plan-2026-05-02.md for full reasoning.

After commit, run:
1. git log --oneline -3 — show the new commit on top
2. git status — confirm clean working tree (modulo the docs/ untracked files which are intentional)
2026-05-02 23:28:24 +00:00

135 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""E1 metrics comparison — A (Tier 1 aaron) vs B (cascade aaron_cascade_test) on the 10 sample sources."""
import json
import subprocess
from pathlib import Path
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
COMPARISON_FILE = EXPERIMENTS / "cascade_reextract_comparison.json"
def query(group_id, cypher):
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
capture_output=True, text=True
)
return result.stdout
def parse_int_result(output):
"""Parse a single-integer result from redis-cli GRAPH.QUERY output."""
lines = [l.strip() for l in output.split("\n") if l.strip()]
for line in lines:
if line.isdigit():
return int(line)
return 0
def parse_string_list(output):
"""Parse a list of strings from redis-cli output (skipping headers and timing)."""
lines = [l.strip() for l in output.split("\n") if l.strip()]
items = []
started = False
for line in lines:
if line.startswith("Cached") or line.startswith("Query internal"):
break
if started:
items.append(line)
# The header is the column name; everything after is data
# But we don't know column names a priori, so detect transition by length pattern
if not started and len(line) < 60 and not any(c in line for c in "{}[]"):
# Likely a header row, skip first one
started = True
return items
def metrics_for_source(group_id, source_name):
"""Get metrics for one source's episode in one group_id."""
# Total entities connected to this episode
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity) RETURN count(distinct n) AS entities'
entities = parse_int_result(query(group_id, q))
# Total edges from this episode (all relationship types)
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[r]-() RETURN count(r) AS edges'
edges = parse_int_result(query(group_id, q))
# Distinct relationship types in edges from entities of this episode
q = (f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity)-[r]-() '
f'RETURN count(distinct type(r)) AS types')
rel_types = parse_int_result(query(group_id, q))
return {"entities": entities, "edges": edges, "rel_types": rel_types}
def main():
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
print(f"E1 metrics comparison — {len(selected)} sources, A=aaron vs B=aaron_cascade_test\n")
print(f"{'Source':<60} {'A.ent':>6} {'B.ent':>6} {'A.edg':>6} {'B.edg':>6} {'A.typ':>6} {'B.typ':>6}")
print("-" * 110)
results = []
for ep in selected:
name = ep["name"]
bucket = ep["bucket"]
a = metrics_for_source("aaron", name)
b = metrics_for_source("aaron_cascade_test", name)
record = {
"name": name, "bucket": bucket,
"a_entities": a["entities"], "b_entities": b["entities"],
"a_edges": a["edges"], "b_edges": b["edges"],
"a_rel_types": a["rel_types"], "b_rel_types": b["rel_types"],
}
results.append(record)
# Truncate name for display
display_name = name if len(name) <= 58 else name[:55] + "..."
print(f"{display_name:<60} {a['entities']:>6} {b['entities']:>6} {a['edges']:>6} {b['edges']:>6} {a['rel_types']:>6} {b['rel_types']:>6}")
# Aggregates
print("\n" + "=" * 110)
n = len(results)
a_ent_sum = sum(r["a_entities"] for r in results)
b_ent_sum = sum(r["b_entities"] for r in results)
a_edge_sum = sum(r["a_edges"] for r in results)
b_edge_sum = sum(r["b_edges"] for r in results)
a_types_sum = sum(r["a_rel_types"] for r in results)
b_types_sum = sum(r["b_rel_types"] for r in results)
print(f"\nAggregate (n={n}):")
print(f" Entities: A mean={a_ent_sum/n:.1f} B mean={b_ent_sum/n:.1f} delta={(b_ent_sum-a_ent_sum)/a_ent_sum*100:+.1f}%")
print(f" Edges: A mean={a_edge_sum/n:.1f} B mean={b_edge_sum/n:.1f} delta={(b_edge_sum-a_edge_sum)/a_edge_sum*100:+.1f}%")
print(f" Rel types: A mean={a_types_sum/n:.1f} B mean={b_types_sum/n:.1f} delta={(b_types_sum-a_types_sum)/a_types_sum*100:+.1f}%")
# Global predicate diversity check (unique types in each group_id)
print(f"\nGlobal predicate diversity:")
a_global = parse_int_result(query("aaron", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
b_global = parse_int_result(query("aaron_cascade_test", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
print(f" A (aaron): {a_global} distinct relationship types across whole graph")
print(f" B (aaron_cascade_test): {b_global} distinct relationship types across whole graph")
# Per-bucket
print(f"\nPer-bucket aggregates:")
for bucket in ["high", "mid", "low", "document"]:
bucket_results = [r for r in results if r["bucket"] == bucket]
if not bucket_results:
continue
bn = len(bucket_results)
a_e = sum(r["a_entities"] for r in bucket_results) / bn
b_e = sum(r["b_entities"] for r in bucket_results) / bn
a_ed = sum(r["a_edges"] for r in bucket_results) / bn
b_ed = sum(r["b_edges"] for r in bucket_results) / bn
print(f" [{bucket:>8}] n={bn} A.ent={a_e:.1f} B.ent={b_e:.1f} ({(b_e-a_e)/a_e*100:+.0f}%) "
f"A.edg={a_ed:.1f} B.edg={b_ed:.1f} ({(b_ed-a_ed)/a_ed*100:+.0f}%)")
with open(COMPARISON_FILE, "w") as f:
json.dump({
"results": results,
"aggregate": {
"a_entities_total": a_ent_sum, "b_entities_total": b_ent_sum,
"a_edges_total": a_edge_sum, "b_edges_total": b_edge_sum,
"global_predicate_diversity": {"a": a_global, "b": b_global},
},
}, f, indent=2)
print(f"\nSaved to {COMPARISON_FILE}")
if __name__ == "__main__":
main()