aaronAI/scripts/experiments/frame_distribution_report.py

"""Read-only analysis of Stage 2 frame data via stage2_frames_v.

Produces seven sections (frequency, hygiene, per-doc count, co-occurrence,
folder cross-tab, worker-version split, data-gap accounting) and writes a JSON
sidecar for diffing across runs.

Usage: venv/bin/python3 scripts/experiments/frame_distribution_report.py
"""
import os
import json
import re
import sys
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path

import psycopg2
from dotenv import load_dotenv

load_dotenv()

OUT_PATH = Path.home() / "aaronai" / "experiments" / f"frame_distribution_{datetime.now().strftime('%Y-%m-%d')}.json"
TOP_K = 20  # for co-occurrence; revisit after seeing the long tail


def normalize(label):
    return re.sub(r"\s+", " ", label.strip().lower().replace("_", " "))


def folder_bin(source):
    """Classify source by type. stage_3_queue stores bare filenames, so we
    bin by what kind of file it is, not where it lives in the tree."""
    if not source:
        return "unknown"
    if re.match(r"^(Claude|ChatGPT|Aaron AI):", source):
        return "conversation"  # bypasses Stage 2/3, will not appear here
    s = source.lower()
    if re.search(r"\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-voice\.md$", s):
        return "voice_note"
    if re.search(r"\d{4}-\d{2}-\d{2}-(nrem|early-rem|late-rem|synthesis|lucid)", s):
        return "dream_output"
    if s.endswith(".md"):
        return "markdown"
    if s.endswith(".pdf"):
        return "pdf"
    if s.endswith(".docx") or s.endswith(".doc"):
        return "docx"
    if s.endswith(".pptx") or s.endswith(".ppt"):
        return "pptx"
    if s.endswith(".txt"):
        return "txt"
    return "other"


def fetch_rows(cur):
    cur.execute("""
        SELECT source, char_length, active_frames, worker_version, raw_metadata
        FROM stage2_frames_v
    """)
    rows = []
    for source, char_length, frames, worker_version, raw in cur.fetchall():
        if not isinstance(frames, list):
            continue
        rows.append({
            "source": source,
            "char_length": char_length,
            "frames": [str(f) for f in frames if f],
            "worker_version": worker_version,
            "raw_keys": sorted(raw.keys()) if isinstance(raw, dict) else [],
        })
    return rows


def section_frequency(rows):
    counter = Counter()
    for r in rows:
        for f in r["frames"]:
            counter[f] += 1
    return counter


def section_hygiene(frequency):
    """Group raw labels by normalized form; flag collisions."""
    groups = defaultdict(list)
    for raw, count in frequency.items():
        groups[normalize(raw)].append((raw, count))
    collisions = {k: v for k, v in groups.items() if len(v) > 1}
    return collisions


def section_per_doc_count(rows):
    counts = Counter(len(r["frames"]) for r in rows)
    return counts


def section_cooccurrence(rows, top_frames):
    top_set = set(top_frames)
    pair_counts = Counter()
    for r in rows:
        present = [f for f in r["frames"] if f in top_set]
        for i in range(len(present)):
            for j in range(i + 1, len(present)):
                a, b = sorted([present[i], present[j]])
                pair_counts[(a, b)] += 1
    return pair_counts


def section_folder_crosstab(rows, top_frames):
    top_set = set(top_frames)
    table = defaultdict(Counter)  # frame -> bin -> count
    bin_totals = Counter()
    for r in rows:
        b = folder_bin(r["source"])
        bin_totals[b] += 1
        for f in r["frames"]:
            if f in top_set:
                table[f][b] += 1
    return table, bin_totals


def section_worker_versions(rows):
    counter = Counter(r["worker_version"] or "unknown" for r in rows)
    raw_keys_by_version = defaultdict(Counter)
    for r in rows:
        v = r["worker_version"] or "unknown"
        raw_keys_by_version[v][tuple(r["raw_keys"])] += 1
    return counter, raw_keys_by_version


def section_data_gap(cur):
    """Docs that completed Stage 2 but never had frames extracted (<2000 chars)."""
    cur.execute("""
        SELECT source, char_length
        FROM stage_2_queue
        WHERE completed_at IS NOT NULL AND char_length < 2000
    """)
    missing = cur.fetchall()
    by_bin = Counter(folder_bin(s) for s, _ in missing)
    char_lengths = [c for _, c in missing]
    return {
        "count": len(missing),
        "by_type_bin": dict(by_bin),
        "char_length": {
            "min": min(char_lengths) if char_lengths else None,
            "max": max(char_lengths) if char_lengths else None,
            "median": sorted(char_lengths)[len(char_lengths) // 2] if char_lengths else None,
        },
        "sample_sources": [s for s, _ in missing[:10]],
    }


def section_corpus_coverage(cur):
    """How much of the embeddings corpus has frame coverage?"""
    cur.execute("SELECT count(DISTINCT source) FROM embeddings")
    total = cur.fetchone()[0]
    cur.execute("""
        SELECT count(DISTINCT source) FROM embeddings
        WHERE source LIKE 'Claude:%' OR source LIKE 'ChatGPT:%'
              OR source LIKE 'Aaron AI:%' OR type='aaronai_conversation'
    """)
    conversations = cur.fetchone()[0]
    cur.execute("SELECT count(DISTINCT source) FROM stage_3_queue WHERE stage2_metadata IS NOT NULL")
    with_frames = cur.fetchone()[0]
    cur.execute("""
        SELECT count(DISTINCT source) FROM stage_2_queue
        WHERE completed_at IS NOT NULL AND char_length < 2000
    """)
    short_no_frames = cur.fetchone()[0]
    cur.execute("""
        SELECT count(DISTINCT source) FROM stage_2_queue
        WHERE failed_at IS NOT NULL
    """)
    failed = cur.fetchone()[0]
    return {
        "total_distinct_sources_in_embeddings": total,
        "conversations_no_frames_by_design": conversations,
        "files_with_frames": with_frames,
        "files_short_no_frames": short_no_frames,
        "files_stage2_failed": failed,
        "frame_coverage_pct": round(100.0 * with_frames / max(total, 1), 1),
    }


def main():
    conn = psycopg2.connect(os.environ["PG_DSN"])
    cur = conn.cursor()

    rows = fetch_rows(cur)
    n_docs = len(rows)
    print(f"=== Stage 2 frame distribution report ({n_docs} docs) ===\n")

    # 1. Frequency
    freq = section_frequency(rows)
    print(f"--- 1. Frame frequency ({len(freq)} distinct labels) ---")
    for label, count in freq.most_common(30):
        print(f"  {count:5d}  {label}")
    print()

    # 2. Hygiene
    collisions = section_hygiene(freq)
    print(f"--- 2. Label hygiene (normalized collisions: {len(collisions)}) ---")
    for norm, variants in sorted(collisions.items(), key=lambda kv: -sum(c for _, c in kv[1])):
        variant_str = ", ".join(f"{r!r}:{c}" for r, c in sorted(variants, key=lambda x: -x[1]))
        print(f"  '{norm}': {variant_str}")
    print()

    # 3. Per-doc frame count
    per_doc = section_per_doc_count(rows)
    print("--- 3. Per-doc frame count ---")
    for n in sorted(per_doc):
        print(f"  {n} frames: {per_doc[n]} docs")
    print()

    # 4. Co-occurrence (top-K)
    top_frames = [f for f, _ in freq.most_common(TOP_K)]
    pairs = section_cooccurrence(rows, top_frames)
    print(f"--- 4. Co-occurrence (top-{TOP_K} frames, top-30 pairs) ---")
    for (a, b), count in pairs.most_common(30):
        print(f"  {count:4d}  {a}  ×  {b}")
    print()

    # 5. Folder cross-tab
    crosstab, bin_totals = section_folder_crosstab(rows, top_frames)
    print(f"--- 5. Frame × folder cross-tab (top-{TOP_K} frames) ---")
    bins_sorted = [b for b, _ in bin_totals.most_common()]
    print(f"  bins (with totals): " + ", ".join(f"{b}({n})" for b, n in bin_totals.most_common(10)))
    for f in top_frames:
        row_data = crosstab[f]
        if not row_data:
            continue
        cells = ", ".join(f"{b}={c}" for b, c in row_data.most_common(5))
        print(f"  {f}: {cells}")
    print()

    # 6. Worker versions
    versions, keys_by_version = section_worker_versions(rows)
    print("--- 6. Worker version split ---")
    for v, count in versions.most_common():
        print(f"  v{v}: {count} docs")
        top_shapes = keys_by_version[v].most_common(3)
        for keys, kcount in top_shapes:
            print(f"      {kcount} docs with keys={list(keys)}")
    print()

    # 7. Data gap
    gap = section_data_gap(cur)
    print("--- 7. Data-gap accounting (Stage 2 docs <2000 chars; never frame-extracted) ---")
    print(f"  count: {gap['count']}")
    print(f"  char_length: min={gap['char_length']['min']}, median={gap['char_length']['median']}, max={gap['char_length']['max']}")
    print(f"  by type bin: {gap['by_type_bin']}")
    print(f"  sample sources: {gap['sample_sources']}")
    print()

    # 8. Corpus coverage
    coverage = section_corpus_coverage(cur)
    print("--- 8. Corpus-wide frame coverage ---")
    print(f"  total distinct sources in embeddings: {coverage['total_distinct_sources_in_embeddings']}")
    print(f"  conversations (no frames by design): {coverage['conversations_no_frames_by_design']}")
    print(f"  files with frames: {coverage['files_with_frames']}")
    print(f"  files short, no frames: {coverage['files_short_no_frames']}")
    print(f"  files Stage 2 failed: {coverage['files_stage2_failed']}")
    print(f"  frame coverage: {coverage['frame_coverage_pct']}% of corpus")
    print()

    # JSON sidecar
    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    sidecar = {
        "generated_at": datetime.now().isoformat(),
        "n_docs_with_frames": n_docs,
        "n_distinct_labels": len(freq),
        "top_30_frames": freq.most_common(30),
        "label_collisions": {
            k: [(r, c) for r, c in v] for k, v in collisions.items()
        },
        "per_doc_frame_count": dict(per_doc),
        "top_30_pairs": [
            {"a": a, "b": b, "count": c}
            for (a, b), c in pairs.most_common(30)
        ],
        "folder_crosstab": {
            f: dict(crosstab[f]) for f in top_frames if crosstab[f]
        },
        "bin_totals": dict(bin_totals),
        "worker_versions": dict(versions),
        "data_gap": gap,
        "corpus_coverage": coverage,
    }
    OUT_PATH.write_text(json.dumps(sidecar, indent=2, default=str))
    print(f"JSON sidecar written: {OUT_PATH}")

    cur.close()
    conn.close()


if __name__ == "__main__":
    main()