aaronAI/scripts/experiments/embeddings_backfill_inspection.py

"""Read-only inspection for the embeddings.type / embeddings.created_at backfill (Improvement #2 / A.1).

Produces a survey of every backfill source-of-truth question without writing
to the database. Output is a human-readable report on stdout plus a JSON
sidecar at experiments/embeddings_backfill_inspection_<date>.json.

Sections:
  1. Cohort recap (counts; should match prior investigation).
  2. Cohort A type inference: extension classifier coverage.
  3. created_at inference for cohort A + B-doc-old:
       - rows with metadata.filepath: stat the file, check existence.
       - rows without filepath: lookup source against watcher_state.json.
       - filename-collision shape audit (live+backup, live+archive, ambiguous).
  4. ChatGPT export resolution (Plan A.1 addition #1):
       - existence of /home/aaron/nextcloud/.../ChatGPT Export/.
       - sample 5 B-chatgpt rows; resolve convo_id -> create_time.
  5. Sentinel date discovery (Plan A.1 addition #3):
       - earliest non-NULL created_at per type (already-populated rows are the
         lower bound for when the substrate started carrying timestamps).
       - git log for the pgvector migration commit.
       - any ChromaDB sqlite still on disk.
       - propose a sentinel with reasoning, or flag as arbitrary.
  6. 50-row stratified sample: derived (type, created_at, source) per row.

Usage: venv/bin/python3 scripts/experiments/embeddings_backfill_inspection.py

Read-only. No DB writes. No filesystem writes outside experiments/.
"""
import json
import os
import random
import re
import subprocess
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path

import psycopg2
from psycopg2.extras import RealDictCursor
from dotenv import load_dotenv

load_dotenv(Path.home() / "aaronai" / ".env")

PG_DSN = os.getenv("PG_DSN")
WATCHER_STATE = Path.home() / "aaronai" / "watcher_state.json"
CHATGPT_EXPORT_DIR = Path("/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export")
NEXTCLOUD_ROOT = Path("/home/aaron/nextcloud/data/data/aaron/files")
OUT_PATH = Path.home() / "aaronai" / "experiments" / f"embeddings_backfill_inspection_{datetime.now().strftime('%Y-%m-%d')}.json"

SUPPORTED_EXT = {".pdf", ".docx", ".pptx", ".txt", ".md"}

random.seed(20260503)


# ─── Helpers ────────────────────────────────────────────────────────────────

def get_pg():
    return psycopg2.connect(PG_DSN, cursor_factory=RealDictCursor)


def header(title):
    bar = "=" * 70
    print(f"\n{bar}\n{title}\n{bar}")


def sub(title):
    print(f"\n--- {title} ---")


def fmt_ts_from_unix(ts):
    """Watcher state stores unix timestamps as strings."""
    try:
        return datetime.fromtimestamp(float(ts), tz=timezone.utc).isoformat().replace("+00:00", "Z")
    except Exception:
        return None


def fmt_ts_from_st_mtime(p):
    try:
        return datetime.fromtimestamp(p.stat().st_mtime, tz=timezone.utc).isoformat().replace("+00:00", "Z")
    except Exception:
        return None


def load_watcher_state():
    """Returns (path -> mtime_str), and (basename -> [(path, mtime_str), ...])."""
    state = json.loads(WATCHER_STATE.read_text())
    by_path = state
    by_name = defaultdict(list)
    for path, mtime in state.items():
        by_name[Path(path).name].append((path, mtime))
    return by_path, by_name


def classify_collision_shape(paths):
    """Categorize a filename-collision group:
       - 'live+backup'   : exactly one path doesn't contain backup/.bak markers
                           and others do
       - 'live+archive'  : exactly one is outside Archive/ and others are inside
       - 'multi-live'    : >=2 paths look like live (no backup/archive markers)
       - 'all-archive'   : every path is inside Archive/ or backup-like
       - 'other'
    """
    def is_backup(p):
        s = p.lower()
        return ".bak" in s or "/backup" in s or "backups/" in s
    def is_archive(p):
        s = p.lower()
        return "/archive/" in s
    backups = [p for p in paths if is_backup(p)]
    archives = [p for p in paths if is_archive(p)]
    live = [p for p in paths if not is_backup(p) and not is_archive(p)]
    if len(live) == 1 and len(backups) >= 1 and len(archives) == 0:
        return "live+backup"
    if len(live) == 1 and len(archives) >= 1 and len(backups) == 0:
        return "live+archive"
    if len(live) == 1 and (len(backups) + len(archives)) >= 1:
        return "live+mixed-old"
    if len(live) >= 2:
        return "multi-live"
    if len(live) == 0:
        return "all-archive-or-backup"
    return "other"


# ─── Section 1: Cohort recap ────────────────────────────────────────────────

def section_1_cohort_recap(cur):
    header("1. COHORT RECAP")
    cur.execute("""
        SELECT
          COUNT(*) AS total,
          COUNT(*) FILTER (WHERE type IS NULL) AS type_null,
          COUNT(*) FILTER (WHERE created_at IS NULL) AS ca_null,
          COUNT(*) FILTER (WHERE type IS NULL AND created_at IS NULL) AS both_null,
          COUNT(*) FILTER (WHERE type IS NOT NULL AND created_at IS NOT NULL) AS both_set
        FROM embeddings;
    """)
    overall = cur.fetchone()
    print(f"Total: {overall['total']}  type_null: {overall['type_null']}  "
          f"ca_null: {overall['ca_null']}  both_null: {overall['both_null']}  "
          f"both_set: {overall['both_set']}")

    cur.execute("""
        SELECT type, created_at IS NULL AS ca_null, COUNT(*) AS n
        FROM embeddings GROUP BY type, ca_null ORDER BY type NULLS LAST, ca_null;
    """)
    cohorts = cur.fetchall()
    sub("Per-(type, ca_null) cohorts")
    for r in cohorts:
        print(f"  type={r['type'] or 'NULL':<22}  ca_null={r['ca_null']!s:<5}  n={r['n']}")
    return {"overall": overall, "cohorts": cohorts}


# ─── Section 2: Cohort A type inference ─────────────────────────────────────

def section_2_type_inference(cur):
    header("2. COHORT A TYPE INFERENCE (extension classifier)")
    cur.execute("""
        SELECT LOWER(SUBSTRING(source FROM '\.[^.]+$')) AS ext, COUNT(*) AS rows
        FROM embeddings WHERE type IS NULL
        GROUP BY ext ORDER BY rows DESC;
    """)
    by_ext = cur.fetchall()
    classified = sum(r["rows"] for r in by_ext if r["ext"] in SUPPORTED_EXT)
    unknown = sum(r["rows"] for r in by_ext if r["ext"] not in SUPPORTED_EXT)
    print(f"NULL-type rows by extension:")
    for r in by_ext:
        flag = "OK" if r["ext"] in SUPPORTED_EXT else "??"
        print(f"  {flag}  {r['ext'] or '(none)':<8}  rows={r['rows']}")
    print(f"\nClassified as 'document' via extension: {classified}")
    print(f"Unclassifiable (no SUPPORTED extension): {unknown}")
    return {"by_ext": by_ext, "classified": classified, "unclassifiable": unknown}


# ─── Section 3: created_at inference ────────────────────────────────────────

def section_3_created_at_inference(cur):
    header("3. CREATED_AT INFERENCE — file-derived rows")
    by_path, by_name = load_watcher_state()
    print(f"watcher_state.json: {len(by_path)} tracked paths, "
          f"{len(by_name)} distinct filenames, "
          f"{sum(1 for v in by_name.values() if len(v) > 1)} filename collisions")

    # 3a. Rows with metadata.filepath: probe stat()
    sub("3a. Rows with metadata.filepath — stat probe")
    cur.execute("""
        SELECT id, source, metadata->>'filepath' AS filepath
        FROM embeddings
        WHERE created_at IS NULL AND metadata->>'filepath' IS NOT NULL;
    """)
    rows_with_fp = cur.fetchall()
    fp_exists = 0
    fp_missing = 0
    fp_outside_root = 0
    sample_resolved = []
    for r in rows_with_fp:
        p = Path(r["filepath"])
        if not str(p).startswith(str(NEXTCLOUD_ROOT)):
            fp_outside_root += 1
        if p.exists():
            fp_exists += 1
            if len(sample_resolved) < 5:
                sample_resolved.append({
                    "id": r["id"], "source": r["source"],
                    "filepath": str(p), "mtime": fmt_ts_from_st_mtime(p),
                })
        else:
            fp_missing += 1
    print(f"  rows with metadata.filepath: {len(rows_with_fp)}")
    print(f"    exists on disk:      {fp_exists}")
    print(f"    missing on disk:     {fp_missing}")
    print(f"    outside Nextcloud root: {fp_outside_root}")
    print(f"  Sample of 5 resolved mtimes:")
    for s in sample_resolved:
        print(f"    {s['id']:<15} {s['source'][:60]:<60}  mtime={s['mtime']}")

    # 3b. Rows without metadata.filepath: watcher_state lookup
    sub("3b. Rows without metadata.filepath — watcher_state lookup")
    cur.execute("""
        SELECT id, source FROM embeddings
        WHERE created_at IS NULL
          AND metadata->>'filepath' IS NULL
          AND type IS NULL OR (type='document' AND created_at IS NULL AND metadata->>'filepath' IS NULL);
    """)
    rows_no_fp = cur.fetchall()
    # Distinct source basenames to look up
    basenames_to_resolve = sorted({r["source"] for r in rows_no_fp if r["source"]})
    n_resolved_unique = sum(1 for n in basenames_to_resolve if len(by_name.get(n, [])) == 1)
    n_collision_unique = sum(1 for n in basenames_to_resolve if len(by_name.get(n, [])) > 1)
    n_unfound = sum(1 for n in basenames_to_resolve if n not in by_name)
    print(f"  rows without filepath: {len(rows_no_fp)}")
    print(f"  distinct source basenames to resolve: {len(basenames_to_resolve)}")
    print(f"    unique watcher_state hit (no collision): {n_resolved_unique}")
    print(f"    collision in watcher_state (>1 path):    {n_collision_unique}")
    print(f"    not in watcher_state at all:             {n_unfound}")

    # 3c. Collision-shape audit
    sub("3c. Collision-shape audit — all collisions in watcher_state")
    collisions = {n: [(p, m) for p, m in by_name[n]] for n in by_name if len(by_name[n]) > 1}
    shape_counts = Counter()
    rows_affected_by_shape = Counter()
    # Map from basename to count of NULL-ca rows that need it (rows_no_fp)
    rows_no_fp_by_name = Counter(r["source"] for r in rows_no_fp)
    sample_per_shape = defaultdict(list)
    for name, paths_mtimes in collisions.items():
        paths = [p for p, _ in paths_mtimes]
        shape = classify_collision_shape(paths)
        shape_counts[shape] += 1
        rows_affected_by_shape[shape] += rows_no_fp_by_name.get(name, 0)
        if len(sample_per_shape[shape]) < 3:
            entry = {
                "name": name,
                "rows_no_fp_using_this_name": rows_no_fp_by_name.get(name, 0),
                "candidates": [
                    {"path": p, "mtime": fmt_ts_from_unix(m)}
                    for p, m in sorted(paths_mtimes, key=lambda x: -float(x[1]))
                ],
            }
            sample_per_shape[shape].append(entry)
    print(f"  collisions in watcher_state: {len(collisions)}")
    print(f"  shape breakdown:")
    for shape, n in shape_counts.most_common():
        print(f"    {shape:<22}  collisions={n:<4}  rows_affected={rows_affected_by_shape[shape]}")
    print(f"\n  Up-to-3 sample collisions per shape (sorted by mtime desc):")
    for shape, samples in sample_per_shape.items():
        print(f"  [{shape}]")
        for s in samples:
            print(f"    {s['name']}  (rows_no_fp using this name: {s['rows_no_fp_using_this_name']})")
            for c in s["candidates"]:
                print(f"        {c['mtime']}  {c['path']}")

    return {
        "watcher_state_paths": len(by_path),
        "watcher_state_basenames": len(by_name),
        "watcher_state_collisions": len(collisions),
        "rows_with_filepath": {
            "total": len(rows_with_fp),
            "exists": fp_exists, "missing": fp_missing,
            "outside_root": fp_outside_root,
            "sample": sample_resolved,
        },
        "rows_without_filepath": {
            "total": len(rows_no_fp),
            "distinct_basenames": len(basenames_to_resolve),
            "unique_hit": n_resolved_unique,
            "collision_hit": n_collision_unique,
            "unfound": n_unfound,
        },
        "collision_shapes": {
            "total": len(collisions),
            "shape_counts": dict(shape_counts),
            "rows_affected_by_shape": dict(rows_affected_by_shape),
            "samples": {k: v for k, v in sample_per_shape.items()},
        },
    }


# ─── Section 4: ChatGPT export resolution ───────────────────────────────────

def section_4_chatgpt_export(cur):
    header("4. CHATGPT EXPORT RESOLUTION (Plan addition #1)")
    print(f"Probing: {CHATGPT_EXPORT_DIR}")
    if not CHATGPT_EXPORT_DIR.exists():
        print("  NOT FOUND — plan on sentinel for entire B-chatgpt cohort.")
        return {"export_dir_exists": False, "files": []}
    files = sorted(CHATGPT_EXPORT_DIR.glob("conversations*.json"))
    print(f"  found {len(files)} export file(s):")
    for f in files:
        print(f"    {f.name}  size={f.stat().st_size:,}  mtime={fmt_ts_from_st_mtime(f)}")

    # Build convo_id -> create_time index from all export files.
    print("\nLoading export(s) to build convo_id -> create_time index...")
    convo_index = {}
    for f in files:
        try:
            data = json.loads(f.read_text(encoding="utf-8"))
        except Exception as e:
            print(f"  failed to parse {f.name}: {e}")
            continue
        for convo in data:
            cid = convo.get("id") or convo.get("conversation_id")
            ct = convo.get("create_time")
            if cid and ct is not None:
                convo_index[cid] = ct
    print(f"  indexed {len(convo_index)} conversations across {len(files)} export files")

    # Sample 5 chatgpt_conversation rows; resolve.
    cur.execute("""
        SELECT id, source FROM embeddings
        WHERE type='chatgpt_conversation' AND created_at IS NULL
        ORDER BY random() LIMIT 5;
    """)
    sample = cur.fetchall()
    sub("Sample of 5 B-chatgpt rows: convo lookup")
    resolved = 0
    sample_results = []
    for r in sample:
        # IDs look like chatgpt_<uuid>_<idx>; uuid extends until last underscore.
        m = re.match(r"^chatgpt_(.+)_(\d+)$", r["id"])
        cid = m.group(1) if m else None
        ct = convo_index.get(cid)
        ct_iso = None
        if ct is not None:
            try:
                ct_iso = datetime.fromtimestamp(float(ct), tz=timezone.utc).isoformat().replace("+00:00", "Z")
            except Exception:
                ct_iso = None
        if ct_iso:
            resolved += 1
        sample_results.append({
            "id": r["id"], "source": r["source"], "convo_id": cid,
            "create_time": ct, "create_time_iso": ct_iso,
            "resolved": ct_iso is not None,
        })
        print(f"  {r['id']}  cid={cid}")
        print(f"     -> create_time={ct}  iso={ct_iso}")
    print(f"\nResolved {resolved}/5. "
          f"{'PROCEED with re-derive for full cohort.' if resolved == 5 else 'PARTIAL — plan re-derive + sentinel for unresolved.'}")

    # Estimate full-cohort coverage by counting how many B-chatgpt convo_ids appear in the index.
    cur.execute("""
        SELECT DISTINCT regexp_replace(id, '^chatgpt_(.+)_\\d+$', '\\1') AS cid
        FROM embeddings WHERE type='chatgpt_conversation' AND created_at IS NULL;
    """)
    distinct_cids = [r["cid"] for r in cur.fetchall()]
    in_index = sum(1 for c in distinct_cids if c in convo_index)
    print(f"Full-cohort coverage estimate: {in_index} / {len(distinct_cids)} distinct convo_ids "
          f"resolvable from export.")
    return {
        "export_dir_exists": True,
        "files": [{"name": f.name, "size": f.stat().st_size, "mtime": fmt_ts_from_st_mtime(f)} for f in files],
        "convo_index_size": len(convo_index),
        "sample_results": sample_results,
        "sample_resolved": resolved,
        "full_cohort": {
            "distinct_convo_ids": len(distinct_cids),
            "resolvable_from_export": in_index,
            "unresolvable": len(distinct_cids) - in_index,
        },
    }


# ─── Section 5: Sentinel date discovery ─────────────────────────────────────

def section_5_sentinel(cur):
    header("5. SENTINEL DATE DISCOVERY (Plan addition #3)")

    # 5a. Earliest non-NULL created_at per type: lower bound on substrate age.
    sub("5a. Earliest non-NULL created_at per type")
    cur.execute("""
        SELECT type, MIN(created_at) AS earliest, MAX(created_at) AS latest, COUNT(*) AS rows
        FROM embeddings WHERE created_at IS NOT NULL GROUP BY type ORDER BY type;
    """)
    rows = cur.fetchall()
    for r in rows:
        print(f"  {r['type']:<22}  earliest={r['earliest']:<32}  latest={r['latest']}")

    # 5b. git log for the pgvector-migration commit.
    sub("5b. Git log — pgvector migration commits")
    git_findings = []
    try:
        out = subprocess.run(
            ["git", "log", "--all", "--format=%H %ci %s",
             "--", "deprecated/migrate_to_pgvector.py", "scripts/migrate_to_pgvector.py"],
            cwd=str(Path.home() / "aaronai"), capture_output=True, text=True, timeout=10,
        )
        for line in out.stdout.strip().splitlines():
            print(f"  {line}")
            git_findings.append(line)
    except Exception as e:
        print(f"  git log failed: {e}")
    # Also: when did the api/ingest scripts cut over to pgvector?
    try:
        out = subprocess.run(
            ["git", "log", "--all", "--format=%H %ci %s", "--grep=pgvector", "-i"],
            cwd=str(Path.home() / "aaronai"), capture_output=True, text=True, timeout=10,
        )
        print("\n  Commits mentioning pgvector:")
        for line in out.stdout.strip().splitlines()[:10]:
            print(f"    {line}")
            git_findings.append(line)
    except Exception as e:
        print(f"  git log (pgvector grep) failed: {e}")

    # 5c. ChromaDB sqlite still on disk?
    sub("5c. ChromaDB dump on disk?")
    candidates = []
    for root in [Path.home() / "aaronai", Path.home() / "aaronai" / "db"]:
        if root.exists():
            for p in root.rglob("chroma*.sqlite*"):
                candidates.append({"path": str(p), "mtime": fmt_ts_from_st_mtime(p)})
    if candidates:
        for c in candidates:
            print(f"  found: {c['path']}  mtime={c['mtime']}")
    else:
        print("  no ChromaDB sqlite found under ~/aaronai")

    # 5d. Propose sentinel.
    sub("5d. Sentinel proposal")
    # Earliest doc cutover: per query, document=2026-04-30. Migration commit f78b830 was
    # 2026-04-26. Most defensible sentinel for "rows that entered pgvector before NOW()
    # writes were canonical" = the migration commit date.
    proposed = "2026-04-26T00:00:00Z"
    reasoning = (
        "git f78b830 'Migrate to pgvector — remove ChromaDB from api.py, ingest scripts, "
        "dream.py' is dated 2026-04-26. The earliest type='document' row with a non-NULL "
        "created_at lands 2026-04-30 (the F11 canonical-encoding cutover). Rows with NULL "
        "created_at all predate F11 and most predate the pgvector cutover itself. "
        "2026-04-26 is the date the ChromaDB->pgvector migration script was committed, "
        "so any row currently in the embeddings table with NULL created_at must have been "
        "ingested on or after that date (when the table came into existence in current form). "
        "It is the tightest defensible upper bound on 'the row entered pgvector before "
        "timestamps were tracked', so it is the right sentinel."
    )
    print(f"  Proposed sentinel: {proposed}")
    print(f"  Reasoning: {reasoning}")

    return {
        "earliest_per_type": rows,
        "git_findings": git_findings,
        "chromadb_candidates": candidates,
        "proposed_sentinel": proposed,
        "reasoning": reasoning,
    }


# ─── Section 6: 50-row stratified sample ────────────────────────────────────

def section_6_stratified_sample(cur, sentinel_iso):
    header("6. 50-ROW STRATIFIED SAMPLE — derived (type, created_at, source)")
    by_path, by_name = load_watcher_state()

    cohorts = [
        ("A (type NULL, ca NULL)", "type IS NULL AND created_at IS NULL", 10),
        ("B-doc-old (type='document', ca NULL)", "type='document' AND created_at IS NULL", 10),
        ("B-chatgpt (type='chatgpt_conversation', ca NULL)", "type='chatgpt_conversation' AND created_at IS NULL", 10),
        ("C-doc-new (type='document', ca set)", "type='document' AND created_at IS NOT NULL", 10),
        ("C-claude (type='claude_conversation', ca set)", "type='claude_conversation' AND created_at IS NOT NULL", 5),
        ("C-aaronai (type='aaronai_conversation', ca set)", "type='aaronai_conversation' AND created_at IS NOT NULL", 5),
    ]

    samples = []
    for label, predicate, n in cohorts:
        sub(f"{label}  (sample size: {n})")
        cur.execute(f"""
            SELECT id, source, type, created_at, metadata
            FROM embeddings WHERE {predicate}
            ORDER BY random() LIMIT %s;
        """, (n,))
        rows = cur.fetchall()
        for r in rows:
            row_meta = r["metadata"] or {}
            fp = row_meta.get("filepath")
            inferred_type = r["type"] or ("document" if (r["source"] or "").lower().endswith(tuple(SUPPORTED_EXT)) else "?")
            inferred_ca = r["created_at"]
            inferred_ca_source = "preserved" if inferred_ca else None
            if not inferred_ca:
                if fp and Path(fp).exists():
                    inferred_ca = fmt_ts_from_st_mtime(Path(fp))
                    inferred_ca_source = "filepath_stat"
                elif r["source"] and r["source"] in by_name:
                    candidates = by_name[r["source"]]
                    if len(candidates) == 1:
                        inferred_ca = fmt_ts_from_unix(candidates[0][1])
                        inferred_ca_source = "watcher_state_unique"
                    else:
                        # take most recent
                        latest = max(candidates, key=lambda x: float(x[1]))
                        inferred_ca = fmt_ts_from_unix(latest[1])
                        inferred_ca_source = f"watcher_state_collision_pick_latest_of_{len(candidates)}"
                else:
                    inferred_ca = sentinel_iso
                    inferred_ca_source = "sentinel"
            print(f"  id={r['id']:<22} src={(r['source'] or '')[:38]:<38}")
            print(f"     existing: type={r['type']!r:<22}  ca={r['created_at']!r}")
            print(f"     inferred: type={inferred_type!r:<22}  ca={inferred_ca!r}  ({inferred_ca_source})")
            samples.append({
                "cohort": label, "id": r["id"], "source": r["source"],
                "existing_type": r["type"], "existing_ca": r["created_at"],
                "inferred_type": inferred_type, "inferred_ca": inferred_ca,
                "inferred_ca_source": inferred_ca_source,
            })
    return samples


# ─── Driver ─────────────────────────────────────────────────────────────────

def main():
    pg = get_pg()
    cur = pg.cursor()

    out = {"generated_at": datetime.now(timezone.utc).isoformat()}
    out["section_1"] = section_1_cohort_recap(cur)
    out["section_2"] = section_2_type_inference(cur)
    out["section_3"] = section_3_created_at_inference(cur)
    out["section_4"] = section_4_chatgpt_export(cur)
    out["section_5"] = section_5_sentinel(cur)
    sentinel_iso = out["section_5"]["proposed_sentinel"]
    out["section_6"] = section_6_stratified_sample(cur, sentinel_iso)

    pg.close()

    # JSON sidecar — strip non-serializables.
    def _serialize(o):
        if isinstance(o, datetime):
            return o.isoformat()
        return str(o)

    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    OUT_PATH.write_text(json.dumps(out, indent=2, default=_serialize))
    print(f"\nJSON sidecar written: {OUT_PATH}")


if __name__ == "__main__":
    main()