"""Read-only inspection for the embeddings.type / embeddings.created_at backfill (Improvement #2 / A.1). Produces a survey of every backfill source-of-truth question without writing to the database. Output is a human-readable report on stdout plus a JSON sidecar at experiments/embeddings_backfill_inspection_.json. Sections: 1. Cohort recap (counts; should match prior investigation). 2. Cohort A type inference: extension classifier coverage. 3. created_at inference for cohort A + B-doc-old: - rows with metadata.filepath: stat the file, check existence. - rows without filepath: lookup source against watcher_state.json. - filename-collision shape audit (live+backup, live+archive, ambiguous). 4. ChatGPT export resolution (Plan A.1 addition #1): - existence of /home/aaron/nextcloud/.../ChatGPT Export/. - sample 5 B-chatgpt rows; resolve convo_id -> create_time. 5. Sentinel date discovery (Plan A.1 addition #3): - earliest non-NULL created_at per type (already-populated rows are the lower bound for when the substrate started carrying timestamps). - git log for the pgvector migration commit. - any ChromaDB sqlite still on disk. - propose a sentinel with reasoning, or flag as arbitrary. 6. 50-row stratified sample: derived (type, created_at, source) per row. Usage: venv/bin/python3 scripts/experiments/embeddings_backfill_inspection.py Read-only. No DB writes. No filesystem writes outside experiments/. """ import json import os import random import re import subprocess import sys from collections import Counter, defaultdict from datetime import datetime, timezone from pathlib import Path import psycopg2 from psycopg2.extras import RealDictCursor from dotenv import load_dotenv load_dotenv(Path.home() / "aaronai" / ".env") PG_DSN = os.getenv("PG_DSN") WATCHER_STATE = Path.home() / "aaronai" / "watcher_state.json" CHATGPT_EXPORT_DIR = Path("/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export") NEXTCLOUD_ROOT = Path("/home/aaron/nextcloud/data/data/aaron/files") OUT_PATH = Path.home() / "aaronai" / "experiments" / f"embeddings_backfill_inspection_{datetime.now().strftime('%Y-%m-%d')}.json" SUPPORTED_EXT = {".pdf", ".docx", ".pptx", ".txt", ".md"} random.seed(20260503) # ─── Helpers ──────────────────────────────────────────────────────────────── def get_pg(): return psycopg2.connect(PG_DSN, cursor_factory=RealDictCursor) def header(title): bar = "=" * 70 print(f"\n{bar}\n{title}\n{bar}") def sub(title): print(f"\n--- {title} ---") def fmt_ts_from_unix(ts): """Watcher state stores unix timestamps as strings.""" try: return datetime.fromtimestamp(float(ts), tz=timezone.utc).isoformat().replace("+00:00", "Z") except Exception: return None def fmt_ts_from_st_mtime(p): try: return datetime.fromtimestamp(p.stat().st_mtime, tz=timezone.utc).isoformat().replace("+00:00", "Z") except Exception: return None def load_watcher_state(): """Returns (path -> mtime_str), and (basename -> [(path, mtime_str), ...]).""" state = json.loads(WATCHER_STATE.read_text()) by_path = state by_name = defaultdict(list) for path, mtime in state.items(): by_name[Path(path).name].append((path, mtime)) return by_path, by_name def classify_collision_shape(paths): """Categorize a filename-collision group: - 'live+backup' : exactly one path doesn't contain backup/.bak markers and others do - 'live+archive' : exactly one is outside Archive/ and others are inside - 'multi-live' : >=2 paths look like live (no backup/archive markers) - 'all-archive' : every path is inside Archive/ or backup-like - 'other' """ def is_backup(p): s = p.lower() return ".bak" in s or "/backup" in s or "backups/" in s def is_archive(p): s = p.lower() return "/archive/" in s backups = [p for p in paths if is_backup(p)] archives = [p for p in paths if is_archive(p)] live = [p for p in paths if not is_backup(p) and not is_archive(p)] if len(live) == 1 and len(backups) >= 1 and len(archives) == 0: return "live+backup" if len(live) == 1 and len(archives) >= 1 and len(backups) == 0: return "live+archive" if len(live) == 1 and (len(backups) + len(archives)) >= 1: return "live+mixed-old" if len(live) >= 2: return "multi-live" if len(live) == 0: return "all-archive-or-backup" return "other" # ─── Section 1: Cohort recap ──────────────────────────────────────────────── def section_1_cohort_recap(cur): header("1. COHORT RECAP") cur.execute(""" SELECT COUNT(*) AS total, COUNT(*) FILTER (WHERE type IS NULL) AS type_null, COUNT(*) FILTER (WHERE created_at IS NULL) AS ca_null, COUNT(*) FILTER (WHERE type IS NULL AND created_at IS NULL) AS both_null, COUNT(*) FILTER (WHERE type IS NOT NULL AND created_at IS NOT NULL) AS both_set FROM embeddings; """) overall = cur.fetchone() print(f"Total: {overall['total']} type_null: {overall['type_null']} " f"ca_null: {overall['ca_null']} both_null: {overall['both_null']} " f"both_set: {overall['both_set']}") cur.execute(""" SELECT type, created_at IS NULL AS ca_null, COUNT(*) AS n FROM embeddings GROUP BY type, ca_null ORDER BY type NULLS LAST, ca_null; """) cohorts = cur.fetchall() sub("Per-(type, ca_null) cohorts") for r in cohorts: print(f" type={r['type'] or 'NULL':<22} ca_null={r['ca_null']!s:<5} n={r['n']}") return {"overall": overall, "cohorts": cohorts} # ─── Section 2: Cohort A type inference ───────────────────────────────────── def section_2_type_inference(cur): header("2. COHORT A TYPE INFERENCE (extension classifier)") cur.execute(""" SELECT LOWER(SUBSTRING(source FROM '\.[^.]+$')) AS ext, COUNT(*) AS rows FROM embeddings WHERE type IS NULL GROUP BY ext ORDER BY rows DESC; """) by_ext = cur.fetchall() classified = sum(r["rows"] for r in by_ext if r["ext"] in SUPPORTED_EXT) unknown = sum(r["rows"] for r in by_ext if r["ext"] not in SUPPORTED_EXT) print(f"NULL-type rows by extension:") for r in by_ext: flag = "OK" if r["ext"] in SUPPORTED_EXT else "??" print(f" {flag} {r['ext'] or '(none)':<8} rows={r['rows']}") print(f"\nClassified as 'document' via extension: {classified}") print(f"Unclassifiable (no SUPPORTED extension): {unknown}") return {"by_ext": by_ext, "classified": classified, "unclassifiable": unknown} # ─── Section 3: created_at inference ──────────────────────────────────────── def section_3_created_at_inference(cur): header("3. CREATED_AT INFERENCE — file-derived rows") by_path, by_name = load_watcher_state() print(f"watcher_state.json: {len(by_path)} tracked paths, " f"{len(by_name)} distinct filenames, " f"{sum(1 for v in by_name.values() if len(v) > 1)} filename collisions") # 3a. Rows with metadata.filepath: probe stat() sub("3a. Rows with metadata.filepath — stat probe") cur.execute(""" SELECT id, source, metadata->>'filepath' AS filepath FROM embeddings WHERE created_at IS NULL AND metadata->>'filepath' IS NOT NULL; """) rows_with_fp = cur.fetchall() fp_exists = 0 fp_missing = 0 fp_outside_root = 0 sample_resolved = [] for r in rows_with_fp: p = Path(r["filepath"]) if not str(p).startswith(str(NEXTCLOUD_ROOT)): fp_outside_root += 1 if p.exists(): fp_exists += 1 if len(sample_resolved) < 5: sample_resolved.append({ "id": r["id"], "source": r["source"], "filepath": str(p), "mtime": fmt_ts_from_st_mtime(p), }) else: fp_missing += 1 print(f" rows with metadata.filepath: {len(rows_with_fp)}") print(f" exists on disk: {fp_exists}") print(f" missing on disk: {fp_missing}") print(f" outside Nextcloud root: {fp_outside_root}") print(f" Sample of 5 resolved mtimes:") for s in sample_resolved: print(f" {s['id']:<15} {s['source'][:60]:<60} mtime={s['mtime']}") # 3b. Rows without metadata.filepath: watcher_state lookup sub("3b. Rows without metadata.filepath — watcher_state lookup") cur.execute(""" SELECT id, source FROM embeddings WHERE created_at IS NULL AND metadata->>'filepath' IS NULL AND type IS NULL OR (type='document' AND created_at IS NULL AND metadata->>'filepath' IS NULL); """) rows_no_fp = cur.fetchall() # Distinct source basenames to look up basenames_to_resolve = sorted({r["source"] for r in rows_no_fp if r["source"]}) n_resolved_unique = sum(1 for n in basenames_to_resolve if len(by_name.get(n, [])) == 1) n_collision_unique = sum(1 for n in basenames_to_resolve if len(by_name.get(n, [])) > 1) n_unfound = sum(1 for n in basenames_to_resolve if n not in by_name) print(f" rows without filepath: {len(rows_no_fp)}") print(f" distinct source basenames to resolve: {len(basenames_to_resolve)}") print(f" unique watcher_state hit (no collision): {n_resolved_unique}") print(f" collision in watcher_state (>1 path): {n_collision_unique}") print(f" not in watcher_state at all: {n_unfound}") # 3c. Collision-shape audit sub("3c. Collision-shape audit — all collisions in watcher_state") collisions = {n: [(p, m) for p, m in by_name[n]] for n in by_name if len(by_name[n]) > 1} shape_counts = Counter() rows_affected_by_shape = Counter() # Map from basename to count of NULL-ca rows that need it (rows_no_fp) rows_no_fp_by_name = Counter(r["source"] for r in rows_no_fp) sample_per_shape = defaultdict(list) for name, paths_mtimes in collisions.items(): paths = [p for p, _ in paths_mtimes] shape = classify_collision_shape(paths) shape_counts[shape] += 1 rows_affected_by_shape[shape] += rows_no_fp_by_name.get(name, 0) if len(sample_per_shape[shape]) < 3: entry = { "name": name, "rows_no_fp_using_this_name": rows_no_fp_by_name.get(name, 0), "candidates": [ {"path": p, "mtime": fmt_ts_from_unix(m)} for p, m in sorted(paths_mtimes, key=lambda x: -float(x[1])) ], } sample_per_shape[shape].append(entry) print(f" collisions in watcher_state: {len(collisions)}") print(f" shape breakdown:") for shape, n in shape_counts.most_common(): print(f" {shape:<22} collisions={n:<4} rows_affected={rows_affected_by_shape[shape]}") print(f"\n Up-to-3 sample collisions per shape (sorted by mtime desc):") for shape, samples in sample_per_shape.items(): print(f" [{shape}]") for s in samples: print(f" {s['name']} (rows_no_fp using this name: {s['rows_no_fp_using_this_name']})") for c in s["candidates"]: print(f" {c['mtime']} {c['path']}") return { "watcher_state_paths": len(by_path), "watcher_state_basenames": len(by_name), "watcher_state_collisions": len(collisions), "rows_with_filepath": { "total": len(rows_with_fp), "exists": fp_exists, "missing": fp_missing, "outside_root": fp_outside_root, "sample": sample_resolved, }, "rows_without_filepath": { "total": len(rows_no_fp), "distinct_basenames": len(basenames_to_resolve), "unique_hit": n_resolved_unique, "collision_hit": n_collision_unique, "unfound": n_unfound, }, "collision_shapes": { "total": len(collisions), "shape_counts": dict(shape_counts), "rows_affected_by_shape": dict(rows_affected_by_shape), "samples": {k: v for k, v in sample_per_shape.items()}, }, } # ─── Section 4: ChatGPT export resolution ─────────────────────────────────── def section_4_chatgpt_export(cur): header("4. CHATGPT EXPORT RESOLUTION (Plan addition #1)") print(f"Probing: {CHATGPT_EXPORT_DIR}") if not CHATGPT_EXPORT_DIR.exists(): print(" NOT FOUND — plan on sentinel for entire B-chatgpt cohort.") return {"export_dir_exists": False, "files": []} files = sorted(CHATGPT_EXPORT_DIR.glob("conversations*.json")) print(f" found {len(files)} export file(s):") for f in files: print(f" {f.name} size={f.stat().st_size:,} mtime={fmt_ts_from_st_mtime(f)}") # Build convo_id -> create_time index from all export files. print("\nLoading export(s) to build convo_id -> create_time index...") convo_index = {} for f in files: try: data = json.loads(f.read_text(encoding="utf-8")) except Exception as e: print(f" failed to parse {f.name}: {e}") continue for convo in data: cid = convo.get("id") or convo.get("conversation_id") ct = convo.get("create_time") if cid and ct is not None: convo_index[cid] = ct print(f" indexed {len(convo_index)} conversations across {len(files)} export files") # Sample 5 chatgpt_conversation rows; resolve. cur.execute(""" SELECT id, source FROM embeddings WHERE type='chatgpt_conversation' AND created_at IS NULL ORDER BY random() LIMIT 5; """) sample = cur.fetchall() sub("Sample of 5 B-chatgpt rows: convo lookup") resolved = 0 sample_results = [] for r in sample: # IDs look like chatgpt__; uuid extends until last underscore. m = re.match(r"^chatgpt_(.+)_(\d+)$", r["id"]) cid = m.group(1) if m else None ct = convo_index.get(cid) ct_iso = None if ct is not None: try: ct_iso = datetime.fromtimestamp(float(ct), tz=timezone.utc).isoformat().replace("+00:00", "Z") except Exception: ct_iso = None if ct_iso: resolved += 1 sample_results.append({ "id": r["id"], "source": r["source"], "convo_id": cid, "create_time": ct, "create_time_iso": ct_iso, "resolved": ct_iso is not None, }) print(f" {r['id']} cid={cid}") print(f" -> create_time={ct} iso={ct_iso}") print(f"\nResolved {resolved}/5. " f"{'PROCEED with re-derive for full cohort.' if resolved == 5 else 'PARTIAL — plan re-derive + sentinel for unresolved.'}") # Estimate full-cohort coverage by counting how many B-chatgpt convo_ids appear in the index. cur.execute(""" SELECT DISTINCT regexp_replace(id, '^chatgpt_(.+)_\\d+$', '\\1') AS cid FROM embeddings WHERE type='chatgpt_conversation' AND created_at IS NULL; """) distinct_cids = [r["cid"] for r in cur.fetchall()] in_index = sum(1 for c in distinct_cids if c in convo_index) print(f"Full-cohort coverage estimate: {in_index} / {len(distinct_cids)} distinct convo_ids " f"resolvable from export.") return { "export_dir_exists": True, "files": [{"name": f.name, "size": f.stat().st_size, "mtime": fmt_ts_from_st_mtime(f)} for f in files], "convo_index_size": len(convo_index), "sample_results": sample_results, "sample_resolved": resolved, "full_cohort": { "distinct_convo_ids": len(distinct_cids), "resolvable_from_export": in_index, "unresolvable": len(distinct_cids) - in_index, }, } # ─── Section 5: Sentinel date discovery ───────────────────────────────────── def section_5_sentinel(cur): header("5. SENTINEL DATE DISCOVERY (Plan addition #3)") # 5a. Earliest non-NULL created_at per type: lower bound on substrate age. sub("5a. Earliest non-NULL created_at per type") cur.execute(""" SELECT type, MIN(created_at) AS earliest, MAX(created_at) AS latest, COUNT(*) AS rows FROM embeddings WHERE created_at IS NOT NULL GROUP BY type ORDER BY type; """) rows = cur.fetchall() for r in rows: print(f" {r['type']:<22} earliest={r['earliest']:<32} latest={r['latest']}") # 5b. git log for the pgvector-migration commit. sub("5b. Git log — pgvector migration commits") git_findings = [] try: out = subprocess.run( ["git", "log", "--all", "--format=%H %ci %s", "--", "deprecated/migrate_to_pgvector.py", "scripts/migrate_to_pgvector.py"], cwd=str(Path.home() / "aaronai"), capture_output=True, text=True, timeout=10, ) for line in out.stdout.strip().splitlines(): print(f" {line}") git_findings.append(line) except Exception as e: print(f" git log failed: {e}") # Also: when did the api/ingest scripts cut over to pgvector? try: out = subprocess.run( ["git", "log", "--all", "--format=%H %ci %s", "--grep=pgvector", "-i"], cwd=str(Path.home() / "aaronai"), capture_output=True, text=True, timeout=10, ) print("\n Commits mentioning pgvector:") for line in out.stdout.strip().splitlines()[:10]: print(f" {line}") git_findings.append(line) except Exception as e: print(f" git log (pgvector grep) failed: {e}") # 5c. ChromaDB sqlite still on disk? sub("5c. ChromaDB dump on disk?") candidates = [] for root in [Path.home() / "aaronai", Path.home() / "aaronai" / "db"]: if root.exists(): for p in root.rglob("chroma*.sqlite*"): candidates.append({"path": str(p), "mtime": fmt_ts_from_st_mtime(p)}) if candidates: for c in candidates: print(f" found: {c['path']} mtime={c['mtime']}") else: print(" no ChromaDB sqlite found under ~/aaronai") # 5d. Propose sentinel. sub("5d. Sentinel proposal") # Earliest doc cutover: per query, document=2026-04-30. Migration commit f78b830 was # 2026-04-26. Most defensible sentinel for "rows that entered pgvector before NOW() # writes were canonical" = the migration commit date. proposed = "2026-04-26T00:00:00Z" reasoning = ( "git f78b830 'Migrate to pgvector — remove ChromaDB from api.py, ingest scripts, " "dream.py' is dated 2026-04-26. The earliest type='document' row with a non-NULL " "created_at lands 2026-04-30 (the F11 canonical-encoding cutover). Rows with NULL " "created_at all predate F11 and most predate the pgvector cutover itself. " "2026-04-26 is the date the ChromaDB->pgvector migration script was committed, " "so any row currently in the embeddings table with NULL created_at must have been " "ingested on or after that date (when the table came into existence in current form). " "It is the tightest defensible upper bound on 'the row entered pgvector before " "timestamps were tracked', so it is the right sentinel." ) print(f" Proposed sentinel: {proposed}") print(f" Reasoning: {reasoning}") return { "earliest_per_type": rows, "git_findings": git_findings, "chromadb_candidates": candidates, "proposed_sentinel": proposed, "reasoning": reasoning, } # ─── Section 6: 50-row stratified sample ──────────────────────────────────── def section_6_stratified_sample(cur, sentinel_iso): header("6. 50-ROW STRATIFIED SAMPLE — derived (type, created_at, source)") by_path, by_name = load_watcher_state() cohorts = [ ("A (type NULL, ca NULL)", "type IS NULL AND created_at IS NULL", 10), ("B-doc-old (type='document', ca NULL)", "type='document' AND created_at IS NULL", 10), ("B-chatgpt (type='chatgpt_conversation', ca NULL)", "type='chatgpt_conversation' AND created_at IS NULL", 10), ("C-doc-new (type='document', ca set)", "type='document' AND created_at IS NOT NULL", 10), ("C-claude (type='claude_conversation', ca set)", "type='claude_conversation' AND created_at IS NOT NULL", 5), ("C-aaronai (type='aaronai_conversation', ca set)", "type='aaronai_conversation' AND created_at IS NOT NULL", 5), ] samples = [] for label, predicate, n in cohorts: sub(f"{label} (sample size: {n})") cur.execute(f""" SELECT id, source, type, created_at, metadata FROM embeddings WHERE {predicate} ORDER BY random() LIMIT %s; """, (n,)) rows = cur.fetchall() for r in rows: row_meta = r["metadata"] or {} fp = row_meta.get("filepath") inferred_type = r["type"] or ("document" if (r["source"] or "").lower().endswith(tuple(SUPPORTED_EXT)) else "?") inferred_ca = r["created_at"] inferred_ca_source = "preserved" if inferred_ca else None if not inferred_ca: if fp and Path(fp).exists(): inferred_ca = fmt_ts_from_st_mtime(Path(fp)) inferred_ca_source = "filepath_stat" elif r["source"] and r["source"] in by_name: candidates = by_name[r["source"]] if len(candidates) == 1: inferred_ca = fmt_ts_from_unix(candidates[0][1]) inferred_ca_source = "watcher_state_unique" else: # take most recent latest = max(candidates, key=lambda x: float(x[1])) inferred_ca = fmt_ts_from_unix(latest[1]) inferred_ca_source = f"watcher_state_collision_pick_latest_of_{len(candidates)}" else: inferred_ca = sentinel_iso inferred_ca_source = "sentinel" print(f" id={r['id']:<22} src={(r['source'] or '')[:38]:<38}") print(f" existing: type={r['type']!r:<22} ca={r['created_at']!r}") print(f" inferred: type={inferred_type!r:<22} ca={inferred_ca!r} ({inferred_ca_source})") samples.append({ "cohort": label, "id": r["id"], "source": r["source"], "existing_type": r["type"], "existing_ca": r["created_at"], "inferred_type": inferred_type, "inferred_ca": inferred_ca, "inferred_ca_source": inferred_ca_source, }) return samples # ─── Driver ───────────────────────────────────────────────────────────────── def main(): pg = get_pg() cur = pg.cursor() out = {"generated_at": datetime.now(timezone.utc).isoformat()} out["section_1"] = section_1_cohort_recap(cur) out["section_2"] = section_2_type_inference(cur) out["section_3"] = section_3_created_at_inference(cur) out["section_4"] = section_4_chatgpt_export(cur) out["section_5"] = section_5_sentinel(cur) sentinel_iso = out["section_5"]["proposed_sentinel"] out["section_6"] = section_6_stratified_sample(cur, sentinel_iso) pg.close() # JSON sidecar — strip non-serializables. def _serialize(o): if isinstance(o, datetime): return o.isoformat() return str(o) OUT_PATH.parent.mkdir(parents=True, exist_ok=True) OUT_PATH.write_text(json.dumps(out, indent=2, default=_serialize)) print(f"\nJSON sidecar written: {OUT_PATH}") if __name__ == "__main__": main()