"""One-off: remove embeddings rows that no longer correspond to a file on disk. Two passes: 1. Modern rows (metadata.filepath set): check each filepath, delete if missing. 2. Legacy rows (metadata.filepath null): build a set of all basenames present anywhere under NEXTCLOUD_PATH, then delete rows whose `source` basename isn't in that set. Default mode is a dry-run (counts + sample paths, no writes). Pass --apply to actually delete. """ import os import sys from pathlib import Path from collections import defaultdict from dotenv import load_dotenv load_dotenv(Path.home() / "aaronai" / ".env", override=True) import psycopg2 NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files") APPLY = "--apply" in sys.argv def get_pg(): return psycopg2.connect(os.environ["PG_DSN"]) def scan_modern_orphans(): """Rows with metadata.filepath whose file doesn't exist on disk.""" pg = get_pg() cur = pg.cursor() cur.execute( "SELECT id, source, metadata->>'filepath' AS filepath " "FROM embeddings WHERE metadata->>'filepath' IS NOT NULL" ) orphans = [] by_source = defaultdict(int) for row in cur.fetchall(): fp = row[2] if fp and not Path(fp).exists(): orphans.append(row) by_source[row[1]] += 1 pg.close() return orphans, by_source def scan_legacy_orphans(): """Rows without metadata.filepath whose basename isn't anywhere under NEXTCLOUD_PATH. Restricted to type='document' so conversations and memory snapshots (which are synthetic sources, not files on disk) aren't flagged as orphans. Walks the filesystem once to build the basename set.""" print(f" walking {NEXTCLOUD_PATH} to build basename index...") on_disk = set() for p in NEXTCLOUD_PATH.rglob("*"): if p.is_file(): on_disk.add(p.name) print(f" {len(on_disk):,} files on disk") pg = get_pg() cur = pg.cursor() cur.execute( "SELECT id, source FROM embeddings " "WHERE metadata->>'filepath' IS NULL AND type = 'document'" ) orphans = [] by_source = defaultdict(int) for row in cur.fetchall(): if row[1] not in on_disk: orphans.append(row) by_source[row[1]] += 1 pg.close() return orphans, by_source def delete_rows(ids): pg = get_pg() cur = pg.cursor() cur.execute("DELETE FROM embeddings WHERE id = ANY(%s)", (list(ids),)) deleted = cur.rowcount pg.commit() pg.close() return deleted def main(): print(f"Mode: {'APPLY (destructive)' if APPLY else 'DRY-RUN (no writes)'}") print(f"Target: {NEXTCLOUD_PATH}") print() print("Pass 1 — modern rows (metadata.filepath set):") modern, modern_by_src = scan_modern_orphans() print(f" {len(modern):,} orphan rows across {len(modern_by_src):,} files") for src, n in sorted(modern_by_src.items(), key=lambda kv: -kv[1])[:10]: print(f" {n:>4} chunks — {src}") print() print("Pass 2 — legacy rows (no metadata.filepath):") legacy, legacy_by_src = scan_legacy_orphans() print(f" {len(legacy):,} orphan rows across {len(legacy_by_src):,} files") for src, n in sorted(legacy_by_src.items(), key=lambda kv: -kv[1])[:10]: print(f" {n:>4} chunks — {src}") print() total = len(modern) + len(legacy) if total == 0: print("Nothing to delete.") return if not APPLY: print(f"Dry-run only. Re-run with --apply to delete {total:,} rows.") return print(f"Deleting {total:,} orphan rows...") n1 = delete_rows([r[0] for r in modern]) if modern else 0 n2 = delete_rows([r[0] for r in legacy]) if legacy else 0 print(f" modern: {n1:,} legacy: {n2:,} total: {n1 + n2:,}") if __name__ == "__main__": main()