aaronAI/scripts/sweep_orphans.py

"""One-off: remove embeddings rows that no longer correspond to a file on disk.

Two passes:
  1. Modern rows (metadata.filepath set): check each filepath, delete if missing.
  2. Legacy rows (metadata.filepath null): build a set of all basenames present
     anywhere under NEXTCLOUD_PATH, then delete rows whose `source` basename
     isn't in that set.

Default mode is a dry-run (counts + sample paths, no writes). Pass --apply to
actually delete.
"""

import os
import sys
from pathlib import Path
from collections import defaultdict

from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env", override=True)

import psycopg2

NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files")
APPLY = "--apply" in sys.argv


def get_pg():
    return psycopg2.connect(os.environ["PG_DSN"])


def scan_modern_orphans():
    """Rows with metadata.filepath whose file doesn't exist on disk."""
    pg = get_pg()
    cur = pg.cursor()
    cur.execute(
        "SELECT id, source, metadata->>'filepath' AS filepath "
        "FROM embeddings WHERE metadata->>'filepath' IS NOT NULL"
    )
    orphans = []
    by_source = defaultdict(int)
    for row in cur.fetchall():
        fp = row[2]
        if fp and not Path(fp).exists():
            orphans.append(row)
            by_source[row[1]] += 1
    pg.close()
    return orphans, by_source


def scan_legacy_orphans():
    """Rows without metadata.filepath whose basename isn't anywhere under
    NEXTCLOUD_PATH. Restricted to type='document' so conversations and memory
    snapshots (which are synthetic sources, not files on disk) aren't flagged
    as orphans. Walks the filesystem once to build the basename set."""
    print(f"  walking {NEXTCLOUD_PATH} to build basename index...")
    on_disk = set()
    for p in NEXTCLOUD_PATH.rglob("*"):
        if p.is_file():
            on_disk.add(p.name)
    print(f"  {len(on_disk):,} files on disk")

    pg = get_pg()
    cur = pg.cursor()
    cur.execute(
        "SELECT id, source FROM embeddings "
        "WHERE metadata->>'filepath' IS NULL AND type = 'document'"
    )
    orphans = []
    by_source = defaultdict(int)
    for row in cur.fetchall():
        if row[1] not in on_disk:
            orphans.append(row)
            by_source[row[1]] += 1
    pg.close()
    return orphans, by_source


def delete_rows(ids):
    pg = get_pg()
    cur = pg.cursor()
    cur.execute("DELETE FROM embeddings WHERE id = ANY(%s)", (list(ids),))
    deleted = cur.rowcount
    pg.commit()
    pg.close()
    return deleted


def main():
    print(f"Mode: {'APPLY (destructive)' if APPLY else 'DRY-RUN (no writes)'}")
    print(f"Target: {NEXTCLOUD_PATH}")
    print()

    print("Pass 1 — modern rows (metadata.filepath set):")
    modern, modern_by_src = scan_modern_orphans()
    print(f"  {len(modern):,} orphan rows across {len(modern_by_src):,} files")
    for src, n in sorted(modern_by_src.items(), key=lambda kv: -kv[1])[:10]:
        print(f"    {n:>4} chunks — {src}")
    print()

    print("Pass 2 — legacy rows (no metadata.filepath):")
    legacy, legacy_by_src = scan_legacy_orphans()
    print(f"  {len(legacy):,} orphan rows across {len(legacy_by_src):,} files")
    for src, n in sorted(legacy_by_src.items(), key=lambda kv: -kv[1])[:10]:
        print(f"    {n:>4} chunks — {src}")
    print()

    total = len(modern) + len(legacy)
    if total == 0:
        print("Nothing to delete.")
        return

    if not APPLY:
        print(f"Dry-run only. Re-run with --apply to delete {total:,} rows.")
        return

    print(f"Deleting {total:,} orphan rows...")
    n1 = delete_rows([r[0] for r in modern]) if modern else 0
    n2 = delete_rows([r[0] for r in legacy]) if legacy else 0
    print(f"  modern: {n1:,}  legacy: {n2:,}  total: {n1 + n2:,}")


if __name__ == "__main__":
    main()