watcher: handle deletes; sweep_orphans cleans existing phantom chunks

watcher.py now listens for on_deleted events and treats on_moved destinations that fall outside NEXTCLOUD_PATH (Nextcloud trashbin, moves to other volumes) as deletes. Both cases call delete_embeddings_for_path (DELETE WHERE metadata.filepath = ...) and remove_from_state to drop the file from watcher_state.json so it isn't carried as known-mtime. Match is by metadata.filepath, not source basename, so files that share a name across folders don't collide. scripts/sweep_orphans.py is the one-time cleanup for chunks the watcher missed before this fix: - Modern pass: rows with metadata.filepath whose file no longer exists. - Legacy pass: rows with NULL filepath and type='document' whose basename isn't anywhere on disk. type='document' restriction skips conversations and memory snapshots (synthetic sources, not files on disk). First run cleaned 629 rows: 628 from moved-file duplicates (e.g., BirdAI docs that traveled across Journal/, Library/, Journal/Projects/BirdAI/) plus the AARON_NELSON_BIO.pdf phantom Aaron flagged.
2026-05-20 02:52:00 +00:00
parent 9bb083f065
commit 10bb29290a
2 changed files with 192 additions and 1 deletions
@@ -0,0 +1,123 @@
+"""One-off: remove embeddings rows that no longer correspond to a file on disk.
+
+Two passes:
+  1. Modern rows (metadata.filepath set): check each filepath, delete if missing.
+  2. Legacy rows (metadata.filepath null): build a set of all basenames present
+     anywhere under NEXTCLOUD_PATH, then delete rows whose `source` basename
+     isn't in that set.
+
+Default mode is a dry-run (counts + sample paths, no writes). Pass --apply to
+actually delete.
+"""
+
+import os
+import sys
+from pathlib import Path
+from collections import defaultdict
+
+from dotenv import load_dotenv
+load_dotenv(Path.home() / "aaronai" / ".env", override=True)
+
+import psycopg2
+
+NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files")
+APPLY = "--apply" in sys.argv
+
+
+def get_pg():
+    return psycopg2.connect(os.environ["PG_DSN"])
+
+
+def scan_modern_orphans():
+    """Rows with metadata.filepath whose file doesn't exist on disk."""
+    pg = get_pg()
+    cur = pg.cursor()
+    cur.execute(
+        "SELECT id, source, metadata->>'filepath' AS filepath "
+        "FROM embeddings WHERE metadata->>'filepath' IS NOT NULL"
+    )
+    orphans = []
+    by_source = defaultdict(int)
+    for row in cur.fetchall():
+        fp = row[2]
+        if fp and not Path(fp).exists():
+            orphans.append(row)
+            by_source[row[1]] += 1
+    pg.close()
+    return orphans, by_source
+
+
+def scan_legacy_orphans():
+    """Rows without metadata.filepath whose basename isn't anywhere under
+    NEXTCLOUD_PATH. Restricted to type='document' so conversations and memory
+    snapshots (which are synthetic sources, not files on disk) aren't flagged
+    as orphans. Walks the filesystem once to build the basename set."""
+    print(f"  walking {NEXTCLOUD_PATH} to build basename index...")
+    on_disk = set()
+    for p in NEXTCLOUD_PATH.rglob("*"):
+        if p.is_file():
+            on_disk.add(p.name)
+    print(f"  {len(on_disk):,} files on disk")
+
+    pg = get_pg()
+    cur = pg.cursor()
+    cur.execute(
+        "SELECT id, source FROM embeddings "
+        "WHERE metadata->>'filepath' IS NULL AND type = 'document'"
+    )
+    orphans = []
+    by_source = defaultdict(int)
+    for row in cur.fetchall():
+        if row[1] not in on_disk:
+            orphans.append(row)
+            by_source[row[1]] += 1
+    pg.close()
+    return orphans, by_source
+
+
+def delete_rows(ids):
+    pg = get_pg()
+    cur = pg.cursor()
+    cur.execute("DELETE FROM embeddings WHERE id = ANY(%s)", (list(ids),))
+    deleted = cur.rowcount
+    pg.commit()
+    pg.close()
+    return deleted
+
+
+def main():
+    print(f"Mode: {'APPLY (destructive)' if APPLY else 'DRY-RUN (no writes)'}")
+    print(f"Target: {NEXTCLOUD_PATH}")
+    print()
+
+    print("Pass 1 — modern rows (metadata.filepath set):")
+    modern, modern_by_src = scan_modern_orphans()
+    print(f"  {len(modern):,} orphan rows across {len(modern_by_src):,} files")
+    for src, n in sorted(modern_by_src.items(), key=lambda kv: -kv[1])[:10]:
+        print(f"    {n:>4} chunks — {src}")
+    print()
+
+    print("Pass 2 — legacy rows (no metadata.filepath):")
+    legacy, legacy_by_src = scan_legacy_orphans()
+    print(f"  {len(legacy):,} orphan rows across {len(legacy_by_src):,} files")
+    for src, n in sorted(legacy_by_src.items(), key=lambda kv: -kv[1])[:10]:
+        print(f"    {n:>4} chunks — {src}")
+    print()
+
+    total = len(modern) + len(legacy)
+    if total == 0:
+        print("Nothing to delete.")
+        return
+
+    if not APPLY:
+        print(f"Dry-run only. Re-run with --apply to delete {total:,} rows.")
+        return
+
+    print(f"Deleting {total:,} orphan rows...")
+    n1 = delete_rows([r[0] for r in modern]) if modern else 0
+    n2 = delete_rows([r[0] for r in legacy]) if legacy else 0
+    print(f"  modern: {n1:,}  legacy: {n2:,}  total: {n1 + n2:,}")
+
+
+if __name__ == "__main__":
+    main()