watcher: handle deletes; sweep_orphans cleans existing phantom chunks
watcher.py now listens for on_deleted events and treats on_moved destinations that fall outside NEXTCLOUD_PATH (Nextcloud trashbin, moves to other volumes) as deletes. Both cases call delete_embeddings_for_path (DELETE WHERE metadata.filepath = ...) and remove_from_state to drop the file from watcher_state.json so it isn't carried as known-mtime. Match is by metadata.filepath, not source basename, so files that share a name across folders don't collide. scripts/sweep_orphans.py is the one-time cleanup for chunks the watcher missed before this fix: - Modern pass: rows with metadata.filepath whose file no longer exists. - Legacy pass: rows with NULL filepath and type='document' whose basename isn't anywhere on disk. type='document' restriction skips conversations and memory snapshots (synthetic sources, not files on disk). First run cleaned 629 rows: 628 from moved-file duplicates (e.g., BirdAI docs that traveled across Journal/, Library/, Journal/Projects/BirdAI/) plus the AARON_NELSON_BIO.pdf phantom Aaron flagged.
This commit is contained in:
+69
-1
@@ -123,6 +123,42 @@ def resolve_ingest_failure(source: str):
|
||||
log.warning(f"Could not resolve ingest failure record (non-fatal): {e}")
|
||||
|
||||
|
||||
def delete_embeddings_for_path(filepath: Path):
|
||||
"""Remove embeddings rows for a file that no longer exists. Matches by
|
||||
metadata.filepath so multi-folder same-basename files don't collide.
|
||||
Legacy rows without filepath metadata are left alone — they get cleaned
|
||||
by sweep_orphans.py."""
|
||||
try:
|
||||
pg = get_pg()
|
||||
try:
|
||||
cur = pg.cursor()
|
||||
cur.execute(
|
||||
"DELETE FROM embeddings WHERE metadata->>'filepath' = %s",
|
||||
(str(filepath),),
|
||||
)
|
||||
deleted = cur.rowcount
|
||||
pg.commit()
|
||||
if deleted:
|
||||
log.info(f"Deleted {deleted} chunks for removed file: {filepath}")
|
||||
finally:
|
||||
pg.close()
|
||||
except Exception as e:
|
||||
log.warning(f"Could not delete embeddings for {filepath} (non-fatal): {e}")
|
||||
|
||||
|
||||
def remove_from_state(filepath: Path):
|
||||
"""Drop a deleted file from watcher_state.json so it isn't carried as
|
||||
'known mtime' indefinitely."""
|
||||
try:
|
||||
state = load_state()
|
||||
key = str(filepath)
|
||||
if key in state:
|
||||
del state[key]
|
||||
save_state(state)
|
||||
except Exception as e:
|
||||
log.warning(f"Could not update state for deleted {filepath} (non-fatal): {e}")
|
||||
|
||||
|
||||
IGNORED_TOP_FOLDERS = {"Drafts"}
|
||||
|
||||
|
||||
@@ -350,15 +386,47 @@ class IngestHandler(FileSystemEventHandler):
|
||||
def on_moved(self, event):
|
||||
if event.is_directory:
|
||||
return
|
||||
src = Path(event.src_path)
|
||||
dest = Path(event.dest_path)
|
||||
# If destination is outside NEXTCLOUD_PATH (e.g., Nextcloud trashbin at
|
||||
# /home/aaron/nextcloud/data/data/aaron/files_trashbin/), treat as a
|
||||
# delete — the file is no longer in the watched corpus.
|
||||
try:
|
||||
dest.relative_to(NEXTCLOUD_PATH)
|
||||
except ValueError:
|
||||
if src.suffix.lower() in SUPPORTED:
|
||||
log.info(f"Event: moved out of tree {src} -> {dest}")
|
||||
threading.Thread(
|
||||
target=lambda: (
|
||||
delete_embeddings_for_path(src),
|
||||
remove_from_state(src),
|
||||
),
|
||||
daemon=True,
|
||||
).start()
|
||||
return
|
||||
# Nextcloud WebDAV writes .part temp files then renames to final path.
|
||||
# src_path is the .part file; dest_path is the final filename.
|
||||
dest = Path(event.dest_path)
|
||||
if dest.suffix.lower() not in SUPPORTED or self._should_ignore(dest):
|
||||
return
|
||||
log.info(f"Event: moved -> {dest}")
|
||||
self.pending = True
|
||||
self.last_event = time.time()
|
||||
|
||||
def on_deleted(self, event):
|
||||
if event.is_directory:
|
||||
return
|
||||
path = Path(event.src_path)
|
||||
if path.suffix.lower() not in SUPPORTED:
|
||||
return
|
||||
log.info(f"Event: deleted {path}")
|
||||
threading.Thread(
|
||||
target=lambda: (
|
||||
delete_embeddings_for_path(path),
|
||||
remove_from_state(path),
|
||||
),
|
||||
daemon=True,
|
||||
).start()
|
||||
|
||||
def on_closed(self, event):
|
||||
# FileClosedEvent fires on the final file after Nextcloud completes write.
|
||||
# Belt-and-suspenders catch for any write pattern not caught by on_moved.
|
||||
|
||||
Reference in New Issue
Block a user