From b9eea6cb622fce12820616a5acb254ec9d81bd48 Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Mon, 4 May 2026 16:19:56 +0000 Subject: [PATCH] watcher.py: extend lockfile filter to catch UTF-8-mangled ~$ prefixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three rows in ingest_failures were Office lockfile leftovers whose filename starts with ~� (~ followed by the UTF-8 replacement character) instead of ~$. Somewhere in the Nextcloud sync chain the $ byte was lost or replaced; the file now lives on disk as a real file with this corrupted name. The watcher's ("~$", ".") prefix filter didn't match, so each cycle tried to ingest these as pptx, hit BadZipFile inside python-pptx (lockfiles aren't real Office documents), and they ended up permanently in ingest_failures. Three filter sites in watcher.py applied the lockfile prefix check: - ingest_file() at :127 - get_changed_files() at :200 - IngestHandler._should_ignore() at :290 All three now match ("~$", "~", ".") — broadened to catch any tilde prefix, not just ~$. The cross-check against pgvector embeddings and disk found zero legitimate tilde-prefixed files in the corpus, so the broader filter has no false-positive risk in this corpus. Cleanup: 3 ingest_failures rows resolved (filepath LIKE '%/~%'). Unresolved count drops 97 → 94. If a fourth filter site is ever added, the right shape is consolidating the lockfile prefix check to a shared function or constant. Three parallel sites with three different tuple orderings is acceptable for now but worth normalizing if the surface grows. --- scripts/watcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/watcher.py b/scripts/watcher.py index 97fb188..6e94641 100644 --- a/scripts/watcher.py +++ b/scripts/watcher.py @@ -124,7 +124,7 @@ def resolve_ingest_failure(source: str): def ingest_file(filepath: Path, embedder) -> int: - if filepath.name.startswith(("~$", ".")): + if filepath.name.startswith(("~$", "~", ".")): return 0 if filepath.suffix.lower() not in SUPPORTED: return 0 @@ -197,7 +197,7 @@ def get_changed_files(state: dict) -> list: continue if path.suffix.lower() not in SUPPORTED: continue - if path.name.startswith((".", "~$")): + if path.name.startswith((".", "~$", "~")): continue if "Admin/Backups" in str(path) or "Backups" in path.parts: continue @@ -287,7 +287,7 @@ class IngestHandler(FileSystemEventHandler): self.last_event = 0 def _should_ignore(self, path: Path) -> bool: - if path.name.startswith((".", "~$")): + if path.name.startswith((".", "~$", "~")): return True if "Admin/Backups" in str(path) or "Backups" in path.parts: return True