From f18fb64fe5e9a5a804a60ae5d0f6271d4cbf40fa Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Mon, 4 May 2026 06:24:08 +0000 Subject: [PATCH] watcher.py: exclude generative-graphic folders and zero-byte files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two-sample diagnostic of the 128 ingest_failures rows surfaced two folders whose contents are exclusively non-text PDFs (iText-produced generative graphics from Processing sketches and computational design sketches) and three zero-byte test artifacts. None of these have ever produced an embedding chunk, and they have nothing extractable to contribute. Excluding them removes 19 / 128 (15%) of the locked-out failures from the cohort and prevents future versions of the same patterns from re-failing. Folder exclusions use path.parts membership rather than substring matching — eliminates false-match risk if similarly-named folders appear elsewhere in the corpus (e.g. an unrelated "Generative Design" or "Computational Design 2017" directory created later). The existing "Admin/Backups" / "Journal/Media" substring checks are looser, but new exclusions take the tighter pattern. Zero-byte filter goes in get_changed_files() only — the actual ingestion gate. Adding stat() to _should_ignore() (the FS-event noise filter) would introduce a race where the file is gone between event fire and stat call. Empty files briefly trigger pending=True but produce no work after debounce; cosmetic only. Cleanup applied separately via UPDATE: 19 ingest_failures rows for these paths marked resolved=TRUE. Unresolved-failure count: 129 -> 110. Verified: get_changed_files() with empty state returns 1418 changed files; all 5 excluded probes (2 folder-matched + 3 zero-byte) absent from the result, control file present. Watcher service restarted clean; startup scan reports no missed files. --- scripts/watcher.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/watcher.py b/scripts/watcher.py index 01adb4b..97fb188 100644 --- a/scripts/watcher.py +++ b/scripts/watcher.py @@ -203,6 +203,12 @@ def get_changed_files(state: dict) -> list: continue if "Journal/Media" in str(path): continue + if "Generative Design" in path.parts and "Processing" in path.parts: + continue + if "Computational Design 2017" in path.parts and "Student Work" in path.parts: + continue + if path.stat().st_size == 0: + continue if state.get(str(path)) != str(path.stat().st_mtime): changed.append(path) return changed @@ -287,6 +293,10 @@ class IngestHandler(FileSystemEventHandler): return True if "Journal/Media" in str(path): return True + if "Generative Design" in path.parts and "Processing" in path.parts: + return True + if "Computational Design 2017" in path.parts and "Student Work" in path.parts: + return True return False def on_created(self, event):