From e38d283e5920caaf01eeac28c898558caef5b725 Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Tue, 5 May 2026 01:42:40 +0000 Subject: [PATCH] watcher.py: exclude 3 image-only pptx files from ingestion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three files in the original ingest_failures cohort have been characterized via direct OCR and confirmed to lack ingestible text: - Presentations/Renders.pptx — 35 PICTURE-shape renders, 33/35 zero-char on OCR, 2 with noise (20 and 29 chars). - Presentations/Ribbon Cutting Slideshow.pptx — 10-slide event photo deck, 9/10 zero-char, 1 with 17 chars of noise. - Academic/DDF555 3D Computational/GH Slicer Notes [Autosaved].pptx — Office autosave duplicate of GH Slicer Notes.pptx; first 9 images byte-identical (sha256) to the canonical file. 2 net-new images contribute 36 noisy chars. Excluding to prevent double-embedding the same content under two source filenames. Pattern matches f18fb64 (path.parts membership). Folder-level globs were considered and rejected: /Presentations/ contains successfully embedded text-bearing decks (aaronnelson_3D 4D.pptx, aaronnelson_slideslam.pptx). Exact-name + parent-folder membership applied in both watcher filter sites (get_changed_files and IngestHandler._should_ignore). The fourth file in the cohort, GH Slicer Notes.pptx (the canonical non-autosave version), was confirmed to carry 379 chars of real text (Grasshopper UI / code samples) across 6/9 images. It remains in ingest_failures unresolved, awaiting the eventual ocrmypdf backlog pass. Cleanup: 3 ingest_failures rows resolved (the excluded files). Unresolved count: 94 → 91. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/watcher.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/watcher.py b/scripts/watcher.py index 6e94641..db04703 100644 --- a/scripts/watcher.py +++ b/scripts/watcher.py @@ -207,6 +207,12 @@ def get_changed_files(state: dict) -> list: continue if "Computational Design 2017" in path.parts and "Student Work" in path.parts: continue + if path.name in ("Renders.pptx", "Ribbon Cutting Slideshow.pptx") \ + and "Presentations" in path.parts: + continue + if path.name == "GH Slicer Notes [Autosaved].pptx" \ + and "DDF555 3D Computational" in path.parts: + continue if path.stat().st_size == 0: continue if state.get(str(path)) != str(path.stat().st_mtime): @@ -297,6 +303,12 @@ class IngestHandler(FileSystemEventHandler): return True if "Computational Design 2017" in path.parts and "Student Work" in path.parts: return True + if path.name in ("Renders.pptx", "Ribbon Cutting Slideshow.pptx") \ + and "Presentations" in path.parts: + return True + if path.name == "GH Slicer Notes [Autosaved].pptx" \ + and "DDF555 3D Computational" in path.parts: + return True return False def on_created(self, event):