diff --git a/scripts/corpus_integrity.py b/scripts/corpus_integrity.py index 6130a95..e96b8a5 100644 --- a/scripts/corpus_integrity.py +++ b/scripts/corpus_integrity.py @@ -195,6 +195,21 @@ def run_reconciliation(fix=False): print(f" Queued: {finfo['source']}") else: print(f" Skipped (unreadable): {finfo['source']}") + try: + pg = get_pg() + cur = pg.cursor() + cur.execute(""" + INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at) + VALUES (%s, %s, %s, 2, NOW(), NOW()) + ON CONFLICT (source) DO UPDATE SET + error = EXCLUDED.error, + last_failed_at = NOW() + """, (finfo["source"], finfo["filepath"], + "Empty text — likely scanned, encrypted, or corrupt. Requires manual review or OCR.")) + pg.commit() + pg.close() + except Exception as e: + print(f" WARNING: could not record failure: {e}") print() report = {