corpus_integrity.py: write unreadable files to ingest_failures for UI visibility

This commit is contained in:
2026-04-30 21:59:06 +00:00
parent 74e2c34f43
commit 7822fb1cc1
+15
View File
@@ -195,6 +195,21 @@ def run_reconciliation(fix=False):
print(f" Queued: {finfo['source']}") print(f" Queued: {finfo['source']}")
else: else:
print(f" Skipped (unreadable): {finfo['source']}") print(f" Skipped (unreadable): {finfo['source']}")
try:
pg = get_pg()
cur = pg.cursor()
cur.execute("""
INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at)
VALUES (%s, %s, %s, 2, NOW(), NOW())
ON CONFLICT (source) DO UPDATE SET
error = EXCLUDED.error,
last_failed_at = NOW()
""", (finfo["source"], finfo["filepath"],
"Empty text — likely scanned, encrypted, or corrupt. Requires manual review or OCR."))
pg.commit()
pg.close()
except Exception as e:
print(f" WARNING: could not record failure: {e}")
print() print()
report = { report = {