corpus_integrity.py: write unreadable files to ingest_failures for UI visibility
This commit is contained in:
@@ -195,6 +195,21 @@ def run_reconciliation(fix=False):
|
|||||||
print(f" Queued: {finfo['source']}")
|
print(f" Queued: {finfo['source']}")
|
||||||
else:
|
else:
|
||||||
print(f" Skipped (unreadable): {finfo['source']}")
|
print(f" Skipped (unreadable): {finfo['source']}")
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at)
|
||||||
|
VALUES (%s, %s, %s, 2, NOW(), NOW())
|
||||||
|
ON CONFLICT (source) DO UPDATE SET
|
||||||
|
error = EXCLUDED.error,
|
||||||
|
last_failed_at = NOW()
|
||||||
|
""", (finfo["source"], finfo["filepath"],
|
||||||
|
"Empty text — likely scanned, encrypted, or corrupt. Requires manual review or OCR."))
|
||||||
|
pg.commit()
|
||||||
|
pg.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f" WARNING: could not record failure: {e}")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
report = {
|
report = {
|
||||||
|
|||||||
Reference in New Issue
Block a user