corpus integrity: ingest_failures tracking in watcher, reconciliation script, corpus status/retry/reconcile endpoints

This commit is contained in:
2026-04-30 21:54:39 +00:00
parent 655dea6ae5
commit 74e2c34f43
3 changed files with 439 additions and 0 deletions
+36
View File
@@ -99,6 +99,7 @@ def extract_text(path: Path) -> str:
return path.read_text(encoding="utf-8", errors="ignore")
except Exception as e:
log.warning(f"Text extraction failed for {path.name}: {e}")
record_ingest_failure(path, f"Text extraction failed: {e}")
return ""
@@ -141,6 +142,38 @@ def enqueue_stage2(source: str, full_text: str):
log.warning(f"Stage 2 enqueue failed (non-fatal): {e}")
def record_ingest_failure(filepath: Path, error: str):
"""Write extraction or ingest failure to ingest_failures table for UI visibility."""
try:
pg = get_pg()
cur = pg.cursor()
cur.execute("""
INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at)
VALUES (%s, %s, %s, 0, NOW(), NOW())
ON CONFLICT (source) DO UPDATE SET
error = EXCLUDED.error,
retry_count = ingest_failures.retry_count + 1,
last_failed_at = NOW(),
resolved = FALSE
""", (filepath.name, str(filepath), error[:1000]))
pg.commit()
pg.close()
except Exception as e:
log.warning(f"Could not record ingest failure (non-fatal): {e}")
def resolve_ingest_failure(source: str):
"""Mark a previously failed file as resolved after successful ingest."""
try:
pg = get_pg()
cur = pg.cursor()
cur.execute("UPDATE ingest_failures SET resolved = TRUE WHERE source = %s", (source,))
pg.commit()
pg.close()
except Exception as e:
log.warning(f"Could not resolve ingest failure record (non-fatal): {e}")
def ingest_file(filepath: Path, embedder) -> int:
if filepath.name.startswith(("~$", ".")):
return 0
@@ -156,6 +189,7 @@ def ingest_file(filepath: Path, embedder) -> int:
embeddings = embedder.encode(chunks).tolist()
except Exception as e:
log.error(f"Embedding failed for {filepath.name}: {e}")
record_ingest_failure(filepath, f"Embedding failed: {e}")
return 0
source = filepath.name
try:
@@ -177,8 +211,10 @@ def ingest_file(filepath: Path, embedder) -> int:
pg.close()
except Exception as e:
log.error(f"pgvector write failed for {filepath.name}: {e}")
record_ingest_failure(filepath, f"pgvector write failed: {e}")
return 0
log.info(f"Indexed {len(chunks)} chunks: {filepath.name}")
resolve_ingest_failure(source)
enqueue_stage2(source, text)
return len(chunks)