corpus integrity: ingest_failures tracking in watcher, reconciliation script, corpus status/retry/reconcile endpoints
This commit is contained in:
@@ -99,6 +99,7 @@ def extract_text(path: Path) -> str:
|
||||
return path.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception as e:
|
||||
log.warning(f"Text extraction failed for {path.name}: {e}")
|
||||
record_ingest_failure(path, f"Text extraction failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
@@ -141,6 +142,38 @@ def enqueue_stage2(source: str, full_text: str):
|
||||
log.warning(f"Stage 2 enqueue failed (non-fatal): {e}")
|
||||
|
||||
|
||||
def record_ingest_failure(filepath: Path, error: str):
|
||||
"""Write extraction or ingest failure to ingest_failures table for UI visibility."""
|
||||
try:
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("""
|
||||
INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at)
|
||||
VALUES (%s, %s, %s, 0, NOW(), NOW())
|
||||
ON CONFLICT (source) DO UPDATE SET
|
||||
error = EXCLUDED.error,
|
||||
retry_count = ingest_failures.retry_count + 1,
|
||||
last_failed_at = NOW(),
|
||||
resolved = FALSE
|
||||
""", (filepath.name, str(filepath), error[:1000]))
|
||||
pg.commit()
|
||||
pg.close()
|
||||
except Exception as e:
|
||||
log.warning(f"Could not record ingest failure (non-fatal): {e}")
|
||||
|
||||
|
||||
def resolve_ingest_failure(source: str):
|
||||
"""Mark a previously failed file as resolved after successful ingest."""
|
||||
try:
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("UPDATE ingest_failures SET resolved = TRUE WHERE source = %s", (source,))
|
||||
pg.commit()
|
||||
pg.close()
|
||||
except Exception as e:
|
||||
log.warning(f"Could not resolve ingest failure record (non-fatal): {e}")
|
||||
|
||||
|
||||
def ingest_file(filepath: Path, embedder) -> int:
|
||||
if filepath.name.startswith(("~$", ".")):
|
||||
return 0
|
||||
@@ -156,6 +189,7 @@ def ingest_file(filepath: Path, embedder) -> int:
|
||||
embeddings = embedder.encode(chunks).tolist()
|
||||
except Exception as e:
|
||||
log.error(f"Embedding failed for {filepath.name}: {e}")
|
||||
record_ingest_failure(filepath, f"Embedding failed: {e}")
|
||||
return 0
|
||||
source = filepath.name
|
||||
try:
|
||||
@@ -177,8 +211,10 @@ def ingest_file(filepath: Path, embedder) -> int:
|
||||
pg.close()
|
||||
except Exception as e:
|
||||
log.error(f"pgvector write failed for {filepath.name}: {e}")
|
||||
record_ingest_failure(filepath, f"pgvector write failed: {e}")
|
||||
return 0
|
||||
log.info(f"Indexed {len(chunks)} chunks: {filepath.name}")
|
||||
resolve_ingest_failure(source)
|
||||
enqueue_stage2(source, text)
|
||||
return len(chunks)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user