stage2_worker: v2.1 — terminal failure states + sudo path fix

Three classes of silent failure converted to clean terminal states:

- Mistral timeout: previously left rows in zombie state (started_at set,
  failed_at null, attempts incremented past retry threshold, row invisible
  to selection query). Now sets failed_at with reason
  'mistral_timeout_after_300s'. Surfaced 2026-05-01 when 17 documents
  accumulated in this state during the Stage 3 saga deadlock incident.

- Mistral parse failure: run_mistral returns {'error': 'parse_failed'} on
  JSON decode failure but process_one wasn't checking, so empty orientation
  ('Active frames: . Frame relationships: ...') was shipped to Stage 3.
  This is F22 from the 2026-04-30 code review. Now sets failed_at with
  reason 'mistral_parse_failure'.

- Wedge recovery hammering: consecutive_failures was only reset on
  successful Ollama restart. With the sudo path bug (also fixed here),
  recovery always failed, so every subsequent failure re-attempted restart.
  Now resets the counter regardless and logs the failure visibly.

Also: subprocess.run now uses absolute paths (/usr/bin/sudo,
/bin/systemctl) instead of relying on PATH, fixing the 'No such file or
directory: sudo' error that broke Stage 2's recover_wedge() since
deployment. F45-adjacent — sudoers entries were added 2026-05-01 but the
PATH issue was masking that fix.

Worker version bumped to 2.1 to match Stage 3's resilience patch level.
This commit is contained in:
2026-05-01 17:28:53 +00:00
parent b936931668
commit da980193dd
+23 -4
View File
@@ -33,7 +33,7 @@ CHAR_LENGTH_THRESHOLD = 2000
REQUEST_TIMEOUT = 300 REQUEST_TIMEOUT = 300
RETRY_ATTEMPTS = 2 RETRY_ATTEMPTS = 2
POLL_INTERVAL = 5 POLL_INTERVAL = 5
WORKER_VERSION = "2.0" WORKER_VERSION = "2.1"
TAXFREE_PROMPT = ( TAXFREE_PROMPT = (
"You are a metadata extraction system. Given a document, describe its content " "You are a metadata extraction system. Given a document, describe its content "
@@ -67,7 +67,10 @@ def write_heartbeat():
def recover_wedge(): def recover_wedge():
log.warning("Mistral wedge detected — restarting Ollama") log.warning("Mistral wedge detected — restarting Ollama")
subprocess.run(["sudo", "systemctl", "restart", "ollama"], capture_output=True) result = subprocess.run(["/usr/bin/sudo", "/bin/systemctl", "restart", "ollama"], capture_output=True, text=True)
if result.returncode != 0:
log.error(f"Ollama restart failed (rc={result.returncode}): stdout={result.stdout!r} stderr={result.stderr!r}")
return False
time.sleep(30) time.sleep(30)
for _ in range(3): for _ in range(3):
try: try:
@@ -146,6 +149,11 @@ def process_one(row):
meta = run_mistral(full_text) meta = run_mistral(full_text)
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
log.warning(f" Mistral timeout on {source}") log.warning(f" Mistral timeout on {source}")
cur.execute(
"UPDATE stage_2_queue SET failed_at = NOW(), failure_reason = %s WHERE id = %s",
(f"mistral_timeout_after_{REQUEST_TIMEOUT}s", row_id)
)
pg.commit()
pg.close() pg.close()
return False return False
except Exception as e: except Exception as e:
@@ -156,6 +164,16 @@ def process_one(row):
pg.close() pg.close()
return False return False
if meta.get("error") == "parse_failed":
log.warning(f" Mistral parse failure on {source}: {meta.get('raw', '')[:100]}")
cur.execute(
"UPDATE stage_2_queue SET failed_at = NOW(), failure_reason = %s WHERE id = %s",
("mistral_parse_failure", row_id)
)
pg.commit()
pg.close()
return False
frames = meta.get("active_frames", []) frames = meta.get("active_frames", [])
log.info(f" Frames: {frames}") log.info(f" Frames: {frames}")
@@ -209,8 +227,9 @@ def run():
if consecutive_failures >= 2: if consecutive_failures >= 2:
log.warning("Multiple consecutive failures — checking for Mistral wedge") log.warning("Multiple consecutive failures — checking for Mistral wedge")
recovered = recover_wedge() recovered = recover_wedge()
if recovered: if not recovered:
consecutive_failures = 0 log.error("Wedge recovery failed — continuing anyway")
consecutive_failures = 0
time.sleep(10) time.sleep(10)
else: else:
consecutive_failures = 0 consecutive_failures = 0