stage2_worker: v2.1 — terminal failure states + sudo path fix
Three classes of silent failure converted to clean terminal states:
- Mistral timeout: previously left rows in zombie state (started_at set,
failed_at null, attempts incremented past retry threshold, row invisible
to selection query). Now sets failed_at with reason
'mistral_timeout_after_300s'. Surfaced 2026-05-01 when 17 documents
accumulated in this state during the Stage 3 saga deadlock incident.
- Mistral parse failure: run_mistral returns {'error': 'parse_failed'} on
JSON decode failure but process_one wasn't checking, so empty orientation
('Active frames: . Frame relationships: ...') was shipped to Stage 3.
This is F22 from the 2026-04-30 code review. Now sets failed_at with
reason 'mistral_parse_failure'.
- Wedge recovery hammering: consecutive_failures was only reset on
successful Ollama restart. With the sudo path bug (also fixed here),
recovery always failed, so every subsequent failure re-attempted restart.
Now resets the counter regardless and logs the failure visibly.
Also: subprocess.run now uses absolute paths (/usr/bin/sudo,
/bin/systemctl) instead of relying on PATH, fixing the 'No such file or
directory: sudo' error that broke Stage 2's recover_wedge() since
deployment. F45-adjacent — sudoers entries were added 2026-05-01 but the
PATH issue was masking that fix.
Worker version bumped to 2.1 to match Stage 3's resilience patch level.
This commit is contained in:
@@ -33,7 +33,7 @@ CHAR_LENGTH_THRESHOLD = 2000
|
|||||||
REQUEST_TIMEOUT = 300
|
REQUEST_TIMEOUT = 300
|
||||||
RETRY_ATTEMPTS = 2
|
RETRY_ATTEMPTS = 2
|
||||||
POLL_INTERVAL = 5
|
POLL_INTERVAL = 5
|
||||||
WORKER_VERSION = "2.0"
|
WORKER_VERSION = "2.1"
|
||||||
|
|
||||||
TAXFREE_PROMPT = (
|
TAXFREE_PROMPT = (
|
||||||
"You are a metadata extraction system. Given a document, describe its content "
|
"You are a metadata extraction system. Given a document, describe its content "
|
||||||
@@ -67,7 +67,10 @@ def write_heartbeat():
|
|||||||
|
|
||||||
def recover_wedge():
|
def recover_wedge():
|
||||||
log.warning("Mistral wedge detected — restarting Ollama")
|
log.warning("Mistral wedge detected — restarting Ollama")
|
||||||
subprocess.run(["sudo", "systemctl", "restart", "ollama"], capture_output=True)
|
result = subprocess.run(["/usr/bin/sudo", "/bin/systemctl", "restart", "ollama"], capture_output=True, text=True)
|
||||||
|
if result.returncode != 0:
|
||||||
|
log.error(f"Ollama restart failed (rc={result.returncode}): stdout={result.stdout!r} stderr={result.stderr!r}")
|
||||||
|
return False
|
||||||
time.sleep(30)
|
time.sleep(30)
|
||||||
for _ in range(3):
|
for _ in range(3):
|
||||||
try:
|
try:
|
||||||
@@ -146,6 +149,11 @@ def process_one(row):
|
|||||||
meta = run_mistral(full_text)
|
meta = run_mistral(full_text)
|
||||||
except requests.exceptions.Timeout:
|
except requests.exceptions.Timeout:
|
||||||
log.warning(f" Mistral timeout on {source}")
|
log.warning(f" Mistral timeout on {source}")
|
||||||
|
cur.execute(
|
||||||
|
"UPDATE stage_2_queue SET failed_at = NOW(), failure_reason = %s WHERE id = %s",
|
||||||
|
(f"mistral_timeout_after_{REQUEST_TIMEOUT}s", row_id)
|
||||||
|
)
|
||||||
|
pg.commit()
|
||||||
pg.close()
|
pg.close()
|
||||||
return False
|
return False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -156,6 +164,16 @@ def process_one(row):
|
|||||||
pg.close()
|
pg.close()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if meta.get("error") == "parse_failed":
|
||||||
|
log.warning(f" Mistral parse failure on {source}: {meta.get('raw', '')[:100]}")
|
||||||
|
cur.execute(
|
||||||
|
"UPDATE stage_2_queue SET failed_at = NOW(), failure_reason = %s WHERE id = %s",
|
||||||
|
("mistral_parse_failure", row_id)
|
||||||
|
)
|
||||||
|
pg.commit()
|
||||||
|
pg.close()
|
||||||
|
return False
|
||||||
|
|
||||||
frames = meta.get("active_frames", [])
|
frames = meta.get("active_frames", [])
|
||||||
log.info(f" Frames: {frames}")
|
log.info(f" Frames: {frames}")
|
||||||
|
|
||||||
@@ -209,8 +227,9 @@ def run():
|
|||||||
if consecutive_failures >= 2:
|
if consecutive_failures >= 2:
|
||||||
log.warning("Multiple consecutive failures — checking for Mistral wedge")
|
log.warning("Multiple consecutive failures — checking for Mistral wedge")
|
||||||
recovered = recover_wedge()
|
recovered = recover_wedge()
|
||||||
if recovered:
|
if not recovered:
|
||||||
consecutive_failures = 0
|
log.error("Wedge recovery failed — continuing anyway")
|
||||||
|
consecutive_failures = 0
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
else:
|
else:
|
||||||
consecutive_failures = 0
|
consecutive_failures = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user