From da980193ddbffe8eba175821c73a9231aab1ae7c Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Fri, 1 May 2026 17:28:53 +0000 Subject: [PATCH] =?UTF-8?q?stage2=5Fworker:=20v2.1=20=E2=80=94=20terminal?= =?UTF-8?q?=20failure=20states=20+=20sudo=20path=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three classes of silent failure converted to clean terminal states: - Mistral timeout: previously left rows in zombie state (started_at set, failed_at null, attempts incremented past retry threshold, row invisible to selection query). Now sets failed_at with reason 'mistral_timeout_after_300s'. Surfaced 2026-05-01 when 17 documents accumulated in this state during the Stage 3 saga deadlock incident. - Mistral parse failure: run_mistral returns {'error': 'parse_failed'} on JSON decode failure but process_one wasn't checking, so empty orientation ('Active frames: . Frame relationships: ...') was shipped to Stage 3. This is F22 from the 2026-04-30 code review. Now sets failed_at with reason 'mistral_parse_failure'. - Wedge recovery hammering: consecutive_failures was only reset on successful Ollama restart. With the sudo path bug (also fixed here), recovery always failed, so every subsequent failure re-attempted restart. Now resets the counter regardless and logs the failure visibly. Also: subprocess.run now uses absolute paths (/usr/bin/sudo, /bin/systemctl) instead of relying on PATH, fixing the 'No such file or directory: sudo' error that broke Stage 2's recover_wedge() since deployment. F45-adjacent — sudoers entries were added 2026-05-01 but the PATH issue was masking that fix. Worker version bumped to 2.1 to match Stage 3's resilience patch level. --- scripts/stage2_worker.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/scripts/stage2_worker.py b/scripts/stage2_worker.py index 3630bda..fec01c4 100644 --- a/scripts/stage2_worker.py +++ b/scripts/stage2_worker.py @@ -33,7 +33,7 @@ CHAR_LENGTH_THRESHOLD = 2000 REQUEST_TIMEOUT = 300 RETRY_ATTEMPTS = 2 POLL_INTERVAL = 5 -WORKER_VERSION = "2.0" +WORKER_VERSION = "2.1" TAXFREE_PROMPT = ( "You are a metadata extraction system. Given a document, describe its content " @@ -67,7 +67,10 @@ def write_heartbeat(): def recover_wedge(): log.warning("Mistral wedge detected — restarting Ollama") - subprocess.run(["sudo", "systemctl", "restart", "ollama"], capture_output=True) + result = subprocess.run(["/usr/bin/sudo", "/bin/systemctl", "restart", "ollama"], capture_output=True, text=True) + if result.returncode != 0: + log.error(f"Ollama restart failed (rc={result.returncode}): stdout={result.stdout!r} stderr={result.stderr!r}") + return False time.sleep(30) for _ in range(3): try: @@ -146,6 +149,11 @@ def process_one(row): meta = run_mistral(full_text) except requests.exceptions.Timeout: log.warning(f" Mistral timeout on {source}") + cur.execute( + "UPDATE stage_2_queue SET failed_at = NOW(), failure_reason = %s WHERE id = %s", + (f"mistral_timeout_after_{REQUEST_TIMEOUT}s", row_id) + ) + pg.commit() pg.close() return False except Exception as e: @@ -156,6 +164,16 @@ def process_one(row): pg.close() return False + if meta.get("error") == "parse_failed": + log.warning(f" Mistral parse failure on {source}: {meta.get('raw', '')[:100]}") + cur.execute( + "UPDATE stage_2_queue SET failed_at = NOW(), failure_reason = %s WHERE id = %s", + ("mistral_parse_failure", row_id) + ) + pg.commit() + pg.close() + return False + frames = meta.get("active_frames", []) log.info(f" Frames: {frames}") @@ -209,8 +227,9 @@ def run(): if consecutive_failures >= 2: log.warning("Multiple consecutive failures — checking for Mistral wedge") recovered = recover_wedge() - if recovered: - consecutive_failures = 0 + if not recovered: + log.error("Wedge recovery failed — continuing anyway") + consecutive_failures = 0 time.sleep(10) else: consecutive_failures = 0