From 1a8e0353f5442e6df69a636001e053e0ae837a0d Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Fri, 1 May 2026 18:40:25 +0000 Subject: [PATCH] =?UTF-8?q?stage3=5Fworker:=20v2.2=20=E2=80=94=20absolute?= =?UTF-8?q?=20sudo/systemctl=20paths,=20error=20logging,=20reset=20failure?= =?UTF-8?q?=20counter=20on=20recovery=20failure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors stage2_worker v2.1 (da98019) resilience fixes: - Absolute paths for /usr/bin/sudo and /bin/systemctl - Log stdout/stderr when sidecar restart fails - Reset consecutive_failures even when wedge recovery fails (prevents permanent stuck state if restart itself is broken) --- scripts/stage3_worker.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/stage3_worker.py b/scripts/stage3_worker.py index 4bba4d0..0e802e0 100644 --- a/scripts/stage3_worker.py +++ b/scripts/stage3_worker.py @@ -44,7 +44,7 @@ HEARTBEAT_FILE = Path("/var/log/aaronai/stage3-heartbeat") RETRY_ATTEMPTS = 2 POLL_INTERVAL = 5 INGEST_TIMEOUT = 600 -WORKER_VERSION = "2.1" +WORKER_VERSION = "2.2" # Match Stage 1 chunking parameters CHUNK_SIZE_WORDS = 500 @@ -73,10 +73,13 @@ def recover_wedge(): Mirrors Stage 2's recover_wedge() for ollama. Requires passwordless sudo for `systemctl restart aaronai-graphiti.service` for the worker's user.""" log.warning("Graphiti wedge detected — restarting sidecar") - subprocess.run( - ["sudo", "systemctl", "restart", "aaronai-graphiti.service"], - capture_output=True + result = subprocess.run( + ["/usr/bin/sudo", "/bin/systemctl", "restart", "aaronai-graphiti.service"], + capture_output=True, text=True ) + if result.returncode != 0: + log.error(f"Sidecar restart failed (rc={result.returncode}): stdout={result.stdout!r} stderr={result.stderr!r}") + return False # Sidecar needs longer than ollama for model loading (sentence-transformers # + BGE reranker + Graphiti library init) time.sleep(45) @@ -256,8 +259,9 @@ def run(): if consecutive_failures >= 2: log.warning("Multiple consecutive failures — checking for Graphiti wedge") recovered = recover_wedge() - if recovered: - consecutive_failures = 0 + if not recovered: + log.error("Wedge recovery failed — continuing anyway") + consecutive_failures = 0 time.sleep(10) else: consecutive_failures = 0