stage3_worker: v2.2 — absolute sudo/systemctl paths, error logging, reset failure counter on recovery failure

Mirrors stage2_worker v2.1 (da98019) resilience fixes:
- Absolute paths for /usr/bin/sudo and /bin/systemctl
- Log stdout/stderr when sidecar restart fails
- Reset consecutive_failures even when wedge recovery fails (prevents
  permanent stuck state if restart itself is broken)
This commit is contained in:
2026-05-01 18:40:25 +00:00
parent da980193dd
commit 1a8e0353f5
+10 -6
View File
@@ -44,7 +44,7 @@ HEARTBEAT_FILE = Path("/var/log/aaronai/stage3-heartbeat")
RETRY_ATTEMPTS = 2 RETRY_ATTEMPTS = 2
POLL_INTERVAL = 5 POLL_INTERVAL = 5
INGEST_TIMEOUT = 600 INGEST_TIMEOUT = 600
WORKER_VERSION = "2.1" WORKER_VERSION = "2.2"
# Match Stage 1 chunking parameters # Match Stage 1 chunking parameters
CHUNK_SIZE_WORDS = 500 CHUNK_SIZE_WORDS = 500
@@ -73,10 +73,13 @@ def recover_wedge():
Mirrors Stage 2's recover_wedge() for ollama. Requires passwordless sudo Mirrors Stage 2's recover_wedge() for ollama. Requires passwordless sudo
for `systemctl restart aaronai-graphiti.service` for the worker's user.""" for `systemctl restart aaronai-graphiti.service` for the worker's user."""
log.warning("Graphiti wedge detected — restarting sidecar") log.warning("Graphiti wedge detected — restarting sidecar")
subprocess.run( result = subprocess.run(
["sudo", "systemctl", "restart", "aaronai-graphiti.service"], ["/usr/bin/sudo", "/bin/systemctl", "restart", "aaronai-graphiti.service"],
capture_output=True capture_output=True, text=True
) )
if result.returncode != 0:
log.error(f"Sidecar restart failed (rc={result.returncode}): stdout={result.stdout!r} stderr={result.stderr!r}")
return False
# Sidecar needs longer than ollama for model loading (sentence-transformers # Sidecar needs longer than ollama for model loading (sentence-transformers
# + BGE reranker + Graphiti library init) # + BGE reranker + Graphiti library init)
time.sleep(45) time.sleep(45)
@@ -256,8 +259,9 @@ def run():
if consecutive_failures >= 2: if consecutive_failures >= 2:
log.warning("Multiple consecutive failures — checking for Graphiti wedge") log.warning("Multiple consecutive failures — checking for Graphiti wedge")
recovered = recover_wedge() recovered = recover_wedge()
if recovered: if not recovered:
consecutive_failures = 0 log.error("Wedge recovery failed — continuing anyway")
consecutive_failures = 0
time.sleep(10) time.sleep(10)
else: else:
consecutive_failures = 0 consecutive_failures = 0