stage3_worker: v2.2 — absolute sudo/systemctl paths, error logging, reset failure counter on recovery failure
Mirrors stage2_worker v2.1 (da98019) resilience fixes:
- Absolute paths for /usr/bin/sudo and /bin/systemctl
- Log stdout/stderr when sidecar restart fails
- Reset consecutive_failures even when wedge recovery fails (prevents
permanent stuck state if restart itself is broken)
This commit is contained in:
@@ -44,7 +44,7 @@ HEARTBEAT_FILE = Path("/var/log/aaronai/stage3-heartbeat")
|
||||
RETRY_ATTEMPTS = 2
|
||||
POLL_INTERVAL = 5
|
||||
INGEST_TIMEOUT = 600
|
||||
WORKER_VERSION = "2.1"
|
||||
WORKER_VERSION = "2.2"
|
||||
|
||||
# Match Stage 1 chunking parameters
|
||||
CHUNK_SIZE_WORDS = 500
|
||||
@@ -73,10 +73,13 @@ def recover_wedge():
|
||||
Mirrors Stage 2's recover_wedge() for ollama. Requires passwordless sudo
|
||||
for `systemctl restart aaronai-graphiti.service` for the worker's user."""
|
||||
log.warning("Graphiti wedge detected — restarting sidecar")
|
||||
subprocess.run(
|
||||
["sudo", "systemctl", "restart", "aaronai-graphiti.service"],
|
||||
capture_output=True
|
||||
result = subprocess.run(
|
||||
["/usr/bin/sudo", "/bin/systemctl", "restart", "aaronai-graphiti.service"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
log.error(f"Sidecar restart failed (rc={result.returncode}): stdout={result.stdout!r} stderr={result.stderr!r}")
|
||||
return False
|
||||
# Sidecar needs longer than ollama for model loading (sentence-transformers
|
||||
# + BGE reranker + Graphiti library init)
|
||||
time.sleep(45)
|
||||
@@ -256,8 +259,9 @@ def run():
|
||||
if consecutive_failures >= 2:
|
||||
log.warning("Multiple consecutive failures — checking for Graphiti wedge")
|
||||
recovered = recover_wedge()
|
||||
if recovered:
|
||||
consecutive_failures = 0
|
||||
if not recovered:
|
||||
log.error("Wedge recovery failed — continuing anyway")
|
||||
consecutive_failures = 0
|
||||
time.sleep(10)
|
||||
else:
|
||||
consecutive_failures = 0
|
||||
|
||||
Reference in New Issue
Block a user