stage3_worker: v2.2 — absolute sudo/systemctl paths, error logging, reset failure counter on recovery failure
Mirrors stage2_worker v2.1 (da98019) resilience fixes:
- Absolute paths for /usr/bin/sudo and /bin/systemctl
- Log stdout/stderr when sidecar restart fails
- Reset consecutive_failures even when wedge recovery fails (prevents
permanent stuck state if restart itself is broken)
This commit is contained in:
@@ -44,7 +44,7 @@ HEARTBEAT_FILE = Path("/var/log/aaronai/stage3-heartbeat")
|
|||||||
RETRY_ATTEMPTS = 2
|
RETRY_ATTEMPTS = 2
|
||||||
POLL_INTERVAL = 5
|
POLL_INTERVAL = 5
|
||||||
INGEST_TIMEOUT = 600
|
INGEST_TIMEOUT = 600
|
||||||
WORKER_VERSION = "2.1"
|
WORKER_VERSION = "2.2"
|
||||||
|
|
||||||
# Match Stage 1 chunking parameters
|
# Match Stage 1 chunking parameters
|
||||||
CHUNK_SIZE_WORDS = 500
|
CHUNK_SIZE_WORDS = 500
|
||||||
@@ -73,10 +73,13 @@ def recover_wedge():
|
|||||||
Mirrors Stage 2's recover_wedge() for ollama. Requires passwordless sudo
|
Mirrors Stage 2's recover_wedge() for ollama. Requires passwordless sudo
|
||||||
for `systemctl restart aaronai-graphiti.service` for the worker's user."""
|
for `systemctl restart aaronai-graphiti.service` for the worker's user."""
|
||||||
log.warning("Graphiti wedge detected — restarting sidecar")
|
log.warning("Graphiti wedge detected — restarting sidecar")
|
||||||
subprocess.run(
|
result = subprocess.run(
|
||||||
["sudo", "systemctl", "restart", "aaronai-graphiti.service"],
|
["/usr/bin/sudo", "/bin/systemctl", "restart", "aaronai-graphiti.service"],
|
||||||
capture_output=True
|
capture_output=True, text=True
|
||||||
)
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
log.error(f"Sidecar restart failed (rc={result.returncode}): stdout={result.stdout!r} stderr={result.stderr!r}")
|
||||||
|
return False
|
||||||
# Sidecar needs longer than ollama for model loading (sentence-transformers
|
# Sidecar needs longer than ollama for model loading (sentence-transformers
|
||||||
# + BGE reranker + Graphiti library init)
|
# + BGE reranker + Graphiti library init)
|
||||||
time.sleep(45)
|
time.sleep(45)
|
||||||
@@ -256,8 +259,9 @@ def run():
|
|||||||
if consecutive_failures >= 2:
|
if consecutive_failures >= 2:
|
||||||
log.warning("Multiple consecutive failures — checking for Graphiti wedge")
|
log.warning("Multiple consecutive failures — checking for Graphiti wedge")
|
||||||
recovered = recover_wedge()
|
recovered = recover_wedge()
|
||||||
if recovered:
|
if not recovered:
|
||||||
consecutive_failures = 0
|
log.error("Wedge recovery failed — continuing anyway")
|
||||||
|
consecutive_failures = 0
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
else:
|
else:
|
||||||
consecutive_failures = 0
|
consecutive_failures = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user