From ef371a19b96c6d53e7938e8fb5da19e33b4dec25 Mon Sep 17 00:00:00 2001 From: mindesbunister Date: Mon, 1 Dec 2025 09:41:42 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20EPYC=20cluster=20SSH=20timeout=20-=20inc?= =?UTF-8?q?rease=20timeout=2030s=E2=86=9260s=20+=20add=20SSH=20options?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CRITICAL FIX (Dec 1, 2025): Cluster start was failing with 'operation failed' Problem: - SSH commands timing out after 30s (too short for 2-hop SSH to worker2) - Missing SSH options caused prompts/delays - Result: Coordinator failed to start worker processes Solution: - Increased timeout from 30s to 60s for nested SSH hops - Added SSH options: -o StrictHostKeyChecking=no -o ConnectTimeout=10 - Applied options to both ssh_command() and worker startup commands Verification (Dec 1, 09:40): - Worker1: 23 processes running (chunk 0-2000) - Worker2: 24 processes running (chunk 2000-4000) - Cluster status: ACTIVE with 2 workers - Both chunks processing successfully Files changed: - cluster/distributed_coordinator.py (lines 302-314, 388-414) --- cluster/distributed_coordinator.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/cluster/distributed_coordinator.py b/cluster/distributed_coordinator.py index f3b5a87..f066e45 100644 --- a/cluster/distributed_coordinator.py +++ b/cluster/distributed_coordinator.py @@ -303,13 +303,16 @@ class DistributedCoordinator: """Execute command on worker via SSH""" worker = WORKERS[worker_id] + # CRITICAL FIX (Dec 1, 2025): Add SSH options to prevent prompts and improve reliability + ssh_opts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=5" + if 'ssh_hop' in worker: # Worker 2 requires hop through worker 1 # CRITICAL FIX (Nov 29, 2025): Use double-nested quotes for 2-hop SSH # Single quotes don't pass command to inner SSH properly - ssh_cmd = f"ssh {worker['ssh_hop']} \"ssh {worker['host']} '{command}'\"" + ssh_cmd = f"ssh {ssh_opts} {worker['ssh_hop']} \"ssh {ssh_opts} {worker['host']} '{command}'\"" else: - ssh_cmd = f"ssh {worker['host']} '{command}'" + ssh_cmd = f"ssh {ssh_opts} {worker['host']} '{command}'" return subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True) @@ -381,21 +384,25 @@ class DistributedCoordinator: print(f"🚀 Starting chunk {chunk_id} on {worker_id} ({chunk_end - chunk_start:,} combos)...") + # CRITICAL FIX (Dec 1, 2025): Add SSH options for reliability and timeout handling + ssh_opts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=5" + # Execute command and capture result to verify it started if 'ssh_hop' in worker: # Worker 2 requires hop through worker 1 - ssh_cmd = f"ssh {worker['ssh_hop']} \"ssh {worker['host']} '{cmd}' && echo 'Started chunk {chunk_id}' || echo 'FAILED'\"" + ssh_cmd = f"ssh {ssh_opts} {worker['ssh_hop']} \"ssh {ssh_opts} {worker['host']} '{cmd}' && echo 'Started chunk {chunk_id}' || echo 'FAILED'\"" else: - ssh_cmd = f"ssh {worker['host']} '{cmd}' && echo 'Started chunk {chunk_id}' || echo 'FAILED'" + ssh_cmd = f"ssh {ssh_opts} {worker['host']} '{cmd}' && echo 'Started chunk {chunk_id}' || echo 'FAILED'" # Use run() to capture output and verify success + # CRITICAL FIX (Dec 1, 2025): Increase timeout from 30s to 60s for nested SSH hops try: result = subprocess.run( ssh_cmd, shell=True, capture_output=True, text=True, - timeout=30 # 30 second timeout + timeout=60 # 60 second timeout (was 30s, too short for 2-hop SSH) ) # Verify worker process started @@ -409,6 +416,7 @@ class DistributedCoordinator: return False except subprocess.TimeoutExpired: print(f"⚠️ SSH command timed out for {chunk_id} on {worker_id}") + print(f" This usually means SSH hop is misconfigured or slow") return False def collect_results(self, worker_id: str, chunk_id: str) -> Optional[str]: