diff --git a/cluster/distributed_coordinator.py b/cluster/distributed_coordinator.py index f3b5a87..f066e45 100644 --- a/cluster/distributed_coordinator.py +++ b/cluster/distributed_coordinator.py @@ -303,13 +303,16 @@ class DistributedCoordinator: """Execute command on worker via SSH""" worker = WORKERS[worker_id] + # CRITICAL FIX (Dec 1, 2025): Add SSH options to prevent prompts and improve reliability + ssh_opts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=5" + if 'ssh_hop' in worker: # Worker 2 requires hop through worker 1 # CRITICAL FIX (Nov 29, 2025): Use double-nested quotes for 2-hop SSH # Single quotes don't pass command to inner SSH properly - ssh_cmd = f"ssh {worker['ssh_hop']} \"ssh {worker['host']} '{command}'\"" + ssh_cmd = f"ssh {ssh_opts} {worker['ssh_hop']} \"ssh {ssh_opts} {worker['host']} '{command}'\"" else: - ssh_cmd = f"ssh {worker['host']} '{command}'" + ssh_cmd = f"ssh {ssh_opts} {worker['host']} '{command}'" return subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True) @@ -381,21 +384,25 @@ class DistributedCoordinator: print(f"🚀 Starting chunk {chunk_id} on {worker_id} ({chunk_end - chunk_start:,} combos)...") + # CRITICAL FIX (Dec 1, 2025): Add SSH options for reliability and timeout handling + ssh_opts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=5" + # Execute command and capture result to verify it started if 'ssh_hop' in worker: # Worker 2 requires hop through worker 1 - ssh_cmd = f"ssh {worker['ssh_hop']} \"ssh {worker['host']} '{cmd}' && echo 'Started chunk {chunk_id}' || echo 'FAILED'\"" + ssh_cmd = f"ssh {ssh_opts} {worker['ssh_hop']} \"ssh {ssh_opts} {worker['host']} '{cmd}' && echo 'Started chunk {chunk_id}' || echo 'FAILED'\"" else: - ssh_cmd = f"ssh {worker['host']} '{cmd}' && echo 'Started chunk {chunk_id}' || echo 'FAILED'" + ssh_cmd = f"ssh {ssh_opts} {worker['host']} '{cmd}' && echo 'Started chunk {chunk_id}' || echo 'FAILED'" # Use run() to capture output and verify success + # CRITICAL FIX (Dec 1, 2025): Increase timeout from 30s to 60s for nested SSH hops try: result = subprocess.run( ssh_cmd, shell=True, capture_output=True, text=True, - timeout=30 # 30 second timeout + timeout=60 # 60 second timeout (was 30s, too short for 2-hop SSH) ) # Verify worker process started @@ -409,6 +416,7 @@ class DistributedCoordinator: return False except subprocess.TimeoutExpired: print(f"⚠️ SSH command timed out for {chunk_id} on {worker_id}") + print(f" This usually means SSH hop is misconfigured or slow") return False def collect_results(self, worker_id: str, chunk_id: str) -> Optional[str]: