feat: Add EPYC cluster distributed sweep with web UI

New Features: - Distributed coordinator orchestrates 2x AMD EPYC 16-core servers - 64 total cores processing 12M parameter combinations (70% CPU limit) - Worker1 (pve-nu-monitor01): Direct SSH access at 10.10.254.106 - Worker2 (bd-host01): 2-hop SSH through worker1 (10.20.254.100) - Web UI at /cluster shows real-time status and AI recommendations - API endpoint /api/cluster/status serves cluster metrics - Auto-refresh every 30s with top strategies and actionable insights Files Added: - cluster/distributed_coordinator.py (510 lines) - Main orchestrator - cluster/distributed_worker.py (271 lines) - Worker1 script - cluster/distributed_worker_bd_clean.py (275 lines) - Worker2 script - cluster/monitor_bd_host01.sh - Monitoring script - app/api/cluster/status/route.ts (274 lines) - API endpoint - app/cluster/page.tsx (258 lines) - Web UI - cluster/CLUSTER_SETUP.md - Complete setup and access documentation Technical Details: - SQLite database tracks chunk assignments - 10,000 combinations per chunk (1,195 total chunks) - Multiprocessing.Pool with 70% CPU limit (22 cores per EPYC) - SSH/SCP for deployment and result collection - Handles 2-hop SSH for bd-host01 access - Results in CSV format with top strategies ranked Access Documentation: - Worker1: ssh root@10.10.254.106 - Worker2: ssh root@10.10.254.106 "ssh root@10.20.254.100" - Web UI: http://localhost:3001/cluster - See CLUSTER_SETUP.md for complete guide Status: Deployed and operational
2025-11-30 13:02:18 +01:00
parent 2a8e04fe57
commit b77282b560
9 changed files with 2190 additions and 0 deletions
--- a/cluster/distributed_coordinator.py
+++ b/cluster/distributed_coordinator.py
@@ -0,0 +1,509 @@
+#!/usr/bin/env python3
+"""
+Distributed Continuous Optimization Coordinator
+
+Extends comprehensive_sweep.py to distribute massive parameter grids
+across 2 EPYC servers (64 cores total) for 24/7 strategy discovery.
+
+Architecture:
+1. Master generates parameter grid (millions of combinations)
+2. Splits into chunks (~10,000 combos per chunk)
+3. Distributes chunks to workers via SSH
+4. Workers run modified comprehensive_sweep on their chunk
+5. Master aggregates results, identifies top performers
+6. Master generates next exploration batch (nearby good configs)
+7. Repeat forever - continuous improvement
+
+Integration with Existing System:
+- Uses simulator.py and MoneyLineInputs from /home/comprehensive_sweep/backtester/
+- Preserves comprehensive_sweep.py output format (CSV with 14 params)
+- Works with existing .venv and data files on EPYC
+- Backwards compatible - can still run comprehensive_sweep.py standalone
+"""
+
+import sqlite3
+import subprocess
+import json
+import time
+import itertools
+import hashlib
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional, Tuple, Any
+from dataclasses import dataclass
+
+# Worker Configuration
+WORKERS = {
+    'worker1': {
+        'host': 'root@10.10.254.106',
+        'cores': 32,  # Full 32 threads available
+        'workspace': '/home/comprehensive_sweep',
+        'ssh_key': None,  # Use default key
+    },
+    'worker2': {
+        'host': 'root@10.20.254.100',
+        'cores': 32,  # Full 32 threads available
+        'workspace': '/home/backtest_dual/backtest',  # CORRECTED: Actual path on bd-host01
+        'ssh_hop': 'root@10.10.254.106',  # Connect through worker1
+        'ssh_key': None,
+    }
+}
+
+CLUSTER_DIR = Path(__file__).parent
+RESULTS_DIR = CLUSTER_DIR / 'distributed_results'
+DB_PATH = CLUSTER_DIR / 'exploration.db'
+
+@dataclass
+class ParameterGrid:
+    """Full parameter space for comprehensive sweep"""
+    flip_thresholds: List[float]
+    ma_gaps: List[float]
+    adx_mins: List[int]
+    long_pos_maxs: List[int]
+    short_pos_mins: List[int]
+    cooldowns: List[int]
+    position_sizes: List[int]
+    tp1_multipliers: List[float]
+    tp2_multipliers: List[float]
+    sl_multipliers: List[float]
+    tp1_close_percents: List[int]
+    trailing_multipliers: List[float]
+    vol_mins: List[float]
+    max_bars_list: List[int]
+    
+    def total_combinations(self) -> int:
+        """Calculate total parameter space size"""
+        return (
+            len(self.flip_thresholds) * len(self.ma_gaps) * len(self.adx_mins) *
+            len(self.long_pos_maxs) * len(self.short_pos_mins) * len(self.cooldowns) *
+            len(self.position_sizes) * len(self.tp1_multipliers) * len(self.tp2_multipliers) *
+            len(self.sl_multipliers) * len(self.tp1_close_percents) * 
+            len(self.trailing_multipliers) * len(self.vol_mins) * len(self.max_bars_list)
+        )
+    
+    def to_dict(self) -> Dict[str, List]:
+        """Convert to dict for JSON serialization"""
+        return {
+            'flip_thresholds': self.flip_thresholds,
+            'ma_gaps': self.ma_gaps,
+            'adx_mins': self.adx_mins,
+            'long_pos_maxs': self.long_pos_maxs,
+            'short_pos_mins': self.short_pos_mins,
+            'cooldowns': self.cooldowns,
+            'position_sizes': self.position_sizes,
+            'tp1_multipliers': self.tp1_multipliers,
+            'tp2_multipliers': self.tp2_multipliers,
+            'sl_multipliers': self.sl_multipliers,
+            'tp1_close_percents': self.tp1_close_percents,
+            'trailing_multipliers': self.trailing_multipliers,
+            'vol_mins': self.vol_mins,
+            'max_bars_list': self.max_bars_list,
+        }
+
+class ExplorationDatabase:
+    """Track all tested strategies and exploration progress"""
+    
+    def __init__(self, db_path: Path):
+        self.db_path = db_path
+        self.init_db()
+    
+    def init_db(self):
+        """Create tables"""
+        conn = sqlite3.connect(self.db_path)
+        c = conn.cursor()
+        
+        # Strategies table - all tested configurations
+        c.execute('''
+            CREATE TABLE IF NOT EXISTS strategies (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                param_hash TEXT UNIQUE NOT NULL,
+                indicator_type TEXT NOT NULL,
+                params_json TEXT NOT NULL,
+                
+                trades INTEGER,
+                win_rate REAL,
+                total_pnl REAL,
+                pnl_per_1k REAL,
+                profit_factor REAL,
+                max_drawdown REAL,
+                sharpe_ratio REAL,
+                
+                tested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                worker_id TEXT,
+                chunk_id TEXT
+            )
+        ''')
+        
+        # Exploration chunks - work distribution tracking
+        c.execute('''
+            CREATE TABLE IF NOT EXISTS chunks (
+                id TEXT PRIMARY KEY,
+                indicator_type TEXT NOT NULL,
+                grid_json TEXT NOT NULL,
+                chunk_start INTEGER NOT NULL,
+                chunk_end INTEGER NOT NULL,
+                total_combos INTEGER NOT NULL,
+                
+                assigned_worker TEXT,
+                status TEXT DEFAULT 'pending',
+                started_at TIMESTAMP,
+                completed_at TIMESTAMP,
+                
+                best_pnl_in_chunk REAL,
+                results_csv_path TEXT
+            )
+        ''')
+        
+        # Exploration phases - high-level progress
+        c.execute('''
+            CREATE TABLE IF NOT EXISTS phases (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                phase_name TEXT NOT NULL,
+                indicator_type TEXT NOT NULL,
+                grid_json TEXT NOT NULL,
+                total_combos INTEGER NOT NULL,
+                
+                completed_combos INTEGER DEFAULT 0,
+                best_pnl_overall REAL DEFAULT 0,
+                best_params_json TEXT,
+                
+                started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                estimated_completion TIMESTAMP,
+                actual_completion TIMESTAMP
+            )
+        ''')
+        
+        # Create indexes for fast queries
+        c.execute('CREATE INDEX IF NOT EXISTS idx_pnl_per_1k ON strategies(pnl_per_1k DESC)')
+        c.execute('CREATE INDEX IF NOT EXISTS idx_indicator_type ON strategies(indicator_type)')
+        c.execute('CREATE INDEX IF NOT EXISTS idx_chunk_status ON chunks(status)')
+        
+        conn.commit()
+        conn.close()
+    
+    def record_chunk(self, chunk_id: str, indicator_type: str, grid: ParameterGrid,
+                    chunk_start: int, chunk_end: int, assigned_worker: str) -> None:
+        """Record new chunk assigned to worker"""
+        conn = sqlite3.connect(self.db_path)
+        c = conn.cursor()
+        
+        c.execute('''
+            INSERT INTO chunks (id, indicator_type, grid_json, chunk_start, chunk_end,
+                              total_combos, assigned_worker, status, started_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, 'running', ?)
+        ''', (chunk_id, indicator_type, json.dumps(grid.to_dict()), chunk_start, chunk_end,
+              chunk_end - chunk_start, assigned_worker, datetime.now()))
+        
+        conn.commit()
+        conn.close()
+    
+    def complete_chunk(self, chunk_id: str, results_csv_path: str, best_pnl: float) -> None:
+        """Mark chunk as completed with results"""
+        conn = sqlite3.connect(self.db_path)
+        c = conn.cursor()
+        
+        c.execute('''
+            UPDATE chunks 
+            SET status='completed', completed_at=?, results_csv_path=?, best_pnl_in_chunk=?
+            WHERE id=?
+        ''', (datetime.now(), results_csv_path, best_pnl, chunk_id))
+        
+        conn.commit()
+        conn.close()
+    
+    def import_results_csv(self, csv_path: str, worker_id: str, chunk_id: str) -> int:
+        """Import CSV results from comprehensive_sweep into strategies table"""
+        import csv
+        
+        conn = sqlite3.connect(self.db_path)
+        c = conn.cursor()
+        
+        imported = 0
+        with open(csv_path, 'r') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                # Create parameter hash for deduplication
+                params = {k: v for k, v in row.items() if k not in [
+                    'rank', 'trades', 'win_rate', 'total_pnl', 'pnl_per_1k',
+                    'profit_factor', 'max_drawdown', 'sharpe_ratio'
+                ]}
+                param_hash = hashlib.sha256(json.dumps(params, sort_keys=True).encode()).hexdigest()
+                
+                try:
+                    c.execute('''
+                        INSERT INTO strategies (
+                            param_hash, indicator_type, params_json,
+                            trades, win_rate, total_pnl, pnl_per_1k,
+                            profit_factor, max_drawdown, sharpe_ratio,
+                            worker_id, chunk_id
+                        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                    ''', (
+                        param_hash, 'v9_moneyline', json.dumps(params),
+                        int(row['trades']), float(row['win_rate']), float(row['total_pnl']),
+                        float(row['pnl_per_1k']), float(row.get('profit_factor', 0)),
+                        float(row.get('max_drawdown', 0)), float(row.get('sharpe_ratio', 0)),
+                        worker_id, chunk_id
+                    ))
+                    imported += 1
+                except sqlite3.IntegrityError:
+                    # Duplicate param_hash - already tested this config
+                    pass
+        
+        conn.commit()
+        conn.close()
+        return imported
+    
+    def get_top_strategies(self, limit: int = 100) -> List[Dict]:
+        """Get top performing strategies across all tested"""
+        conn = sqlite3.connect(self.db_path)
+        c = conn.cursor()
+        
+        c.execute('''
+            SELECT indicator_type, params_json, trades, win_rate, total_pnl, pnl_per_1k,
+                   profit_factor, max_drawdown, sharpe_ratio, tested_at
+            FROM strategies
+            WHERE trades >= 700  -- Statistical significance
+              AND win_rate >= 0.50 AND win_rate <= 0.70  -- Realistic
+              AND profit_factor >= 1.2  -- Minimum edge
+            ORDER BY pnl_per_1k DESC
+            LIMIT ?
+        ''', (limit,))
+        
+        rows = c.fetchall()
+        conn.close()
+        
+        results = []
+        for row in rows:
+            results.append({
+                'indicator_type': row[0],
+                'params': json.loads(row[1]),
+                'trades': row[2],
+                'win_rate': row[3],
+                'total_pnl': row[4],
+                'pnl_per_1k': row[5],
+                'profit_factor': row[6],
+                'max_drawdown': row[7],
+                'sharpe_ratio': row[8],
+                'tested_at': row[9],
+            })
+        
+        return results
+
+class DistributedCoordinator:
+    """Coordinates distributed parameter sweeps across EPYC servers"""
+    
+    def __init__(self):
+        self.db = ExplorationDatabase(DB_PATH)
+        RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    
+    def ssh_command(self, worker_id: str, command: str) -> subprocess.CompletedProcess:
+        """Execute command on worker via SSH"""
+        worker = WORKERS[worker_id]
+        
+        if 'ssh_hop' in worker:
+            # Worker 2 requires hop through worker 1
+            # CRITICAL FIX (Nov 29, 2025): Use double-nested quotes for 2-hop SSH
+            # Single quotes don't pass command to inner SSH properly
+            ssh_cmd = f"ssh {worker['ssh_hop']} \"ssh {worker['host']} '{command}'\""
+        else:
+            ssh_cmd = f"ssh {worker['host']} '{command}'"
+        
+        return subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True)
+    
+    def deploy_worker_script(self, worker_id: str) -> bool:
+        """Deploy distributed_worker.py to EPYC server"""
+        worker = WORKERS[worker_id]
+        script_path = CLUSTER_DIR / 'distributed_worker.py'
+        
+        # Copy script to worker's comprehensive_sweep directory
+        target = f"{worker['workspace']}/backtester/scripts/distributed_worker.py"
+        
+        if 'ssh_hop' in worker:
+            # Two-hop copy for worker2
+            print(f"📤 Copying worker script to {worker_id} via hop...")
+            # Copy to worker1 first
+            subprocess.run(f"scp {script_path} {WORKERS['worker1']['host']}:/tmp/", shell=True)
+            # Then copy from worker1 to worker2
+            self.ssh_command('worker1', f"scp /tmp/distributed_worker.py {worker['host']}:{target}")
+        else:
+            print(f"📤 Copying worker script to {worker_id}...")
+            subprocess.run(f"scp {script_path} {worker['host']}:{target}", shell=True)
+        
+        print(f"✅ Worker script deployed to {worker_id}")
+        return True
+    
+    def assign_chunk(self, worker_id: str, chunk_id: str, grid: ParameterGrid,
+                    chunk_start: int, chunk_end: int) -> bool:
+        """Assign parameter chunk to worker for processing"""
+        worker = WORKERS[worker_id]
+        
+        # Record in database
+        self.db.record_chunk(chunk_id, 'v9_moneyline', grid, chunk_start, chunk_end, worker_id)
+        
+        # Create chunk specification JSON
+        chunk_spec = {
+            'chunk_id': chunk_id,
+            'chunk_start': chunk_start,
+            'chunk_end': chunk_end,
+            'grid': grid.to_dict(),
+            'num_workers': worker['cores'],
+        }
+        
+        chunk_json_path = RESULTS_DIR / f"{chunk_id}_spec.json"
+        with open(chunk_json_path, 'w') as f:
+            json.dump(chunk_spec, f, indent=2)
+        
+        # Copy chunk spec to worker
+        target_json = f"{worker['workspace']}/chunk_{chunk_id}.json"
+        if 'ssh_hop' in worker:
+            # Two-hop copy
+            subprocess.run(f"scp {chunk_json_path} {WORKERS['worker1']['host']}:/tmp/", shell=True)
+            self.ssh_command('worker1', f"scp /tmp/{chunk_id}_spec.json {worker['host']}:{target_json}")
+        else:
+            subprocess.run(f"scp {chunk_json_path} {worker['host']}:{target_json}", shell=True)
+        
+        # Execute distributed_worker.py on worker
+        # CRITICAL: Simplified SSH command without bash -c to avoid quoting issues
+        cmd = (f"cd {worker['workspace']} && "
+               f"source backtester/.venv/bin/activate && "
+               f"nohup python3 backtester/scripts/distributed_worker.py {target_json} "
+               f"> /tmp/{chunk_id}.log 2>&1 &")
+        
+        print(f"🚀 Starting chunk {chunk_id} on {worker_id} ({chunk_end - chunk_start:,} combos)...")
+        result = self.ssh_command(worker_id, cmd)
+        
+        if result.returncode == 0:
+            print(f"✅ Chunk {chunk_id} assigned to {worker_id}")
+            return True
+        else:
+            print(f"❌ Failed to assign chunk {chunk_id} to {worker_id}: {result.stderr}")
+            return False
+    
+    def collect_results(self, worker_id: str, chunk_id: str) -> Optional[str]:
+        """Collect CSV results from worker"""
+        worker = WORKERS[worker_id]
+        
+        # Check if results file exists on worker
+        results_csv = f"{worker['workspace']}/chunk_{chunk_id}_results.csv"
+        check_cmd = f"test -f {results_csv} && echo 'exists'"
+        result = self.ssh_command(worker_id, check_cmd)
+        
+        if 'exists' not in result.stdout:
+            return None  # Results not ready yet
+        
+        # Copy results back to master
+        local_csv = RESULTS_DIR / f"{chunk_id}_results.csv"
+        
+        if 'ssh_hop' in worker:
+            # Two-hop copy back
+            self.ssh_command('worker1', f"scp {worker['host']}:{results_csv} /tmp/")
+            subprocess.run(f"scp {WORKERS['worker1']['host']}:/tmp/chunk_{chunk_id}_results.csv {local_csv}", shell=True)
+        else:
+            subprocess.run(f"scp {worker['host']}:{results_csv} {local_csv}", shell=True)
+        
+        print(f"📥 Collected results from {worker_id} chunk {chunk_id}")
+        
+        # Import into database
+        imported = self.db.import_results_csv(str(local_csv), worker_id, chunk_id)
+        print(f"📊 Imported {imported} unique strategies from {chunk_id}")
+        
+        # Get best P&L from CSV for chunk tracking
+        import csv
+        with open(local_csv, 'r') as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+            best_pnl = max(float(row['pnl_per_1k']) for row in rows) if rows else 0
+        
+        self.db.complete_chunk(chunk_id, str(local_csv), best_pnl)
+        
+        return str(local_csv)
+    
+    def start_comprehensive_exploration(self, chunk_size: int = 10000):
+        """Start massive comprehensive parameter sweep"""
+        print("=" * 80)
+        print("🚀 DISTRIBUTED COMPREHENSIVE EXPLORATION")
+        print("=" * 80)
+        print()
+        
+        # Define full parameter grid (can be expanded)
+        grid = ParameterGrid(
+            flip_thresholds=[0.4, 0.5, 0.6, 0.7],
+            ma_gaps=[0.20, 0.30, 0.40, 0.50],
+            adx_mins=[18, 21, 24, 27],
+            long_pos_maxs=[60, 65, 70, 75],
+            short_pos_mins=[20, 25, 30, 35],
+            cooldowns=[1, 2, 3, 4],
+            position_sizes=[10000],  # Fixed for fair comparison
+            tp1_multipliers=[1.5, 2.0, 2.5],
+            tp2_multipliers=[3.0, 4.0, 5.0],
+            sl_multipliers=[2.5, 3.0, 3.5],
+            tp1_close_percents=[50, 60, 70, 75],
+            trailing_multipliers=[1.0, 1.5, 2.0],
+            vol_mins=[0.8, 1.0, 1.2],
+            max_bars_list=[300, 500, 1000],
+        )
+        
+        total_combos = grid.total_combinations()
+        
+        print(f"📊 Total parameter space: {total_combos:,} combinations")
+        print(f"📦 Chunk size: {chunk_size:,} combinations per chunk")
+        print(f"🎯 Total chunks: {(total_combos + chunk_size - 1) // chunk_size:,}")
+        print(f"⏱️  Estimated time: {(total_combos * 1.6) / (64 * 3600):.1f} hours with 64 cores")
+        print()
+        
+        # Deploy worker scripts
+        for worker_id in WORKERS.keys():
+            self.deploy_worker_script(worker_id)
+        
+        print()
+        print("🔄 Distributing chunks to workers...")
+        print()
+        
+        # Split work across workers
+        chunk_id_counter = 0
+        chunk_start = 0
+        active_chunks = {}
+        worker_list = list(WORKERS.keys())  # ['worker1', 'worker2']
+        
+        while chunk_start < total_combos:
+            chunk_end = min(chunk_start + chunk_size, total_combos)
+            chunk_id = f"v9_chunk_{chunk_id_counter:06d}"
+            
+            # Round-robin assignment across both workers for balanced load
+            worker_id = worker_list[chunk_id_counter % len(worker_list)]
+            
+            if self.assign_chunk(worker_id, chunk_id, grid, chunk_start, chunk_end):
+                active_chunks[chunk_id] = worker_id
+            
+            chunk_id_counter += 1
+            chunk_start = chunk_end
+            
+            # Don't overwhelm workers - limit to 2 chunks per worker at a time
+            if len(active_chunks) >= len(WORKERS) * 2:
+                print(f"⏸️  Pausing chunk assignment - {len(active_chunks)} chunks active")
+                print(f"⏳ Waiting for chunks to complete...")
+                break
+        
+        print()
+        print(f"✅ Assigned {len(active_chunks)} initial chunks")
+        print()
+        print("📊 Monitor progress with: python3 cluster/exploration_status.py")
+        print("🏆 View top strategies: sqlite3 cluster/exploration.db 'SELECT * FROM strategies ORDER BY pnl_per_1k DESC LIMIT 10'")
+
+def main():
+    """Main coordinator entry point"""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Distributed continuous optimization coordinator')
+    parser.add_argument('--chunk-size', type=int, default=10000,
+                       help='Number of combinations per chunk (default: 10000)')
+    parser.add_argument('--continuous', action='store_true',
+                       help='Run continuously (not implemented yet)')
+    
+    args = parser.parse_args()
+    
+    coordinator = DistributedCoordinator()
+    coordinator.start_comprehensive_exploration(chunk_size=args.chunk_size)
+
+if __name__ == '__main__':
+    main()