feat: Add EPYC cluster distributed sweep with web UI

New Features:
- Distributed coordinator orchestrates 2x AMD EPYC 16-core servers
- 64 total cores processing 12M parameter combinations (70% CPU limit)
- Worker1 (pve-nu-monitor01): Direct SSH access at 10.10.254.106
- Worker2 (bd-host01): 2-hop SSH through worker1 (10.20.254.100)
- Web UI at /cluster shows real-time status and AI recommendations
- API endpoint /api/cluster/status serves cluster metrics
- Auto-refresh every 30s with top strategies and actionable insights

Files Added:
- cluster/distributed_coordinator.py (510 lines) - Main orchestrator
- cluster/distributed_worker.py (271 lines) - Worker1 script
- cluster/distributed_worker_bd_clean.py (275 lines) - Worker2 script
- cluster/monitor_bd_host01.sh - Monitoring script
- app/api/cluster/status/route.ts (274 lines) - API endpoint
- app/cluster/page.tsx (258 lines) - Web UI
- cluster/CLUSTER_SETUP.md - Complete setup and access documentation

Technical Details:
- SQLite database tracks chunk assignments
- 10,000 combinations per chunk (1,195 total chunks)
- Multiprocessing.Pool with 70% CPU limit (22 cores per EPYC)
- SSH/SCP for deployment and result collection
- Handles 2-hop SSH for bd-host01 access
- Results in CSV format with top strategies ranked

Access Documentation:
- Worker1: ssh root@10.10.254.106
- Worker2: ssh root@10.10.254.106 "ssh root@10.20.254.100"
- Web UI: http://localhost:3001/cluster
- See CLUSTER_SETUP.md for complete guide

Status: Deployed and operational
This commit is contained in:
mindesbunister
2025-11-30 13:02:18 +01:00
parent 2a8e04fe57
commit b77282b560
9 changed files with 2190 additions and 0 deletions

View File

@@ -0,0 +1,509 @@
#!/usr/bin/env python3
"""
Distributed Continuous Optimization Coordinator
Extends comprehensive_sweep.py to distribute massive parameter grids
across 2 EPYC servers (64 cores total) for 24/7 strategy discovery.
Architecture:
1. Master generates parameter grid (millions of combinations)
2. Splits into chunks (~10,000 combos per chunk)
3. Distributes chunks to workers via SSH
4. Workers run modified comprehensive_sweep on their chunk
5. Master aggregates results, identifies top performers
6. Master generates next exploration batch (nearby good configs)
7. Repeat forever - continuous improvement
Integration with Existing System:
- Uses simulator.py and MoneyLineInputs from /home/comprehensive_sweep/backtester/
- Preserves comprehensive_sweep.py output format (CSV with 14 params)
- Works with existing .venv and data files on EPYC
- Backwards compatible - can still run comprehensive_sweep.py standalone
"""
import sqlite3
import subprocess
import json
import time
import itertools
import hashlib
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
# Worker Configuration
WORKERS = {
'worker1': {
'host': 'root@10.10.254.106',
'cores': 32, # Full 32 threads available
'workspace': '/home/comprehensive_sweep',
'ssh_key': None, # Use default key
},
'worker2': {
'host': 'root@10.20.254.100',
'cores': 32, # Full 32 threads available
'workspace': '/home/backtest_dual/backtest', # CORRECTED: Actual path on bd-host01
'ssh_hop': 'root@10.10.254.106', # Connect through worker1
'ssh_key': None,
}
}
CLUSTER_DIR = Path(__file__).parent
RESULTS_DIR = CLUSTER_DIR / 'distributed_results'
DB_PATH = CLUSTER_DIR / 'exploration.db'
@dataclass
class ParameterGrid:
"""Full parameter space for comprehensive sweep"""
flip_thresholds: List[float]
ma_gaps: List[float]
adx_mins: List[int]
long_pos_maxs: List[int]
short_pos_mins: List[int]
cooldowns: List[int]
position_sizes: List[int]
tp1_multipliers: List[float]
tp2_multipliers: List[float]
sl_multipliers: List[float]
tp1_close_percents: List[int]
trailing_multipliers: List[float]
vol_mins: List[float]
max_bars_list: List[int]
def total_combinations(self) -> int:
"""Calculate total parameter space size"""
return (
len(self.flip_thresholds) * len(self.ma_gaps) * len(self.adx_mins) *
len(self.long_pos_maxs) * len(self.short_pos_mins) * len(self.cooldowns) *
len(self.position_sizes) * len(self.tp1_multipliers) * len(self.tp2_multipliers) *
len(self.sl_multipliers) * len(self.tp1_close_percents) *
len(self.trailing_multipliers) * len(self.vol_mins) * len(self.max_bars_list)
)
def to_dict(self) -> Dict[str, List]:
"""Convert to dict for JSON serialization"""
return {
'flip_thresholds': self.flip_thresholds,
'ma_gaps': self.ma_gaps,
'adx_mins': self.adx_mins,
'long_pos_maxs': self.long_pos_maxs,
'short_pos_mins': self.short_pos_mins,
'cooldowns': self.cooldowns,
'position_sizes': self.position_sizes,
'tp1_multipliers': self.tp1_multipliers,
'tp2_multipliers': self.tp2_multipliers,
'sl_multipliers': self.sl_multipliers,
'tp1_close_percents': self.tp1_close_percents,
'trailing_multipliers': self.trailing_multipliers,
'vol_mins': self.vol_mins,
'max_bars_list': self.max_bars_list,
}
class ExplorationDatabase:
"""Track all tested strategies and exploration progress"""
def __init__(self, db_path: Path):
self.db_path = db_path
self.init_db()
def init_db(self):
"""Create tables"""
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
# Strategies table - all tested configurations
c.execute('''
CREATE TABLE IF NOT EXISTS strategies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
param_hash TEXT UNIQUE NOT NULL,
indicator_type TEXT NOT NULL,
params_json TEXT NOT NULL,
trades INTEGER,
win_rate REAL,
total_pnl REAL,
pnl_per_1k REAL,
profit_factor REAL,
max_drawdown REAL,
sharpe_ratio REAL,
tested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
worker_id TEXT,
chunk_id TEXT
)
''')
# Exploration chunks - work distribution tracking
c.execute('''
CREATE TABLE IF NOT EXISTS chunks (
id TEXT PRIMARY KEY,
indicator_type TEXT NOT NULL,
grid_json TEXT NOT NULL,
chunk_start INTEGER NOT NULL,
chunk_end INTEGER NOT NULL,
total_combos INTEGER NOT NULL,
assigned_worker TEXT,
status TEXT DEFAULT 'pending',
started_at TIMESTAMP,
completed_at TIMESTAMP,
best_pnl_in_chunk REAL,
results_csv_path TEXT
)
''')
# Exploration phases - high-level progress
c.execute('''
CREATE TABLE IF NOT EXISTS phases (
id INTEGER PRIMARY KEY AUTOINCREMENT,
phase_name TEXT NOT NULL,
indicator_type TEXT NOT NULL,
grid_json TEXT NOT NULL,
total_combos INTEGER NOT NULL,
completed_combos INTEGER DEFAULT 0,
best_pnl_overall REAL DEFAULT 0,
best_params_json TEXT,
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
estimated_completion TIMESTAMP,
actual_completion TIMESTAMP
)
''')
# Create indexes for fast queries
c.execute('CREATE INDEX IF NOT EXISTS idx_pnl_per_1k ON strategies(pnl_per_1k DESC)')
c.execute('CREATE INDEX IF NOT EXISTS idx_indicator_type ON strategies(indicator_type)')
c.execute('CREATE INDEX IF NOT EXISTS idx_chunk_status ON chunks(status)')
conn.commit()
conn.close()
def record_chunk(self, chunk_id: str, indicator_type: str, grid: ParameterGrid,
chunk_start: int, chunk_end: int, assigned_worker: str) -> None:
"""Record new chunk assigned to worker"""
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute('''
INSERT INTO chunks (id, indicator_type, grid_json, chunk_start, chunk_end,
total_combos, assigned_worker, status, started_at)
VALUES (?, ?, ?, ?, ?, ?, ?, 'running', ?)
''', (chunk_id, indicator_type, json.dumps(grid.to_dict()), chunk_start, chunk_end,
chunk_end - chunk_start, assigned_worker, datetime.now()))
conn.commit()
conn.close()
def complete_chunk(self, chunk_id: str, results_csv_path: str, best_pnl: float) -> None:
"""Mark chunk as completed with results"""
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute('''
UPDATE chunks
SET status='completed', completed_at=?, results_csv_path=?, best_pnl_in_chunk=?
WHERE id=?
''', (datetime.now(), results_csv_path, best_pnl, chunk_id))
conn.commit()
conn.close()
def import_results_csv(self, csv_path: str, worker_id: str, chunk_id: str) -> int:
"""Import CSV results from comprehensive_sweep into strategies table"""
import csv
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
imported = 0
with open(csv_path, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
# Create parameter hash for deduplication
params = {k: v for k, v in row.items() if k not in [
'rank', 'trades', 'win_rate', 'total_pnl', 'pnl_per_1k',
'profit_factor', 'max_drawdown', 'sharpe_ratio'
]}
param_hash = hashlib.sha256(json.dumps(params, sort_keys=True).encode()).hexdigest()
try:
c.execute('''
INSERT INTO strategies (
param_hash, indicator_type, params_json,
trades, win_rate, total_pnl, pnl_per_1k,
profit_factor, max_drawdown, sharpe_ratio,
worker_id, chunk_id
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
param_hash, 'v9_moneyline', json.dumps(params),
int(row['trades']), float(row['win_rate']), float(row['total_pnl']),
float(row['pnl_per_1k']), float(row.get('profit_factor', 0)),
float(row.get('max_drawdown', 0)), float(row.get('sharpe_ratio', 0)),
worker_id, chunk_id
))
imported += 1
except sqlite3.IntegrityError:
# Duplicate param_hash - already tested this config
pass
conn.commit()
conn.close()
return imported
def get_top_strategies(self, limit: int = 100) -> List[Dict]:
"""Get top performing strategies across all tested"""
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute('''
SELECT indicator_type, params_json, trades, win_rate, total_pnl, pnl_per_1k,
profit_factor, max_drawdown, sharpe_ratio, tested_at
FROM strategies
WHERE trades >= 700 -- Statistical significance
AND win_rate >= 0.50 AND win_rate <= 0.70 -- Realistic
AND profit_factor >= 1.2 -- Minimum edge
ORDER BY pnl_per_1k DESC
LIMIT ?
''', (limit,))
rows = c.fetchall()
conn.close()
results = []
for row in rows:
results.append({
'indicator_type': row[0],
'params': json.loads(row[1]),
'trades': row[2],
'win_rate': row[3],
'total_pnl': row[4],
'pnl_per_1k': row[5],
'profit_factor': row[6],
'max_drawdown': row[7],
'sharpe_ratio': row[8],
'tested_at': row[9],
})
return results
class DistributedCoordinator:
"""Coordinates distributed parameter sweeps across EPYC servers"""
def __init__(self):
self.db = ExplorationDatabase(DB_PATH)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
def ssh_command(self, worker_id: str, command: str) -> subprocess.CompletedProcess:
"""Execute command on worker via SSH"""
worker = WORKERS[worker_id]
if 'ssh_hop' in worker:
# Worker 2 requires hop through worker 1
# CRITICAL FIX (Nov 29, 2025): Use double-nested quotes for 2-hop SSH
# Single quotes don't pass command to inner SSH properly
ssh_cmd = f"ssh {worker['ssh_hop']} \"ssh {worker['host']} '{command}'\""
else:
ssh_cmd = f"ssh {worker['host']} '{command}'"
return subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True)
def deploy_worker_script(self, worker_id: str) -> bool:
"""Deploy distributed_worker.py to EPYC server"""
worker = WORKERS[worker_id]
script_path = CLUSTER_DIR / 'distributed_worker.py'
# Copy script to worker's comprehensive_sweep directory
target = f"{worker['workspace']}/backtester/scripts/distributed_worker.py"
if 'ssh_hop' in worker:
# Two-hop copy for worker2
print(f"📤 Copying worker script to {worker_id} via hop...")
# Copy to worker1 first
subprocess.run(f"scp {script_path} {WORKERS['worker1']['host']}:/tmp/", shell=True)
# Then copy from worker1 to worker2
self.ssh_command('worker1', f"scp /tmp/distributed_worker.py {worker['host']}:{target}")
else:
print(f"📤 Copying worker script to {worker_id}...")
subprocess.run(f"scp {script_path} {worker['host']}:{target}", shell=True)
print(f"✅ Worker script deployed to {worker_id}")
return True
def assign_chunk(self, worker_id: str, chunk_id: str, grid: ParameterGrid,
chunk_start: int, chunk_end: int) -> bool:
"""Assign parameter chunk to worker for processing"""
worker = WORKERS[worker_id]
# Record in database
self.db.record_chunk(chunk_id, 'v9_moneyline', grid, chunk_start, chunk_end, worker_id)
# Create chunk specification JSON
chunk_spec = {
'chunk_id': chunk_id,
'chunk_start': chunk_start,
'chunk_end': chunk_end,
'grid': grid.to_dict(),
'num_workers': worker['cores'],
}
chunk_json_path = RESULTS_DIR / f"{chunk_id}_spec.json"
with open(chunk_json_path, 'w') as f:
json.dump(chunk_spec, f, indent=2)
# Copy chunk spec to worker
target_json = f"{worker['workspace']}/chunk_{chunk_id}.json"
if 'ssh_hop' in worker:
# Two-hop copy
subprocess.run(f"scp {chunk_json_path} {WORKERS['worker1']['host']}:/tmp/", shell=True)
self.ssh_command('worker1', f"scp /tmp/{chunk_id}_spec.json {worker['host']}:{target_json}")
else:
subprocess.run(f"scp {chunk_json_path} {worker['host']}:{target_json}", shell=True)
# Execute distributed_worker.py on worker
# CRITICAL: Simplified SSH command without bash -c to avoid quoting issues
cmd = (f"cd {worker['workspace']} && "
f"source backtester/.venv/bin/activate && "
f"nohup python3 backtester/scripts/distributed_worker.py {target_json} "
f"> /tmp/{chunk_id}.log 2>&1 &")
print(f"🚀 Starting chunk {chunk_id} on {worker_id} ({chunk_end - chunk_start:,} combos)...")
result = self.ssh_command(worker_id, cmd)
if result.returncode == 0:
print(f"✅ Chunk {chunk_id} assigned to {worker_id}")
return True
else:
print(f"❌ Failed to assign chunk {chunk_id} to {worker_id}: {result.stderr}")
return False
def collect_results(self, worker_id: str, chunk_id: str) -> Optional[str]:
"""Collect CSV results from worker"""
worker = WORKERS[worker_id]
# Check if results file exists on worker
results_csv = f"{worker['workspace']}/chunk_{chunk_id}_results.csv"
check_cmd = f"test -f {results_csv} && echo 'exists'"
result = self.ssh_command(worker_id, check_cmd)
if 'exists' not in result.stdout:
return None # Results not ready yet
# Copy results back to master
local_csv = RESULTS_DIR / f"{chunk_id}_results.csv"
if 'ssh_hop' in worker:
# Two-hop copy back
self.ssh_command('worker1', f"scp {worker['host']}:{results_csv} /tmp/")
subprocess.run(f"scp {WORKERS['worker1']['host']}:/tmp/chunk_{chunk_id}_results.csv {local_csv}", shell=True)
else:
subprocess.run(f"scp {worker['host']}:{results_csv} {local_csv}", shell=True)
print(f"📥 Collected results from {worker_id} chunk {chunk_id}")
# Import into database
imported = self.db.import_results_csv(str(local_csv), worker_id, chunk_id)
print(f"📊 Imported {imported} unique strategies from {chunk_id}")
# Get best P&L from CSV for chunk tracking
import csv
with open(local_csv, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)
best_pnl = max(float(row['pnl_per_1k']) for row in rows) if rows else 0
self.db.complete_chunk(chunk_id, str(local_csv), best_pnl)
return str(local_csv)
def start_comprehensive_exploration(self, chunk_size: int = 10000):
"""Start massive comprehensive parameter sweep"""
print("=" * 80)
print("🚀 DISTRIBUTED COMPREHENSIVE EXPLORATION")
print("=" * 80)
print()
# Define full parameter grid (can be expanded)
grid = ParameterGrid(
flip_thresholds=[0.4, 0.5, 0.6, 0.7],
ma_gaps=[0.20, 0.30, 0.40, 0.50],
adx_mins=[18, 21, 24, 27],
long_pos_maxs=[60, 65, 70, 75],
short_pos_mins=[20, 25, 30, 35],
cooldowns=[1, 2, 3, 4],
position_sizes=[10000], # Fixed for fair comparison
tp1_multipliers=[1.5, 2.0, 2.5],
tp2_multipliers=[3.0, 4.0, 5.0],
sl_multipliers=[2.5, 3.0, 3.5],
tp1_close_percents=[50, 60, 70, 75],
trailing_multipliers=[1.0, 1.5, 2.0],
vol_mins=[0.8, 1.0, 1.2],
max_bars_list=[300, 500, 1000],
)
total_combos = grid.total_combinations()
print(f"📊 Total parameter space: {total_combos:,} combinations")
print(f"📦 Chunk size: {chunk_size:,} combinations per chunk")
print(f"🎯 Total chunks: {(total_combos + chunk_size - 1) // chunk_size:,}")
print(f"⏱️ Estimated time: {(total_combos * 1.6) / (64 * 3600):.1f} hours with 64 cores")
print()
# Deploy worker scripts
for worker_id in WORKERS.keys():
self.deploy_worker_script(worker_id)
print()
print("🔄 Distributing chunks to workers...")
print()
# Split work across workers
chunk_id_counter = 0
chunk_start = 0
active_chunks = {}
worker_list = list(WORKERS.keys()) # ['worker1', 'worker2']
while chunk_start < total_combos:
chunk_end = min(chunk_start + chunk_size, total_combos)
chunk_id = f"v9_chunk_{chunk_id_counter:06d}"
# Round-robin assignment across both workers for balanced load
worker_id = worker_list[chunk_id_counter % len(worker_list)]
if self.assign_chunk(worker_id, chunk_id, grid, chunk_start, chunk_end):
active_chunks[chunk_id] = worker_id
chunk_id_counter += 1
chunk_start = chunk_end
# Don't overwhelm workers - limit to 2 chunks per worker at a time
if len(active_chunks) >= len(WORKERS) * 2:
print(f"⏸️ Pausing chunk assignment - {len(active_chunks)} chunks active")
print(f"⏳ Waiting for chunks to complete...")
break
print()
print(f"✅ Assigned {len(active_chunks)} initial chunks")
print()
print("📊 Monitor progress with: python3 cluster/exploration_status.py")
print("🏆 View top strategies: sqlite3 cluster/exploration.db 'SELECT * FROM strategies ORDER BY pnl_per_1k DESC LIMIT 10'")
def main():
"""Main coordinator entry point"""
import argparse
parser = argparse.ArgumentParser(description='Distributed continuous optimization coordinator')
parser.add_argument('--chunk-size', type=int, default=10000,
help='Number of combinations per chunk (default: 10000)')
parser.add_argument('--continuous', action='store_true',
help='Run continuously (not implemented yet)')
args = parser.parse_args()
coordinator = DistributedCoordinator()
coordinator.start_comprehensive_exploration(chunk_size=args.chunk_size)
if __name__ == '__main__':
main()