feat: Continuous optimization cluster for 2 EPYC servers
- Master controller with job queue and result aggregation - Worker scripts for parallel backtesting (22 workers per server) - SQLite database for strategy ranking and performance tracking - File-based job queue (simple, robust, survives crashes) - Auto-setup script for both EPYC servers - Status dashboard for monitoring progress - Comprehensive deployment guide Architecture: - Master: Job generation, worker coordination, result collection - Worker 1 (pve-nu-monitor01): AMD EPYC 7282, 22 parallel jobs - Worker 2 (srv-bd-host01): AMD EPYC 7302, 22 parallel jobs - Total capacity: ~49,000 backtests/day (44 cores @ 70%) Initial focus: v9 parameter refinement (27 configurations) Target: Find strategies >00/1k P&L (current baseline 92/1k) Files: - cluster/master.py: Main controller (570 lines) - cluster/worker.py: Worker execution script (220 lines) - cluster/setup_cluster.sh: Automated deployment - cluster/status.py: Real-time status dashboard - cluster/README.md: Operational documentation - cluster/DEPLOYMENT.md: Step-by-step deployment guide
This commit is contained in:
99
cluster/setup_cluster.sh
Executable file
99
cluster/setup_cluster.sh
Executable file
@@ -0,0 +1,99 @@
|
||||
#!/bin/bash
|
||||
# Setup optimization cluster on both EPYC servers
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
echo "🚀 Setting up optimization cluster..."
|
||||
|
||||
# Configuration
|
||||
WORKER1_HOST="root@10.10.254.106"
|
||||
WORKER2_HOP="$WORKER1_HOST"
|
||||
WORKER2_HOST="root@10.20.254.100"
|
||||
WORKSPACE="/root/optimization-cluster"
|
||||
|
||||
# Colors
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
setup_worker() {
|
||||
local HOST=$1
|
||||
local NAME=$2
|
||||
local VIA_HOP=${3:-}
|
||||
|
||||
echo -e "\n${BLUE}Setting up $NAME ($HOST)...${NC}"
|
||||
|
||||
# Build SSH command
|
||||
SSH_CMD="ssh $HOST"
|
||||
if [ -n "$VIA_HOP" ]; then
|
||||
SSH_CMD="ssh $VIA_HOP ssh $HOST"
|
||||
fi
|
||||
|
||||
# Create workspace
|
||||
$SSH_CMD "mkdir -p $WORKSPACE/{jobs,results,data,backtester,logs}"
|
||||
|
||||
# Install Python dependencies
|
||||
echo " 📦 Installing Python packages..."
|
||||
$SSH_CMD "cd $WORKSPACE && python3 -m venv .venv"
|
||||
$SSH_CMD "cd $WORKSPACE && .venv/bin/pip install pandas numpy"
|
||||
|
||||
# Copy backtester code
|
||||
echo " 📁 Copying backtester modules..."
|
||||
if [ -n "$VIA_HOP" ]; then
|
||||
# Two-hop transfer: local -> worker1 -> worker2
|
||||
scp -r "$SCRIPT_DIR/../backtester/"* "$VIA_HOP:$WORKSPACE/backtester/" > /dev/null 2>&1
|
||||
$SSH_CMD "rsync -a $WORKSPACE/backtester/ $HOST:$WORKSPACE/backtester/"
|
||||
else
|
||||
scp -r "$SCRIPT_DIR/../backtester/"* "$HOST:$WORKSPACE/backtester/" > /dev/null 2>&1
|
||||
fi
|
||||
|
||||
# Copy worker script
|
||||
echo " 📄 Installing worker script..."
|
||||
if [ -n "$VIA_HOP" ]; then
|
||||
scp "$SCRIPT_DIR/worker.py" "$VIA_HOP:$WORKSPACE/" > /dev/null 2>&1
|
||||
$SSH_CMD "scp $WORKSPACE/worker.py $HOST:$WORKSPACE/"
|
||||
else
|
||||
scp "$SCRIPT_DIR/worker.py" "$HOST:$WORKSPACE/" > /dev/null 2>&1
|
||||
fi
|
||||
|
||||
$SSH_CMD "chmod +x $WORKSPACE/worker.py"
|
||||
|
||||
# Copy data file
|
||||
echo " 📊 Copying OHLCV data..."
|
||||
if [ -f "$SCRIPT_DIR/../backtester/data/solusdt_5m.csv" ]; then
|
||||
if [ -n "$VIA_HOP" ]; then
|
||||
scp "$SCRIPT_DIR/../backtester/data/solusdt_5m.csv" "$VIA_HOP:$WORKSPACE/data/" > /dev/null 2>&1
|
||||
$SSH_CMD "scp $WORKSPACE/data/solusdt_5m.csv $HOST:$WORKSPACE/data/"
|
||||
else
|
||||
scp "$SCRIPT_DIR/../backtester/data/solusdt_5m.csv" "$HOST:$WORKSPACE/data/" > /dev/null 2>&1
|
||||
fi
|
||||
else
|
||||
echo " ⚠️ Warning: solusdt_5m.csv not found, download manually"
|
||||
fi
|
||||
|
||||
# Verify setup
|
||||
echo " ✅ Verifying installation..."
|
||||
$SSH_CMD "cd $WORKSPACE && ls -lah"
|
||||
|
||||
echo -e "${GREEN}✅ $NAME setup complete${NC}"
|
||||
}
|
||||
|
||||
# Setup Worker 1 (direct connection)
|
||||
setup_worker "$WORKER1_HOST" "Worker 1 (pve-nu-monitor01)"
|
||||
|
||||
# Setup Worker 2 (via Worker 1 hop)
|
||||
setup_worker "$WORKER2_HOST" "Worker 2 (srv-bd-host01)" "$WORKER1_HOST"
|
||||
|
||||
echo -e "\n${GREEN}🎉 Cluster setup complete!${NC}"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " 1. Start master controller:"
|
||||
echo " cd $SCRIPT_DIR && python3 master.py"
|
||||
echo ""
|
||||
echo " 2. Monitor cluster status:"
|
||||
echo " watch -n 5 'ls -1 cluster/queue/*.json 2>/dev/null | wc -l'"
|
||||
echo ""
|
||||
echo " 3. View results:"
|
||||
echo " sqlite3 cluster/strategies.db 'SELECT * FROM strategies ORDER BY pnl_per_1k DESC LIMIT 10'"
|
||||
Reference in New Issue
Block a user