- Master controller with job queue and result aggregation - Worker scripts for parallel backtesting (22 workers per server) - SQLite database for strategy ranking and performance tracking - File-based job queue (simple, robust, survives crashes) - Auto-setup script for both EPYC servers - Status dashboard for monitoring progress - Comprehensive deployment guide Architecture: - Master: Job generation, worker coordination, result collection - Worker 1 (pve-nu-monitor01): AMD EPYC 7282, 22 parallel jobs - Worker 2 (srv-bd-host01): AMD EPYC 7302, 22 parallel jobs - Total capacity: ~49,000 backtests/day (44 cores @ 70%) Initial focus: v9 parameter refinement (27 configurations) Target: Find strategies >00/1k P&L (current baseline 92/1k) Files: - cluster/master.py: Main controller (570 lines) - cluster/worker.py: Worker execution script (220 lines) - cluster/setup_cluster.sh: Automated deployment - cluster/status.py: Real-time status dashboard - cluster/README.md: Operational documentation - cluster/DEPLOYMENT.md: Step-by-step deployment guide
100 lines
3.1 KiB
Bash
Executable File
100 lines
3.1 KiB
Bash
Executable File
#!/bin/bash
|
|
# Setup optimization cluster on both EPYC servers
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
echo "🚀 Setting up optimization cluster..."
|
|
|
|
# Configuration
|
|
WORKER1_HOST="root@10.10.254.106"
|
|
WORKER2_HOP="$WORKER1_HOST"
|
|
WORKER2_HOST="root@10.20.254.100"
|
|
WORKSPACE="/root/optimization-cluster"
|
|
|
|
# Colors
|
|
GREEN='\033[0;32m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
setup_worker() {
|
|
local HOST=$1
|
|
local NAME=$2
|
|
local VIA_HOP=${3:-}
|
|
|
|
echo -e "\n${BLUE}Setting up $NAME ($HOST)...${NC}"
|
|
|
|
# Build SSH command
|
|
SSH_CMD="ssh $HOST"
|
|
if [ -n "$VIA_HOP" ]; then
|
|
SSH_CMD="ssh $VIA_HOP ssh $HOST"
|
|
fi
|
|
|
|
# Create workspace
|
|
$SSH_CMD "mkdir -p $WORKSPACE/{jobs,results,data,backtester,logs}"
|
|
|
|
# Install Python dependencies
|
|
echo " 📦 Installing Python packages..."
|
|
$SSH_CMD "cd $WORKSPACE && python3 -m venv .venv"
|
|
$SSH_CMD "cd $WORKSPACE && .venv/bin/pip install pandas numpy"
|
|
|
|
# Copy backtester code
|
|
echo " 📁 Copying backtester modules..."
|
|
if [ -n "$VIA_HOP" ]; then
|
|
# Two-hop transfer: local -> worker1 -> worker2
|
|
scp -r "$SCRIPT_DIR/../backtester/"* "$VIA_HOP:$WORKSPACE/backtester/" > /dev/null 2>&1
|
|
$SSH_CMD "rsync -a $WORKSPACE/backtester/ $HOST:$WORKSPACE/backtester/"
|
|
else
|
|
scp -r "$SCRIPT_DIR/../backtester/"* "$HOST:$WORKSPACE/backtester/" > /dev/null 2>&1
|
|
fi
|
|
|
|
# Copy worker script
|
|
echo " 📄 Installing worker script..."
|
|
if [ -n "$VIA_HOP" ]; then
|
|
scp "$SCRIPT_DIR/worker.py" "$VIA_HOP:$WORKSPACE/" > /dev/null 2>&1
|
|
$SSH_CMD "scp $WORKSPACE/worker.py $HOST:$WORKSPACE/"
|
|
else
|
|
scp "$SCRIPT_DIR/worker.py" "$HOST:$WORKSPACE/" > /dev/null 2>&1
|
|
fi
|
|
|
|
$SSH_CMD "chmod +x $WORKSPACE/worker.py"
|
|
|
|
# Copy data file
|
|
echo " 📊 Copying OHLCV data..."
|
|
if [ -f "$SCRIPT_DIR/../backtester/data/solusdt_5m.csv" ]; then
|
|
if [ -n "$VIA_HOP" ]; then
|
|
scp "$SCRIPT_DIR/../backtester/data/solusdt_5m.csv" "$VIA_HOP:$WORKSPACE/data/" > /dev/null 2>&1
|
|
$SSH_CMD "scp $WORKSPACE/data/solusdt_5m.csv $HOST:$WORKSPACE/data/"
|
|
else
|
|
scp "$SCRIPT_DIR/../backtester/data/solusdt_5m.csv" "$HOST:$WORKSPACE/data/" > /dev/null 2>&1
|
|
fi
|
|
else
|
|
echo " ⚠️ Warning: solusdt_5m.csv not found, download manually"
|
|
fi
|
|
|
|
# Verify setup
|
|
echo " ✅ Verifying installation..."
|
|
$SSH_CMD "cd $WORKSPACE && ls -lah"
|
|
|
|
echo -e "${GREEN}✅ $NAME setup complete${NC}"
|
|
}
|
|
|
|
# Setup Worker 1 (direct connection)
|
|
setup_worker "$WORKER1_HOST" "Worker 1 (pve-nu-monitor01)"
|
|
|
|
# Setup Worker 2 (via Worker 1 hop)
|
|
setup_worker "$WORKER2_HOST" "Worker 2 (srv-bd-host01)" "$WORKER1_HOST"
|
|
|
|
echo -e "\n${GREEN}🎉 Cluster setup complete!${NC}"
|
|
echo ""
|
|
echo "Next steps:"
|
|
echo " 1. Start master controller:"
|
|
echo " cd $SCRIPT_DIR && python3 master.py"
|
|
echo ""
|
|
echo " 2. Monitor cluster status:"
|
|
echo " watch -n 5 'ls -1 cluster/queue/*.json 2>/dev/null | wc -l'"
|
|
echo ""
|
|
echo " 3. View results:"
|
|
echo " sqlite3 cluster/strategies.db 'SELECT * FROM strategies ORDER BY pnl_per_1k DESC LIMIT 10'"
|