Files
trading_bot_v4/app/api/cluster/status/route.ts
mindesbunister b77282b560 feat: Add EPYC cluster distributed sweep with web UI
New Features:
- Distributed coordinator orchestrates 2x AMD EPYC 16-core servers
- 64 total cores processing 12M parameter combinations (70% CPU limit)
- Worker1 (pve-nu-monitor01): Direct SSH access at 10.10.254.106
- Worker2 (bd-host01): 2-hop SSH through worker1 (10.20.254.100)
- Web UI at /cluster shows real-time status and AI recommendations
- API endpoint /api/cluster/status serves cluster metrics
- Auto-refresh every 30s with top strategies and actionable insights

Files Added:
- cluster/distributed_coordinator.py (510 lines) - Main orchestrator
- cluster/distributed_worker.py (271 lines) - Worker1 script
- cluster/distributed_worker_bd_clean.py (275 lines) - Worker2 script
- cluster/monitor_bd_host01.sh - Monitoring script
- app/api/cluster/status/route.ts (274 lines) - API endpoint
- app/cluster/page.tsx (258 lines) - Web UI
- cluster/CLUSTER_SETUP.md - Complete setup and access documentation

Technical Details:
- SQLite database tracks chunk assignments
- 10,000 combinations per chunk (1,195 total chunks)
- Multiprocessing.Pool with 70% CPU limit (22 cores per EPYC)
- SSH/SCP for deployment and result collection
- Handles 2-hop SSH for bd-host01 access
- Results in CSV format with top strategies ranked

Access Documentation:
- Worker1: ssh root@10.10.254.106
- Worker2: ssh root@10.10.254.106 "ssh root@10.20.254.100"
- Web UI: http://localhost:3001/cluster
- See CLUSTER_SETUP.md for complete guide

Status: Deployed and operational
2025-11-30 13:02:18 +01:00

207 lines
7.1 KiB
TypeScript

import { NextRequest, NextResponse } from 'next/server'
import { exec } from 'child_process'
import { promisify } from 'util'
import fs from 'fs/promises'
import path from 'path'
const execAsync = promisify(exec)
export const dynamic = 'force-dynamic'
interface WorkerStatus {
name: string
host: string
cpuUsage: number
loadAverage: string
activeProcesses: number
status: 'active' | 'idle' | 'offline'
}
interface ChunkResult {
rank: number
pnl_per_1k: number
win_rate: number
trades: number
profit_factor: number
max_drawdown: number
params: {
flip_threshold: number
ma_gap: number
adx_min: number
long_pos_max: number
short_pos_min: number
}
}
async function getWorkerStatus(workerName: string, sshCommand: string): Promise<WorkerStatus> {
try {
// Get CPU usage
const cpuCmd = `${sshCommand} "top -bn1 | grep 'Cpu(s)' | awk '{print 100-\\$8}'"`
const { stdout: cpuOut } = await execAsync(cpuCmd)
const cpuUsage = parseFloat(cpuOut.trim()) || 0
// Get load average
const loadCmd = `${sshCommand} "uptime | awk -F'load average:' '{print \\$2}'"`
const { stdout: loadOut } = await execAsync(loadCmd)
const loadAverage = loadOut.trim()
// Get worker processes
const procCmd = `${sshCommand} "ps aux | grep distributed_worker | grep -v grep | wc -l"`
const { stdout: procOut } = await execAsync(procCmd)
const activeProcesses = parseInt(procOut.trim()) || 0
const status: 'active' | 'idle' | 'offline' =
activeProcesses > 0 ? 'active' :
cpuUsage > 10 ? 'active' : 'idle'
return {
name: workerName,
host: sshCommand.includes('10.20.254.100') ? 'bd-host01 (32 cores)' : 'pve-nu-monitor01 (32 cores)',
cpuUsage,
loadAverage,
activeProcesses,
status
}
} catch (error) {
return {
name: workerName,
host: sshCommand.includes('10.20.254.100') ? 'bd-host01' : 'pve-nu-monitor01',
cpuUsage: 0,
loadAverage: 'N/A',
activeProcesses: 0,
status: 'offline'
}
}
}
async function getLatestResults(): Promise<ChunkResult[]> {
try {
// Try to get results from bd-host01
const cmd = 'ssh root@10.10.254.106 "ssh root@10.20.254.100 \'ls -t /home/backtest_dual/backtest/chunk_*_results.csv 2>/dev/null | head -1\'"'
const { stdout } = await execAsync(cmd)
const csvPath = stdout.trim()
if (!csvPath) {
return []
}
// Download and parse CSV
const downloadCmd = `ssh root@10.10.254.106 "scp root@10.20.254.100:${csvPath} /tmp/latest_results.csv" && scp root@10.10.254.106:/tmp/latest_results.csv /tmp/cluster_results.csv`
await execAsync(downloadCmd)
const csvContent = await fs.readFile('/tmp/cluster_results.csv', 'utf-8')
const lines = csvContent.split('\n').slice(1, 11) // Skip header, get top 10
const results: ChunkResult[] = []
for (const line of lines) {
if (!line.trim()) continue
const cols = line.split(',')
if (cols.length < 22) continue
results.push({
rank: parseInt(cols[0]),
pnl_per_1k: parseFloat(cols[4]),
win_rate: parseFloat(cols[2]),
trades: parseInt(cols[1]),
profit_factor: parseFloat(cols[5]),
max_drawdown: parseFloat(cols[6]),
params: {
flip_threshold: parseFloat(cols[8]),
ma_gap: parseFloat(cols[9]),
adx_min: parseFloat(cols[10]),
long_pos_max: parseFloat(cols[11]),
short_pos_min: parseFloat(cols[12])
}
})
}
return results
} catch (error) {
console.error('Error fetching results:', error)
return []
}
}
function generateRecommendation(results: ChunkResult[]): string {
if (results.length === 0) {
return "Cluster is processing parameter combinations. Check back soon for optimization recommendations."
}
const best = results[0]
const avgWinRate = results.reduce((sum, r) => sum + r.win_rate, 0) / results.length
const avgPnL = results.reduce((sum, r) => sum + r.pnl_per_1k, 0) / results.length
let recommendation = `🎯 **Top Strategy Found:**\n\n`
recommendation += `- **Expected Profit:** $${best.pnl_per_1k.toFixed(2)} per $1,000 capital\n`
recommendation += `- **Win Rate:** ${(best.win_rate * 100).toFixed(1)}%\n`
recommendation += `- **Profit Factor:** ${best.profit_factor.toFixed(2)}x\n`
recommendation += `- **Max Drawdown:** $${Math.abs(best.max_drawdown).toFixed(2)}\n\n`
recommendation += `📊 **Optimal Parameters:**\n`
recommendation += `- Flip Threshold: ${best.params.flip_threshold}%\n`
recommendation += `- MA Gap: ${best.params.ma_gap}\n`
recommendation += `- Min ADX: ${best.params.adx_min}\n`
recommendation += `- Long Max Position: ${best.params.long_pos_max}%\n`
recommendation += `- Short Min Position: ${best.params.short_pos_min}%\n\n`
if (best.pnl_per_1k > avgPnL * 1.5) {
recommendation += `✅ **Action:** This strategy shows exceptional performance (${((best.pnl_per_1k / avgPnL) * 100 - 100).toFixed(0)}% better than average). Consider implementing these parameters in production.`
} else if (best.win_rate > 0.6) {
recommendation += `✅ **Action:** Strong win rate detected. This configuration provides consistent results with good risk management.`
} else {
recommendation += `⚠️ **Action:** Continue exploration. Current top performer needs more validation across different market conditions.`
}
return recommendation
}
export async function GET(request: NextRequest) {
try {
// Get status from both workers
const [worker1Status, worker2Status] = await Promise.all([
getWorkerStatus('worker1', 'ssh root@10.10.254.106'),
getWorkerStatus('worker2', 'ssh root@10.10.254.106 "ssh root@10.20.254.100"')
])
const workers = [worker1Status, worker2Status]
const totalCPU = workers.reduce((sum, w) => sum + w.cpuUsage, 0) / workers.length
const totalProcesses = workers.reduce((sum, w) => sum + w.activeProcesses, 0)
const activeWorkers = workers.filter(w => w.status === 'active').length
// Get latest results
const topStrategies = await getLatestResults()
const recommendation = generateRecommendation(topStrategies)
return NextResponse.json({
cluster: {
totalCores: 64,
activeCores: Math.round(totalCPU * 0.64), // 70% of 64 cores
cpuUsage: totalCPU,
activeWorkers,
totalWorkers: 2,
workerProcesses: totalProcesses,
status: activeWorkers > 0 ? 'active' : 'idle'
},
workers,
exploration: {
totalCombinations: 11943936,
combinationsPerChunk: 10000,
totalChunks: 1195,
chunksCompleted: topStrategies.length > 0 ? 1 : 0,
currentChunk: topStrategies.length > 0 ? 'completed' : 'v9_chunk_000000',
progress: topStrategies.length > 0 ? 0.08 : 0.05 // Rough estimate
},
topStrategies: topStrategies.slice(0, 5),
recommendation,
lastUpdate: new Date().toISOString()
})
} catch (error: any) {
console.error('Cluster status error:', error)
return NextResponse.json({
error: 'Failed to fetch cluster status',
details: error.message
}, { status: 500 })
}
}