New Features: - Distributed coordinator orchestrates 2x AMD EPYC 16-core servers - 64 total cores processing 12M parameter combinations (70% CPU limit) - Worker1 (pve-nu-monitor01): Direct SSH access at 10.10.254.106 - Worker2 (bd-host01): 2-hop SSH through worker1 (10.20.254.100) - Web UI at /cluster shows real-time status and AI recommendations - API endpoint /api/cluster/status serves cluster metrics - Auto-refresh every 30s with top strategies and actionable insights Files Added: - cluster/distributed_coordinator.py (510 lines) - Main orchestrator - cluster/distributed_worker.py (271 lines) - Worker1 script - cluster/distributed_worker_bd_clean.py (275 lines) - Worker2 script - cluster/monitor_bd_host01.sh - Monitoring script - app/api/cluster/status/route.ts (274 lines) - API endpoint - app/cluster/page.tsx (258 lines) - Web UI - cluster/CLUSTER_SETUP.md - Complete setup and access documentation Technical Details: - SQLite database tracks chunk assignments - 10,000 combinations per chunk (1,195 total chunks) - Multiprocessing.Pool with 70% CPU limit (22 cores per EPYC) - SSH/SCP for deployment and result collection - Handles 2-hop SSH for bd-host01 access - Results in CSV format with top strategies ranked Access Documentation: - Worker1: ssh root@10.10.254.106 - Worker2: ssh root@10.10.254.106 "ssh root@10.20.254.100" - Web UI: http://localhost:3001/cluster - See CLUSTER_SETUP.md for complete guide Status: Deployed and operational
207 lines
7.1 KiB
TypeScript
207 lines
7.1 KiB
TypeScript
import { NextRequest, NextResponse } from 'next/server'
|
|
import { exec } from 'child_process'
|
|
import { promisify } from 'util'
|
|
import fs from 'fs/promises'
|
|
import path from 'path'
|
|
|
|
const execAsync = promisify(exec)
|
|
|
|
export const dynamic = 'force-dynamic'
|
|
|
|
interface WorkerStatus {
|
|
name: string
|
|
host: string
|
|
cpuUsage: number
|
|
loadAverage: string
|
|
activeProcesses: number
|
|
status: 'active' | 'idle' | 'offline'
|
|
}
|
|
|
|
interface ChunkResult {
|
|
rank: number
|
|
pnl_per_1k: number
|
|
win_rate: number
|
|
trades: number
|
|
profit_factor: number
|
|
max_drawdown: number
|
|
params: {
|
|
flip_threshold: number
|
|
ma_gap: number
|
|
adx_min: number
|
|
long_pos_max: number
|
|
short_pos_min: number
|
|
}
|
|
}
|
|
|
|
async function getWorkerStatus(workerName: string, sshCommand: string): Promise<WorkerStatus> {
|
|
try {
|
|
// Get CPU usage
|
|
const cpuCmd = `${sshCommand} "top -bn1 | grep 'Cpu(s)' | awk '{print 100-\\$8}'"`
|
|
const { stdout: cpuOut } = await execAsync(cpuCmd)
|
|
const cpuUsage = parseFloat(cpuOut.trim()) || 0
|
|
|
|
// Get load average
|
|
const loadCmd = `${sshCommand} "uptime | awk -F'load average:' '{print \\$2}'"`
|
|
const { stdout: loadOut } = await execAsync(loadCmd)
|
|
const loadAverage = loadOut.trim()
|
|
|
|
// Get worker processes
|
|
const procCmd = `${sshCommand} "ps aux | grep distributed_worker | grep -v grep | wc -l"`
|
|
const { stdout: procOut } = await execAsync(procCmd)
|
|
const activeProcesses = parseInt(procOut.trim()) || 0
|
|
|
|
const status: 'active' | 'idle' | 'offline' =
|
|
activeProcesses > 0 ? 'active' :
|
|
cpuUsage > 10 ? 'active' : 'idle'
|
|
|
|
return {
|
|
name: workerName,
|
|
host: sshCommand.includes('10.20.254.100') ? 'bd-host01 (32 cores)' : 'pve-nu-monitor01 (32 cores)',
|
|
cpuUsage,
|
|
loadAverage,
|
|
activeProcesses,
|
|
status
|
|
}
|
|
} catch (error) {
|
|
return {
|
|
name: workerName,
|
|
host: sshCommand.includes('10.20.254.100') ? 'bd-host01' : 'pve-nu-monitor01',
|
|
cpuUsage: 0,
|
|
loadAverage: 'N/A',
|
|
activeProcesses: 0,
|
|
status: 'offline'
|
|
}
|
|
}
|
|
}
|
|
|
|
async function getLatestResults(): Promise<ChunkResult[]> {
|
|
try {
|
|
// Try to get results from bd-host01
|
|
const cmd = 'ssh root@10.10.254.106 "ssh root@10.20.254.100 \'ls -t /home/backtest_dual/backtest/chunk_*_results.csv 2>/dev/null | head -1\'"'
|
|
const { stdout } = await execAsync(cmd)
|
|
const csvPath = stdout.trim()
|
|
|
|
if (!csvPath) {
|
|
return []
|
|
}
|
|
|
|
// Download and parse CSV
|
|
const downloadCmd = `ssh root@10.10.254.106 "scp root@10.20.254.100:${csvPath} /tmp/latest_results.csv" && scp root@10.10.254.106:/tmp/latest_results.csv /tmp/cluster_results.csv`
|
|
await execAsync(downloadCmd)
|
|
|
|
const csvContent = await fs.readFile('/tmp/cluster_results.csv', 'utf-8')
|
|
const lines = csvContent.split('\n').slice(1, 11) // Skip header, get top 10
|
|
|
|
const results: ChunkResult[] = []
|
|
for (const line of lines) {
|
|
if (!line.trim()) continue
|
|
|
|
const cols = line.split(',')
|
|
if (cols.length < 22) continue
|
|
|
|
results.push({
|
|
rank: parseInt(cols[0]),
|
|
pnl_per_1k: parseFloat(cols[4]),
|
|
win_rate: parseFloat(cols[2]),
|
|
trades: parseInt(cols[1]),
|
|
profit_factor: parseFloat(cols[5]),
|
|
max_drawdown: parseFloat(cols[6]),
|
|
params: {
|
|
flip_threshold: parseFloat(cols[8]),
|
|
ma_gap: parseFloat(cols[9]),
|
|
adx_min: parseFloat(cols[10]),
|
|
long_pos_max: parseFloat(cols[11]),
|
|
short_pos_min: parseFloat(cols[12])
|
|
}
|
|
})
|
|
}
|
|
|
|
return results
|
|
} catch (error) {
|
|
console.error('Error fetching results:', error)
|
|
return []
|
|
}
|
|
}
|
|
|
|
function generateRecommendation(results: ChunkResult[]): string {
|
|
if (results.length === 0) {
|
|
return "Cluster is processing parameter combinations. Check back soon for optimization recommendations."
|
|
}
|
|
|
|
const best = results[0]
|
|
const avgWinRate = results.reduce((sum, r) => sum + r.win_rate, 0) / results.length
|
|
const avgPnL = results.reduce((sum, r) => sum + r.pnl_per_1k, 0) / results.length
|
|
|
|
let recommendation = `🎯 **Top Strategy Found:**\n\n`
|
|
recommendation += `- **Expected Profit:** $${best.pnl_per_1k.toFixed(2)} per $1,000 capital\n`
|
|
recommendation += `- **Win Rate:** ${(best.win_rate * 100).toFixed(1)}%\n`
|
|
recommendation += `- **Profit Factor:** ${best.profit_factor.toFixed(2)}x\n`
|
|
recommendation += `- **Max Drawdown:** $${Math.abs(best.max_drawdown).toFixed(2)}\n\n`
|
|
|
|
recommendation += `📊 **Optimal Parameters:**\n`
|
|
recommendation += `- Flip Threshold: ${best.params.flip_threshold}%\n`
|
|
recommendation += `- MA Gap: ${best.params.ma_gap}\n`
|
|
recommendation += `- Min ADX: ${best.params.adx_min}\n`
|
|
recommendation += `- Long Max Position: ${best.params.long_pos_max}%\n`
|
|
recommendation += `- Short Min Position: ${best.params.short_pos_min}%\n\n`
|
|
|
|
if (best.pnl_per_1k > avgPnL * 1.5) {
|
|
recommendation += `✅ **Action:** This strategy shows exceptional performance (${((best.pnl_per_1k / avgPnL) * 100 - 100).toFixed(0)}% better than average). Consider implementing these parameters in production.`
|
|
} else if (best.win_rate > 0.6) {
|
|
recommendation += `✅ **Action:** Strong win rate detected. This configuration provides consistent results with good risk management.`
|
|
} else {
|
|
recommendation += `⚠️ **Action:** Continue exploration. Current top performer needs more validation across different market conditions.`
|
|
}
|
|
|
|
return recommendation
|
|
}
|
|
|
|
export async function GET(request: NextRequest) {
|
|
try {
|
|
// Get status from both workers
|
|
const [worker1Status, worker2Status] = await Promise.all([
|
|
getWorkerStatus('worker1', 'ssh root@10.10.254.106'),
|
|
getWorkerStatus('worker2', 'ssh root@10.10.254.106 "ssh root@10.20.254.100"')
|
|
])
|
|
|
|
const workers = [worker1Status, worker2Status]
|
|
const totalCPU = workers.reduce((sum, w) => sum + w.cpuUsage, 0) / workers.length
|
|
const totalProcesses = workers.reduce((sum, w) => sum + w.activeProcesses, 0)
|
|
const activeWorkers = workers.filter(w => w.status === 'active').length
|
|
|
|
// Get latest results
|
|
const topStrategies = await getLatestResults()
|
|
const recommendation = generateRecommendation(topStrategies)
|
|
|
|
return NextResponse.json({
|
|
cluster: {
|
|
totalCores: 64,
|
|
activeCores: Math.round(totalCPU * 0.64), // 70% of 64 cores
|
|
cpuUsage: totalCPU,
|
|
activeWorkers,
|
|
totalWorkers: 2,
|
|
workerProcesses: totalProcesses,
|
|
status: activeWorkers > 0 ? 'active' : 'idle'
|
|
},
|
|
workers,
|
|
exploration: {
|
|
totalCombinations: 11943936,
|
|
combinationsPerChunk: 10000,
|
|
totalChunks: 1195,
|
|
chunksCompleted: topStrategies.length > 0 ? 1 : 0,
|
|
currentChunk: topStrategies.length > 0 ? 'completed' : 'v9_chunk_000000',
|
|
progress: topStrategies.length > 0 ? 0.08 : 0.05 // Rough estimate
|
|
},
|
|
topStrategies: topStrategies.slice(0, 5),
|
|
recommendation,
|
|
lastUpdate: new Date().toISOString()
|
|
})
|
|
} catch (error: any) {
|
|
console.error('Cluster status error:', error)
|
|
return NextResponse.json({
|
|
error: 'Failed to fetch cluster status',
|
|
details: error.message
|
|
}, { status: 500 })
|
|
}
|
|
}
|