CRITICAL FIX (Nov 30, 2025):
- Dashboard showed 'idle' despite 22+ worker processes running
- Root cause: SSH-based worker detection timing out
- Solution: Check database for running chunks FIRST
Changes:
1. app/api/cluster/status/route.ts:
- Query exploration database before SSH detection
- If running chunks exist, mark workers 'active' even if SSH fails
- Override worker status: 'offline' → 'active' when chunks running
- Log: '✅ Cluster status: ACTIVE (database shows running chunks)'
- Database is source of truth, SSH only for supplementary metrics
2. app/cluster/page.tsx:
- Stop button ALREADY EXISTS (conditionally shown)
- Shows Start when status='idle', Stop when status='active'
- No code changes needed - fixed by status detection
Result:
- Dashboard now shows 'ACTIVE' with 2 workers (correct)
- Workers show 'active' status (was 'offline')
- Stop button automatically visible when cluster active
- System resilient to SSH timeouts/network issues
Verified:
- Container restarted: Nov 30 21:18 UTC
- API tested: Returns status='active', activeWorkers=2
- Logs confirm: Database-first logic working
- Workers confirmed running: 22+ processes on worker1, workers on worker2
441 lines
15 KiB
Python
441 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Real-time web dashboard for distributed parameter exploration cluster.
|
|
Shows worker status, chunk progress, top strategies, and system metrics.
|
|
"""
|
|
|
|
from flask import Flask, render_template_string
|
|
import sqlite3
|
|
import subprocess
|
|
import time
|
|
from datetime import datetime
|
|
|
|
app = Flask(__name__)
|
|
|
|
WORKERS = {
|
|
'worker1': {
|
|
'host': 'root@10.10.254.106',
|
|
'cores': 32,
|
|
'name': 'Worker1'
|
|
},
|
|
'worker2': {
|
|
'host': 'root@10.20.254.100',
|
|
'ssh_hop': 'root@10.10.254.106',
|
|
'cores': 32,
|
|
'name': 'Worker2 (via hop)'
|
|
}
|
|
}
|
|
|
|
HTML_TEMPLATE = """
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Cluster Dashboard</title>
|
|
<meta http-equiv="refresh" content="30">
|
|
<style>
|
|
body {
|
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
|
margin: 0;
|
|
padding: 20px;
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
|
color: #fff;
|
|
}
|
|
.container {
|
|
max-width: 1400px;
|
|
margin: 0 auto;
|
|
}
|
|
h1 {
|
|
text-align: center;
|
|
font-size: 2.5em;
|
|
margin-bottom: 10px;
|
|
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
|
|
}
|
|
.subtitle {
|
|
text-align: center;
|
|
font-size: 1.2em;
|
|
opacity: 0.9;
|
|
margin-bottom: 30px;
|
|
}
|
|
.grid {
|
|
display: grid;
|
|
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
|
|
gap: 20px;
|
|
margin-bottom: 20px;
|
|
}
|
|
.card {
|
|
background: rgba(255,255,255,0.15);
|
|
backdrop-filter: blur(10px);
|
|
border-radius: 15px;
|
|
padding: 25px;
|
|
box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
|
|
border: 1px solid rgba(255,255,255,0.18);
|
|
}
|
|
.card h2 {
|
|
margin-top: 0;
|
|
font-size: 1.5em;
|
|
border-bottom: 2px solid rgba(255,255,255,0.3);
|
|
padding-bottom: 10px;
|
|
margin-bottom: 15px;
|
|
}
|
|
.worker {
|
|
background: rgba(255,255,255,0.1);
|
|
padding: 15px;
|
|
border-radius: 10px;
|
|
margin-bottom: 15px;
|
|
}
|
|
.worker-header {
|
|
display: flex;
|
|
justify-content: space-between;
|
|
align-items: center;
|
|
margin-bottom: 10px;
|
|
}
|
|
.worker-name {
|
|
font-size: 1.3em;
|
|
font-weight: bold;
|
|
}
|
|
.status-badge {
|
|
padding: 5px 15px;
|
|
border-radius: 20px;
|
|
font-size: 0.9em;
|
|
font-weight: bold;
|
|
}
|
|
.status-running {
|
|
background: #10b981;
|
|
color: white;
|
|
}
|
|
.status-idle {
|
|
background: #6b7280;
|
|
color: white;
|
|
}
|
|
.metric {
|
|
display: flex;
|
|
justify-content: space-between;
|
|
padding: 8px 0;
|
|
border-bottom: 1px solid rgba(255,255,255,0.1);
|
|
}
|
|
.metric:last-child {
|
|
border-bottom: none;
|
|
}
|
|
.metric-label {
|
|
opacity: 0.8;
|
|
}
|
|
.metric-value {
|
|
font-weight: bold;
|
|
font-size: 1.1em;
|
|
}
|
|
.progress-bar {
|
|
background: rgba(255,255,255,0.2);
|
|
border-radius: 10px;
|
|
height: 30px;
|
|
margin: 10px 0;
|
|
overflow: hidden;
|
|
position: relative;
|
|
}
|
|
.progress-fill {
|
|
background: linear-gradient(90deg, #10b981, #34d399);
|
|
height: 100%;
|
|
transition: width 0.5s ease;
|
|
display: flex;
|
|
align-items: center;
|
|
justify-content: center;
|
|
font-weight: bold;
|
|
}
|
|
.strategies-table {
|
|
width: 100%;
|
|
border-collapse: collapse;
|
|
margin-top: 15px;
|
|
}
|
|
.strategies-table th {
|
|
background: rgba(255,255,255,0.2);
|
|
padding: 10px;
|
|
text-align: left;
|
|
font-weight: bold;
|
|
}
|
|
.strategies-table td {
|
|
padding: 10px;
|
|
border-bottom: 1px solid rgba(255,255,255,0.1);
|
|
}
|
|
.strategies-table tr:hover {
|
|
background: rgba(255,255,255,0.1);
|
|
}
|
|
.stat-big {
|
|
font-size: 3em;
|
|
font-weight: bold;
|
|
text-align: center;
|
|
margin: 20px 0;
|
|
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
|
|
}
|
|
.timestamp {
|
|
text-align: center;
|
|
opacity: 0.7;
|
|
margin-top: 20px;
|
|
}
|
|
.positive { color: #10b981; }
|
|
.negative { color: #ef4444; }
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<h1>🚀 Parameter Exploration Cluster</h1>
|
|
<div class="subtitle">Real-time distributed backtesting dashboard</div>
|
|
|
|
<div class="grid">
|
|
<!-- Overall Progress -->
|
|
<div class="card">
|
|
<h2>📊 Exploration Progress</h2>
|
|
<div class="stat-big">{{ progress_pct }}%</div>
|
|
<div class="progress-bar">
|
|
<div class="progress-fill" style="width: {{ progress_pct }}%">
|
|
{{ tested_combos:,}} / {{ total_combos:,}}
|
|
</div>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">Total Chunks</span>
|
|
<span class="metric-value">{{ total_chunks }}</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">Completed</span>
|
|
<span class="metric-value">{{ completed_chunks }} ({{ completed_pct }}%)</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">Running</span>
|
|
<span class="metric-value">{{ running_chunks }}</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">Pending</span>
|
|
<span class="metric-value">{{ pending_chunks }}</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">Est. Completion</span>
|
|
<span class="metric-value">{{ est_hours }}h remaining</span>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- Worker Status -->
|
|
<div class="card">
|
|
<h2>🖥️ Worker Status</h2>
|
|
{% for worker_id, worker_data in workers.items() %}
|
|
<div class="worker">
|
|
<div class="worker-header">
|
|
<div class="worker-name">{{ worker_data.name }}</div>
|
|
<div class="status-badge status-{{ worker_data.status }}">
|
|
{{ worker_data.status_text }}
|
|
</div>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">CPU Usage</span>
|
|
<span class="metric-value">{{ worker_data.cpu }}%</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">Processes</span>
|
|
<span class="metric-value">{{ worker_data.processes }}</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">Active Chunks</span>
|
|
<span class="metric-value">{{ worker_data.active_chunks }}</span>
|
|
</div>
|
|
</div>
|
|
{% endfor %}
|
|
</div>
|
|
</div>
|
|
|
|
<!-- Top Strategies -->
|
|
<div class="card">
|
|
<h2>🏆 Top 10 Strategies</h2>
|
|
{% if top_strategies %}
|
|
<table class="strategies-table">
|
|
<thead>
|
|
<tr>
|
|
<th>Rank</th>
|
|
<th>Parameters</th>
|
|
<th>PnL per 1k</th>
|
|
<th>Win Rate</th>
|
|
<th>Profit Factor</th>
|
|
<th>Trades</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
{% for strat in top_strategies %}
|
|
<tr>
|
|
<td><strong>#{{ loop.index }}</strong></td>
|
|
<td style="font-size: 0.85em;">
|
|
flip={{ strat.flip_threshold }},
|
|
gap={{ strat.ma_gap }},
|
|
adx={{ strat.momentum_adx }},
|
|
pos={{ strat.momentum_long_pos }}/{{ strat.momentum_short_pos }}
|
|
</td>
|
|
<td class="{{ 'positive' if strat.pnl_per_1k > 0 else 'negative' }}">
|
|
${{ "%.2f"|format(strat.pnl_per_1k) }}
|
|
</td>
|
|
<td>{{ "%.1f"|format(strat.win_rate) }}%</td>
|
|
<td>{{ "%.2f"|format(strat.profit_factor) }}</td>
|
|
<td>{{ strat.total_trades }}</td>
|
|
</tr>
|
|
{% endfor %}
|
|
</tbody>
|
|
</table>
|
|
{% else %}
|
|
<p style="text-align: center; opacity: 0.7; padding: 20px;">
|
|
⏳ Processing combinations... Results will appear when chunks complete.
|
|
<br><small>First chunk running now - check back in a few minutes!</small>
|
|
</p>
|
|
{% endif %}
|
|
</div>
|
|
|
|
<div class="timestamp">
|
|
Last updated: {{ timestamp }}
|
|
<br>Auto-refreshes every 30 seconds
|
|
</div>
|
|
</div>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
def get_worker_status(worker_id):
|
|
"""Get real-time status from a worker via SSH."""
|
|
worker = WORKERS[worker_id]
|
|
|
|
try:
|
|
# Get CPU usage
|
|
if 'ssh_hop' in worker:
|
|
cpu_cmd = f"ssh {WORKERS['worker1']['host']} 'ssh {worker['host']} \"top -bn1 | grep Cpu\"'"
|
|
else:
|
|
cpu_cmd = f"ssh {worker['host']} 'top -bn1 | grep Cpu'"
|
|
|
|
cpu_result = subprocess.run(cpu_cmd, shell=True, capture_output=True, text=True, timeout=5)
|
|
cpu_line = cpu_result.stdout.strip()
|
|
# Parse: %Cpu(s): 90.1 us, ...
|
|
if 'Cpu' in cpu_line:
|
|
cpu_pct = float(cpu_line.split(':')[1].split('us')[0].strip())
|
|
else:
|
|
cpu_pct = 0.0
|
|
|
|
# Get process count
|
|
if 'ssh_hop' in worker:
|
|
proc_cmd = f"ssh {WORKERS['worker1']['host']} 'ssh {worker['host']} \"ps aux | grep chunk_v9 | grep python | wc -l\"'"
|
|
else:
|
|
proc_cmd = f"ssh {worker['host']} 'ps aux | grep chunk_v9 | grep python | wc -l'"
|
|
|
|
proc_result = subprocess.run(proc_cmd, shell=True, capture_output=True, text=True, timeout=5)
|
|
processes = int(proc_result.stdout.strip())
|
|
|
|
# Determine status
|
|
if processes > 0 and cpu_pct > 50:
|
|
status = 'running'
|
|
status_text = 'RUNNING'
|
|
elif processes > 0:
|
|
status = 'running'
|
|
status_text = 'STARTING'
|
|
else:
|
|
status = 'idle'
|
|
status_text = 'IDLE'
|
|
|
|
return {
|
|
'name': worker.get('name', worker_id),
|
|
'cpu': round(cpu_pct, 1),
|
|
'processes': processes,
|
|
'status': status,
|
|
'status_text': status_text,
|
|
'active_chunks': 0 # Will be filled from DB
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'name': worker.get('name', worker_id),
|
|
'cpu': 0.0,
|
|
'processes': 0,
|
|
'status': 'idle',
|
|
'status_text': 'ERROR',
|
|
'active_chunks': 0
|
|
}
|
|
|
|
@app.route('/')
|
|
def dashboard():
|
|
"""Render the dashboard."""
|
|
# Connect to database
|
|
conn = sqlite3.connect('exploration.db')
|
|
c = conn.cursor()
|
|
|
|
# Get chunk statistics
|
|
c.execute("SELECT COUNT(*), status FROM chunks GROUP BY status")
|
|
chunk_stats = {row[1]: row[0] for row in c.fetchall()}
|
|
total_chunks = sum(chunk_stats.values())
|
|
completed_chunks = chunk_stats.get('completed', 0)
|
|
running_chunks = chunk_stats.get('running', 0)
|
|
pending_chunks = chunk_stats.get('pending', 0)
|
|
|
|
# Get active chunks per worker
|
|
c.execute("SELECT assigned_worker, COUNT(*) FROM chunks WHERE status='running' GROUP BY assigned_worker")
|
|
active_per_worker = {row[0]: row[1] for row in c.fetchall()}
|
|
|
|
# Get total combinations from chunks table
|
|
c.execute("SELECT SUM(total_combos) FROM chunks")
|
|
total_combos_result = c.fetchone()[0]
|
|
total_combos = total_combos_result if total_combos_result else 4096
|
|
|
|
# Get tested strategies count (if strategies table exists)
|
|
try:
|
|
c.execute("SELECT COUNT(*) FROM strategies")
|
|
tested_combos = c.fetchone()[0]
|
|
except sqlite3.OperationalError:
|
|
tested_combos = 0
|
|
|
|
# Get top strategies (if table exists)
|
|
top_strategies = []
|
|
try:
|
|
c.execute("""
|
|
SELECT * FROM strategies
|
|
WHERE total_trades >= 700
|
|
ORDER BY pnl_per_1k DESC
|
|
LIMIT 10
|
|
""")
|
|
columns = [desc[0] for desc in c.description]
|
|
for row in c.fetchall():
|
|
strat = dict(zip(columns, row))
|
|
top_strategies.append(strat)
|
|
except sqlite3.OperationalError:
|
|
pass # Table doesn't exist yet
|
|
|
|
conn.close()
|
|
|
|
# Calculate progress
|
|
progress_pct = round((tested_combos / total_combos) * 100, 2) if total_combos > 0 else 0
|
|
completed_pct = round((completed_chunks / total_chunks) * 100, 1) if total_chunks > 0 else 0
|
|
|
|
# Estimate time remaining
|
|
if completed_chunks > 0 and running_chunks > 0:
|
|
avg_time_per_chunk = 1.5 # hours (rough estimate)
|
|
est_hours = round(pending_chunks * avg_time_per_chunk / max(running_chunks, 1), 1)
|
|
else:
|
|
est_hours = "N/A"
|
|
|
|
# Get worker status
|
|
workers = {}
|
|
for worker_id in WORKERS.keys():
|
|
worker_data = get_worker_status(worker_id)
|
|
worker_data['active_chunks'] = active_per_worker.get(worker_id, 0)
|
|
workers[worker_id] = worker_data
|
|
|
|
# Render template
|
|
return render_template_string(
|
|
HTML_TEMPLATE,
|
|
progress_pct=progress_pct,
|
|
tested_combos=tested_combos,
|
|
total_combos=total_combos,
|
|
total_chunks=total_chunks,
|
|
completed_chunks=completed_chunks,
|
|
completed_pct=completed_pct,
|
|
running_chunks=running_chunks,
|
|
pending_chunks=pending_chunks,
|
|
est_hours=est_hours,
|
|
workers=workers,
|
|
top_strategies=top_strategies,
|
|
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
)
|
|
|
|
if __name__ == '__main__':
|
|
print("🌐 Starting web dashboard on http://0.0.0.0:5000")
|
|
print(" Access from any browser on your network")
|
|
print(" Auto-refreshes every 30 seconds")
|
|
print()
|
|
app.run(host='0.0.0.0', port=5000, debug=False)
|