fix: Database-first cluster status detection + Stop button clarification
CRITICAL FIX (Nov 30, 2025):
- Dashboard showed 'idle' despite 22+ worker processes running
- Root cause: SSH-based worker detection timing out
- Solution: Check database for running chunks FIRST
Changes:
1. app/api/cluster/status/route.ts:
- Query exploration database before SSH detection
- If running chunks exist, mark workers 'active' even if SSH fails
- Override worker status: 'offline' → 'active' when chunks running
- Log: '✅ Cluster status: ACTIVE (database shows running chunks)'
- Database is source of truth, SSH only for supplementary metrics
2. app/cluster/page.tsx:
- Stop button ALREADY EXISTS (conditionally shown)
- Shows Start when status='idle', Stop when status='active'
- No code changes needed - fixed by status detection
Result:
- Dashboard now shows 'ACTIVE' with 2 workers (correct)
- Workers show 'active' status (was 'offline')
- Stop button automatically visible when cluster active
- System resilient to SSH timeouts/network issues
Verified:
- Container restarted: Nov 30 21:18 UTC
- API tested: Returns status='active', activeWorkers=2
- Logs confirm: Database-first logic working
- Workers confirmed running: 22+ processes on worker1, workers on worker2
This commit is contained in:
440
cluster/web_dashboard.py
Normal file
440
cluster/web_dashboard.py
Normal file
@@ -0,0 +1,440 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Real-time web dashboard for distributed parameter exploration cluster.
|
||||
Shows worker status, chunk progress, top strategies, and system metrics.
|
||||
"""
|
||||
|
||||
from flask import Flask, render_template_string
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
WORKERS = {
|
||||
'worker1': {
|
||||
'host': 'root@10.10.254.106',
|
||||
'cores': 32,
|
||||
'name': 'Worker1'
|
||||
},
|
||||
'worker2': {
|
||||
'host': 'root@10.20.254.100',
|
||||
'ssh_hop': 'root@10.10.254.106',
|
||||
'cores': 32,
|
||||
'name': 'Worker2 (via hop)'
|
||||
}
|
||||
}
|
||||
|
||||
HTML_TEMPLATE = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Cluster Dashboard</title>
|
||||
<meta http-equiv="refresh" content="30">
|
||||
<style>
|
||||
body {
|
||||
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: #fff;
|
||||
}
|
||||
.container {
|
||||
max-width: 1400px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
h1 {
|
||||
text-align: center;
|
||||
font-size: 2.5em;
|
||||
margin-bottom: 10px;
|
||||
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
|
||||
}
|
||||
.subtitle {
|
||||
text-align: center;
|
||||
font-size: 1.2em;
|
||||
opacity: 0.9;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
.grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
|
||||
gap: 20px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.card {
|
||||
background: rgba(255,255,255,0.15);
|
||||
backdrop-filter: blur(10px);
|
||||
border-radius: 15px;
|
||||
padding: 25px;
|
||||
box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
|
||||
border: 1px solid rgba(255,255,255,0.18);
|
||||
}
|
||||
.card h2 {
|
||||
margin-top: 0;
|
||||
font-size: 1.5em;
|
||||
border-bottom: 2px solid rgba(255,255,255,0.3);
|
||||
padding-bottom: 10px;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
.worker {
|
||||
background: rgba(255,255,255,0.1);
|
||||
padding: 15px;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
.worker-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.worker-name {
|
||||
font-size: 1.3em;
|
||||
font-weight: bold;
|
||||
}
|
||||
.status-badge {
|
||||
padding: 5px 15px;
|
||||
border-radius: 20px;
|
||||
font-size: 0.9em;
|
||||
font-weight: bold;
|
||||
}
|
||||
.status-running {
|
||||
background: #10b981;
|
||||
color: white;
|
||||
}
|
||||
.status-idle {
|
||||
background: #6b7280;
|
||||
color: white;
|
||||
}
|
||||
.metric {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
padding: 8px 0;
|
||||
border-bottom: 1px solid rgba(255,255,255,0.1);
|
||||
}
|
||||
.metric:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
.metric-label {
|
||||
opacity: 0.8;
|
||||
}
|
||||
.metric-value {
|
||||
font-weight: bold;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
.progress-bar {
|
||||
background: rgba(255,255,255,0.2);
|
||||
border-radius: 10px;
|
||||
height: 30px;
|
||||
margin: 10px 0;
|
||||
overflow: hidden;
|
||||
position: relative;
|
||||
}
|
||||
.progress-fill {
|
||||
background: linear-gradient(90deg, #10b981, #34d399);
|
||||
height: 100%;
|
||||
transition: width 0.5s ease;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-weight: bold;
|
||||
}
|
||||
.strategies-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-top: 15px;
|
||||
}
|
||||
.strategies-table th {
|
||||
background: rgba(255,255,255,0.2);
|
||||
padding: 10px;
|
||||
text-align: left;
|
||||
font-weight: bold;
|
||||
}
|
||||
.strategies-table td {
|
||||
padding: 10px;
|
||||
border-bottom: 1px solid rgba(255,255,255,0.1);
|
||||
}
|
||||
.strategies-table tr:hover {
|
||||
background: rgba(255,255,255,0.1);
|
||||
}
|
||||
.stat-big {
|
||||
font-size: 3em;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
margin: 20px 0;
|
||||
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
|
||||
}
|
||||
.timestamp {
|
||||
text-align: center;
|
||||
opacity: 0.7;
|
||||
margin-top: 20px;
|
||||
}
|
||||
.positive { color: #10b981; }
|
||||
.negative { color: #ef4444; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>🚀 Parameter Exploration Cluster</h1>
|
||||
<div class="subtitle">Real-time distributed backtesting dashboard</div>
|
||||
|
||||
<div class="grid">
|
||||
<!-- Overall Progress -->
|
||||
<div class="card">
|
||||
<h2>📊 Exploration Progress</h2>
|
||||
<div class="stat-big">{{ progress_pct }}%</div>
|
||||
<div class="progress-bar">
|
||||
<div class="progress-fill" style="width: {{ progress_pct }}%">
|
||||
{{ tested_combos:,}} / {{ total_combos:,}}
|
||||
</div>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<span class="metric-label">Total Chunks</span>
|
||||
<span class="metric-value">{{ total_chunks }}</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<span class="metric-label">Completed</span>
|
||||
<span class="metric-value">{{ completed_chunks }} ({{ completed_pct }}%)</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<span class="metric-label">Running</span>
|
||||
<span class="metric-value">{{ running_chunks }}</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<span class="metric-label">Pending</span>
|
||||
<span class="metric-value">{{ pending_chunks }}</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<span class="metric-label">Est. Completion</span>
|
||||
<span class="metric-value">{{ est_hours }}h remaining</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Worker Status -->
|
||||
<div class="card">
|
||||
<h2>🖥️ Worker Status</h2>
|
||||
{% for worker_id, worker_data in workers.items() %}
|
||||
<div class="worker">
|
||||
<div class="worker-header">
|
||||
<div class="worker-name">{{ worker_data.name }}</div>
|
||||
<div class="status-badge status-{{ worker_data.status }}">
|
||||
{{ worker_data.status_text }}
|
||||
</div>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<span class="metric-label">CPU Usage</span>
|
||||
<span class="metric-value">{{ worker_data.cpu }}%</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<span class="metric-label">Processes</span>
|
||||
<span class="metric-value">{{ worker_data.processes }}</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<span class="metric-label">Active Chunks</span>
|
||||
<span class="metric-value">{{ worker_data.active_chunks }}</span>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Top Strategies -->
|
||||
<div class="card">
|
||||
<h2>🏆 Top 10 Strategies</h2>
|
||||
{% if top_strategies %}
|
||||
<table class="strategies-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Rank</th>
|
||||
<th>Parameters</th>
|
||||
<th>PnL per 1k</th>
|
||||
<th>Win Rate</th>
|
||||
<th>Profit Factor</th>
|
||||
<th>Trades</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for strat in top_strategies %}
|
||||
<tr>
|
||||
<td><strong>#{{ loop.index }}</strong></td>
|
||||
<td style="font-size: 0.85em;">
|
||||
flip={{ strat.flip_threshold }},
|
||||
gap={{ strat.ma_gap }},
|
||||
adx={{ strat.momentum_adx }},
|
||||
pos={{ strat.momentum_long_pos }}/{{ strat.momentum_short_pos }}
|
||||
</td>
|
||||
<td class="{{ 'positive' if strat.pnl_per_1k > 0 else 'negative' }}">
|
||||
${{ "%.2f"|format(strat.pnl_per_1k) }}
|
||||
</td>
|
||||
<td>{{ "%.1f"|format(strat.win_rate) }}%</td>
|
||||
<td>{{ "%.2f"|format(strat.profit_factor) }}</td>
|
||||
<td>{{ strat.total_trades }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% else %}
|
||||
<p style="text-align: center; opacity: 0.7; padding: 20px;">
|
||||
⏳ Processing combinations... Results will appear when chunks complete.
|
||||
<br><small>First chunk running now - check back in a few minutes!</small>
|
||||
</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<div class="timestamp">
|
||||
Last updated: {{ timestamp }}
|
||||
<br>Auto-refreshes every 30 seconds
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
def get_worker_status(worker_id):
|
||||
"""Get real-time status from a worker via SSH."""
|
||||
worker = WORKERS[worker_id]
|
||||
|
||||
try:
|
||||
# Get CPU usage
|
||||
if 'ssh_hop' in worker:
|
||||
cpu_cmd = f"ssh {WORKERS['worker1']['host']} 'ssh {worker['host']} \"top -bn1 | grep Cpu\"'"
|
||||
else:
|
||||
cpu_cmd = f"ssh {worker['host']} 'top -bn1 | grep Cpu'"
|
||||
|
||||
cpu_result = subprocess.run(cpu_cmd, shell=True, capture_output=True, text=True, timeout=5)
|
||||
cpu_line = cpu_result.stdout.strip()
|
||||
# Parse: %Cpu(s): 90.1 us, ...
|
||||
if 'Cpu' in cpu_line:
|
||||
cpu_pct = float(cpu_line.split(':')[1].split('us')[0].strip())
|
||||
else:
|
||||
cpu_pct = 0.0
|
||||
|
||||
# Get process count
|
||||
if 'ssh_hop' in worker:
|
||||
proc_cmd = f"ssh {WORKERS['worker1']['host']} 'ssh {worker['host']} \"ps aux | grep chunk_v9 | grep python | wc -l\"'"
|
||||
else:
|
||||
proc_cmd = f"ssh {worker['host']} 'ps aux | grep chunk_v9 | grep python | wc -l'"
|
||||
|
||||
proc_result = subprocess.run(proc_cmd, shell=True, capture_output=True, text=True, timeout=5)
|
||||
processes = int(proc_result.stdout.strip())
|
||||
|
||||
# Determine status
|
||||
if processes > 0 and cpu_pct > 50:
|
||||
status = 'running'
|
||||
status_text = 'RUNNING'
|
||||
elif processes > 0:
|
||||
status = 'running'
|
||||
status_text = 'STARTING'
|
||||
else:
|
||||
status = 'idle'
|
||||
status_text = 'IDLE'
|
||||
|
||||
return {
|
||||
'name': worker.get('name', worker_id),
|
||||
'cpu': round(cpu_pct, 1),
|
||||
'processes': processes,
|
||||
'status': status,
|
||||
'status_text': status_text,
|
||||
'active_chunks': 0 # Will be filled from DB
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'name': worker.get('name', worker_id),
|
||||
'cpu': 0.0,
|
||||
'processes': 0,
|
||||
'status': 'idle',
|
||||
'status_text': 'ERROR',
|
||||
'active_chunks': 0
|
||||
}
|
||||
|
||||
@app.route('/')
|
||||
def dashboard():
|
||||
"""Render the dashboard."""
|
||||
# Connect to database
|
||||
conn = sqlite3.connect('exploration.db')
|
||||
c = conn.cursor()
|
||||
|
||||
# Get chunk statistics
|
||||
c.execute("SELECT COUNT(*), status FROM chunks GROUP BY status")
|
||||
chunk_stats = {row[1]: row[0] for row in c.fetchall()}
|
||||
total_chunks = sum(chunk_stats.values())
|
||||
completed_chunks = chunk_stats.get('completed', 0)
|
||||
running_chunks = chunk_stats.get('running', 0)
|
||||
pending_chunks = chunk_stats.get('pending', 0)
|
||||
|
||||
# Get active chunks per worker
|
||||
c.execute("SELECT assigned_worker, COUNT(*) FROM chunks WHERE status='running' GROUP BY assigned_worker")
|
||||
active_per_worker = {row[0]: row[1] for row in c.fetchall()}
|
||||
|
||||
# Get total combinations from chunks table
|
||||
c.execute("SELECT SUM(total_combos) FROM chunks")
|
||||
total_combos_result = c.fetchone()[0]
|
||||
total_combos = total_combos_result if total_combos_result else 4096
|
||||
|
||||
# Get tested strategies count (if strategies table exists)
|
||||
try:
|
||||
c.execute("SELECT COUNT(*) FROM strategies")
|
||||
tested_combos = c.fetchone()[0]
|
||||
except sqlite3.OperationalError:
|
||||
tested_combos = 0
|
||||
|
||||
# Get top strategies (if table exists)
|
||||
top_strategies = []
|
||||
try:
|
||||
c.execute("""
|
||||
SELECT * FROM strategies
|
||||
WHERE total_trades >= 700
|
||||
ORDER BY pnl_per_1k DESC
|
||||
LIMIT 10
|
||||
""")
|
||||
columns = [desc[0] for desc in c.description]
|
||||
for row in c.fetchall():
|
||||
strat = dict(zip(columns, row))
|
||||
top_strategies.append(strat)
|
||||
except sqlite3.OperationalError:
|
||||
pass # Table doesn't exist yet
|
||||
|
||||
conn.close()
|
||||
|
||||
# Calculate progress
|
||||
progress_pct = round((tested_combos / total_combos) * 100, 2) if total_combos > 0 else 0
|
||||
completed_pct = round((completed_chunks / total_chunks) * 100, 1) if total_chunks > 0 else 0
|
||||
|
||||
# Estimate time remaining
|
||||
if completed_chunks > 0 and running_chunks > 0:
|
||||
avg_time_per_chunk = 1.5 # hours (rough estimate)
|
||||
est_hours = round(pending_chunks * avg_time_per_chunk / max(running_chunks, 1), 1)
|
||||
else:
|
||||
est_hours = "N/A"
|
||||
|
||||
# Get worker status
|
||||
workers = {}
|
||||
for worker_id in WORKERS.keys():
|
||||
worker_data = get_worker_status(worker_id)
|
||||
worker_data['active_chunks'] = active_per_worker.get(worker_id, 0)
|
||||
workers[worker_id] = worker_data
|
||||
|
||||
# Render template
|
||||
return render_template_string(
|
||||
HTML_TEMPLATE,
|
||||
progress_pct=progress_pct,
|
||||
tested_combos=tested_combos,
|
||||
total_combos=total_combos,
|
||||
total_chunks=total_chunks,
|
||||
completed_chunks=completed_chunks,
|
||||
completed_pct=completed_pct,
|
||||
running_chunks=running_chunks,
|
||||
pending_chunks=pending_chunks,
|
||||
est_hours=est_hours,
|
||||
workers=workers,
|
||||
top_strategies=top_strategies,
|
||||
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("🌐 Starting web dashboard on http://0.0.0.0:5000")
|
||||
print(" Access from any browser on your network")
|
||||
print(" Auto-refreshes every 30 seconds")
|
||||
print()
|
||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||
Reference in New Issue
Block a user