Files
trading_bot_v4/cluster/web_dashboard.py
mindesbunister cc56b72df2 fix: Database-first cluster status detection + Stop button clarification
CRITICAL FIX (Nov 30, 2025):
- Dashboard showed 'idle' despite 22+ worker processes running
- Root cause: SSH-based worker detection timing out
- Solution: Check database for running chunks FIRST

Changes:
1. app/api/cluster/status/route.ts:
   - Query exploration database before SSH detection
   - If running chunks exist, mark workers 'active' even if SSH fails
   - Override worker status: 'offline' → 'active' when chunks running
   - Log: ' Cluster status: ACTIVE (database shows running chunks)'
   - Database is source of truth, SSH only for supplementary metrics

2. app/cluster/page.tsx:
   - Stop button ALREADY EXISTS (conditionally shown)
   - Shows Start when status='idle', Stop when status='active'
   - No code changes needed - fixed by status detection

Result:
- Dashboard now shows 'ACTIVE' with 2 workers (correct)
- Workers show 'active' status (was 'offline')
- Stop button automatically visible when cluster active
- System resilient to SSH timeouts/network issues

Verified:
- Container restarted: Nov 30 21:18 UTC
- API tested: Returns status='active', activeWorkers=2
- Logs confirm: Database-first logic working
- Workers confirmed running: 22+ processes on worker1, workers on worker2
2025-11-30 22:23:01 +01:00

441 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Real-time web dashboard for distributed parameter exploration cluster.
Shows worker status, chunk progress, top strategies, and system metrics.
"""
from flask import Flask, render_template_string
import sqlite3
import subprocess
import time
from datetime import datetime
app = Flask(__name__)
WORKERS = {
'worker1': {
'host': 'root@10.10.254.106',
'cores': 32,
'name': 'Worker1'
},
'worker2': {
'host': 'root@10.20.254.100',
'ssh_hop': 'root@10.10.254.106',
'cores': 32,
'name': 'Worker2 (via hop)'
}
}
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
<title>Cluster Dashboard</title>
<meta http-equiv="refresh" content="30">
<style>
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
margin: 0;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: #fff;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
h1 {
text-align: center;
font-size: 2.5em;
margin-bottom: 10px;
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
}
.subtitle {
text-align: center;
font-size: 1.2em;
opacity: 0.9;
margin-bottom: 30px;
}
.grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
gap: 20px;
margin-bottom: 20px;
}
.card {
background: rgba(255,255,255,0.15);
backdrop-filter: blur(10px);
border-radius: 15px;
padding: 25px;
box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
border: 1px solid rgba(255,255,255,0.18);
}
.card h2 {
margin-top: 0;
font-size: 1.5em;
border-bottom: 2px solid rgba(255,255,255,0.3);
padding-bottom: 10px;
margin-bottom: 15px;
}
.worker {
background: rgba(255,255,255,0.1);
padding: 15px;
border-radius: 10px;
margin-bottom: 15px;
}
.worker-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 10px;
}
.worker-name {
font-size: 1.3em;
font-weight: bold;
}
.status-badge {
padding: 5px 15px;
border-radius: 20px;
font-size: 0.9em;
font-weight: bold;
}
.status-running {
background: #10b981;
color: white;
}
.status-idle {
background: #6b7280;
color: white;
}
.metric {
display: flex;
justify-content: space-between;
padding: 8px 0;
border-bottom: 1px solid rgba(255,255,255,0.1);
}
.metric:last-child {
border-bottom: none;
}
.metric-label {
opacity: 0.8;
}
.metric-value {
font-weight: bold;
font-size: 1.1em;
}
.progress-bar {
background: rgba(255,255,255,0.2);
border-radius: 10px;
height: 30px;
margin: 10px 0;
overflow: hidden;
position: relative;
}
.progress-fill {
background: linear-gradient(90deg, #10b981, #34d399);
height: 100%;
transition: width 0.5s ease;
display: flex;
align-items: center;
justify-content: center;
font-weight: bold;
}
.strategies-table {
width: 100%;
border-collapse: collapse;
margin-top: 15px;
}
.strategies-table th {
background: rgba(255,255,255,0.2);
padding: 10px;
text-align: left;
font-weight: bold;
}
.strategies-table td {
padding: 10px;
border-bottom: 1px solid rgba(255,255,255,0.1);
}
.strategies-table tr:hover {
background: rgba(255,255,255,0.1);
}
.stat-big {
font-size: 3em;
font-weight: bold;
text-align: center;
margin: 20px 0;
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
}
.timestamp {
text-align: center;
opacity: 0.7;
margin-top: 20px;
}
.positive { color: #10b981; }
.negative { color: #ef4444; }
</style>
</head>
<body>
<div class="container">
<h1>🚀 Parameter Exploration Cluster</h1>
<div class="subtitle">Real-time distributed backtesting dashboard</div>
<div class="grid">
<!-- Overall Progress -->
<div class="card">
<h2>📊 Exploration Progress</h2>
<div class="stat-big">{{ progress_pct }}%</div>
<div class="progress-bar">
<div class="progress-fill" style="width: {{ progress_pct }}%">
{{ tested_combos:,}} / {{ total_combos:,}}
</div>
</div>
<div class="metric">
<span class="metric-label">Total Chunks</span>
<span class="metric-value">{{ total_chunks }}</span>
</div>
<div class="metric">
<span class="metric-label">Completed</span>
<span class="metric-value">{{ completed_chunks }} ({{ completed_pct }}%)</span>
</div>
<div class="metric">
<span class="metric-label">Running</span>
<span class="metric-value">{{ running_chunks }}</span>
</div>
<div class="metric">
<span class="metric-label">Pending</span>
<span class="metric-value">{{ pending_chunks }}</span>
</div>
<div class="metric">
<span class="metric-label">Est. Completion</span>
<span class="metric-value">{{ est_hours }}h remaining</span>
</div>
</div>
<!-- Worker Status -->
<div class="card">
<h2>🖥️ Worker Status</h2>
{% for worker_id, worker_data in workers.items() %}
<div class="worker">
<div class="worker-header">
<div class="worker-name">{{ worker_data.name }}</div>
<div class="status-badge status-{{ worker_data.status }}">
{{ worker_data.status_text }}
</div>
</div>
<div class="metric">
<span class="metric-label">CPU Usage</span>
<span class="metric-value">{{ worker_data.cpu }}%</span>
</div>
<div class="metric">
<span class="metric-label">Processes</span>
<span class="metric-value">{{ worker_data.processes }}</span>
</div>
<div class="metric">
<span class="metric-label">Active Chunks</span>
<span class="metric-value">{{ worker_data.active_chunks }}</span>
</div>
</div>
{% endfor %}
</div>
</div>
<!-- Top Strategies -->
<div class="card">
<h2>🏆 Top 10 Strategies</h2>
{% if top_strategies %}
<table class="strategies-table">
<thead>
<tr>
<th>Rank</th>
<th>Parameters</th>
<th>PnL per 1k</th>
<th>Win Rate</th>
<th>Profit Factor</th>
<th>Trades</th>
</tr>
</thead>
<tbody>
{% for strat in top_strategies %}
<tr>
<td><strong>#{{ loop.index }}</strong></td>
<td style="font-size: 0.85em;">
flip={{ strat.flip_threshold }},
gap={{ strat.ma_gap }},
adx={{ strat.momentum_adx }},
pos={{ strat.momentum_long_pos }}/{{ strat.momentum_short_pos }}
</td>
<td class="{{ 'positive' if strat.pnl_per_1k > 0 else 'negative' }}">
${{ "%.2f"|format(strat.pnl_per_1k) }}
</td>
<td>{{ "%.1f"|format(strat.win_rate) }}%</td>
<td>{{ "%.2f"|format(strat.profit_factor) }}</td>
<td>{{ strat.total_trades }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p style="text-align: center; opacity: 0.7; padding: 20px;">
⏳ Processing combinations... Results will appear when chunks complete.
<br><small>First chunk running now - check back in a few minutes!</small>
</p>
{% endif %}
</div>
<div class="timestamp">
Last updated: {{ timestamp }}
<br>Auto-refreshes every 30 seconds
</div>
</div>
</body>
</html>
"""
def get_worker_status(worker_id):
"""Get real-time status from a worker via SSH."""
worker = WORKERS[worker_id]
try:
# Get CPU usage
if 'ssh_hop' in worker:
cpu_cmd = f"ssh {WORKERS['worker1']['host']} 'ssh {worker['host']} \"top -bn1 | grep Cpu\"'"
else:
cpu_cmd = f"ssh {worker['host']} 'top -bn1 | grep Cpu'"
cpu_result = subprocess.run(cpu_cmd, shell=True, capture_output=True, text=True, timeout=5)
cpu_line = cpu_result.stdout.strip()
# Parse: %Cpu(s): 90.1 us, ...
if 'Cpu' in cpu_line:
cpu_pct = float(cpu_line.split(':')[1].split('us')[0].strip())
else:
cpu_pct = 0.0
# Get process count
if 'ssh_hop' in worker:
proc_cmd = f"ssh {WORKERS['worker1']['host']} 'ssh {worker['host']} \"ps aux | grep chunk_v9 | grep python | wc -l\"'"
else:
proc_cmd = f"ssh {worker['host']} 'ps aux | grep chunk_v9 | grep python | wc -l'"
proc_result = subprocess.run(proc_cmd, shell=True, capture_output=True, text=True, timeout=5)
processes = int(proc_result.stdout.strip())
# Determine status
if processes > 0 and cpu_pct > 50:
status = 'running'
status_text = 'RUNNING'
elif processes > 0:
status = 'running'
status_text = 'STARTING'
else:
status = 'idle'
status_text = 'IDLE'
return {
'name': worker.get('name', worker_id),
'cpu': round(cpu_pct, 1),
'processes': processes,
'status': status,
'status_text': status_text,
'active_chunks': 0 # Will be filled from DB
}
except Exception as e:
return {
'name': worker.get('name', worker_id),
'cpu': 0.0,
'processes': 0,
'status': 'idle',
'status_text': 'ERROR',
'active_chunks': 0
}
@app.route('/')
def dashboard():
"""Render the dashboard."""
# Connect to database
conn = sqlite3.connect('exploration.db')
c = conn.cursor()
# Get chunk statistics
c.execute("SELECT COUNT(*), status FROM chunks GROUP BY status")
chunk_stats = {row[1]: row[0] for row in c.fetchall()}
total_chunks = sum(chunk_stats.values())
completed_chunks = chunk_stats.get('completed', 0)
running_chunks = chunk_stats.get('running', 0)
pending_chunks = chunk_stats.get('pending', 0)
# Get active chunks per worker
c.execute("SELECT assigned_worker, COUNT(*) FROM chunks WHERE status='running' GROUP BY assigned_worker")
active_per_worker = {row[0]: row[1] for row in c.fetchall()}
# Get total combinations from chunks table
c.execute("SELECT SUM(total_combos) FROM chunks")
total_combos_result = c.fetchone()[0]
total_combos = total_combos_result if total_combos_result else 4096
# Get tested strategies count (if strategies table exists)
try:
c.execute("SELECT COUNT(*) FROM strategies")
tested_combos = c.fetchone()[0]
except sqlite3.OperationalError:
tested_combos = 0
# Get top strategies (if table exists)
top_strategies = []
try:
c.execute("""
SELECT * FROM strategies
WHERE total_trades >= 700
ORDER BY pnl_per_1k DESC
LIMIT 10
""")
columns = [desc[0] for desc in c.description]
for row in c.fetchall():
strat = dict(zip(columns, row))
top_strategies.append(strat)
except sqlite3.OperationalError:
pass # Table doesn't exist yet
conn.close()
# Calculate progress
progress_pct = round((tested_combos / total_combos) * 100, 2) if total_combos > 0 else 0
completed_pct = round((completed_chunks / total_chunks) * 100, 1) if total_chunks > 0 else 0
# Estimate time remaining
if completed_chunks > 0 and running_chunks > 0:
avg_time_per_chunk = 1.5 # hours (rough estimate)
est_hours = round(pending_chunks * avg_time_per_chunk / max(running_chunks, 1), 1)
else:
est_hours = "N/A"
# Get worker status
workers = {}
for worker_id in WORKERS.keys():
worker_data = get_worker_status(worker_id)
worker_data['active_chunks'] = active_per_worker.get(worker_id, 0)
workers[worker_id] = worker_data
# Render template
return render_template_string(
HTML_TEMPLATE,
progress_pct=progress_pct,
tested_combos=tested_combos,
total_combos=total_combos,
total_chunks=total_chunks,
completed_chunks=completed_chunks,
completed_pct=completed_pct,
running_chunks=running_chunks,
pending_chunks=pending_chunks,
est_hours=est_hours,
workers=workers,
top_strategies=top_strategies,
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
)
if __name__ == '__main__':
print("🌐 Starting web dashboard on http://0.0.0.0:5000")
print(" Access from any browser on your network")
print(" Auto-refreshes every 30 seconds")
print()
app.run(host='0.0.0.0', port=5000, debug=False)