fix: Database-first cluster status detection + Stop button clarification
CRITICAL FIX (Nov 30, 2025):
- Dashboard showed 'idle' despite 22+ worker processes running
- Root cause: SSH-based worker detection timing out
- Solution: Check database for running chunks FIRST
Changes:
1. app/api/cluster/status/route.ts:
- Query exploration database before SSH detection
- If running chunks exist, mark workers 'active' even if SSH fails
- Override worker status: 'offline' → 'active' when chunks running
- Log: '✅ Cluster status: ACTIVE (database shows running chunks)'
- Database is source of truth, SSH only for supplementary metrics
2. app/cluster/page.tsx:
- Stop button ALREADY EXISTS (conditionally shown)
- Shows Start when status='idle', Stop when status='active'
- No code changes needed - fixed by status detection
Result:
- Dashboard now shows 'ACTIVE' with 2 workers (correct)
- Workers show 'active' status (was 'offline')
- Stop button automatically visible when cluster active
- System resilient to SSH timeouts/network issues
Verified:
- Container restarted: Nov 30 21:18 UTC
- API tested: Returns status='active', activeWorkers=2
- Logs confirm: Database-first logic working
- Workers confirmed running: 22+ processes on worker1, workers on worker2
This commit is contained in:
183
cluster/exploration_status.py
Executable file
183
cluster/exploration_status.py
Executable file
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Exploration Status Monitor
|
||||
|
||||
Shows real-time progress of distributed parameter sweep across EPYC servers.
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List
|
||||
|
||||
CLUSTER_DIR = Path(__file__).parent
|
||||
DB_PATH = CLUSTER_DIR / 'exploration.db'
|
||||
|
||||
WORKERS = {
|
||||
'worker1': {'host': 'root@10.10.254.106'},
|
||||
'worker2': {'host': 'root@10.20.254.100', 'ssh_hop': 'root@10.10.254.106'},
|
||||
}
|
||||
|
||||
def check_worker_status(worker_id: str) -> Dict:
|
||||
"""Check if worker has active processes"""
|
||||
worker = WORKERS[worker_id]
|
||||
|
||||
if 'ssh_hop' in worker:
|
||||
cmd = f"ssh {worker['ssh_hop']} ssh {worker['host']} 'pgrep -f distributed_worker.py | wc -l'"
|
||||
else:
|
||||
cmd = f"ssh {worker['host']} 'pgrep -f distributed_worker.py | wc -l'"
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=5)
|
||||
active_processes = int(result.stdout.strip())
|
||||
return {'status': 'running' if active_processes > 0 else 'idle', 'processes': active_processes}
|
||||
except Exception as e:
|
||||
return {'status': 'unreachable', 'error': str(e)}
|
||||
|
||||
def print_status():
|
||||
"""Print comprehensive status"""
|
||||
|
||||
if not DB_PATH.exists():
|
||||
print("❌ Database not found. Run distributed_coordinator.py first.")
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
c = conn.cursor()
|
||||
|
||||
print("=" * 80)
|
||||
print("📊 DISTRIBUTED EXPLORATION STATUS")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
# Worker status
|
||||
print("🖥️ WORKER STATUS:")
|
||||
print()
|
||||
for worker_id in WORKERS.keys():
|
||||
status = check_worker_status(worker_id)
|
||||
status_emoji = "🟢" if status['status'] == 'running' else "⚪"
|
||||
print(f" {status_emoji} {worker_id}: {status['status']}", end="")
|
||||
if status['status'] == 'running':
|
||||
print(f" ({status['processes']} processes)")
|
||||
else:
|
||||
print()
|
||||
print()
|
||||
|
||||
# Chunk progress
|
||||
c.execute("SELECT status, COUNT(*) FROM chunks GROUP BY status")
|
||||
chunk_stats = dict(c.fetchall())
|
||||
|
||||
total_chunks = sum(chunk_stats.values()) if chunk_stats else 0
|
||||
completed = chunk_stats.get('completed', 0)
|
||||
running = chunk_stats.get('running', 0)
|
||||
pending = chunk_stats.get('pending', 0)
|
||||
|
||||
if total_chunks > 0:
|
||||
print("📦 CHUNK PROGRESS:")
|
||||
print()
|
||||
print(f" Total chunks: {total_chunks:,}")
|
||||
print(f" ✅ Completed: {completed:,} ({completed/total_chunks*100:.1f}%)")
|
||||
print(f" 🔄 Running: {running:,}")
|
||||
print(f" ⏸️ Pending: {pending:,}")
|
||||
print()
|
||||
|
||||
# Completion estimate
|
||||
c.execute("SELECT SUM(total_combos) FROM chunks")
|
||||
total_combos = c.fetchone()[0] or 0
|
||||
|
||||
c.execute("SELECT SUM(total_combos) FROM chunks WHERE status='completed'")
|
||||
completed_combos = c.fetchone()[0] or 0
|
||||
|
||||
if total_combos > 0:
|
||||
pct_complete = (completed_combos / total_combos) * 100
|
||||
print(f" 📊 Parameter space: {completed_combos:,} / {total_combos:,} tested ({pct_complete:.1f}%)")
|
||||
print()
|
||||
|
||||
# Strategy statistics
|
||||
c.execute("SELECT COUNT(*) FROM strategies")
|
||||
total_strategies = c.fetchone()[0]
|
||||
|
||||
if total_strategies > 0:
|
||||
print("🎯 STRATEGY RESULTS:")
|
||||
print()
|
||||
print(f" Total tested: {total_strategies:,} unique configurations")
|
||||
print()
|
||||
|
||||
# Top 10 strategies
|
||||
c.execute('''
|
||||
SELECT params_json, trades, win_rate, pnl_per_1k, profit_factor,
|
||||
max_drawdown, tested_at
|
||||
FROM strategies
|
||||
WHERE trades >= 700
|
||||
AND win_rate >= 0.50 AND win_rate <= 0.70
|
||||
AND profit_factor >= 1.2
|
||||
ORDER BY pnl_per_1k DESC
|
||||
LIMIT 10
|
||||
''')
|
||||
|
||||
top_strategies = c.fetchall()
|
||||
|
||||
if top_strategies:
|
||||
print(" 🏆 TOP 10 STRATEGIES:")
|
||||
print()
|
||||
print(" " + "-" * 76)
|
||||
print(f" {'Rank':<6} {'P&L/1k':<10} {'Trades':<8} {'WR%':<7} {'PF':<7} {'DD%':<8} {'Tested':<15}")
|
||||
print(" " + "-" * 76)
|
||||
|
||||
for i, row in enumerate(top_strategies, 1):
|
||||
params, trades, wr, pnl, pf, dd, tested = row
|
||||
tested_date = datetime.fromisoformat(tested).strftime('%Y-%m-%d %H:%M')
|
||||
print(f" {i:<6} ${pnl:<9.2f} {trades:<8} {wr*100:<6.1f}% {pf:<6.2f} {dd:<7.1f}% {tested_date}")
|
||||
|
||||
print(" " + "-" * 76)
|
||||
print()
|
||||
|
||||
# Show best config details
|
||||
best = top_strategies[0]
|
||||
params_json = best[0]
|
||||
import json
|
||||
params = json.loads(params_json)
|
||||
|
||||
print(" 💎 BEST CONFIGURATION:")
|
||||
print()
|
||||
print(f" P&L: ${best[3]:.2f} per $1k")
|
||||
print(f" Trades: {best[1]}, Win Rate: {best[2]*100:.1f}%, Profit Factor: {best[4]:.2f}")
|
||||
print(f" Max Drawdown: {best[5]:.1f}%")
|
||||
print()
|
||||
print(" Parameters:")
|
||||
for key, value in params.items():
|
||||
print(f" {key}: {value}")
|
||||
else:
|
||||
print(" ⏳ No validated strategies yet (need 700+ trades with realistic metrics)")
|
||||
else:
|
||||
print("⏳ No strategies tested yet")
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
|
||||
conn.close()
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Monitor distributed exploration progress')
|
||||
parser.add_argument('--watch', action='store_true', help='Watch mode (refresh every 30s)')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.watch:
|
||||
import time
|
||||
try:
|
||||
while True:
|
||||
print("\033[2J\033[H") # Clear screen
|
||||
print_status()
|
||||
print()
|
||||
print("Press Ctrl+C to exit watch mode...")
|
||||
time.sleep(30)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n👋 Exiting watch mode")
|
||||
else:
|
||||
print_status()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user