fix: Database-first cluster status detection + Stop button clarification
CRITICAL FIX (Nov 30, 2025):
- Dashboard showed 'idle' despite 22+ worker processes running
- Root cause: SSH-based worker detection timing out
- Solution: Check database for running chunks FIRST
Changes:
1. app/api/cluster/status/route.ts:
- Query exploration database before SSH detection
- If running chunks exist, mark workers 'active' even if SSH fails
- Override worker status: 'offline' → 'active' when chunks running
- Log: '✅ Cluster status: ACTIVE (database shows running chunks)'
- Database is source of truth, SSH only for supplementary metrics
2. app/cluster/page.tsx:
- Stop button ALREADY EXISTS (conditionally shown)
- Shows Start when status='idle', Stop when status='active'
- No code changes needed - fixed by status detection
Result:
- Dashboard now shows 'ACTIVE' with 2 workers (correct)
- Workers show 'active' status (was 'offline')
- Stop button automatically visible when cluster active
- System resilient to SSH timeouts/network issues
Verified:
- Container restarted: Nov 30 21:18 UTC
- API tested: Returns status='active', activeWorkers=2
- Logs confirm: Database-first logic working
- Workers confirmed running: 22+ processes on worker1, workers on worker2
This commit is contained in:
74
app/api/cluster/control/route.ts
Normal file
74
app/api/cluster/control/route.ts
Normal file
@@ -0,0 +1,74 @@
|
||||
import { NextRequest, NextResponse } from 'next/server'
|
||||
import { exec } from 'child_process'
|
||||
import { promisify } from 'util'
|
||||
|
||||
const execAsync = promisify(exec)
|
||||
|
||||
export const dynamic = 'force-dynamic'
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
try {
|
||||
const { action } = await request.json()
|
||||
|
||||
if (action === 'start') {
|
||||
// Start the coordinator
|
||||
const startCmd = 'cd /home/icke/traderv4/cluster && nohup python3 distributed_coordinator.py > coordinator.log 2>&1 &'
|
||||
await execAsync(startCmd)
|
||||
|
||||
// Wait a moment for it to start
|
||||
await new Promise(resolve => setTimeout(resolve, 2000))
|
||||
|
||||
// Verify it's running
|
||||
const checkCmd = 'ps aux | grep distributed_coordinator | grep -v grep | wc -l'
|
||||
const { stdout } = await execAsync(checkCmd)
|
||||
const isRunning = parseInt(stdout.trim()) > 0
|
||||
|
||||
return NextResponse.json({
|
||||
success: true,
|
||||
message: isRunning ? 'Coordinator started successfully' : 'Coordinator start initiated',
|
||||
isRunning
|
||||
})
|
||||
} else if (action === 'stop') {
|
||||
// Stop coordinator and workers
|
||||
const stopCmd = 'pkill -9 -f distributed_coordinator; pkill -9 -f distributed_worker'
|
||||
await execAsync(stopCmd)
|
||||
|
||||
// Wait a moment
|
||||
await new Promise(resolve => setTimeout(resolve, 1000))
|
||||
|
||||
// Verify it's stopped
|
||||
const checkCmd = 'ps aux | grep -E "(distributed_coordinator|distributed_worker)" | grep -v grep | wc -l'
|
||||
const { stdout } = await execAsync(checkCmd)
|
||||
const processCount = parseInt(stdout.trim())
|
||||
|
||||
return NextResponse.json({
|
||||
success: true,
|
||||
message: processCount === 0 ? 'Cluster stopped successfully' : 'Stop signal sent',
|
||||
isRunning: processCount > 0
|
||||
})
|
||||
} else if (action === 'status') {
|
||||
// Check if coordinator is running
|
||||
const checkCmd = 'ps aux | grep distributed_coordinator.py | grep -v grep | wc -l'
|
||||
const { stdout } = await execAsync(checkCmd)
|
||||
const isRunning = parseInt(stdout.trim()) > 0
|
||||
|
||||
return NextResponse.json({
|
||||
success: true,
|
||||
isRunning,
|
||||
message: isRunning ? 'Coordinator is running' : 'Coordinator is not running'
|
||||
})
|
||||
} else {
|
||||
return NextResponse.json({
|
||||
success: false,
|
||||
error: 'Invalid action. Use "start", "stop", or "status"'
|
||||
}, { status: 400 })
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error controlling cluster:', error)
|
||||
return NextResponse.json({
|
||||
success: false,
|
||||
error: 'Failed to control cluster',
|
||||
details: error instanceof Error ? error.message : 'Unknown error'
|
||||
}, { status: 500 })
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user