import { NextRequest, NextResponse } from 'next/server' import { exec } from 'child_process' import { promisify } from 'util' import path from 'path' import sqlite3 from 'sqlite3' import { open } from 'sqlite' const execAsync = promisify(exec) export const dynamic = 'force-dynamic' export async function POST(request: NextRequest) { try { const { action } = await request.json() if (action === 'start') { // CRITICAL FIX (Dec 1, 2025): Check for stale database state before starting // Problem: Database shows chunks as "running" but no coordinator process exists // Solution: Reset stale chunks to "pending" before starting new exploration const dbPath = path.join(process.cwd(), 'cluster', 'exploration.db') // First check if coordinator is already running const checkCmd = 'ps aux | grep distributed_coordinator.py | grep -v grep | wc -l' const { stdout: checkStdout } = await execAsync(checkCmd) const alreadyRunning = parseInt(checkStdout.trim()) > 0 if (alreadyRunning) { return NextResponse.json({ success: false, error: 'Coordinator is already running', message: 'Use Stop button first if you want to restart' }, { status: 400 }) } // Reset any stale "running" chunks to "pending" (orphaned from crashed coordinator) console.log('🔧 Checking for stale database chunks...') try { const db = await open({ filename: dbPath, driver: sqlite3.Database }) await db.run(`UPDATE chunks SET status='pending', assigned_worker=NULL, started_at=NULL WHERE status='running'`) const { changes } = await db.run('SELECT changes() as changes') await db.close() console.log(`✅ Database cleanup complete - ${changes || 0} chunks reset`) } catch (dbErr) { console.error('⚠️ Database cleanup failed:', dbErr) // Continue anyway - don't block start if database issue } // Start the coordinator const startCmd = 'cd /home/icke/traderv4/cluster && nohup python3 distributed_coordinator.py > coordinator.log 2>&1 &' await execAsync(startCmd) console.log('🚀 Coordinator start command issued') // Wait a moment for it to start await new Promise(resolve => setTimeout(resolve, 2000)) // Verify it's running const { stdout } = await execAsync(checkCmd) const isRunning = parseInt(stdout.trim()) > 0 if (!isRunning) { // Check coordinator log for errors const logCmd = 'tail -20 /home/icke/traderv4/cluster/coordinator.log 2>/dev/null || echo "No log file"' const { stdout: logOutput } = await execAsync(logCmd) console.error('❌ Coordinator failed to start. Log:\n', logOutput) return NextResponse.json({ success: false, error: 'Coordinator failed to start', details: logOutput, message: 'Check coordinator.log for details' }, { status: 500 }) } return NextResponse.json({ success: true, message: 'Coordinator started successfully', isRunning: true }) } else if (action === 'stop') { // CRITICAL FIX (Dec 1, 2025): ALWAYS reset database state when stopping // Issue: Coordinator may have already exited but left chunks in "running" state // Solution: Reset database FIRST, then attempt to kill any remaining processes console.log('🛑 Stopping cluster...') // CRITICAL: Reset database state FIRST (even if coordinator already gone) const dbPath = path.join(process.cwd(), 'cluster', 'exploration.db') console.log('🔧 Resetting database chunks to pending...') try { const db = await open({ filename: dbPath, driver: sqlite3.Database }) const result = await db.run(`UPDATE chunks SET status='pending', assigned_worker=NULL, started_at=NULL WHERE status='running'`) const pendingCount = await db.get(`SELECT COUNT(*) as count FROM chunks WHERE status='pending'`) await db.close() console.log(`✅ Database cleanup complete - ${result.changes || 0} chunks reset to pending (total pending: ${pendingCount?.count || 0})`) } catch (dbErr) { console.error('❌ Database reset failed:', dbErr) return NextResponse.json({ success: false, error: 'Failed to reset database state', details: dbErr instanceof Error ? dbErr.message : 'Unknown error' }, { status: 500 }) } // THEN try to stop any running processes (may already be stopped) const stopCmd = 'pkill -9 -f distributed_coordinator; pkill -9 -f distributed_worker' try { await execAsync(stopCmd) console.log('✅ Killed coordinator and worker processes') } catch (err) { // pkill returns error code if no processes found - this is OK console.log('📝 No processes to kill (already stopped)') } // Wait a moment for cleanup await new Promise(resolve => setTimeout(resolve, 1000)) // Verify everything is stopped const checkCmd = 'ps aux | grep -E "(distributed_coordinator|distributed_worker)" | grep -v grep | wc -l' const { stdout } = await execAsync(checkCmd) const processCount = parseInt(stdout.trim()) return NextResponse.json({ success: true, message: 'Cluster stopped and database reset to pending', isRunning: processCount > 0, note: processCount === 0 ? 'All processes stopped, chunks reset' : 'Some processes may still be cleaning up' }) } else if (action === 'status') { // Check if coordinator is running const checkCmd = 'ps aux | grep distributed_coordinator.py | grep -v grep | wc -l' const { stdout } = await execAsync(checkCmd) const isRunning = parseInt(stdout.trim()) > 0 return NextResponse.json({ success: true, isRunning, message: isRunning ? 'Coordinator is running' : 'Coordinator is not running' }) } else { return NextResponse.json({ success: false, error: 'Invalid action. Use "start", "stop", or "status"' }, { status: 400 }) } } catch (error) { console.error('Error controlling cluster:', error) return NextResponse.json({ success: false, error: 'Failed to control cluster', details: error instanceof Error ? error.message : 'Unknown error' }, { status: 500 }) } }