CRITICAL FIXES: 1. Stop button now resets database FIRST (before pkill) - Database cleanup happens even if coordinator crashed - Prevents stale 'running' chunks blocking restart - Uses Node.js sqlite library (not CLI - Docker compatible) 2. UI enhancement - 4-state display - ⚡ Processing (running > 0) - ⏳ Pending (pending > 0, running = 0) - ✅ Complete (all completed) - ⏸️ Idle (no work queued) [NEW] - Shows pending chunk count when present TECHNICAL DETAILS: - Replaced sqlite3 CLI calls with proper Node.js API - Fixed permissions: chown 1001:1001 cluster/ for container write - Database-first logic: reset → pkill → verify - Detailed logging for each operation step FILES CHANGED: - app/api/cluster/control/route.ts (database operations refactored) - app/cluster/page.tsx (4-state UI display) VERIFIED: - Stop button successfully reset 3 'running' chunks → 'pending' - UI correctly shows Idle state after Stop - Container logs show detailed operation flow - Database operations work in Docker environment DEPLOYMENT: - Container rebuilt with fixed code - Tested with real stale database (3 running chunks) - All operations working correctly
161 lines
6.5 KiB
TypeScript
161 lines
6.5 KiB
TypeScript
import { NextRequest, NextResponse } from 'next/server'
|
|
import { exec } from 'child_process'
|
|
import { promisify } from 'util'
|
|
import path from 'path'
|
|
import sqlite3 from 'sqlite3'
|
|
import { open } from 'sqlite'
|
|
|
|
const execAsync = promisify(exec)
|
|
|
|
export const dynamic = 'force-dynamic'
|
|
|
|
export async function POST(request: NextRequest) {
|
|
try {
|
|
const { action } = await request.json()
|
|
|
|
if (action === 'start') {
|
|
// CRITICAL FIX (Dec 1, 2025): Check for stale database state before starting
|
|
// Problem: Database shows chunks as "running" but no coordinator process exists
|
|
// Solution: Reset stale chunks to "pending" before starting new exploration
|
|
|
|
const dbPath = path.join(process.cwd(), 'cluster', 'exploration.db')
|
|
|
|
// First check if coordinator is already running
|
|
const checkCmd = 'ps aux | grep distributed_coordinator.py | grep -v grep | wc -l'
|
|
const { stdout: checkStdout } = await execAsync(checkCmd)
|
|
const alreadyRunning = parseInt(checkStdout.trim()) > 0
|
|
|
|
if (alreadyRunning) {
|
|
return NextResponse.json({
|
|
success: false,
|
|
error: 'Coordinator is already running',
|
|
message: 'Use Stop button first if you want to restart'
|
|
}, { status: 400 })
|
|
}
|
|
|
|
// Reset any stale "running" chunks to "pending" (orphaned from crashed coordinator)
|
|
console.log('🔧 Checking for stale database chunks...')
|
|
try {
|
|
const db = await open({
|
|
filename: dbPath,
|
|
driver: sqlite3.Database
|
|
})
|
|
await db.run(`UPDATE chunks SET status='pending', assigned_worker=NULL, started_at=NULL WHERE status='running'`)
|
|
const { changes } = await db.run('SELECT changes() as changes')
|
|
await db.close()
|
|
console.log(`✅ Database cleanup complete - ${changes || 0} chunks reset`)
|
|
} catch (dbErr) {
|
|
console.error('⚠️ Database cleanup failed:', dbErr)
|
|
// Continue anyway - don't block start if database issue
|
|
}
|
|
|
|
// Start the coordinator
|
|
const startCmd = 'cd /home/icke/traderv4/cluster && nohup python3 distributed_coordinator.py > coordinator.log 2>&1 &'
|
|
await execAsync(startCmd)
|
|
console.log('🚀 Coordinator start command issued')
|
|
|
|
// Wait a moment for it to start
|
|
await new Promise(resolve => setTimeout(resolve, 2000))
|
|
|
|
// Verify it's running
|
|
const { stdout } = await execAsync(checkCmd)
|
|
const isRunning = parseInt(stdout.trim()) > 0
|
|
|
|
if (!isRunning) {
|
|
// Check coordinator log for errors
|
|
const logCmd = 'tail -20 /home/icke/traderv4/cluster/coordinator.log 2>/dev/null || echo "No log file"'
|
|
const { stdout: logOutput } = await execAsync(logCmd)
|
|
console.error('❌ Coordinator failed to start. Log:\n', logOutput)
|
|
|
|
return NextResponse.json({
|
|
success: false,
|
|
error: 'Coordinator failed to start',
|
|
details: logOutput,
|
|
message: 'Check coordinator.log for details'
|
|
}, { status: 500 })
|
|
}
|
|
|
|
return NextResponse.json({
|
|
success: true,
|
|
message: 'Coordinator started successfully',
|
|
isRunning: true
|
|
})
|
|
} else if (action === 'stop') {
|
|
// CRITICAL FIX (Dec 1, 2025): ALWAYS reset database state when stopping
|
|
// Issue: Coordinator may have already exited but left chunks in "running" state
|
|
// Solution: Reset database FIRST, then attempt to kill any remaining processes
|
|
|
|
console.log('🛑 Stopping cluster...')
|
|
|
|
// CRITICAL: Reset database state FIRST (even if coordinator already gone)
|
|
const dbPath = path.join(process.cwd(), 'cluster', 'exploration.db')
|
|
console.log('🔧 Resetting database chunks to pending...')
|
|
try {
|
|
const db = await open({
|
|
filename: dbPath,
|
|
driver: sqlite3.Database
|
|
})
|
|
const result = await db.run(`UPDATE chunks SET status='pending', assigned_worker=NULL, started_at=NULL WHERE status='running'`)
|
|
const pendingCount = await db.get(`SELECT COUNT(*) as count FROM chunks WHERE status='pending'`)
|
|
await db.close()
|
|
console.log(`✅ Database cleanup complete - ${result.changes || 0} chunks reset to pending (total pending: ${pendingCount?.count || 0})`)
|
|
} catch (dbErr) {
|
|
console.error('❌ Database reset failed:', dbErr)
|
|
return NextResponse.json({
|
|
success: false,
|
|
error: 'Failed to reset database state',
|
|
details: dbErr instanceof Error ? dbErr.message : 'Unknown error'
|
|
}, { status: 500 })
|
|
}
|
|
|
|
// THEN try to stop any running processes (may already be stopped)
|
|
const stopCmd = 'pkill -9 -f distributed_coordinator; pkill -9 -f distributed_worker'
|
|
try {
|
|
await execAsync(stopCmd)
|
|
console.log('✅ Killed coordinator and worker processes')
|
|
} catch (err) {
|
|
// pkill returns error code if no processes found - this is OK
|
|
console.log('📝 No processes to kill (already stopped)')
|
|
}
|
|
|
|
// Wait a moment for cleanup
|
|
await new Promise(resolve => setTimeout(resolve, 1000))
|
|
|
|
// Verify everything is stopped
|
|
const checkCmd = 'ps aux | grep -E "(distributed_coordinator|distributed_worker)" | grep -v grep | wc -l'
|
|
const { stdout } = await execAsync(checkCmd)
|
|
const processCount = parseInt(stdout.trim())
|
|
|
|
return NextResponse.json({
|
|
success: true,
|
|
message: 'Cluster stopped and database reset to pending',
|
|
isRunning: processCount > 0,
|
|
note: processCount === 0 ? 'All processes stopped, chunks reset' : 'Some processes may still be cleaning up'
|
|
})
|
|
} else if (action === 'status') {
|
|
// Check if coordinator is running
|
|
const checkCmd = 'ps aux | grep distributed_coordinator.py | grep -v grep | wc -l'
|
|
const { stdout } = await execAsync(checkCmd)
|
|
const isRunning = parseInt(stdout.trim()) > 0
|
|
|
|
return NextResponse.json({
|
|
success: true,
|
|
isRunning,
|
|
message: isRunning ? 'Coordinator is running' : 'Coordinator is not running'
|
|
})
|
|
} else {
|
|
return NextResponse.json({
|
|
success: false,
|
|
error: 'Invalid action. Use "start", "stop", or "status"'
|
|
}, { status: 400 })
|
|
}
|
|
} catch (error) {
|
|
console.error('Error controlling cluster:', error)
|
|
return NextResponse.json({
|
|
success: false,
|
|
error: 'Failed to control cluster',
|
|
details: error instanceof Error ? error.message : 'Unknown error'
|
|
}, { status: 500 })
|
|
}
|
|
}
|