feat: Replace blind 2-hour reconnect with error-based health monitoring
User Request: Replace blind 2-hour restart timer with smart monitoring that only restarts when accountUnsubscribe errors actually occur Changes: . Health Monitor (NEW): - Created lib/monitoring/drift-health-monitor.ts - Tracks accountUnsubscribe errors in 30-second sliding window - Triggers container restart via flag file when 50+ errors detected - Prevents unnecessary restarts when SDK healthy . Drift Client: - Removed blind scheduleReconnection() and 2-hour timer - Added interceptWebSocketErrors() to catch SDK errors - Patches console.error to monitor for accountUnsubscribe patterns - Starts health monitor after successful initialization - Removed unused reconnect() method and reconnectTimer field . Health API (NEW): - GET /api/drift/health - Check current error count and health status - Returns: healthy boolean, errorCount, threshold, message - Useful for external monitoring and debugging Impact: - System only restarts when actual memory leak detected - Prevents unnecessary downtime every 2 hours - More targeted response to SDK issues - Better operational stability Files: - lib/monitoring/drift-health-monitor.ts (NEW - 165 lines) - lib/drift/client.ts (removed timer, added error interception) - app/api/drift/health/route.ts (NEW - health check endpoint) Testing: - Health monitor starts on initialization: ✅ - API endpoint returns healthy status: ✅ - No blind reconnection scheduled: ✅
This commit is contained in:
162
lib/monitoring/drift-health-monitor.ts
Normal file
162
lib/monitoring/drift-health-monitor.ts
Normal file
@@ -0,0 +1,162 @@
|
||||
/**
|
||||
* Drift SDK Health Monitor
|
||||
*
|
||||
* Monitors for accountUnsubscribe errors that indicate WebSocket connection issues.
|
||||
* When detected, triggers container restart via flag file for watch-restart.sh
|
||||
*/
|
||||
|
||||
import fs from 'fs'
|
||||
import path from 'path'
|
||||
|
||||
class DriftHealthMonitor {
|
||||
private errorCounts: Map<string, number> = new Map()
|
||||
private errorWindow: number = 30000 // 30 second window
|
||||
private errorThreshold: number = 50 // 50 errors in 30 seconds = problem
|
||||
private checkInterval: NodeJS.Timeout | null = null
|
||||
private isMonitoring: boolean = false
|
||||
|
||||
/**
|
||||
* Start monitoring for Drift SDK errors
|
||||
*/
|
||||
start(): void {
|
||||
if (this.isMonitoring) {
|
||||
console.log('⚠️ Drift health monitor already running')
|
||||
return
|
||||
}
|
||||
|
||||
this.isMonitoring = true
|
||||
console.log('🏥 Drift health monitor started')
|
||||
console.log(` Threshold: ${this.errorThreshold} accountUnsubscribe errors in ${this.errorWindow/1000}s`)
|
||||
|
||||
// Check error counts every 10 seconds
|
||||
this.checkInterval = setInterval(() => {
|
||||
this.checkErrorThreshold()
|
||||
}, 10000)
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop monitoring
|
||||
*/
|
||||
stop(): void {
|
||||
if (this.checkInterval) {
|
||||
clearInterval(this.checkInterval)
|
||||
this.checkInterval = null
|
||||
}
|
||||
this.isMonitoring = false
|
||||
console.log('🏥 Drift health monitor stopped')
|
||||
}
|
||||
|
||||
/**
|
||||
* Record an accountUnsubscribe error
|
||||
*/
|
||||
recordError(errorType: string = 'accountUnsubscribe'): void {
|
||||
const now = Date.now()
|
||||
const key = `${errorType}-${now}`
|
||||
this.errorCounts.set(key, now)
|
||||
|
||||
// Clean up old errors outside the window
|
||||
this.cleanupOldErrors()
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove errors older than the error window
|
||||
*/
|
||||
private cleanupOldErrors(): void {
|
||||
const now = Date.now()
|
||||
const cutoff = now - this.errorWindow
|
||||
|
||||
for (const [key, timestamp] of this.errorCounts.entries()) {
|
||||
if (timestamp < cutoff) {
|
||||
this.errorCounts.delete(key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if error threshold exceeded
|
||||
*/
|
||||
private checkErrorThreshold(): void {
|
||||
this.cleanupOldErrors()
|
||||
|
||||
const errorCount = this.errorCounts.size
|
||||
|
||||
if (errorCount >= this.errorThreshold) {
|
||||
console.error(`🚨 CRITICAL: ${errorCount} Drift SDK errors in ${this.errorWindow/1000}s (threshold: ${this.errorThreshold})`)
|
||||
console.error('🔄 Triggering container restart to clear WebSocket connection leak...')
|
||||
|
||||
this.triggerRestart()
|
||||
|
||||
// Stop monitoring to prevent multiple restart triggers
|
||||
this.stop()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Trigger container restart via flag file
|
||||
*/
|
||||
private triggerRestart(): void {
|
||||
const restartFlagPath = '/tmp/trading-bot-restart.flag'
|
||||
|
||||
try {
|
||||
fs.writeFileSync(
|
||||
restartFlagPath,
|
||||
`Drift SDK health check failed: ${this.errorCounts.size} accountUnsubscribe errors\nTimestamp: ${new Date().toISOString()}\n`,
|
||||
'utf-8'
|
||||
)
|
||||
console.log(`✅ Restart flag created at ${restartFlagPath}`)
|
||||
console.log(' watch-restart.sh will restart container within 10 seconds')
|
||||
} catch (error) {
|
||||
console.error('❌ Failed to create restart flag:', error)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current error count
|
||||
*/
|
||||
getErrorCount(): number {
|
||||
this.cleanupOldErrors()
|
||||
return this.errorCounts.size
|
||||
}
|
||||
|
||||
/**
|
||||
* Get health status
|
||||
*/
|
||||
getHealthStatus(): { healthy: boolean; errorCount: number; threshold: number } {
|
||||
const errorCount = this.getErrorCount()
|
||||
return {
|
||||
healthy: errorCount < this.errorThreshold,
|
||||
errorCount,
|
||||
threshold: this.errorThreshold
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Singleton instance
|
||||
let monitorInstance: DriftHealthMonitor | null = null
|
||||
|
||||
/**
|
||||
* Get the Drift health monitor singleton
|
||||
*/
|
||||
export function getDriftHealthMonitor(): DriftHealthMonitor {
|
||||
if (!monitorInstance) {
|
||||
monitorInstance = new DriftHealthMonitor()
|
||||
}
|
||||
return monitorInstance
|
||||
}
|
||||
|
||||
/**
|
||||
* Start Drift health monitoring
|
||||
*/
|
||||
export function startDriftHealthMonitoring(): void {
|
||||
const monitor = getDriftHealthMonitor()
|
||||
monitor.start()
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop Drift health monitoring
|
||||
*/
|
||||
export function stopDriftHealthMonitoring(): void {
|
||||
if (monitorInstance) {
|
||||
monitorInstance.stop()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user