trading_bot_v4/lib/monitoring/drift-health-monitor.ts

/**
 * Drift SDK Health Monitor
 *
 * Monitors for accountUnsubscribe errors that indicate WebSocket connection issues.
 * When detected, triggers container restart via flag file for watch-restart.sh
 */

import fs from 'fs'
import path from 'path'

class DriftHealthMonitor {
  private errorCounts: Map<string, number> = new Map()
  private errorWindow: number = 10000 // 10 second window (was 30s - too slow for rapid memory leak)
  private errorThreshold: number = 20 // 20 errors in 10 seconds = problem (was 50 in 30s)
  private checkInterval: NodeJS.Timeout | null = null
  private isMonitoring: boolean = false

  /**
   * Start monitoring for Drift SDK errors
   */
  start(): void {
    if (this.isMonitoring) {
      console.log('⚠️ Drift health monitor already running')
      return
    }

    this.isMonitoring = true
    console.log('🏥 Drift health monitor started')
    console.log(`   Threshold: ${this.errorThreshold} accountUnsubscribe errors in ${this.errorWindow/1000}s`)

    // Check error counts every 3 seconds (was 10s - faster response to memory leak)
    this.checkInterval = setInterval(() => {
      this.checkErrorThreshold()
    }, 3000)
  }

  /**
   * Stop monitoring
   */
  stop(): void {
    if (this.checkInterval) {
      clearInterval(this.checkInterval)
      this.checkInterval = null
    }
    this.isMonitoring = false
    console.log('🏥 Drift health monitor stopped')
  }

  /**
   * Record an accountUnsubscribe error
   */
  recordError(errorType: string = 'accountUnsubscribe'): void {
    const now = Date.now()
    const key = `${errorType}-${now}`
    this.errorCounts.set(key, now)

    // Clean up old errors outside the window
    this.cleanupOldErrors()
  }

  /**
   * Remove errors older than the error window
   */
  private cleanupOldErrors(): void {
    const now = Date.now()
    const cutoff = now - this.errorWindow

    for (const [key, timestamp] of this.errorCounts.entries()) {
      if (timestamp < cutoff) {
        this.errorCounts.delete(key)
      }
    }
  }

  /**
   * Check if error threshold exceeded
   */
  private checkErrorThreshold(): void {
    this.cleanupOldErrors()

    const errorCount = this.errorCounts.size

    if (errorCount >= this.errorThreshold) {
      console.error(`🚨 CRITICAL: ${errorCount} Drift SDK errors in ${this.errorWindow/1000}s (threshold: ${this.errorThreshold})`)
      console.error('🔄 Triggering container restart to clear WebSocket connection leak...')

      this.triggerRestart()

      // Stop monitoring to prevent multiple restart triggers
      this.stop()
    }
  }

  /**
   * Trigger container restart via flag file
   */
  private triggerRestart(): void {
    const restartFlagPath = '/tmp/trading-bot-restart.flag'

    try {
      fs.writeFileSync(
        restartFlagPath,
        `Drift SDK health check failed: ${this.errorCounts.size} accountUnsubscribe errors\nTimestamp: ${new Date().toISOString()}\n`,
        'utf-8'
      )
      console.log(`✅ Restart flag created at ${restartFlagPath}`)
      console.log('   watch-restart.sh will restart container within 10 seconds')
    } catch (error) {
      console.error('❌ Failed to create restart flag:', error)
    }
  }

  /**
   * Get current error count
   */
  getErrorCount(): number {
    this.cleanupOldErrors()
    return this.errorCounts.size
  }

  /**
   * Get health status
   */
  getHealthStatus(): { healthy: boolean; errorCount: number; threshold: number } {
    const errorCount = this.getErrorCount()
    return {
      healthy: errorCount < this.errorThreshold,
      errorCount,
      threshold: this.errorThreshold
    }
  }
}

// Singleton instance
let monitorInstance: DriftHealthMonitor | null = null

/**
 * Get the Drift health monitor singleton
 */
export function getDriftHealthMonitor(): DriftHealthMonitor {
  if (!monitorInstance) {
    monitorInstance = new DriftHealthMonitor()
  }
  return monitorInstance
}

/**
 * Start Drift health monitoring
 */
export function startDriftHealthMonitoring(): void {
  const monitor = getDriftHealthMonitor()
  monitor.start()
}

/**
 * Stop Drift health monitoring
 */
export function stopDriftHealthMonitoring(): void {
  if (monitorInstance) {
    monitorInstance.stop()
  }
}