- Created logger utility with environment-based gating (lib/utils/logger.ts) - Replaced 517 console.log statements with logger.log (71% reduction) - Fixed import paths in 15 files (resolved comment-trapped imports) - Added DEBUG_LOGS=false to .env - Achieves 71% immediate log reduction (517/731 statements) - Expected 90% reduction in production when deployed Impact: Reduced I/O blocking, lower log volume in production Risk: LOW (easy rollback, non-invasive) Phase: Phase 1, Task 1.1 (Quick Wins - Console.log Production Gating) Files changed: - NEW: lib/utils/logger.ts (production-safe logging) - NEW: scripts/replace-console-logs.js (automation tool) - Modified: 15 lib/*.ts files (console.log → logger.log) - Modified: .env (DEBUG_LOGS=false) Next: Task 1.2 (Image Size Optimization)
164 lines
4.2 KiB
TypeScript
164 lines
4.2 KiB
TypeScript
/**
|
|
* Drift SDK Health Monitor
|
|
*
|
|
* Monitors for accountUnsubscribe errors that indicate WebSocket connection issues.
|
|
* When detected, triggers container restart via flag file for watch-restart.sh
|
|
*/
|
|
|
|
import fs from 'fs'
|
|
import { logger } from '../utils/logger'
|
|
import path from 'path'
|
|
|
|
class DriftHealthMonitor {
|
|
private errorCounts: Map<string, number> = new Map()
|
|
private errorWindow: number = 10000 // 10 second window (was 30s - too slow for rapid memory leak)
|
|
private errorThreshold: number = 20 // 20 errors in 10 seconds = problem (was 50 in 30s)
|
|
private checkInterval: NodeJS.Timeout | null = null
|
|
private isMonitoring: boolean = false
|
|
|
|
/**
|
|
* Start monitoring for Drift SDK errors
|
|
*/
|
|
start(): void {
|
|
if (this.isMonitoring) {
|
|
logger.log('⚠️ Drift health monitor already running')
|
|
return
|
|
}
|
|
|
|
this.isMonitoring = true
|
|
logger.log('🏥 Drift health monitor started')
|
|
logger.log(` Threshold: ${this.errorThreshold} accountUnsubscribe errors in ${this.errorWindow/1000}s`)
|
|
|
|
// Check error counts every 3 seconds (was 10s - faster response to memory leak)
|
|
this.checkInterval = setInterval(() => {
|
|
this.checkErrorThreshold()
|
|
}, 3000)
|
|
}
|
|
|
|
/**
|
|
* Stop monitoring
|
|
*/
|
|
stop(): void {
|
|
if (this.checkInterval) {
|
|
clearInterval(this.checkInterval)
|
|
this.checkInterval = null
|
|
}
|
|
this.isMonitoring = false
|
|
logger.log('🏥 Drift health monitor stopped')
|
|
}
|
|
|
|
/**
|
|
* Record an accountUnsubscribe error
|
|
*/
|
|
recordError(errorType: string = 'accountUnsubscribe'): void {
|
|
const now = Date.now()
|
|
const key = `${errorType}-${now}`
|
|
this.errorCounts.set(key, now)
|
|
|
|
// Clean up old errors outside the window
|
|
this.cleanupOldErrors()
|
|
}
|
|
|
|
/**
|
|
* Remove errors older than the error window
|
|
*/
|
|
private cleanupOldErrors(): void {
|
|
const now = Date.now()
|
|
const cutoff = now - this.errorWindow
|
|
|
|
for (const [key, timestamp] of this.errorCounts.entries()) {
|
|
if (timestamp < cutoff) {
|
|
this.errorCounts.delete(key)
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if error threshold exceeded
|
|
*/
|
|
private checkErrorThreshold(): void {
|
|
this.cleanupOldErrors()
|
|
|
|
const errorCount = this.errorCounts.size
|
|
|
|
if (errorCount >= this.errorThreshold) {
|
|
console.error(`🚨 CRITICAL: ${errorCount} Drift SDK errors in ${this.errorWindow/1000}s (threshold: ${this.errorThreshold})`)
|
|
console.error('🔄 Triggering container restart to clear WebSocket connection leak...')
|
|
|
|
this.triggerRestart()
|
|
|
|
// Stop monitoring to prevent multiple restart triggers
|
|
this.stop()
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Trigger container restart via flag file
|
|
*/
|
|
private triggerRestart(): void {
|
|
const restartFlagPath = '/tmp/trading-bot-restart.flag'
|
|
|
|
try {
|
|
fs.writeFileSync(
|
|
restartFlagPath,
|
|
`Drift SDK health check failed: ${this.errorCounts.size} accountUnsubscribe errors\nTimestamp: ${new Date().toISOString()}\n`,
|
|
'utf-8'
|
|
)
|
|
logger.log(`✅ Restart flag created at ${restartFlagPath}`)
|
|
logger.log(' watch-restart.sh will restart container within 10 seconds')
|
|
} catch (error) {
|
|
console.error('❌ Failed to create restart flag:', error)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get current error count
|
|
*/
|
|
getErrorCount(): number {
|
|
this.cleanupOldErrors()
|
|
return this.errorCounts.size
|
|
}
|
|
|
|
/**
|
|
* Get health status
|
|
*/
|
|
getHealthStatus(): { healthy: boolean; errorCount: number; threshold: number } {
|
|
const errorCount = this.getErrorCount()
|
|
return {
|
|
healthy: errorCount < this.errorThreshold,
|
|
errorCount,
|
|
threshold: this.errorThreshold
|
|
}
|
|
}
|
|
}
|
|
|
|
// Singleton instance
|
|
let monitorInstance: DriftHealthMonitor | null = null
|
|
|
|
/**
|
|
* Get the Drift health monitor singleton
|
|
*/
|
|
export function getDriftHealthMonitor(): DriftHealthMonitor {
|
|
if (!monitorInstance) {
|
|
monitorInstance = new DriftHealthMonitor()
|
|
}
|
|
return monitorInstance
|
|
}
|
|
|
|
/**
|
|
* Start Drift health monitoring
|
|
*/
|
|
export function startDriftHealthMonitoring(): void {
|
|
const monitor = getDriftHealthMonitor()
|
|
monitor.start()
|
|
}
|
|
|
|
/**
|
|
* Stop Drift health monitoring
|
|
*/
|
|
export function stopDriftHealthMonitoring(): void {
|
|
if (monitorInstance) {
|
|
monitorInstance.stop()
|
|
}
|
|
}
|