critical: Make health monitor 3-4x more aggressive to prevent heap crashes
PROBLEM (Nov 27, 2025 - 11:53 UTC): - accountUnsubscribe errors accumulated 200+ times in 2 seconds - JavaScript heap out of memory crash BEFORE health monitor could trigger - Old settings: 50 errors / 30s window / check every 10s = too slow - Container crashed from memory exhaustion, not clean restart SOLUTION - 3-4x FASTER RESPONSE: - Error window: 30s → 10s (3× faster detection) - Error threshold: 50 → 20 errors (2.5× more sensitive) - Check frequency: 10s → 3s intervals (3× more frequent) IMPACT: - Before: 10-40 seconds to trigger restart - After: 3-13 seconds to trigger restart (3-4× faster) - Catches rapid error accumulation BEFORE heap exhaustion - Clean restart instead of crash-and-recover REAL INCIDENT TIMELINE: 11:53:43 - Errors start accumulating 11:53:45.606 - FATAL: heap out of memory (2.2 seconds) 11:53:47.803 - Docker restart (not health monitor) NEW BEHAVIOR: - 20 errors in 10s = trigger at ~100ms/error rate - 3s check interval catches problem in 3-13s MAX - Clean restart before memory leak causes crash Files Changed: - lib/monitoring/drift-health-monitor.ts (lines 13-14, 32)
This commit is contained in:
@@ -10,8 +10,8 @@ import path from 'path'
|
|||||||
|
|
||||||
class DriftHealthMonitor {
|
class DriftHealthMonitor {
|
||||||
private errorCounts: Map<string, number> = new Map()
|
private errorCounts: Map<string, number> = new Map()
|
||||||
private errorWindow: number = 30000 // 30 second window
|
private errorWindow: number = 10000 // 10 second window (was 30s - too slow for rapid memory leak)
|
||||||
private errorThreshold: number = 50 // 50 errors in 30 seconds = problem
|
private errorThreshold: number = 20 // 20 errors in 10 seconds = problem (was 50 in 30s)
|
||||||
private checkInterval: NodeJS.Timeout | null = null
|
private checkInterval: NodeJS.Timeout | null = null
|
||||||
private isMonitoring: boolean = false
|
private isMonitoring: boolean = false
|
||||||
|
|
||||||
@@ -28,10 +28,10 @@ class DriftHealthMonitor {
|
|||||||
console.log('🏥 Drift health monitor started')
|
console.log('🏥 Drift health monitor started')
|
||||||
console.log(` Threshold: ${this.errorThreshold} accountUnsubscribe errors in ${this.errorWindow/1000}s`)
|
console.log(` Threshold: ${this.errorThreshold} accountUnsubscribe errors in ${this.errorWindow/1000}s`)
|
||||||
|
|
||||||
// Check error counts every 10 seconds
|
// Check error counts every 3 seconds (was 10s - faster response to memory leak)
|
||||||
this.checkInterval = setInterval(() => {
|
this.checkInterval = setInterval(() => {
|
||||||
this.checkErrorThreshold()
|
this.checkErrorThreshold()
|
||||||
}, 10000)
|
}, 3000)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user