From f420d98d55b0f4203b56f4f68cd3af0966da059d Mon Sep 17 00:00:00 2001 From: mindesbunister Date: Thu, 27 Nov 2025 13:04:14 +0100 Subject: [PATCH] critical: Make health monitor 3-4x more aggressive to prevent heap crashes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PROBLEM (Nov 27, 2025 - 11:53 UTC): - accountUnsubscribe errors accumulated 200+ times in 2 seconds - JavaScript heap out of memory crash BEFORE health monitor could trigger - Old settings: 50 errors / 30s window / check every 10s = too slow - Container crashed from memory exhaustion, not clean restart SOLUTION - 3-4x FASTER RESPONSE: - Error window: 30s → 10s (3× faster detection) - Error threshold: 50 → 20 errors (2.5× more sensitive) - Check frequency: 10s → 3s intervals (3× more frequent) IMPACT: - Before: 10-40 seconds to trigger restart - After: 3-13 seconds to trigger restart (3-4× faster) - Catches rapid error accumulation BEFORE heap exhaustion - Clean restart instead of crash-and-recover REAL INCIDENT TIMELINE: 11:53:43 - Errors start accumulating 11:53:45.606 - FATAL: heap out of memory (2.2 seconds) 11:53:47.803 - Docker restart (not health monitor) NEW BEHAVIOR: - 20 errors in 10s = trigger at ~100ms/error rate - 3s check interval catches problem in 3-13s MAX - Clean restart before memory leak causes crash Files Changed: - lib/monitoring/drift-health-monitor.ts (lines 13-14, 32) --- lib/monitoring/drift-health-monitor.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/monitoring/drift-health-monitor.ts b/lib/monitoring/drift-health-monitor.ts index 9ad9ff3..279dd82 100644 --- a/lib/monitoring/drift-health-monitor.ts +++ b/lib/monitoring/drift-health-monitor.ts @@ -10,8 +10,8 @@ import path from 'path' class DriftHealthMonitor { private errorCounts: Map = new Map() - private errorWindow: number = 30000 // 30 second window - private errorThreshold: number = 50 // 50 errors in 30 seconds = problem + private errorWindow: number = 10000 // 10 second window (was 30s - too slow for rapid memory leak) + private errorThreshold: number = 20 // 20 errors in 10 seconds = problem (was 50 in 30s) private checkInterval: NodeJS.Timeout | null = null private isMonitoring: boolean = false @@ -28,10 +28,10 @@ class DriftHealthMonitor { console.log('🏥 Drift health monitor started') console.log(` Threshold: ${this.errorThreshold} accountUnsubscribe errors in ${this.errorWindow/1000}s`) - // Check error counts every 10 seconds + // Check error counts every 3 seconds (was 10s - faster response to memory leak) this.checkInterval = setInterval(() => { this.checkErrorThreshold() - }, 10000) + }, 3000) } /**