feat: Add High Availability setup roadmap and scripts

Created comprehensive HA roadmap with 6 phases: - Phase 1: Warm standby (CURRENT - manual failover) - Phase 2: Database replication - Phase 3: Health monitoring - Phase 4: Reverse proxy + floating IP - Phase 5: Automated failover - Phase 6: Geographic redundancy Includes: - Decision gates based on capital and stability - Cost-benefit analysis - Scripts for healthcheck, failover, DB sync - Recommendation to defer full HA until capital > $5k Secondary server ready at 72.62.39.24 for emergency manual failover. Related: User concern about system uptime, but full HA complexity not justified at current scale (~$600 capital). Revisit in Q1 2026.
2025-11-19 20:52:12 +01:00
parent d28da02089
commit 880aae9a77
7 changed files with 936 additions and 0 deletions
--- a/ha-setup/failover-controller.sh
+++ b/ha-setup/failover-controller.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+#
+# HA Failover Controller
+# Monitors primary server and activates secondary on failure
+#
+# IMPORTANT: Run this ONLY on SECONDARY server
+# Primary should always be active unless failed
+#
+
+set -eu
+
+PRIMARY_HOST="root@192.168.1.100"  # Update with primary IP
+SECONDARY_HOST="root@72.62.39.24"
+CHECK_INTERVAL=15  # seconds between checks
+MAX_FAILURES=3     # failures before failover
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="/home/icke/traderv4"
+FAILURE_COUNT=0
+
+log() {
+  echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a /var/log/trading-bot-ha.log
+}
+
+telegram_notify() {
+  local message="$1"
+  # Use the Telegram bot to send notification
+  if [ -f "${PROJECT_DIR}/.env" ]; then
+    source "${PROJECT_DIR}/.env"
+    curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
+      -d chat_id="${TELEGRAM_CHAT_ID}" \
+      -d text="🚨 HA FAILOVER: ${message}" \
+      -d parse_mode="HTML" > /dev/null
+  fi
+}
+
+check_primary_health() {
+  # SSH to primary and run healthcheck
+  ssh -o ConnectTimeout=5 -o BatchMode=yes "${PRIMARY_HOST}" \
+    "cd ${PROJECT_DIR} && bash ha-setup/healthcheck.sh" &>/dev/null
+  return $?
+}
+
+is_secondary_active() {
+  docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"
+  return $?
+}
+
+start_secondary() {
+  log "🚀 Starting secondary (failover activation)..."
+  cd "${PROJECT_DIR}"
+  docker compose up -d trading-bot
+  sleep 10
+  
+  if docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"; then
+    log "✅ Secondary activated successfully"
+    telegram_notify "Secondary server activated (primary failed ${MAX_FAILURES} health checks)"
+    return 0
+  else
+    log "❌ Failed to start secondary"
+    telegram_notify "⚠️ CRITICAL: Secondary failed to start after primary failure!"
+    return 1
+  fi
+}
+
+stop_secondary() {
+  log "🛑 Stopping secondary (primary recovered)..."
+  cd "${PROJECT_DIR}"
+  docker compose stop trading-bot
+  
+  if ! is_secondary_active; then
+    log "✅ Secondary stopped successfully"
+    telegram_notify "Primary server recovered, secondary deactivated"
+    return 0
+  else
+    log "❌ Failed to stop secondary"
+    return 1
+  fi
+}
+
+main_loop() {
+  log "🎯 HA Failover Controller started (Secondary mode)"
+  log "Monitoring primary: ${PRIMARY_HOST}"
+  log "Check interval: ${CHECK_INTERVAL}s, Max failures: ${MAX_FAILURES}"
+  
+  while true; do
+    if check_primary_health; then
+      # Primary is healthy
+      if [ $FAILURE_COUNT -gt 0 ]; then
+        log "✅ Primary recovered (was at ${FAILURE_COUNT} failures)"
+        FAILURE_COUNT=0
+      fi
+      
+      # If secondary is running, stop it (primary should be active)
+      if is_secondary_active; then
+        log "⚠️  Secondary is active but primary is healthy - stopping secondary"
+        stop_secondary
+      fi
+      
+    else
+      # Primary is unhealthy
+      FAILURE_COUNT=$((FAILURE_COUNT + 1))
+      log "❌ Primary health check failed (${FAILURE_COUNT}/${MAX_FAILURES})"
+      
+      if [ $FAILURE_COUNT -ge $MAX_FAILURES ]; then
+        if ! is_secondary_active; then
+          log "🚨 PRIMARY FAILED - Activating secondary..."
+          telegram_notify "Primary server failed ${MAX_FAILURES} consecutive health checks. Activating secondary..."
+          start_secondary
+        else
+          log "ℹ️  Secondary already active (primary still failing)"
+        fi
+      fi
+    fi
+    
+    sleep $CHECK_INTERVAL
+  done
+}
+
+# Ensure running as root (needs docker access)
+if [ "$EUID" -ne 0 ]; then
+  log "❌ Must run as root (needs docker and SSH access)"
+  exit 1
+fi
+
+main_loop