trading_bot_v4/ha-setup/failover-controller.sh

#!/bin/bash
#
# HA Failover Controller
# Monitors primary server and activates secondary on failure
#
# IMPORTANT: Run this ONLY on SECONDARY server
# Primary should always be active unless failed
#

set -eu

PRIMARY_HOST="root@192.168.1.100"  # Update with primary IP
SECONDARY_HOST="root@72.62.39.24"
CHECK_INTERVAL=15  # seconds between checks
MAX_FAILURES=3     # failures before failover

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="/home/icke/traderv4"
FAILURE_COUNT=0

log() {
  echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a /var/log/trading-bot-ha.log
}

telegram_notify() {
  local message="$1"
  # Use the Telegram bot to send notification
  if [ -f "${PROJECT_DIR}/.env" ]; then
    source "${PROJECT_DIR}/.env"
    curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
      -d chat_id="${TELEGRAM_CHAT_ID}" \
      -d text="🚨 HA FAILOVER: ${message}" \
      -d parse_mode="HTML" > /dev/null
  fi
}

check_primary_health() {
  # SSH to primary and run healthcheck
  ssh -o ConnectTimeout=5 -o BatchMode=yes "${PRIMARY_HOST}" \
    "cd ${PROJECT_DIR} && bash ha-setup/healthcheck.sh" &>/dev/null
  return $?
}

is_secondary_active() {
  docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"
  return $?
}

start_secondary() {
  log "🚀 Starting secondary (failover activation)..."
  cd "${PROJECT_DIR}"
  docker compose up -d trading-bot
  sleep 10

  if docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"; then
    log "✅ Secondary activated successfully"
    telegram_notify "Secondary server activated (primary failed ${MAX_FAILURES} health checks)"
    return 0
  else
    log "❌ Failed to start secondary"
    telegram_notify "⚠️ CRITICAL: Secondary failed to start after primary failure!"
    return 1
  fi
}

stop_secondary() {
  log "🛑 Stopping secondary (primary recovered)..."
  cd "${PROJECT_DIR}"
  docker compose stop trading-bot

  if ! is_secondary_active; then
    log "✅ Secondary stopped successfully"
    telegram_notify "Primary server recovered, secondary deactivated"
    return 0
  else
    log "❌ Failed to stop secondary"
    return 1
  fi
}

main_loop() {
  log "🎯 HA Failover Controller started (Secondary mode)"
  log "Monitoring primary: ${PRIMARY_HOST}"
  log "Check interval: ${CHECK_INTERVAL}s, Max failures: ${MAX_FAILURES}"

  while true; do
    if check_primary_health; then
      # Primary is healthy
      if [ $FAILURE_COUNT -gt 0 ]; then
        log "✅ Primary recovered (was at ${FAILURE_COUNT} failures)"
        FAILURE_COUNT=0
      fi

      # If secondary is running, stop it (primary should be active)
      if is_secondary_active; then
        log "⚠️  Secondary is active but primary is healthy - stopping secondary"
        stop_secondary
      fi

    else
      # Primary is unhealthy
      FAILURE_COUNT=$((FAILURE_COUNT + 1))
      log "❌ Primary health check failed (${FAILURE_COUNT}/${MAX_FAILURES})"

      if [ $FAILURE_COUNT -ge $MAX_FAILURES ]; then
        if ! is_secondary_active; then
          log "🚨 PRIMARY FAILED - Activating secondary..."
          telegram_notify "Primary server failed ${MAX_FAILURES} consecutive health checks. Activating secondary..."
          start_secondary
        else
          log "ℹ️  Secondary already active (primary still failing)"
        fi
      fi
    fi

    sleep $CHECK_INTERVAL
  done
}

# Ensure running as root (needs docker access)
if [ "$EUID" -ne 0 ]; then
  log "❌ Must run as root (needs docker and SSH access)"
  exit 1
fi

main_loop