Files
trading_bot_v4/ha-setup/failover-controller.sh
mindesbunister 880aae9a77 feat: Add High Availability setup roadmap and scripts
Created comprehensive HA roadmap with 6 phases:
- Phase 1: Warm standby (CURRENT - manual failover)
- Phase 2: Database replication
- Phase 3: Health monitoring
- Phase 4: Reverse proxy + floating IP
- Phase 5: Automated failover
- Phase 6: Geographic redundancy

Includes:
- Decision gates based on capital and stability
- Cost-benefit analysis
- Scripts for healthcheck, failover, DB sync
- Recommendation to defer full HA until capital > $5k

Secondary server ready at 72.62.39.24 for emergency manual failover.

Related: User concern about system uptime, but full HA complexity
not justified at current scale (~$600 capital). Revisit in Q1 2026.
2025-11-19 20:52:12 +01:00

127 lines
3.6 KiB
Bash
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
#
# HA Failover Controller
# Monitors primary server and activates secondary on failure
#
# IMPORTANT: Run this ONLY on SECONDARY server
# Primary should always be active unless failed
#
set -eu
PRIMARY_HOST="root@192.168.1.100" # Update with primary IP
SECONDARY_HOST="root@72.62.39.24"
CHECK_INTERVAL=15 # seconds between checks
MAX_FAILURES=3 # failures before failover
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="/home/icke/traderv4"
FAILURE_COUNT=0
log() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a /var/log/trading-bot-ha.log
}
telegram_notify() {
local message="$1"
# Use the Telegram bot to send notification
if [ -f "${PROJECT_DIR}/.env" ]; then
source "${PROJECT_DIR}/.env"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d text="🚨 HA FAILOVER: ${message}" \
-d parse_mode="HTML" > /dev/null
fi
}
check_primary_health() {
# SSH to primary and run healthcheck
ssh -o ConnectTimeout=5 -o BatchMode=yes "${PRIMARY_HOST}" \
"cd ${PROJECT_DIR} && bash ha-setup/healthcheck.sh" &>/dev/null
return $?
}
is_secondary_active() {
docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"
return $?
}
start_secondary() {
log "🚀 Starting secondary (failover activation)..."
cd "${PROJECT_DIR}"
docker compose up -d trading-bot
sleep 10
if docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"; then
log "✅ Secondary activated successfully"
telegram_notify "Secondary server activated (primary failed ${MAX_FAILURES} health checks)"
return 0
else
log "❌ Failed to start secondary"
telegram_notify "⚠️ CRITICAL: Secondary failed to start after primary failure!"
return 1
fi
}
stop_secondary() {
log "🛑 Stopping secondary (primary recovered)..."
cd "${PROJECT_DIR}"
docker compose stop trading-bot
if ! is_secondary_active; then
log "✅ Secondary stopped successfully"
telegram_notify "Primary server recovered, secondary deactivated"
return 0
else
log "❌ Failed to stop secondary"
return 1
fi
}
main_loop() {
log "🎯 HA Failover Controller started (Secondary mode)"
log "Monitoring primary: ${PRIMARY_HOST}"
log "Check interval: ${CHECK_INTERVAL}s, Max failures: ${MAX_FAILURES}"
while true; do
if check_primary_health; then
# Primary is healthy
if [ $FAILURE_COUNT -gt 0 ]; then
log "✅ Primary recovered (was at ${FAILURE_COUNT} failures)"
FAILURE_COUNT=0
fi
# If secondary is running, stop it (primary should be active)
if is_secondary_active; then
log "⚠️ Secondary is active but primary is healthy - stopping secondary"
stop_secondary
fi
else
# Primary is unhealthy
FAILURE_COUNT=$((FAILURE_COUNT + 1))
log "❌ Primary health check failed (${FAILURE_COUNT}/${MAX_FAILURES})"
if [ $FAILURE_COUNT -ge $MAX_FAILURES ]; then
if ! is_secondary_active; then
log "🚨 PRIMARY FAILED - Activating secondary..."
telegram_notify "Primary server failed ${MAX_FAILURES} consecutive health checks. Activating secondary..."
start_secondary
else
log " Secondary already active (primary still failing)"
fi
fi
fi
sleep $CHECK_INTERVAL
done
}
# Ensure running as root (needs docker access)
if [ "$EUID" -ne 0 ]; then
log "❌ Must run as root (needs docker and SSH access)"
exit 1
fi
main_loop