feat: Add High Availability setup roadmap and scripts
Created comprehensive HA roadmap with 6 phases: - Phase 1: Warm standby (CURRENT - manual failover) - Phase 2: Database replication - Phase 3: Health monitoring - Phase 4: Reverse proxy + floating IP - Phase 5: Automated failover - Phase 6: Geographic redundancy Includes: - Decision gates based on capital and stability - Cost-benefit analysis - Scripts for healthcheck, failover, DB sync - Recommendation to defer full HA until capital > $5k Secondary server ready at 72.62.39.24 for emergency manual failover. Related: User concern about system uptime, but full HA complexity not justified at current scale (~$600 capital). Revisit in Q1 2026.
This commit is contained in:
126
ha-setup/failover-controller.sh
Normal file
126
ha-setup/failover-controller.sh
Normal file
@@ -0,0 +1,126 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# HA Failover Controller
|
||||
# Monitors primary server and activates secondary on failure
|
||||
#
|
||||
# IMPORTANT: Run this ONLY on SECONDARY server
|
||||
# Primary should always be active unless failed
|
||||
#
|
||||
|
||||
set -eu
|
||||
|
||||
PRIMARY_HOST="root@192.168.1.100" # Update with primary IP
|
||||
SECONDARY_HOST="root@72.62.39.24"
|
||||
CHECK_INTERVAL=15 # seconds between checks
|
||||
MAX_FAILURES=3 # failures before failover
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="/home/icke/traderv4"
|
||||
FAILURE_COUNT=0
|
||||
|
||||
log() {
|
||||
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a /var/log/trading-bot-ha.log
|
||||
}
|
||||
|
||||
telegram_notify() {
|
||||
local message="$1"
|
||||
# Use the Telegram bot to send notification
|
||||
if [ -f "${PROJECT_DIR}/.env" ]; then
|
||||
source "${PROJECT_DIR}/.env"
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d chat_id="${TELEGRAM_CHAT_ID}" \
|
||||
-d text="🚨 HA FAILOVER: ${message}" \
|
||||
-d parse_mode="HTML" > /dev/null
|
||||
fi
|
||||
}
|
||||
|
||||
check_primary_health() {
|
||||
# SSH to primary and run healthcheck
|
||||
ssh -o ConnectTimeout=5 -o BatchMode=yes "${PRIMARY_HOST}" \
|
||||
"cd ${PROJECT_DIR} && bash ha-setup/healthcheck.sh" &>/dev/null
|
||||
return $?
|
||||
}
|
||||
|
||||
is_secondary_active() {
|
||||
docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"
|
||||
return $?
|
||||
}
|
||||
|
||||
start_secondary() {
|
||||
log "🚀 Starting secondary (failover activation)..."
|
||||
cd "${PROJECT_DIR}"
|
||||
docker compose up -d trading-bot
|
||||
sleep 10
|
||||
|
||||
if docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"; then
|
||||
log "✅ Secondary activated successfully"
|
||||
telegram_notify "Secondary server activated (primary failed ${MAX_FAILURES} health checks)"
|
||||
return 0
|
||||
else
|
||||
log "❌ Failed to start secondary"
|
||||
telegram_notify "⚠️ CRITICAL: Secondary failed to start after primary failure!"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
stop_secondary() {
|
||||
log "🛑 Stopping secondary (primary recovered)..."
|
||||
cd "${PROJECT_DIR}"
|
||||
docker compose stop trading-bot
|
||||
|
||||
if ! is_secondary_active; then
|
||||
log "✅ Secondary stopped successfully"
|
||||
telegram_notify "Primary server recovered, secondary deactivated"
|
||||
return 0
|
||||
else
|
||||
log "❌ Failed to stop secondary"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
main_loop() {
|
||||
log "🎯 HA Failover Controller started (Secondary mode)"
|
||||
log "Monitoring primary: ${PRIMARY_HOST}"
|
||||
log "Check interval: ${CHECK_INTERVAL}s, Max failures: ${MAX_FAILURES}"
|
||||
|
||||
while true; do
|
||||
if check_primary_health; then
|
||||
# Primary is healthy
|
||||
if [ $FAILURE_COUNT -gt 0 ]; then
|
||||
log "✅ Primary recovered (was at ${FAILURE_COUNT} failures)"
|
||||
FAILURE_COUNT=0
|
||||
fi
|
||||
|
||||
# If secondary is running, stop it (primary should be active)
|
||||
if is_secondary_active; then
|
||||
log "⚠️ Secondary is active but primary is healthy - stopping secondary"
|
||||
stop_secondary
|
||||
fi
|
||||
|
||||
else
|
||||
# Primary is unhealthy
|
||||
FAILURE_COUNT=$((FAILURE_COUNT + 1))
|
||||
log "❌ Primary health check failed (${FAILURE_COUNT}/${MAX_FAILURES})"
|
||||
|
||||
if [ $FAILURE_COUNT -ge $MAX_FAILURES ]; then
|
||||
if ! is_secondary_active; then
|
||||
log "🚨 PRIMARY FAILED - Activating secondary..."
|
||||
telegram_notify "Primary server failed ${MAX_FAILURES} consecutive health checks. Activating secondary..."
|
||||
start_secondary
|
||||
else
|
||||
log "ℹ️ Secondary already active (primary still failing)"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
sleep $CHECK_INTERVAL
|
||||
done
|
||||
}
|
||||
|
||||
# Ensure running as root (needs docker access)
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
log "❌ Must run as root (needs docker and SSH access)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
main_loop
|
||||
Reference in New Issue
Block a user