feat: Add High Availability setup roadmap and scripts

Created comprehensive HA roadmap with 6 phases:
- Phase 1: Warm standby (CURRENT - manual failover)
- Phase 2: Database replication
- Phase 3: Health monitoring
- Phase 4: Reverse proxy + floating IP
- Phase 5: Automated failover
- Phase 6: Geographic redundancy

Includes:
- Decision gates based on capital and stability
- Cost-benefit analysis
- Scripts for healthcheck, failover, DB sync
- Recommendation to defer full HA until capital > $5k

Secondary server ready at 72.62.39.24 for emergency manual failover.

Related: User concern about system uptime, but full HA complexity
not justified at current scale (~$600 capital). Revisit in Q1 2026.
This commit is contained in:
mindesbunister
2025-11-19 20:52:12 +01:00
parent d28da02089
commit 880aae9a77
7 changed files with 936 additions and 0 deletions

View File

@@ -0,0 +1,126 @@
#!/bin/bash
#
# HA Failover Controller
# Monitors primary server and activates secondary on failure
#
# IMPORTANT: Run this ONLY on SECONDARY server
# Primary should always be active unless failed
#
set -eu
PRIMARY_HOST="root@192.168.1.100" # Update with primary IP
SECONDARY_HOST="root@72.62.39.24"
CHECK_INTERVAL=15 # seconds between checks
MAX_FAILURES=3 # failures before failover
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="/home/icke/traderv4"
FAILURE_COUNT=0
log() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a /var/log/trading-bot-ha.log
}
telegram_notify() {
local message="$1"
# Use the Telegram bot to send notification
if [ -f "${PROJECT_DIR}/.env" ]; then
source "${PROJECT_DIR}/.env"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d text="🚨 HA FAILOVER: ${message}" \
-d parse_mode="HTML" > /dev/null
fi
}
check_primary_health() {
# SSH to primary and run healthcheck
ssh -o ConnectTimeout=5 -o BatchMode=yes "${PRIMARY_HOST}" \
"cd ${PROJECT_DIR} && bash ha-setup/healthcheck.sh" &>/dev/null
return $?
}
is_secondary_active() {
docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"
return $?
}
start_secondary() {
log "🚀 Starting secondary (failover activation)..."
cd "${PROJECT_DIR}"
docker compose up -d trading-bot
sleep 10
if docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"; then
log "✅ Secondary activated successfully"
telegram_notify "Secondary server activated (primary failed ${MAX_FAILURES} health checks)"
return 0
else
log "❌ Failed to start secondary"
telegram_notify "⚠️ CRITICAL: Secondary failed to start after primary failure!"
return 1
fi
}
stop_secondary() {
log "🛑 Stopping secondary (primary recovered)..."
cd "${PROJECT_DIR}"
docker compose stop trading-bot
if ! is_secondary_active; then
log "✅ Secondary stopped successfully"
telegram_notify "Primary server recovered, secondary deactivated"
return 0
else
log "❌ Failed to stop secondary"
return 1
fi
}
main_loop() {
log "🎯 HA Failover Controller started (Secondary mode)"
log "Monitoring primary: ${PRIMARY_HOST}"
log "Check interval: ${CHECK_INTERVAL}s, Max failures: ${MAX_FAILURES}"
while true; do
if check_primary_health; then
# Primary is healthy
if [ $FAILURE_COUNT -gt 0 ]; then
log "✅ Primary recovered (was at ${FAILURE_COUNT} failures)"
FAILURE_COUNT=0
fi
# If secondary is running, stop it (primary should be active)
if is_secondary_active; then
log "⚠️ Secondary is active but primary is healthy - stopping secondary"
stop_secondary
fi
else
# Primary is unhealthy
FAILURE_COUNT=$((FAILURE_COUNT + 1))
log "❌ Primary health check failed (${FAILURE_COUNT}/${MAX_FAILURES})"
if [ $FAILURE_COUNT -ge $MAX_FAILURES ]; then
if ! is_secondary_active; then
log "🚨 PRIMARY FAILED - Activating secondary..."
telegram_notify "Primary server failed ${MAX_FAILURES} consecutive health checks. Activating secondary..."
start_secondary
else
log " Secondary already active (primary still failing)"
fi
fi
fi
sleep $CHECK_INTERVAL
done
}
# Ensure running as root (needs docker access)
if [ "$EUID" -ne 0 ]; then
log "❌ Must run as root (needs docker and SSH access)"
exit 1
fi
main_loop