feat: Add High Availability setup roadmap and scripts

Created comprehensive HA roadmap with 6 phases:
- Phase 1: Warm standby (CURRENT - manual failover)
- Phase 2: Database replication
- Phase 3: Health monitoring
- Phase 4: Reverse proxy + floating IP
- Phase 5: Automated failover
- Phase 6: Geographic redundancy

Includes:
- Decision gates based on capital and stability
- Cost-benefit analysis
- Scripts for healthcheck, failover, DB sync
- Recommendation to defer full HA until capital > $5k

Secondary server ready at 72.62.39.24 for emergency manual failover.

Related: User concern about system uptime, but full HA complexity
not justified at current scale (~$600 capital). Revisit in Q1 2026.
This commit is contained in:
mindesbunister
2025-11-19 20:52:12 +01:00
parent d28da02089
commit 880aae9a77
7 changed files with 936 additions and 0 deletions

90
ha-setup/healthcheck.sh Normal file
View File

@@ -0,0 +1,90 @@
#!/bin/bash
#
# Trading Bot Health Check Script
# Checks if trading bot is healthy and responding
#
# Usage: ./healthcheck.sh
# Exit codes: 0 = healthy, 1 = unhealthy
set -eu
TRADING_BOT_HOST="${TRADING_BOT_HOST:-localhost:3001}"
MAX_FAILURES="${MAX_FAILURES:-3}"
CHECK_INTERVAL="${CHECK_INTERVAL:-10}"
# Source API key from .env
if [ -f "/home/icke/traderv4/.env" ]; then
export $(grep "^API_SECRET_KEY=" /home/icke/traderv4/.env | xargs)
fi
log() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*"
}
# Check if container is running
check_container() {
docker ps --filter "name=trading-bot-v4" --filter "status=running" | grep -q "trading-bot-v4"
return $?
}
# Check if API is responding
check_api() {
local response
response=$(curl -s -f -m 5 \
-H "Authorization: Bearer ${API_SECRET_KEY}" \
"http://${TRADING_BOT_HOST}/api/drift/account-summary" 2>&1)
if [ $? -eq 0 ] && echo "$response" | grep -q '"success":true'; then
return 0
else
return 1
fi
}
# Check if Position Manager is monitoring (if positions exist)
check_position_manager() {
local logs
logs=$(docker logs --tail=50 trading-bot-v4 2>&1)
# Check for recent monitoring activity (within last 30 seconds)
if echo "$logs" | grep -q "🔍 Monitoring"; then
return 0
fi
# If no monitoring logs but no positions open, that's OK
if echo "$logs" | grep -q "No positions to monitor"; then
return 0
fi
# If container just started (less than 1 min), give it time
if docker inspect trading-bot-v4 --format='{{.State.StartedAt}}' | grep -q "$(date -u +%Y-%m-%dT%H)"; then
return 0
fi
return 1
}
# Main health check
main() {
log "Starting health check..."
if ! check_container; then
log "❌ UNHEALTHY: Container not running"
exit 1
fi
if ! check_api; then
log "❌ UNHEALTHY: API not responding"
exit 1
fi
if ! check_position_manager; then
log "⚠️ WARNING: Position Manager may not be monitoring (check logs)"
# Don't fail on this - API working is primary health indicator
fi
log "✅ HEALTHY: All checks passed"
exit 0
}
main "$@"