#!/usr/bin/env python3 """ DNS Failover Monitor with Bot Start/Stop (IMPROVED Dec 2025) CRITICAL IMPROVEMENT: Secondary trading bot stays STOPPED until failover. This prevents dual-bot interference where both bots trade the same wallet. Failover sequence: 1. Detect primary failure (3 consecutive checks, ~90 seconds) 2. Create DEMOTED flag on primary (prevent split-brain) 3. Promote secondary database to read-write 4. Update DNS to secondary IP 5. START secondary trading bot <-- NEW! Failback sequence: 1. Detect primary recovery 2. STOP secondary trading bot <-- NEW! 3. Update DNS to primary IP 4. Manual: Reconfigure old primary as secondary """ import requests import time import json import os import subprocess from xmlrpc.client import ServerProxy from datetime import datetime # Configuration PRIMARY_URL = os.getenv('PRIMARY_URL', 'https://flow.egonetix.de/api/health') PRIMARY_HOST = '95.216.52.28' SECONDARY_IP = '72.62.39.24' PRIMARY_IP = '95.216.52.28' CHECK_INTERVAL = 30 # seconds FAILURE_THRESHOLD = 3 # consecutive failures before failover RECOVERY_CHECK_INTERVAL = 300 # 5 minutes # Telegram configuration TELEGRAM_BOT_TOKEN = os.getenv('TELEGRAM_BOT_TOKEN', '8240234365:AAEm6hg_XOm54x8ctnwpNYreFKRAEvWU3uY') TELEGRAM_CHAT_ID = os.getenv('TELEGRAM_CHAT_ID', '579304651') # INWX API configuration INWX_API_URL = 'https://api.domrobot.com/xmlrpc/' INWX_USERNAME = os.getenv('INWX_USERNAME') INWX_PASSWORD = os.getenv('INWX_PASSWORD') # State management STATE_FILE = '/var/lib/dns-failover-state.json' LOG_FILE = '/var/log/dns-failover.log' # Trading bot configuration (NEW!) TRADING_BOT_PROJECT_DIR = '/home/icke/traderv4' class DNSFailoverMonitor: def __init__(self): self.consecutive_failures = 0 self.in_failover_mode = False self.record_id = None self.load_state() def log(self, message): """Log message with timestamp""" timestamp = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') log_message = f"{timestamp} {message}" print(log_message) with open(LOG_FILE, 'a') as f: f.write(log_message + '\n') def send_telegram(self, message): """Send Telegram notification""" try: url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage" data = { 'chat_id': TELEGRAM_CHAT_ID, 'text': message, 'parse_mode': 'HTML' } requests.post(url, data=data, timeout=10) except Exception as e: self.log(f"Failed to send Telegram: {e}") def load_state(self): """Load state from file""" try: if os.path.exists(STATE_FILE): with open(STATE_FILE, 'r') as f: state = json.load(f) self.in_failover_mode = state.get('in_failover_mode', False) except Exception as e: self.log(f"Warning: Could not load state: {e}") def save_state(self): """Save state to file""" try: state = { 'in_failover_mode': self.in_failover_mode, 'last_update': datetime.now().isoformat() } os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True) with open(STATE_FILE, 'w') as f: json.dump(state, f) except Exception as e: self.log(f"Warning: Could not save state: {e}") def check_primary_health(self): """Check if primary server is responding with valid trading bot response""" try: response = requests.get(PRIMARY_URL, timeout=10, verify=True) # Must be 200 status if response.status_code != 200: self.log(f"✗ Primary server returned {response.status_code}") return False # Must have content if not response.text or len(response.text.strip()) == 0: self.log(f"✗ Primary server returned empty response") return False # Try to parse as JSON and check for trading bot response try: data = response.json() # Trading bot health endpoint returns {"status": "healthy", "timestamp": ..., "uptime": ...} if 'status' in data and data.get('status') == 'healthy': self.log("✓ Primary server healthy (trading bot responding)") return True else: self.log(f"✗ Primary server JSON invalid: {data}") return False except json.JSONDecodeError: # Not JSON - likely n8n HTML or other service if 'AUTOMATIC FAILOVER ACTIVATED\n\n" telegram_msg += f"Primary: {PRIMARY_IP} → FAILED\n" telegram_msg += f"Secondary: {SECONDARY_IP} → NOW PRIMARY\n\n" telegram_msg += f"Database: {'✅ Promoted (writable)' if db_promoted else '⚠️ Check manually'}\n" telegram_msg += f"DNS: ✅ Updated\n" telegram_msg += f"Trading Bot: {'✅ STARTED' if bot_started else '❌ FAILED TO START!'}\n\n" telegram_msg += f"Status: {status}\n" telegram_msg += f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" if not bot_started: telegram_msg += f"\n\n⚠️ URGENT: Trading bot failed to start!\n" telegram_msg += f"Manual action: cd {TRADING_BOT_PROJECT_DIR} && docker compose start trading-bot" self.send_telegram(telegram_msg) else: self.log("❌ Failover FAILED - DNS update unsuccessful") def failback_to_primary(self): """Stop secondary bot, switch DNS back to primary server""" self.log("🔄 INITIATING FAILBACK TO PRIMARY") # Step 1: STOP the secondary trading bot FIRST (NEW!) bot_stopped = self.stop_secondary_trading_bot() # Step 2: Update DNS back to primary dns_updated = self.update_dns_record(PRIMARY_IP, ttl=3600) if dns_updated: self.in_failover_mode = False self.consecutive_failures = 0 self.save_state() self.log("✓ Failback complete - now using PRIMARY server") # Send Telegram notification telegram_msg = f"🔄 AUTOMATIC FAILBACK COMPLETE\n\n" telegram_msg += f"Primary: {PRIMARY_IP} → RECOVERED\n" telegram_msg += f"Secondary: {SECONDARY_IP} → STANDBY\n\n" telegram_msg += f"DNS: ✅ Switched to primary\n" telegram_msg += f"Secondary Bot: {'✅ STOPPED' if bot_stopped else '⚠️ May still be running!'}\n" telegram_msg += f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n" telegram_msg += f"⚠️ Manual Action Required:\n" telegram_msg += f"1. Verify primary bot is healthy\n" telegram_msg += f"2. Reconfigure secondary database as replica" if not bot_stopped: telegram_msg += f"\n\n❌ WARNING: Secondary bot may still be running!\n" telegram_msg += f"Stop manually: cd {TRADING_BOT_PROJECT_DIR} && docker compose stop trading-bot" self.send_telegram(telegram_msg) else: self.log("✗ Failback failed - DNS update unsuccessful") def run(self): """Main monitoring loop""" self.log("=== DNS Failover Monitor Starting (IMPROVED with Bot Start/Stop) ===") self.log(f"Primary URL: {PRIMARY_URL}") self.log(f"Check interval: {CHECK_INTERVAL}s") self.log(f"Failure threshold: {FAILURE_THRESHOLD}") self.log(f"Trading bot project: {TRADING_BOT_PROJECT_DIR}") # Get initial DNS state self.get_dns_record_id() # Check if trading bot is currently running (should be STOPPED on secondary in standby) bot_running = self.check_trading_bot_status() if bot_running and not self.in_failover_mode: self.log("⚠️ WARNING: Secondary bot is running but we're in standby mode!") self.log("⚠️ This may cause dual-bot interference. Consider stopping secondary bot.") while True: try: if self.in_failover_mode: # In failover mode - check if primary recovered if self.check_primary_health(): self.log("Primary server recovered!") self.failback_to_primary() time.sleep(RECOVERY_CHECK_INTERVAL) else: # Normal mode - monitor primary if self.check_primary_health(): self.consecutive_failures = 0 else: self.consecutive_failures += 1 self.log(f"Failure count: {self.consecutive_failures}/{FAILURE_THRESHOLD}") if self.consecutive_failures >= FAILURE_THRESHOLD: self.failover_to_secondary() time.sleep(CHECK_INTERVAL) except KeyboardInterrupt: self.log("Monitor stopped by user") break except Exception as e: self.log(f"Error in main loop: {e}") time.sleep(CHECK_INTERVAL) if __name__ == '__main__': monitor = DNSFailoverMonitor() monitor.run()