DevOps Automation with Python: Intelligent System Monitoring with Auto Recovery

#python #devops #monitoring #automation

DevOps Automation with Python: Intelligent System Monitoring with Auto Recovery

Automation is at the core of modern DevOps culture. While powerful tools like Kubernetes, Docker, and CI/CD platforms exist, scripts remain a critical foundation for efficient infrastructure management.

In this article, we will build a Python system monitoring script that not only tracks system resources but also performs automatic recovery when issues are detected.

Why Use Scripts in DevOps?

Scripts bring several key advantages to DevOps practices:

Fast Response – A script can detect and fix issues in seconds.
Consistency – Tasks are always executed the same way, reducing human error.
Scalability – One script can manage hundreds of servers.
Documentation – A well-written script serves as executable documentation.

Full Python Monitoring Script

#!/usr/bin/env python3
"""
Intelligent System Monitoring and Auto Recovery
Author: DevOps Bulgaria Team
Version: 2.0
"""

import psutil
import subprocess
import logging
import smtplib
import json
import time
from datetime import datetime
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from typing import Dict, List, Optional
import argparse
import sys
import os

class SystemMonitor:
    def __init__(self, config_file: str):
        self.load_config(config_file)
        self.setup_logging()
        self.cooldowns = {}

    def load_config(self, config_file: str):
        try:
            with open(config_file, "r") as f:
                self.config = json.load(f)
        except Exception as e:
            print(f"Error loading config: {e}")
            sys.exit(1)

    def setup_logging(self):
        log_file = "/var/log/system_monitor.log"
        logging.basicConfig(
            filename=log_file,
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s"
        )
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

    def get_metrics(self) -> Dict[str, float]:
        return {
            "cpu_percent": psutil.cpu_percent(interval=1),
            "memory_percent": psutil.virtual_memory().percent,
            "disk_percent": psutil.disk_usage("/").percent,
            "load_average": psutil.getloadavg()[0]
        }

    def check_services(self) -> Dict[str, bool]:
        results = {}
        for service in self.config.get("services", []):
            try:
                result = subprocess.run(
                    ["systemctl", "is-active", "--quiet", service],
                    check=False
                )
                results[service] = (result.returncode == 0)
            except Exception:
                results[service] = False
        return results

    def attempt_recovery(self, service: str) -> bool:
        logging.warning(f"Attempting to restart service: {service}")
        try:
            subprocess.run(
                ["systemctl", "restart", service],
                check=True
            )
            time.sleep(5)
            return subprocess.run(
                ["systemctl", "is-active", "--quiet", service],
                check=False
            ).returncode == 0
        except Exception as e:
            logging.error(f"Recovery failed for {service}: {e}")
            return False

    def send_alert(self, subject: str, body: str):
        email_conf = self.config.get("email", {})
        if not email_conf.get("enabled", False):
            return

        now = time.time()
        last_alert = self.cooldowns.get(subject, 0)
        if now - last_alert < self.config["monitoring"]["cooldown_period"]:
            return
        self.cooldowns[subject] = now

        try:
            msg = MIMEMultipart()
            msg["From"] = email_conf["username"]
            msg["To"] = ", ".join(email_conf["to_addresses"])
            msg["Subject"] = subject
            msg.attach(MIMEText(body, "plain"))

            with smtplib.SMTP(email_conf["smtp_server"], email_conf["smtp_port"]) as server:
                server.starttls()
                server.login(email_conf["username"], email_conf["password"])
                server.send_message(msg)

            logging.info(f"Alert sent: {subject}")
        except Exception as e:
            logging.error(f"Failed to send alert: {e}")

    def monitor_once(self):
        metrics = self.get_metrics()
        services = self.check_services()

        logging.info("System Metrics: " + str(metrics))

        for key, value in metrics.items():
            if value > self.config["thresholds"][key]:
                msg = f"{key} at {value}% exceeds threshold {self.config['thresholds'][key]}%"
                logging.warning(msg)
                self.send_alert("System Threshold Exceeded", msg)

        for service, is_active in services.items():
            if not is_active:
                logging.error(f"Service down: {service}")
                for _ in range(self.config["monitoring"]["recovery_attempts"]):
                    if self.attempt_recovery(service):
                        logging.info(f"Service {service} recovered successfully")
                        break
                else:
                    self.send_alert("Service Down", f"Failed to recover {service}")

    def run(self):
        interval = self.config["monitoring"]["interval"]
        while True:
            self.monitor_once()
            time.sleep(interval)

def main():
    parser = argparse.ArgumentParser(description="System Monitor")
    parser.add_argument("--config", default="monitor_config.json", help="Path to config file")
    parser.add_argument("--once", action="store_true", help="Run once and exit")
    parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
    args = parser.parse_args()

    monitor = SystemMonitor(args.config)
    if args.once:
        monitor.monitor_once()
    else:
        monitor.run()

if __name__ == "__main__":
    main()

Configuration File (monitor_config.json)

{
    "thresholds": {
        "cpu_percent": 85.0,
        "memory_percent": 90.0,
        "disk_percent": 85.0,
        "load_average": 8.0
    },
    "services": [
        "nginx",
        "mysql",
        "docker",
        "redis-server"
    ],
    "email": {
        "enabled": true,
        "smtp_server": "smtp.gmail.com",
        "smtp_port": 587,
        "username": "your-email@gmail.com",
        "password": "your-app-password",
        "to_addresses": ["admin@yourcompany.com"]
    },
    "monitoring": {
        "interval": 60,
        "recovery_attempts": 3,
        "cooldown_period": 300
    }
}

Systemd Service Example

[Unit]
Description=Intelligent System Monitor
After=network.target

[Service]
Type=simple
User=root
ExecStart=/usr/bin/python3 /opt/system-monitor/system_monitor.py
Restart=always
RestartSec=10

[Install]
WantedBy=multi-user.target

Conclusion

This script provides:

Real-time monitoring of CPU, memory, disk, and load average
Service supervision with automatic restart
Configurable thresholds and alerts through JSON
Email notifications with cooldown to avoid spam
Systemd integration for reliable background execution

Automation scripts like this are a cornerstone of DevOps practices. They ensure system resilience, reduce downtime, and free engineers from repetitive tasks.

Pro tip: Always test in a staging environment before production deployment.

Originally written in Bulgarian and translated to English for the dev.to community.

Read the original Bulgarian version: DevOps автоматизация със скриптове