DevOps Automation with Python: Intelligent System Monitoring with Auto Recovery
Automation is at the core of modern DevOps culture. While powerful tools like Kubernetes, Docker, and CI/CD platforms exist, scripts remain a critical foundation for efficient infrastructure management.
In this article, we will build a Python system monitoring script that not only tracks system resources but also performs automatic recovery when issues are detected.
Why Use Scripts in DevOps?
Scripts bring several key advantages to DevOps practices:
- Fast Response – A script can detect and fix issues in seconds.
- Consistency – Tasks are always executed the same way, reducing human error.
- Scalability – One script can manage hundreds of servers.
- Documentation – A well-written script serves as executable documentation.
Full Python Monitoring Script
#!/usr/bin/env python3
"""
Intelligent System Monitoring and Auto Recovery
Author: DevOps Bulgaria Team
Version: 2.0
"""
import psutil
import subprocess
import logging
import smtplib
import json
import time
from datetime import datetime
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from typing import Dict, List, Optional
import argparse
import sys
import os
class SystemMonitor:
def __init__(self, config_file: str):
self.load_config(config_file)
self.setup_logging()
self.cooldowns = {}
def load_config(self, config_file: str):
try:
with open(config_file, "r") as f:
self.config = json.load(f)
except Exception as e:
print(f"Error loading config: {e}")
sys.exit(1)
def setup_logging(self):
log_file = "/var/log/system_monitor.log"
logging.basicConfig(
filename=log_file,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
def get_metrics(self) -> Dict[str, float]:
return {
"cpu_percent": psutil.cpu_percent(interval=1),
"memory_percent": psutil.virtual_memory().percent,
"disk_percent": psutil.disk_usage("/").percent,
"load_average": psutil.getloadavg()[0]
}
def check_services(self) -> Dict[str, bool]:
results = {}
for service in self.config.get("services", []):
try:
result = subprocess.run(
["systemctl", "is-active", "--quiet", service],
check=False
)
results[service] = (result.returncode == 0)
except Exception:
results[service] = False
return results
def attempt_recovery(self, service: str) -> bool:
logging.warning(f"Attempting to restart service: {service}")
try:
subprocess.run(
["systemctl", "restart", service],
check=True
)
time.sleep(5)
return subprocess.run(
["systemctl", "is-active", "--quiet", service],
check=False
).returncode == 0
except Exception as e:
logging.error(f"Recovery failed for {service}: {e}")
return False
def send_alert(self, subject: str, body: str):
email_conf = self.config.get("email", {})
if not email_conf.get("enabled", False):
return
now = time.time()
last_alert = self.cooldowns.get(subject, 0)
if now - last_alert < self.config["monitoring"]["cooldown_period"]:
return
self.cooldowns[subject] = now
try:
msg = MIMEMultipart()
msg["From"] = email_conf["username"]
msg["To"] = ", ".join(email_conf["to_addresses"])
msg["Subject"] = subject
msg.attach(MIMEText(body, "plain"))
with smtplib.SMTP(email_conf["smtp_server"], email_conf["smtp_port"]) as server:
server.starttls()
server.login(email_conf["username"], email_conf["password"])
server.send_message(msg)
logging.info(f"Alert sent: {subject}")
except Exception as e:
logging.error(f"Failed to send alert: {e}")
def monitor_once(self):
metrics = self.get_metrics()
services = self.check_services()
logging.info("System Metrics: " + str(metrics))
for key, value in metrics.items():
if value > self.config["thresholds"][key]:
msg = f"{key} at {value}% exceeds threshold {self.config['thresholds'][key]}%"
logging.warning(msg)
self.send_alert("System Threshold Exceeded", msg)
for service, is_active in services.items():
if not is_active:
logging.error(f"Service down: {service}")
for _ in range(self.config["monitoring"]["recovery_attempts"]):
if self.attempt_recovery(service):
logging.info(f"Service {service} recovered successfully")
break
else:
self.send_alert("Service Down", f"Failed to recover {service}")
def run(self):
interval = self.config["monitoring"]["interval"]
while True:
self.monitor_once()
time.sleep(interval)
def main():
parser = argparse.ArgumentParser(description="System Monitor")
parser.add_argument("--config", default="monitor_config.json", help="Path to config file")
parser.add_argument("--once", action="store_true", help="Run once and exit")
parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
args = parser.parse_args()
monitor = SystemMonitor(args.config)
if args.once:
monitor.monitor_once()
else:
monitor.run()
if __name__ == "__main__":
main()
Configuration File (monitor_config.json)
{
"thresholds": {
"cpu_percent": 85.0,
"memory_percent": 90.0,
"disk_percent": 85.0,
"load_average": 8.0
},
"services": [
"nginx",
"mysql",
"docker",
"redis-server"
],
"email": {
"enabled": true,
"smtp_server": "smtp.gmail.com",
"smtp_port": 587,
"username": "your-email@gmail.com",
"password": "your-app-password",
"to_addresses": ["admin@yourcompany.com"]
},
"monitoring": {
"interval": 60,
"recovery_attempts": 3,
"cooldown_period": 300
}
}
Systemd Service Example
[Unit]
Description=Intelligent System Monitor
After=network.target
[Service]
Type=simple
User=root
ExecStart=/usr/bin/python3 /opt/system-monitor/system_monitor.py
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
Conclusion
This script provides:
- Real-time monitoring of CPU, memory, disk, and load average
- Service supervision with automatic restart
- Configurable thresholds and alerts through JSON
- Email notifications with cooldown to avoid spam
- Systemd integration for reliable background execution
Automation scripts like this are a cornerstone of DevOps practices. They ensure system resilience, reduce downtime, and free engineers from repetitive tasks.
Pro tip: Always test in a staging environment before production deployment.
Originally written in Bulgarian and translated to English for the dev.to community.
Read the original Bulgarian version: DevOps автоматизация със скриптове
Top comments (0)