ruạṛ
#!/usr/bin/env python3 """ Backuply Health Check Plugin for Icinga Description: - Checks if backup servers are configured - Verifies backup jobs exist - Monitors recent backup job executions via master process data - Reports per-job success/failed/warnings/running counts with perfdata - Lists failed user accounts in output Usage: ./check_backuply.py [--chours 24] [--whours 12] Exit Codes: 0 OK 1 WARNING 2 CRITICAL 3 UNKNOWN """ import os import sys import shutil import argparse import time import sqlite3 import json import subprocess def check_backup_servers(): """Check if backup servers are configured""" config_path = "/var/backuply/conf/backup_servers.json" if not os.path.exists(config_path): print(f"CRITICAL: Backup servers config not found at {config_path}") return 2 try: with open(config_path, 'r') as f: backup_servers = json.load(f) except (json.JSONDecodeError, IOError) as e: print(f"CRITICAL: Failed to read backup servers config: {e}") return 2 if not backup_servers: print("CRITICAL: No backup servers configured") return 2 print(f"OK: {len(backup_servers)} backup server(s) configured") return 0 def check_backup_jobs(): """Check if backup jobs are configured""" config_path = "/var/backuply/conf/backup.json" if not os.path.exists(config_path): print(f"CRITICAL: Backup jobs config not found at {config_path}") return 2 try: with open(config_path, 'r') as f: backup_jobs = json.load(f) except (json.JSONDecodeError, IOError) as e: print(f"CRITICAL: Failed to read backup jobs config: {e}") return 2 if not backup_jobs: print("CRITICAL: No backup jobs configured") return 2 active_jobs = [job for job in backup_jobs if job.get('schedule_status') == 1] if not active_jobs: print("WARNING: No active backup jobs found") return 1 print(f"OK: {len(active_jobs)} active backup job(s) configured") return 0 def parse_job_stats(data_str): """Parse master process data JSON into job-level stats. Status codes in schedule.users per-user dict: 1 = success 0 = failed -100 = completed with warnings -1 / [] (empty list) = running or queued """ try: data = json.loads(data_str) except (json.JSONDecodeError, TypeError): return None schedule = data.get('schedule', {}) users = schedule.get('users', {}) total_proc = schedule.get('total_proc', 0) failed_users = [] success = failed = warnings = 0 for username, u in users.items(): if not isinstance(u, dict): continue st = u.get('status') if st == 1: success += 1 elif st == 0: failed += 1 failed_users.append(username) elif st == -100: warnings += 1 running = total_proc - success - failed - warnings return { 'job_name': schedule.get('schedule_name', 'Unknown'), 'total_proc': total_proc, 'success': success, 'failed': failed, 'warnings': warnings, 'running': running, 'failed_users': failed_users, } def check_recent_tasks(): """Check recent backup job executions via master process data in SQLite.""" parser = argparse.ArgumentParser(add_help=False) parser.add_argument("--chours", type=int, default=24, help="Critical hours threshold (default: 24)") parser.add_argument("--whours", type=int, default=12, help="Warning hours threshold (default: 12)") args, _ = parser.parse_known_args() def get_current_users_count(): try: if os.path.exists("/usr/local/cpanel/cpanel"): command = "whmapi1 --output=jsonpretty get_current_users_count" process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) output, _ = process.communicate() data = json.loads(output) return data['data']['users'] except: pass users_count = get_current_users_count() critical_hours = args.chours warning_hours = args.whours db_path = "/var/backuply/db/tasks.db" if not os.path.exists(db_path): print(f"WARNING: Database not found at {db_path}") return 1 now = int(time.time()) critical_threshold = now - (critical_hours * 3600) warning_threshold = now - (warning_hours * 3600) try: conn = sqlite3.connect(db_path) cursor = conn.cursor() try: with open('/proc/uptime', 'r') as f: uptime_seconds = float(f.read().split()[0]) except (IOError, ValueError): uptime_seconds = float('inf') cursor.execute(""" SELECT 1 FROM tasks WHERE action = 'admin_backup_master_process' LIMIT 1 """) has_backup_history = cursor.fetchone() is not None if uptime_seconds < (24 * 3600) and not has_backup_history: uptime_hours = uptime_seconds / 3600 print(f"OK: New server - uptime {uptime_hours:.1f}h and no backup history") conn.close() return 0 cursor.execute(""" SELECT actid, data, created FROM tasks WHERE action = 'admin_backup_master_process' AND created > ? ORDER BY created DESC """, (critical_threshold,)) master_tasks = cursor.fetchall() conn.close() except sqlite3.Error as e: print(f"CRITICAL: Failed to query database: {e}") return 2 if not master_tasks and users_count is not None and users_count > 0: print(f"CRITICAL: No backup jobs executed in last {critical_hours}h but {users_count} user(s) exist") return 2 if not master_tasks: print(f"OK: No backup jobs in last {critical_hours}h") return 0 jobs = [] for actid, data_str, created in master_tasks: stats = parse_job_stats(data_str) if stats is None: continue stats['actid'] = actid stats['created'] = created jobs.append(stats) if not jobs: print(f"OK: {len(master_tasks)} backup job(s) in last {critical_hours}h (no parseable data)") return 0 critical_failed = [j for j in jobs if j['failed'] > 0 and j['created'] > warning_threshold] warning_failed = [j for j in jobs if j['failed'] > 0 and warning_threshold >= j['created'] > critical_threshold] warning_only = [j for j in jobs if j['warnings'] > 0 and j['failed'] == 0 and j['created'] > warning_threshold] latest = jobs[0] perfdata = ( f"| success={latest['success']};;;0;{latest['total_proc']}" f" failed={latest['failed']};;;0;{latest['total_proc']}" f" warnings={latest['warnings']};;;0;{latest['total_proc']}" f" running={latest['running']};;;0;{latest['total_proc']}" f" total={latest['total_proc']}" ) if critical_failed: job = critical_failed[0] sample = ', '.join(job['failed_users'][:5]) extra = f" (+{len(job['failed_users']) - 5} more)" if len(job['failed_users']) > 5 else "" print( f"CRITICAL: {job['job_name']}(id:{job['actid']})" f" {job['failed']}/{job['total_proc']} failed" f" [{sample}{extra}]," f" {job['success']} ok, {job['warnings']} warn" f" {perfdata}" ) return 2 if warning_failed: job = warning_failed[0] sample = ', '.join(job['failed_users'][:5]) extra = f" (+{len(job['failed_users']) - 5} more)" if len(job['failed_users']) > 5 else "" print( f"WARNING: {job['job_name']}(id:{job['actid']})" f" {job['failed']}/{job['total_proc']} failed" f" [{sample}{extra}]," f" {job['success']} ok" f" {perfdata}" ) return 1 if warning_only: job = warning_only[0] print( f"WARNING: {job['job_name']}(id:{job['actid']})" f" {job['warnings']}/{job['total_proc']} warnings," f" {job['success']} ok" f" {perfdata}" ) return 1 msg = ( f"OK: {len(jobs)} job(s) in {critical_hours}h," f" latest {latest['job_name']}" f" {latest['success']}/{latest['total_proc']} ok" ) if latest['running'] > 0: msg += f" ({latest['running']} running)" if users_count is not None: msg += f", {users_count} users" print(f"{msg} {perfdata}") return 0 if __name__ == "__main__": if not shutil.which("backuply"): print("OK: Backuply is not installed on this server") sys.exit(0) checks = [check_backup_servers, check_backup_jobs, check_recent_tasks] for check_fn in checks: result = check_fn() if result != 0: sys.exit(result)
cải xoăn