Event handler script in /etc/nagios/conf.d ..
Code: Select all
4 -rwxr-xr-x 1 nagios nagios 2946 Mar 30 13:29 restarting-services.sh
Code: Select all
#!/usr/bin/env bash
date=$(date)
# What state is the service in?
case "${1}" in
OK)
# The service just came back up, so don't do anything...
;;
WARNING)
# We don't really care about warning states, since the service is probably still running...
;;
UNKNOWN)
# We don't know what might be causing an unknown error, so don't do anything...
;;
CRITICAL)
# Aha! The service appears to have a problem - perhaps we should restart the server...
# Is this a "soft" or a "hard" state?
case "${2}" in
# We're in a "soft" state, meaning that Nagios is in the middle of retrying the
# check before it turns into a "hard" state and contacts get notified...
SOFT)
# What check attempt are we on? We don't want to restart the web server on the first
# check, because it may just be a fluke!
case "${3}" in
# Wait until the check has been tried 3 times before restarting the service.
# If the check fails on the 4th time (after we restart the service), the state
# type will turn to "hard" and contacts will be notified of the problem.
# Hopefully this will restart the service successfully, so the 4th check will
# result in a "soft" recovery. If that happens no one gets notified because we
# fixed the problem!
3)
printf "%s" "Restarting service ${6} (3rd soft critical state)...\n"
# Call NRPE to restart the service on the remote machine
/usr/lib/nagios/plugins/check_nrpe -H "${4}" -c restart-service -a "${5}"
echo "${date} - restart ${6} - SOFT" >> /tmp/nagios-autorestart.log
;;
esac
;;
HARD)
case "${3}" in
4)
printf "%s" "Restarting ${6} service...\n"
# Call the init script to restart the service
echo "${date} - restart ${6} - HARD" >> /tmp/nagios-autorestart.log
/usr/lib/nagios/plugins/check_nrpe -H "${4}" -c restart-service -a "${5}"
;;
esac
;;
esac
;;
esac
exit 0
Code: Select all
define command {
command_name check_salt
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -u -t 60 -c check_procs -a salt
}
Code: Select all
define service {
name salt_alive
use generic-service
check_command check_salt
description Check salt is alive
notification_options w,c,r
notifications_enabled 0
contact_groups linux-team
host_name my.server.com
event_handler restart-service!salt
}
Code: Select all
define command {
command_name restart-service
command_line /etc/nagios/conf.d/restarting-services.sh "$SERVICESTATE$" "$SERVICESTATETYPE$" "$SERVICEATTEMPT$" "$HOSTADDRESS$" "$ARG1$" "$SERVICEDESC$"
}
Code: Select all
command[restart-service]=/usr/bin/sudo /usr/sbin/service $ARG1$ restart
EDIT:
Forgot to mention on the remote host I have allowed nagios user to be able to run the service command in sudoers.