Issue with custom event handler
Posted: Wed Jul 04, 2012 6:18 am
I have written a script that will check the state of a virtual machine, and if critical will connect to the vsphere server and start the VM.
The vsphere side code is:
The NSC.ini file references this bat file as vmstart.
On the nagios server the command is:
The script is:
The service is defined as
The host is defined as:
The VM is currently turned off.
I can (as the nagios user) run the command ./restartvm.sh Nagios-Slave QAESX1 CRITICAL SOFT 3 0 0 and the VM will start up.
I can also call the check_nrpe command /usr/local/nagios/libexec/check_nrpe -H 192.168.2.97 -c vmstart -a QAESX1 Nagios-Slave and the VM will start
However the Vm does not start during normal Nagios checks, and does not even update the text file to echo the values. I have tried a mixture of adding the variables in the defined service, but cant figure out whats causing it not to fire.
I can run the simple check_nrpe commands against the same server and it works, and a simpler script to restart a windows service works fine, so I know NRPE itself is working.
Can anyone help me on this? My head is getting sore from banging it against the desk...
The vsphere side code is:
Code: Select all
@echo off
if "%1" == "QAESX1" set pass=xxx
if "%1" == "QAESX2" set pass=xxx
if "%1" == "Mirage" set pass=xxx
if "%1" == "LTNESX01" set pass=xxx
echo "I was passed %1 %2 and the password is %pass%" > "C:\Program Files\NSClient++\scripts\result.txt"
"C:\Program Files (x86)\VMware\VMware vSphere CLI\bin\vmware-cmd.pl" -H %1 --username root --password %pass% -l | Find "%2" > tmp.txt
set /p txtConfig= < tmp.txt
"C:\Program Files (x86)\VMware\VMware vSphere CLI\bin\vmware-cmd.pl" -H %1 --username root --password %pass% %txtConfig% getstate > vmState.txt
set /p vmState= < vmState.txt
if "%vmState%" == "getstate() = on" (
echo "VM is up"
) else (
echo "VM is off, starting..."
"C:\Program Files (x86)\VMware\VMware vSphere CLI\bin\vmware-cmd.pl" -H %1 --username root --password %pass% %txtConfig% start hard
)
On the nagios server the command is:
Code: Select all
define command{
command_name restartvm
command_line $USER1$/restartvm.sh $HOSTNAME$ $_PARENT$ $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ $HOSTDOWNTIME$ $SERVICEDOWNTIME$
}
Code: Select all
#!/bin/sh
HostName=$1
Parent=$2
ServiceState=$3
ServiceStateType=$4
ServiceAttempt=$5
HostDownTime=$6
ServiceDownTime=$7
servicestatus="$HostDownTime""$ServiceDownTime"
echo "Parent is $Parent" > vmcheck.txt
echo "Hostname is $HostName" >> vmcheck.txt
echo "Service State is $ServiceState" >> vmcheck.txt
echo "Service state type is $ServiceStateType" >> vmcheck.txt
echo "Service attempt is $ServiceAttempt" >> vmcheck.txt
echo "Host down time is $HostDownTime" >> vmcheck.txt
echo "Service downtime is $ServiceDownTime" >> vmcheck.txt
echo "Service Status is $servicestatus" >> vmcheck.txt
case "$ServiceState" in
OK)
;;
WARNING)
;;
UNKNOWN)
;;
CRITICAL)
# restart the server...
case "$ServiceStateType" in
SOFT)
case "$ServiceAttempt" in
3)
case "$servicestatus" in
00)
echo -e "Starting VM (3rd soft critical state)..."
/usr/local/nagios/libexec/check_nrpe -H 192.168.2.97 -c vmstart -a $Parent $HostName
;;
esac
;;
esac
;;
HARD)
case "$servicestatus" in
00)
echo -e "Starting VM..."
/usr/local/nagios/libexec/check_nrpe -H 192.168.2.97 -c vmstart -a $Parent $HostName
;;
esac
;;
esac
;;
esac
exit 0
Code: Select all
define service{
use generic-service
host_name Nagios-Slave
service_description PING
check_command check_ping!950.0,20%!1200.0,60%
max_check_attempts 4
event_handler restartvm
event_handler_enabled 1
}
Code: Select all
define host{
use apache-server
host_name Nagios-Slave
alias Nagios-Slave
address 192.168.2.148
parents QAESX1
_PARENT QAESX1
statusmap_image rack_linux.png
icon_image rack_linux.png
}
I can (as the nagios user) run the command ./restartvm.sh Nagios-Slave QAESX1 CRITICAL SOFT 3 0 0 and the VM will start up.
I can also call the check_nrpe command /usr/local/nagios/libexec/check_nrpe -H 192.168.2.97 -c vmstart -a QAESX1 Nagios-Slave and the VM will start
However the Vm does not start during normal Nagios checks, and does not even update the text file to echo the values. I have tried a mixture of adding the variables in the defined service, but cant figure out whats causing it not to fire.
I can run the simple check_nrpe commands against the same server and it works, and a simpler script to restart a windows service works fine, so I know NRPE itself is working.
Can anyone help me on this? My head is getting sore from banging it against the desk...