this hp_insight_agents pluging runs fine in the command line, however it returns null in nagios.
Code: Select all
#!/bin/bash
#########################################################
# #
# HP/Compaq Insight Management Agents Checker #
# #
# check_insight_agents #
# Version 2.0 (October 7, 2011) #
# #
# Authored by Jason Leonard #
# E-mail: [email protected] #
# #
# Overview #
# ----------------------------------------------------- #
# This plugin started out as individual rewrites #
# of the check_hp plugins - 6 scripts that each provide #
# information on different HP component statuses. I #
# found myself quite a fan of the plugin's output, and #
# was using them to learn SNMP and Nagios scripting. #
# Yet in the plugins, I saw where the code used many #
# if/then conditions, where case/select statements #
# could provide a more efficient analysis and reduce #
# redundancy in the code. #
# #
# As things developped, I had a strong desire to #
# find/use 1 plugin that could report all information. #
# A few were already developped, but as I tested them, #
# they each seemed to lack output that was meaningful. #
# In addition, a more amateur programmer myself, I #
# found myself struggling with the logic of other code. #
# So I set to try and combine the 6 individual plugins #
# into 1. This is the result of that work. #
# #
# The basic flow of the plugin is that it checks #
# global health first, and reports on that condition #
# most of the time. This leaves the program to only #
# make 1 check on most iterations. Individual compon- #
# ents are checked only when the global status might #
# indicate something wrong. At this point, although #
# chances are only 1 item has failed, the plugin makes #
# its best attempt to assume nothing and give info. on #
# ANY component that has failed. This also differs from #
# the check_hp plugin, which tended to output each #
# components status as checked, whether failed or not. #
# #
# I readily admit that the plugins have had #
# limited testing in our own environment, but we did #
# have a horrible old Compaq server that I was able to #
# use to test a number of scenarios. It's multiple #
# hardware failures actually encouraged me to make sure #
# the plugin could report more than 1 failed component. #
# #
# Hopefully, as this plugin is used, I can get #
# more feedback on where I may have missed something #
# in my logic - I took what seems to be a very unique #
# approach in checking "global" health before checking #
# any other conditions. #
# #
# As with every piece of code I write, I made a #
# strong effort for the code to be easy to follow, and #
# employed meaningful variable names to help clarify #
# whatever might be going on in the code. #
# #
# This plugin is distributed under the GNU GPL license. #
# You may re-destribute only according to the terms of #
# the GNU GPL v2. #
# #
#########################################################
#########################################################
## GLOBAL VARIABLES ##
#########################################################
APPNAME=$(basename $0)
VERSION="2.0"
COMMUNITY="public"
EXIT_CODE=0
EXIT_STRING=""
#########################################################
## print_help Function ##
#########################################################
# Prints out user help and gives examples of proper #
# plugin usage #
#########################################################
function print_help () {
echo 'Insight Management Agent plugin for Nagios'
echo ''
echo 'This plugin is not developped by the Nagios Plugin group.'
echo 'Please do not e-mail them for support on this plugin.'
echo ''
echo 'For contact info, please read the plugin script file.'
echo ''
echo "Usage of $APPNAME"
echo " $APPNAME -H <host/IP> -C <community> | -h | -V "
echo '---------------------------------------------------------------------'
echo 'Usable Options:'
echo ' -C public'
echo ' (required option)'
echo ' The SNMP Community variable - use the name of an SNMP community with read privileges'
echo ' By default, the community is assumed to be public'
echo ' -H hostname'
echo ' (required option)'
echo ' The IP address or hostname of the system to check'
echo ' -h'
echo ' This help screen'
echo ' -V'
echo ' show the current version of the plugin'
echo ''
echo 'Example:'
echo " $APPNAME -H 10.0.1.10 -C public"
echo ''
echo '---------------------------------------------------------------------'
return 3
}
#########################################################
## ThermalChecks function ##
#########################################################
# Checks all thermal components - fans, temperatures, #
# etc. Reports details of all failed components. #
#########################################################
function ThermalChecks () {
# check fans / temps
GLOBAL_THERMAL_STATUS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.1 2>/dev/null | cut -f4 -d ' ')
case "$GLOBAL_THERMAL_STATUS" in
1)
THERMAL_STATUS_TEXT="UNKNOWN: Checking thermal conditions is not supported for $HOST_NAME! Make sure SNMP is properly configured and that all Insight Management Agents are installed.\n"
THERMAL_EXIT_CODE=3
;;
2) # Thermal status is okay - we are only warning the user of anything that is degraded/failed
THERMAL_EXIT_CODE=0
;;
3|4) # We have failed /degraded status - we need to check individual components - temperatures AND fans
# ----------------------------------------
# ----- Check our temperatures first -----
# ----------------------------------------
GLOBAL_TEMP_STATUS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.3 2>/dev/null | cut -f4 -d ' ')
THERMAL_STATUS_TEXT=""
case "$GLOBAL_TEMP_STATUS" in
1) # A status of 1 is "unknown", but should be implicit if GLOBAL_THERMAL_STATUS is 1.
TEMP_EXIT_CODE=3
;;
2) # A status of 2 is okay, so we don't need to check anything
TEMP_EXIT_CODE=0
;;
3)
THERMAL_STATUS_TEXT="CRITICAL:"
TEMPERATURE_CONDITIONS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.8.1.6 2>/dev/null)
# First, filter our conditions for only sensors in degraded state
SENSORS_DEGRADED=$(echo "$TEMPERATURE_CONDITIONS" | grep "INTEGER: 3")
# Determine how many bad sensors we have
NUM_SENSORS_DEGRADED=$(echo "$SENSORS_DEGRADED" | wc -l)
CURRENT_SENSOR=1
while [ $CURRENT_SENSOR -le $NUM_SENSORS_DEGRADED ]; do
SENSOR_NUM=$(echo "$SENSORS_DEGRADED" | cut -f1 -d ' ' | cut -f10 -d '.' | awk NR==$CURRENT_SENSOR)
SENSOR_TYPE=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.8.1.3.0.$SENSOR_NUM 2>/dev/null | cut -f4 -d ' ')
SENSOR_TEMP=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.8.1.4.0.$SENSOR_NUM 2>/dev/null | cut -f4 -d ' ')
SENSOR_THRESHOLD=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.8.1.5.0.$SENSOR_NUM 2>/dev/null | cut -f4 -d ' ')
case "$SENSOR_TYPE" in
1) SENSOR_DESCR="other";;
2) SENSOR_DESCR="unknown";;
3) SENSOR_DESCR="system";;
4) SENSOR_DESCR="systemBoard";;
5) SENSOR_DESCR="ioBoard";;
6) SENSOR_DESCR="cpu";;
7) SENSOR_DESCR="memory";;
8) SENSOR_DESCR="storage";;
9) SENSOR_DESCR="removableMedia";;
10) SENSOR_DESCR="powerSupply" ;;
11) SENSOR_DESCR="ambient";;
12) SENSOR_DESCR="chassis";;
13) SENSOR_DESCR="bridgeCard";;
esac
# Make sure we list all sensors with problems, sequentially
if [ $CURRENT_SENSOR -lt $NUM_SENSORS_DEGRADED ]; then
THERMAL_STATUS_TEXT=$(echo "$THERMAL_STATUS_TEXT" "$SENSOR_DESCR temperature is $SENSOR_TEMP degrees Celsius - system will shut down at $SENSOR_THRESHOLD degrees Celsius!, ")
else
THERMAL_STATUS_TEXT=$(echo "$THERMAL_STATUS_TEXT" "$SENSOR_DESCR temperature is $SENSOR_TEMP degrees Celsius - system will shut down at $SENSOR_THRESHOLD degrees Celsius! ")
fi
CURRENT_SENSOR=$[ $CURRENT_SENSOR+1 ]
done
TEMP_EXIT_CODE=2
;;
4) # A status of 4 is one that will shut the system down, so we don't need to handle it
;;
esac
# -------------------------------
# ----- Check our fans next -----
# -------------------------------
GLOBAL_FAN_STATUS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.4 2>/dev/null | cut -f4 -d ' ')
FAN_CONDITIONS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.7.1.9 2>/dev/null )
case "$GLOBAL_FAN_STATUS" in
1) # This should be implicit by the global thermal status - we don't need details
FAN_EXIT_CODE=3
;;
2) # This should be implicit by the global thermal status - we don't need details
FAN_EXIT_CODE=0
;;
3|4) # Apparently an individual component can be degraded but this is not set as degraded
# So maybe we should not use the global status?
if [ $GLOBAL_FAN_STATUS -eq 3 ]; then
THERMAL_STATUS_TEXT="WARNING:"
FAN_EXIT_CODE=1
else
# Theoretically, we will never see this if it is 4 - the system will shut down
# UNLESS 1.3.6.1.4.1.232.6.2.6.2.0 = 2
THERMAL_STATUS_TEXT="CRITICAL:"
FAN_EXIT_CODE=2
fi
# First, filter our conditions for only fans in degraded state
FANS_AFFECTED=$(echo "$FAN_CONDITIONS" | egrep "INTEGER: 3|INTEGER: 4")
# Determine how many degraded fans we have
NUM_FANS_AFFECTED=$(echo "$FANS_AFFECTED" | wc -l)
CURRENT_FAN=1
while [ $CURRENT_FAN -le $NUM_FANS_AFFECTED ]; do
# Looking at CURRENT_FAN (1st, 2nd) system fan failed (out of NUM_FANS_AFFECTED fans)
FAN_NUM=$(echo "$FANS_AFFECTED" | cut -f1 -d ' ' | cut -f10 -d '.' | awk NR==$CURRENT_FAN)
# We need to differentiate if it's status was degraded or failed
FAN_STATUS=$(echo "$FANS_AFFECTED" | cut -f4 -d ' ' | awk NR==$CURRENT_FAN)
# We need it's location code
FAN_LOCATION=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.7.1.3.0.$CURRENT_FAN 2>/dev/null | cut -f4 -d ' ')
# Get some descriptive text for the fan's location
case "$FAN_LOCATION" in
1) FAN_DESCR="other";;
2) FAN_DESCR="unknown";;
3) FAN_DESCR="system";;
4) FAN_DESCR="systemBoard";;
5) FAN_DESCR="ioBoard";;
6) FAN_DESCR="cpu";;
7) FAN_DESCR="memory";;
8) FAN_DESCR="storage";;
9) FAN_DESCR="removableMedia";;
10) FAN_DESCR="powerSupply" ;;
11) FAN_DESCR="ambient";;
12) FAN_DESCR="chassis";;
13) FAN_DESCR="bridgeCard";;
esac
# Report whether the status was failed/degraded
if [ $FAN_STATUS -eq 3 ]; then
FAN_STATUS_TEXT="is degraded"
else
FAN_STATUS_TEXT="has failed"
fi
# Theoretically, the MIBs state that degraded status means the fan is redundant; failed means it is not
# However, I see no reason to NOT double-check that information!
FAN_REDUNDANCY=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.7.1.7.0.$PSU_NUM 2>/dev/null | cut -f4 -d ' ')
case "$FAN_REDUNDANCY" in
2) FAN_REDUNDANCY_TEXT="NOT redundant";;
3) FAN_REDUNDANCY_TEXT="redundant";;
esac
# Determine if the fan can be hot-swapped (so any user looking at the output knows if they'll have to take the system down
IS_FAN_HOT_SWAPPABLE=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.6.7.1.10.0.$PSU_NUM 2>/dev/null | cut -f4 -d ' ')
case "$IS_FAN_HOT_SWAPPABLE" in
2) FAN_SWAPPABLE_TEXT="NOT hot-swappable";;
3) FAN_SWAPPABLE_TEXT="hot-swappable";;
esac
# Report on the fan's details - providing a way to separate each fan by a ","
if [ $CURRENT_FAN -lt $NUM_FANS_AFFECTED ]; then
THERMAL_STATUS_TEXT=$(echo "$THERMAL_STATUS_TEXT" "System Fan # $FAN_LOCATION ($FAN_DESCR) $FAN_STATUS_TEXT (fan is $FAN_REDUNDANCY_TEXT, $FAN_SWAPPABLE_TEXT), ")
else
THERMAL_STATUS_TEXT=$(echo "$THERMAL_STATUS_TEXT" "System Fan # $FAN_LOCATION ($FAN_DESCR) $FAN_STATUS_TEXT (fan is $FAN_REDUNDANCY_TEXT, $FAN_SWAPPABLE_TEXT). ")
fi
CURRENT_FAN=$[ $CURRENT_FAN+1 ]
done
;;
esac
# If either component was critical, we want Nagios to return critical
if [ $FAN_EXIT_CODE -o $TEMP_EXIT_CODE -eq 2 ]; then
THERMAL_EXIT_CODE=2
else
# No component was higher than warning status, so return warning to Nagios
THERMAL_EXIT_CODE=1
fi
;;
esac
# if [ $THERMAL_EXIT_CODE -eq 0]; then
# If either of the individual return codes was OK, then CPU Fan Status codes must be the failing component
# I currently do not have any checks implemented for CPU Fan Status, as it appears the "agent" for this is not common
# THERMAL_STATUS_TEXT="UNKNOWN: CPU Fan Status may be failed/degraded. This plugin does not support checking of these items.\n"
# fi
EXIT_STRING="$EXIT_STRING $THERMAL_STATUS_TEXT"
return $THERMAL_EXIT_CODE
}
#########################################################
## PowerSupplyCheck function ##
#########################################################
# Checks all power supplies. Returns status of all #
# failed components. #
#########################################################
function PowerSupplyCheck () {
GLOBAL_PSU_STATUS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.9.1 2>/dev/null | cut -f4 -d ' ')
PSU_CONDITIONS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.9.3.1.4 2>/dev/null)
case "$GLOBAL_PSU_STATUS" in
1)
EXIT_STRING="UNKNOWN: Checking power supplies is not supported for $HOST_NAME! Make sure SNMP is properly configured and that all Insight Management Agents are installed.\n"
PSU_EXIT_CODE=3
;;
2) # Power supplies are okay - we are only warning the user of anything that is degraded/failed
PSU_EXIT_CODE=0
;;
3|4)
if [ $GLOBAL_PSU_STATUS -eq 3 ]; then
EXIT_STRING="$EXIT_STRING WARNING:"
PSU_EXIT_CODE=1
else
EXIT_STRING="$EXIT_STRING CRITICAL:"
PSU_EXIT_CODE=2
fi
# First, filter our conditions for only PSUs in degraded state
PSUs_AFFECTED=$(echo "$PSU_CONDITIONS" | egrep "INTEGER: 3|INTEGER: 4")
# Determine how many PSUs we have that are degraded
NUM_PSUs_AFFECTED=$(echo "$PSUs_AFFECTED" | wc -l)
CURRENT_PSU=1
# For each degraded PSU, output it's status
while [ $CURRENT_PSU -le $NUM_PSUs_AFFECTED ]; do
# Looking at CURRENT_PSU (1st, 2nd) power supply failed (out of NUM_PSUs_AFFECTED power supplies)
# We need it's location code (is it PSU 3, PSU 2, etc.?)...
PSU_NUM=$(echo "$PSUs_AFFECTED" | cut -f1 -d ' ' | cut -f10 -d '.' | awk NR==$CURRENT_PSU)
# We need to differentiate if it's status was degraded or failed
PSU_STATUS=$(echo "$PSUs_AFFECTED" | cut -f4 -d ' ' | awk NR==$CURRENT_PSU)
# And finally, we can get the error code for the current power supply
PSU_ERROR_CODE=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.9.3.1.5.0.$PSU_NUM 2>/dev/null | cut -f4 -d ' ')
case "$PSU_ERROR_CODE" in
1) PSU_ERROR_DESCR="No Error";;
2) PSU_ERROR_DESCR="2 (generalFailure)";;
3) PSU_ERROR_DESCR="3 (bistFailure)";;
4) PSU_ERROR_DESCR="4 (fanFailure)";;
5) PSU_ERROR_DESCR="5 (tempFailure)";;
6) PSU_ERROR_DESCR="6 (interlockOpen)";;
7) PSU_ERROR_DESCR="7 (epromFailed)";;
8) PSU_ERROR_DESCR="8 (vrefFailed)";;
9) PSU_ERROR_DESCR="9 (dacFailed)";;
10) PSU_ERROR_DESCR="10 (ramTestFailed)";;
11) PSU_ERROR_DESCR="11 (voltageChannelFailed)";;
12) PSU_ERROR_DESCR="12 (orringdiodeFailed)";;
13) PSU_ERROR_DESCR="13 (brownOut)";;
14) PSU_ERROR_DESCR="14 (giveupOnStartup)";;
15) PSU_ERROR_DESCR="15 (nvramInvalid)";;
16) PSU_ERROR_DESCR="16 (calibrationTableInvalid)";;
esac
# Report whether the status was failed/degraded
if [ $PSU_STATUS -eq 3 ]; then
PSU_STATUS_TEXT="is degraded"
else
PSU_STATUS_TEXT="has failed"
fi
# Determine redundant status of current psu
PSU_REDUNDANCY=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.9.3.1.9.0.$PSU_NUM 2>/dev/null | cut -f4 -d ' ')
case "$PSU_REDUNDANCY" in
2) PSU_REDUNDANCY_TEXT="Non-redundant";;
3) PSU_REDUNDANCY_TEXT="Redundant";;
*) PSU_REDUNDANCY_TEXT=""
esac
# Make sure we list all PSU's affected, separating each by a ","
if [ $CURRENT_PSU -lt $NUM_PSUs_DEGRADED ]; then
EXIT_STRING="$EXIT_STRING $PSU_REDUNDANCY_TEXT Power Supply # $CURRENT_PSU $PSU_STATUS_TEXT with error $PSU_ERROR_DESCR, "
else
EXIT_STRING="$EXIT_STRING $PSU_REDUNDANCY_TEXT Power Supply # $CURRENT_PSU $PSU_STATUS_TEXT with error $PSU_ERROR_DESCR. "
fi
CURRENT_PSU=$[ $CURRENT_PSU+1 ]
done
;;
esac
return $PSU_EXIT_CODE
}
#########################################################
## DriveChecks function ##
#########################################################
# Checks all hard drives AND drive controllers. #
# Returns details on any failed components. #
#########################################################
function DriveChecks () {
CONTROLLERS_AFFECTED=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.2.1.1.6 2>/dev/null | egrep "INTEGER: 3|INTEGER: 4")
NUM_CONTROLLERS_AFFECTED=$(echo "$CONTROLLERS_AFFECTED" | wc -l)
CURRENT_CONTROLLER=1
while [ $CURRENT_CONTROLLER -le $NUM_CONTROLLERS_AFFECTED ]; do
CONTROLLER_INDEX=$(echo "$CONTROLLERS_AFFECTED" | cut -f1 -d ' ' | cut -f9 -d '.' | awk NR==$CURRENT_CONTROLLER)
CONTROLLER_LOCATION=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.2.1.1.20.$CONTROLLER_INDEX 2>/dev/null | cut -f4 -d ':' | tr -d '"')
CONTROLLER_STATUS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.2.1.1.6.$CONTROLLER_INDEX 2>/dev/null | cut -f4 -d ' ')
LOGICAL_DRIVE_ERROR=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.3.1.1.11 2>/dev/null)
PHYSICAL_DRIVE_STATUS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.5.1.1.37 2>/dev/null)
DRIVE_STATUS_TEXT=""
case "$CONTROLLER_STATUS" in
1) # Unknown
DRIVE_STATUS_TEXT="UNKNOWN: Checking drive conditions is not supported for $HOST_NAME! Make sure SNMP is properly configured and that all Insight Management Agents are installed.\n"
DRIVE_EXIT_CODE=3
;;
2) # Everything is okay, no need to report anything
DRIVE_EXIT_CODE=0
;;
3) # One or more components may be failed, but not all. The array controller is still usable but degraded
DRIVE_STATUS_TEXT="Array Controller at$CONTROLLER_LOCATION is degraded. REASON -"
LOGICAL_DRIVES_AFFECTED=$(echo "$LOGICAL_DRIVE_ERROR" | egrep "INTEGER: 3|INTEGER: 4")
NUM_LOGICAL_DRIVES_AFFECTED=$(echo "$LOGICAL_DRIVES_AFFECTED" | wc -l)
#------------------------------------
#----- Check out logical drives -----
#------------------------------------
CURRENT_LOGICAL_DRIVE=1
while [ $CURRENT_LOGICAL_DRIVE -le $NUM_LOGICAL_DRIVES_AFFECTED ]; do
# Determine which drive number in the array
LOGICAL_DRIVE_NUM=$(echo "$LOGICAL_DRIVES_AFFECTED" | cut -f1 -d ' ' | cut -f10 -d '.' | awk NR==$CURRENT_LOGICAL_DRIVE)
# Determine whether or not this drive is degraded/failed
LOGICAL_DRIVE_STATUS=$(echo "$LOGICAL_DRIVES_AFFECTED" | cut -f4 -d ' ' | awk NR==$CURRENT_LOGICAL_DRIVE)
# Determine some basic identities/properties of this drive
LOGICAL_DRIVE_TEXT=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.3.1.1.14.$CONTROLLER_INDEX.$LOGICAL_DRIVE_NUM 2>/dev/null | cut -f4 -d ':' | tr -d '"')
LOGICAL_DRIVE_ERROR=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.3.1.1.4.$CONTROLLER_INDEX.$LOGICAL_DRIVE_NUM 2>/dev/null | cut -f4 -d ' ')
case "$LOGICAL_DRIVE_ERROR" in
1) LOGICAL_DRIVE_ERROR_DESCR="1 (other)";;
2) LOGICAL_DRIVE_ERROR_DESCR="2 (ok)";;
3) LOGICAL_DRIVE_ERROR_DESCR="3 (failed)";;
4) LOGICAL_DRIVE_ERROR_DESCR="4 (unconfigured)";;
5) LOGICAL_DRIVE_ERROR_DESCR="5 (recovering)";;
6) LOGICAL_DRIVE_ERROR_DESCR="6 (readyForRebuild)";;
7) LOGICAL_DRIVE_ERROR_DESCR="7 (rebuilding)";;
8) LOGICAL_DRIVE_ERROR_DESCR="8 (wrongDrive)";;
9) LOGICAL_DRIVE_ERROR_DESCR="9 (badConnect)";;
10) LOGICAL_DRIVE_ERROR_DESCR="10 (overheating)";;
11) LOGICAL_DRIVE_ERROR_DESCR="11 (shutdown)";;
12) LOGICAL_DRIVE_ERROR_DESCR="12 (expanding)";;
13) LOGICAL_DRIVE_ERROR_DESCR="13 (notAvailable)";;
14) LOGICAL_DRIVE_ERROR_DESCR="14 (queuedForExpansion)";;
esac
# Report whether the status was failed/degraded
if [ $LOGICAL_DRIVE_STATUS -eq 3 ]; then
LOGICAL_DRIVE_STATUS_TEXT="is degraded"
else
LOGICAL_DRIVE_STATUS_TEXT="has failed"
fi
# Make sure we list all drives affected, separating each by a ","
if [ $CURRENT_LOGICAL_DRIVE -lt $NUM_LOGICAL_DRIVES_AFFECTED ]; then
DRIVE_STATUS_TEXT=$(echo "$DRIVE_STATUS_TEXT" "Logical Drive$LOGICAL_DRIVE_TEXT $LOGICAL_DRIVE_STATUS_TEXT with error $LOGICAL_DRIVE_ERROR_DESCR, ")
else
DRIVE_STATUS_TEXT=$(echo "$DRIVE_STATUS_TEXT" "Logical Drive$LOGICAL_DRIVE_TEXT $LOGICAL_DRIVE_STATUS_TEXT with error $LOGICAL_DRIVE_ERROR_DESCR. ")
fi
CURRENT_LOGICAL_DRIVE=$[ $CURRENT_LOGICAL_DRIVE+1 ]
done
#-------------------------------------
#----- Check out physical drives -----
#-------------------------------------
PHYSICAL_DRIVES_AFFECTED=$(echo "$PHYSICAL_DRIVE_STATUS" | egrep "INTEGER: 3|INTEGER: 4")
NUM_PHYSICAL_DRIVES_AFFECTED=$(echo "$PHYSICAL_DRIVES_AFFECTED" | wc -l)
CURRENT_PHYSICAL_DRIVE=1
while [ $CURRENT_PHYSICAL_DRIVE -le $NUM_PHYSICAL_DRIVES_AFFECTED ]; do
# Determine which drive number in the array
PHYSICAL_DRIVE_NUM=$(echo "$PHYSICAL_DRIVES_AFFECTED" | cut -f1 -d ' ' | cut -f10 -d '.' | awk NR==$CURRENT_PHYSICAL_DRIVE)
# Determine whether or not this drive is degraded/failed
PHYSICAL_DRIVE_STATUS=$(echo "$PHYSICAL_DRIVES_AFFECTED" | cut -f4 -d ' ' | awk NR==$CURRENT_PHYSICAL_DRIVE)
# Determine some basic identities/properties of this drive
PHYSICAL_DRIVE_BAY=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.5.1.1.64.$CONTROLLER_INDEX.$PHYSICAL_DRIVE_NUM 2>/dev/null | cut -f4 -d ':' | tr -d '"')
PHYSICAL_DRIVE_ERROR=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.5.1.1.6.$CONTROLLER_INDEX.$PHYSICAL_DRIVE_NUM 2>/dev/null | cut -f4 -d ' ')
# PHYSICAL_DRIVE_MODEL=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.3.2.5.1.1.3.0.$CURRENT_PHYSICAL_DRIVE 2>/dev/null | gawk -F: '{ print $4 }'))
case "$PHYSICAL_DRIVE_ERROR" in
1) PHYSICAL_DRIVE_ERROR_DESCR="1 (other)";;
2) PHYSICAL_DRIVE_ERROR_DESCR="2 (ok)";;
3) PHYSICAL_DRIVE_ERROR_DESCR="3 (failed)";;
4) PHYSICAL_DRIVE_ERROR_DESCR="4 (predictiveFailure)";;
esac
if [ $PHYSICAL_DRIVE_STATUS -eq 3 ]; then
PHYSICAL_DRIVE_STATUS_TEXT="degraded"
else
PHYSICAL_DRIVE_STATUS_TEXT="failed"
fi
# Make sure we list all drives affected, separating each by a ","
if [ $CURRENT_PHYSICAL_DRIVE -lt $NUM_PHYSICAL_DRIVES_AFFECTED ]; then
DRIVE_STATUS_TEXT=$(echo "$DRIVE_STATUS_TEXT" "$PHYSICAL_DRIVE_BAY is $PHYSICAL_DRIVE_STATUS_TEXT with error $PHYSICAL_DRIVE_ERROR_DESCR, ")
else
DRIVE_STATUS_TEXT=$(echo "$DRIVE_STATUS_TEXT" "$PHYSICAL_DRIVE_BAY is $PHYSICAL_DRIVE_STATUS_TEXT with error $PHYSICAL_DRIVE_ERROR_DESCR. ")
fi
CURRENT_PHYSICAL_DRIVE=$[ $CURRENT_PHYSICAL_DRIVE+1 ]
done
# Currently, this function checks drives in much the way that Systems Management would.
# Controller condition is based on the logical drive status, and logical drive status
# is based on physical drive status. The error codes returned by this plugin should match
# the Systems Management global page.
# The problem with this approach is that if a physical drive is failed, the global status
# is not considered critical. But we want Nagios to alert us!
# So here, rather than assume a warning status only, I want to report the worst
# condition reported from all the subcomponents. This means eventhough the controller might be
# okay - redundancy is in effect and the server is operating - we can still get notified of a failed
# drive and have the entire thing treated as critical (since at least 1 subcomponent
if [ $LOGICAL_DRIVE_STATUS -eq 4 ] || [ $PHYSICAL_DRIVE_STATUS -eq 4 ]; then
DRIVE_STATUS_TEXT="CRITICAL: $DRIVE_STATUS_TEXT\n"
DRIVE_EXIT_CODE=2
else
if [ $LOGICAL_DRIVE_STATUS -eq 3 ] || [ $PHYSICAL_DRIVE_STATUS -eq 3 ]; then
# Something has to have flagged critical
# We know all components are at warning
DRIVE_STATUS_TEXT="WARNING: $DRIVE_STATUS_TEXT\n"
else
DRIVE_STATUS_TEXT="WARNING: There appears to be a problem on the controller at $CONTROLLER_LOCATION, but no drives are effected. Possible causes are outdated firmware, a tape drive needs cleaned, etc. Check Systems Management homepage for more details.\n"
fi
DRIVE_EXIT_CODE=1
fi
;;
4) # A "failed" state means all individual components have "failed" - if any 1 component is "degraded" the system returns a degraded instead
DRIVE_STATUS_TEXT="CRITICAL: Array Controller at $CONTROLLER_LOCATION has failed!\n"
DRIVE_EXIT_CODE=2
;;
esac
CURRENT_CONTROLLER=$[ $CURRENT_CONTROLLER+1 ]
done
EXIT_STRING="$EXIT_STRING $DRIVE_STATUS_TEXT"
return $DRIVE_EXIT_CODE
}
#########################################################
## MemoryCheck function ##
#########################################################
# Checks memory sticks for any errors they might be #
# reporting to the system. I hope to be able to #
# implement more checks in the future but did #
# not find much useful information in the Compaq #
# MIBs for memory information. #
#########################################################
function MemoryCheck () {
MEMORY_STATUS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.3.2.0 2>/dev/null | cut -f4 -d ' ')
MEMORY_EXIT_CODE=0
case "$MEMORY_STATUS" in
1)
MEMORY_EXIT_CODE=3
EXIT_STRING="$EXIT_STRING UNKNOWN: Condition of correctable memory is undetermined for this machine.\n"
;;
3|4)
# Determine how many memory errors we have gotten on the memory chips
NUM_MEM_ERRORS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.3.3 2>/dev/null | cut -f4 -d ' ')
# Use the HP Agents to determine how many memory errors are critical for the sticks installed
MEM_CRITICAL=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.2.3.5 2>/dev/null | cut -f4 -d ' ')
# Set our warning at 80% of the critical value returned by the HP Agents
MEM_WARNING=$(echo $MEM_CRITICAL*0.8/1 | bc)
if [ $NUM_MEM_ERRORS -ge $MEM_CRITICAL ]; then
EXIT_STRING="$EXIT_STRING CRITICAL: $NUM_MEM_ERRORS memory errors occured! Please replace the faulty memory!\n"
MEMORY_EXIT_CODE=2
else
if [ $NUM_MEM_ERRORS -ge $MEM_WARNING ]; then
EXIT_STRING="$EXIT_STRING WARNING: $NUM_MEM_ERRORS memory errors occured! Memory will need to be replaced soon. (CRITICAL = $MEM_CRITICAL)\n"
MEMORY_EXIT_CODE=1
fi
fi
;;
esac
return $MEMORY_EXIT_CODE
}
#########################################################
## MAIN CODE ##
#########################################################
# Check that all required binaries for the script are available
# EXIT with an UNKNOWN status if not
binaries="snmpwalk awk cut grep wc"
for required_binary in $binaries;
do
which $required_binary > /dev/null
if [ "$?" != '0' ];then
printf "UNKNOWN: $APPNAME: No usable '$required_binary' binary in '$PATH'\n"
exit 3
fi
done
# Parse our options as passed, and make sure things are peachy
while getopts "C:H:hV" OPTION;
do
case $OPTION in
"C")
COMMUNITY=$OPTARG
;;
"H")
HOST_NAME=$OPTARG
;;
"h")
print_help
exit $?
;;
"V")
printf "$APPNAME - version $VERSION\n"
exit $EXIT_CODE
;;
esac
done
# Make sure all necessary arguments were given; EXIT with an UNKNOWN status if not
if [ -z "$COMMUNITY" ] || [ -z "$HOST_NAME" ];then
EXIT_STRING="UNKNOWN: Hostname and/or Community variables have not been set!\n"
EXIT_CODE=3
else
# Determine health of most major components (hard drives are under a differet MIB tree and are not included)
COMPONENT_SYSTEM_STATUS=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.6.1.3 2>/dev/null | cut -f4 -d ' ')
if [ $? -ne 0 ]; then
#Check for server response - is SNMP even setup okay?
EXIT_STRING="WARNING: No SNMP response from $HOST_NAME! Make sure host is up and SNMP is configured properly.\n"
EXIT_CODE=1
else
case "$COMPONENT_SYSTEM_STATUS" in
1) # A value of 1 indicates that the system agents aren't working or SNMP could be broke - no sense going further!
EXIT_STRING="UNKNOWN: Check your SNMP configuration and be sure all Insight Agents are installed.\n"
EXIT_CODE=3
;;
2) # A value of 2 means everything is sweet with regular components; we don't know about drive status though
DriveChecks
DRIVE_STATUS=$?
if [ $DRIVE_STATUS -gt 0 ]; then
# Eventhough other components are okay, drives are not - let's report their condition
EXIT_CODE=$DRIVE_STATUS
else
# Everything is fine - let's output some info. on the server
SERVER_SERIAL_NUMBER=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.2.2.2.5 2>/dev/null | cut -f4 -d ':' | tr -d '"')
SERVER_MODEL_NUMBER=$(snmpwalk -v 2c -c $COMMUNITY $HOST_NAME .1.3.6.1.4.1.232.2.2.4.2 2>/dev/null | cut -f4 -d ':' | tr -d '"')
EXIT_STRING="OK: All system components are normal. Server is a$SERVER_MODEL_NUMBER with serial #$SERVER_SERIAL_NUMBER.\n"
EXIT_CODE=0
fi
;;
3|4) # 3 is degraded status, 4 is failed - we need to investigate for more details
# Let's run checks of all the components that we can, and report details ony on warnings/failures.
# This builds a culmulative output, in the event that more than 1 component has failed
ThermalChecks
PowerSupplyCheck
MemoryCheck
DriveChecks
if [ $COMPONENT_SYSTEM_STATUS -eq 3 ]; then
EXIT_CODE=1
else
EXIT_CODE=2
fi
;;
esac
fi
fi
printf "$EXIT_STRING"
exit $EXIT_CODE