"grep -r command_file" Nagios process throttling CPUs
Posted: Thu Feb 18, 2021 2:37 am
Hello everyone,
Lately I've had weird performance issues on my Nagios Core server running on CentOS 7.9.2009. The server has 8 vCPUs and 8GB of RAM.
This is what the incident looks like in htop:

As you can see, a process by the nagios user running a command "grep -r command_file" is putting tremendous strain on the server.
Running a grep search reveals that the strenuous command is related to a HPE iLO plugin:
Here is that file in question, fully:
This performance issue started about a week or two ago, and it seems to go hand-in-hand with the issue of my iLO card checks timing out. This problem happens about every two or three days, and it is causing a lot of false alerts for our team. Usually, all of the sudden all of my HPE iLO cards get service check timeouts, and re-running the checks makes no difference - this usually lasts about an hour or so.
You would think a timeout threshold of 120 seconds would be enough:

Here's what my Outlook inbox looked like when I came to work this morning. Looks like the iLOs have been going at it all night long:

I would really appreciate any input or ideas on this matter. I've opened an issue on the plugin's GitHub page, but it seems nobody is reading them, as every single issue case there is without any answers.
I wasn't able to find any relevant information in the iLO card's or my Nagios Core's logs. Here's my iLO.cfg if it helps:
Please let me know if I can provide more details, logs, configuration files etc.
Lately I've had weird performance issues on my Nagios Core server running on CentOS 7.9.2009. The server has 8 vCPUs and 8GB of RAM.
This is what the incident looks like in htop:

As you can see, a process by the nagios user running a command "grep -r command_file" is putting tremendous strain on the server.
Running a grep search reveals that the strenuous command is related to a HPE iLO plugin:
Code: Select all
[root@nagios]# grep -R "grep -r command_file" /usr/local/nagios/
/usr/local/nagios/libexec/nagios_hpeilo_traps: NagiosCmdFile=`grep -r command_file $nagios_cfg_file | \
/usr/local/nagios/libexec/nagios_hpeilo_traps: NagiosCmdFile=`grep -r command_file $nagios_cfg_file | \Code: Select all
#!/bin/bash
#
# nagios_hpeilo_traps -- The script aims to collect the SNMP traps
# received from HP ProLiant Server and to
# update the corresponding status in real-time
#
# (C) Copyright [2015] Hewlett-Packard Enterprise Development Company, L.P.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to:
# Free Software Foundation, Inc.
# 51 Franklin Street, Fifth Floor
# Boston, MA 02110-1301, USA.
#
# Written by Adrian Huang <[email protected]>.
prefix=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
NagiosLibexec="${prefix}"
NagiosHpiloEngine="nagios_hpeilo_engine"
HpiloLogFile="/var/log/nagios_hpeilo_traps.log"
HpiloSnmpComm="public" # hpeilo ilo community string
HpiloServices=("System Status" "Fans" "Memory" "Network" "Power Supplies" "Processors" "Storage" "Temperatures")
SnmpTrapOid="iso.3.6.1.6.3.1.1.4.1.0"
HlthStatusChgTrap="HlthStuatsChangeTrap"
function get_hlth_service_type () {
case $1 in
603[5-9] | 6055 ) # Fan Group Trap
echo "${HpiloServices[1]}"
;;
603[0-4] | 604[8-9] | 6050 | 605[4-5] ) # Power Supply Group Trap
echo "${HpiloServices[4]}"
;;
604[0-2] ) # Temperature Group Trap
echo "${HpiloServices[7]}"
;;
602[3-4] ) # Processors Group Trap
echo "${HpiloServices[5]}"
;;
11020 ) # Health Status Array Change Trap
echo "$HlthStatusChgTrap"
;;
*)
echo "None"
;;
esac
}
# argument 1 ($1): the iLO IP which generated the trap this time
# argument 2 ($2): service type
# argument 3 ($3): service status (NAGIOS_OK or NAGIOS_WARNING or ...)
# argument 4 ($4): status information
function nagios_passive_check () {
write_log "PROCESS_SERVICE_CHECK_RESULT;$1;$2;$3;$4"
now=$((`date +%s`))
printf "[%lu] PROCESS_SERVICE_CHECK_RESULT;$1;$2;$3;$4\n" $now > $NagiosCmdFile
}
# argument 1 ($1): The data to be logged
function write_log () {
if [ "$VerboseMode" = "1" ]
then
echo -e "[`date "+%b %d %T"`] $1" >> $HpiloLogFile
fi
}
function get_nagios_cfg_file() {
local nagios_cfg=
# The format might be like:
# 1) NagiosIloCfgFile=${prefix}/etc/nagios.cfg
# 2) NAGIOSCFG="/etc/nagios3/nagios.cfg"
nagios_cfg=`grep -E "=.*etc.*nagios.cfg" /etc/init.d/nagios* | \
cut -d '=' -f 2`
# trim double quotes
nagios_cfg=`echo $nagios_cfg | tr -d '"'`
if [[ $nagios_cfg = *prefix* ]]; then
# The /etc/init.d/nagios* has the fomrat like this:
# NagiosIloCfgFile=${prefix}/etc/nagios.cfg
# So, we need to replace ${prefix} with the actual value
# Find the nagios prefix path
local nagios_prefix=`grep -i "^prefix=" /etc/init.d/nagios* | \
cut -d '=' -f 2`
# ${prefix}/etc/nagios.cfg -> will be etc/nagios.cfg
local nagios_suffix=`echo $nagios_cfg | cut -d '/' -f 2-`
nagios_cfg="${nagios_prefix}/$nagios_suffix"
fi
echo "$nagios_cfg"
}
function get_nagios_ilo_cfg_path() {
local nagios_cfg=$(get_nagios_cfg_file)
local nagios_path=`dirname $nagios_cfg`
local NagiosIloCfgFile="$nagios_path/ilo/ilo.cfg"
echo $NagiosIloCfgFile
}
# argument 1 ($1): the iLO IP which generated the trap this time
# argument 2 ($2): the iLO IP trap number
# argument 3 ($3): service type
function update_hlth_status () {
local nagios_cfg_file=$(get_nagios_cfg_file)
local oid_idx=
local nagios_service_status=
write_log "$nagios_cfg_file"
NagiosCmdFile=`grep -r command_file $nagios_cfg_file | \
awk -F = '{print $2}'`
for((idx=0;idx<${#HpiloServices[@]};idx++))
do
if [ "$3" = "${HpiloServices[$idx]}" ]; then
oid_idx=$idx
break
fi
done
# The oid_idx must be increased by 1 since the oid index of the
# NagiosHpiloEngine is started from 1 rather than 0.
write_log "$NagiosLibexec/$NagiosHpiloEngine -H $IloIP -C $HpiloSnmpComm -o $((oid_idx+1))"
output=`$NagiosLibexec/$NagiosHpiloEngine -H $IloIP -C $HpiloSnmpComm -o $((oid_idx+1))`
nagios_service_status=$?
nagios_passive_check "$1" "$3" "$nagios_service_status" "$output"
}
# argument 1 ($1): the iLO IP
# argument 2 ($2): the iLO IP RO community string
function update_hlth_status_from_status_array () {
local nagios_cfg_file=$(get_nagios_cfg_file)
local oid_idx=0
local curr_oid_idx=$3
local curr_oid_idx=$((curr_oid_idx-1))
local curr_odi_status=
local curr_oid_disc=
nagios_service_status=0
local hlth_array_count=0
write_log "$nagios_cfg_file"
NagiosCmdFile=`grep -r command_file $nagios_cfg_file | \
awk -F = '{print $2}'`
# The oid_idx must be increased by 1 since the oid index of the
# NagiosHpiloEngine is started from 1 rather than 0.
write_log "$NagiosLibexec/$NagiosHpiloEngine -H $1 -C $2 -o $((oid_idx))"
output=`$NagiosLibexec/$NagiosHpiloEngine -H $1 -C $2 -o $((oid_idx))`
nagios_service_status=$?
write_log "${HpiloServices[0]} nagios_service_status : $nagios_service_status - $output"
IFS=';' read -a hlth_status_array <<< "$output"
hlth_array_count=${#hlth_status_array[@]}
for ((xy=0;xy<${#HpiloServices[@]};xy++));do
service_key=${HpiloServices[$xy]}
case $service_key in
"System Status") service_index=0 ;;
"Fans") service_index=4 ;;
"Memory") service_index=3 ;;
"Network") service_index=14 ;;
"Power Supplies") service_index=6 ;;
"Processors") service_index=2 ;;
"Storage") service_index=9 ;;
"Temperatures") service_index=5 ;;
*) ;;
esac
write_log "hlth_array_count : $hlth_array_count"
if [ $hlth_array_count = 1 ]; then
status_key=${hlth_status_array[0]}
status_value=3
write_log "status_key : $status_key"
else
status_key=${hlth_status_array[$service_index]}
case $status_key in
"OK") status_value=0 ;;
"Degraded") status_value=1 ;;
"Failed") status_value=2 ;;
"NA" | "Other") status_value=3 ;;
*) ;;
esac
fi
if [ "$status_value" == "1" ] || [ "$status_value" == "2" ]; then
case "$xy" in
[1] | [4] | [6])
status_key=`$NagiosLibexec/$NagiosHpiloEngine -H $1 -C $2 -o $((xy+1))`
nagios_service_status=$?
;;
*)
;;
esac
elif [ "$status_value" == "3" ]; then
case "$xy" in
3)
status_key="Other: Probable Cause might be AMS service is not installed or started on host OS"
;;
4)
status_key="Other: Verify status on OA for Power Supplies of blade server"
;;
*)
;;
esac
fi
if [ $curr_oid_idx = $xy ]; then
curr_odi_status=$status_value
curr_oid_disc="$status_key"
continue
fi
nagios_passive_check "$1" "${HpiloServices[$xy]}" "$status_value" "$status_key"
sleep 0.001
done
output=$curr_oid_disc
nagios_service_status=$curr_odi_status
write_log "5. $1 ${HpiloServices[0]} $nagios_service_status $output "
}
# $1: file name
# $2: search pattern
function search_string() {
local str=
if [ -f $1 ]; then
str=`grep "$2" -A 1 $1`
fi
echo $str
}
#start here
VerboseMode=0
ActiveMode=0
# parse the arguments
while [ "x$1" != "x" ]; do
option=$1
case $option in
-H)
# Get the Host IP
shift
host_ip=$1
;;
-C)
# Get the community string
shift
comm_str=$1
;;
-o)
# Get the OID
shift
oid_id=$1
;;
-v)
# enable verbose mode
VerboseMode=1
;;
-A)
# Active mode: Get Health Status Array and Update to web page
ActiveMode=1
;;
*)
echo "ERROR: unknown option $option"
exit 1
;;
esac
shift
done
if [ "$ActiveMode" = "0" ]
then
read IloIP
read protocol
# Get the iLO trap ID
while read oid val
do
if [ "$oid" = "$SnmpTrapOid" ]; then
# get the iLO trap ID
IloTrapNumber=`echo $val | awk -F . '{print $NF}'`
break;
fi
done
ServiceType=$(get_hlth_service_type $IloTrapNumber)
write_log "ilo_ip: $IloIP trap_number: $IloTrapNumber, type: $ServiceType"
NagiosIloCfgFile=$(get_nagios_ilo_cfg_path)
has_configured=$(search_string $NagiosIloCfgFile $IloIP)
if [ "x${has_configured}" != "x" ]; then
comm_str_id=`echo $has_configured | awk '{print $3}'`
if [ "x${comm_str_id}" != "x" ]; then
HpiloSnmpComm=`echo $has_configured | awk '{print $4}'`
fi
fi
if [ "$ServiceType" != "None" ]
then
if [ "$ServiceType" != "$HlthStatusChgTrap" ]
then
update_hlth_status $IloIP $IloTrapNumber "$ServiceType"
fi
# update system health status
update_hlth_status $IloIP $IloTrapNumber "${HpiloServices[0]}"
fi
else
write_log "host_ip : $host_ip comm_str: $comm_str oid_idx: $oid_id"
#if [ "$oid_id" = "1" ]; then
write_log "Active Schedule Mode"
update_hlth_status_from_status_array $host_ip $comm_str $oid_id
printf "$output\n"
exit $nagios_service_status
#fi
fi
You would think a timeout threshold of 120 seconds would be enough:

Here's what my Outlook inbox looked like when I came to work this morning. Looks like the iLOs have been going at it all night long:

I would really appreciate any input or ideas on this matter. I've opened an issue on the plugin's GitHub page, but it seems nobody is reading them, as every single issue case there is without any answers.
I wasn't able to find any relevant information in the iLO card's or my Nagios Core's logs. Here's my iLO.cfg if it helps:
Code: Select all
define host {
name generic-iLO-host
use generic-host-template
check_interval 5
max_check_attempts 10
check_command check-iLO-host-alive
action_url https://$HOSTADDRESS$
_nagiosip 192.4.61.36
icon_image_alt HP Integrated Lights-Out (iLO)
register 0
contact_groups admins,admins-sms
icon_image hpe.png
first_notification_delay 1
}
define service {
name generic-iLO-service
use generic-service
passive_checks_enabled 1
active_checks_enabled 1
check_interval 5
register 0
notification_options c,w
first_notification_delay 1
contact_groups admins,admins-sms
}
define service {
name generic-iLO-Passive-service
use generic-service
passive_checks_enabled 1
active_checks_enabled 1
check_period 24x7
check_interval 2880
normal_check_interval 2880
check_freshness 1
freshness_threshold 10800
register 0
notification_options c,w
}
define host {
host_name ProLiant DL360 Gen9 VMWARE-FOM
address 10.88.255.15
_UUID 818208CZJ6490T71
_iLO_SNMP_community corpsnmp
use generic-iLO-host
}
define hostgroup {
hostgroup_name HPE-iLO
members ProLiant DL360 Gen9 VMWARE-FOM, ProLiant DL360 Gen9 VMWARE1, ProLiant DL360 Gen9 VMWARE2, ProLiant DL380 Gen9 BACKUP, ProLiant ML350 Gen9 DC
}
define servicegroup {
servicegroup_name System Status
members ProLiant DL360 Gen9 VMWARE-FOM,System Status, ProLiant DL360 Gen9 VMWARE1,System Status, ProLiant DL360 Gen9 VMWARE2,System Status, ProLiant DL380 Gen9 BACKUP,System Status, ProLiant ML350 Gen9 DC,System Status
}
define servicegroup {
servicegroup_name Fans
members ProLiant DL360 Gen9 VMWARE-FOM,Fans, ProLiant DL360 Gen9 VMWARE1,Fans, ProLiant DL360 Gen9 VMWARE2,Fans, ProLiant DL380 Gen9 BACKUP,Fans, ProLiant ML350 Gen9 DC,Fans
}
define servicegroup {
servicegroup_name Memory
members ProLiant DL360 Gen9 VMWARE-FOM,Memory, ProLiant DL360 Gen9 VMWARE1,Memory, ProLiant DL360 Gen9 VMWARE2,Memory, ProLiant DL380 Gen9 BACKUP,Memory, ProLiant ML350 Gen9 DC,Memory
}
define servicegroup {
servicegroup_name Network
members ProLiant DL360 Gen9 VMWARE-FOM,Network, ProLiant DL360 Gen9 VMWARE1,Network, ProLiant DL360 Gen9 VMWARE2,Network, ProLiant DL380 Gen9 BACKUP,Network, ProLiant ML350 Gen9 DC,Network
}
define servicegroup {
servicegroup_name Power Supplies
members ProLiant DL360 Gen9 VMWARE-FOM,Power Supplies, ProLiant DL360 Gen9 VMWARE1,Power Supplies, ProLiant DL360 Gen9 VMWARE2,Power Supplies, ProLiant DL380 Gen9 BACKUP,Power Supplies, ProLiant ML350 Gen9 DC,Power Supplies
}
define servicegroup {
servicegroup_name Processors
members ProLiant DL360 Gen9 VMWARE-FOM,Processors, ProLiant DL360 Gen9 VMWARE1,Processors, ProLiant DL360 Gen9 VMWARE2,Processors, ProLiant DL380 Gen9 BACKUP,Processors, ProLiant ML350 Gen9 DC,Processors
}
define servicegroup {
servicegroup_name Storage
members ProLiant DL360 Gen9 VMWARE-FOM,Storage, ProLiant DL360 Gen9 VMWARE1,Storage, ProLiant DL360 Gen9 VMWARE2,Storage, ProLiant DL380 Gen9 BACKUP,Storage, ProLiant ML350 Gen9 DC,Storage
}
define servicegroup {
servicegroup_name Temperatures
members ProLiant DL360 Gen9 VMWARE-FOM,Temperatures, ProLiant DL360 Gen9 VMWARE1,Temperatures, ProLiant DL360 Gen9 VMWARE2,Temperatures, ProLiant DL380 Gen9 BACKUP,Temperatures, ProLiant ML350 Gen9 DC,Temperatures
}
define service {
use generic-iLO-service
hostgroup_name HPE-iLO
service_description System Status
check_command nagios_hpeilo_health_service!1
}
define service {
use generic-iLO-Passive-service
hostgroup_name HPE-iLO
service_description Fans
action_url http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=2
check_command nagios_hpeilo_health_service!2
}
define service {
use generic-iLO-Passive-service
hostgroup_name HPE-iLO
service_description Memory
action_url http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=5
check_command nagios_hpeilo_health_service!3
}
define service {
use generic-iLO-Passive-service
hostgroup_name HPE-iLO
service_description Network
action_url http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=7
check_command nagios_hpeilo_health_service!4
}
define service {
use generic-iLO-Passive-service
hostgroup_name HPE-iLO
service_description Power Supplies
action_url http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=1
check_command nagios_hpeilo_health_service!5
}
define service {
use generic-iLO-Passive-service
hostgroup_name HPE-iLO
service_description Processors
action_url http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=6
check_command nagios_hpeilo_health_service!6
}
define service {
use generic-iLO-Passive-service
hostgroup_name HPE-iLO
service_description Storage
action_url http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=4
check_command nagios_hpeilo_health_service!7
}
define service {
use generic-iLO-Passive-service
hostgroup_name HPE-iLO
service_description Temperatures
action_url http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=3
check_command nagios_hpeilo_health_service!8
}
define command {
command_name nagios_hpeilo_engine
command_line /usr/local/nagios/libexec/nagios_hpeilo_engine -H $HOSTADDRESS$ -C $_HOSTILO_SNMP_COMMUNITY$ -o $ARG1$
}
define command {
command_name check-iLO-host-alive
command_line /usr/local/nagios/libexec/nagios_hpeilo_engine -H $HOSTADDRESS$ -C $_HOSTILO_SNMP_COMMUNITY$ -o 11
}
define command {
command_name nagios_hpeilo_health_service
command_line /usr/local/nagios/libexec/nagios_hpeilo_traps -A -H $HOSTADDRESS$ -C $_HOSTILO_SNMP_COMMUNITY$ -o $ARG1$
}
define host {
host_name ProLiant DL360 Gen9 VMWARE1
address 10.88.255.16
_UUID 8654865486548654
_iLO_SNMP_community corpsnmp
use generic-iLO-host
}
define host {
host_name ProLiant DL360 Gen9 VMWARE2
address 10.88.255.17
_UUID 8654865486548654
_iLO_SNMP_community corpsnmp
use generic-iLO-host
}
define host {
host_name ProLiant DL380 Gen9 BACKUP
address 10.88.255.18
_UUID 8654865486548654
_iLO_SNMP_community corpsnmp
use generic-iLO-host
}
define host {
host_name ProLiant ML350 Gen9 DC
address 192.4.63.158
_UUID 865486548654865
_iLO_SNMP_community corpsnmp
use generic-iLO-host
}