Page 1 of 1

"grep -r command_file" Nagios process throttling CPUs

Posted: Thu Feb 18, 2021 2:37 am
by Koja
Hello everyone,

Lately I've had weird performance issues on my Nagios Core server running on CentOS 7.9.2009. The server has 8 vCPUs and 8GB of RAM.

This is what the incident looks like in htop:
Image

As you can see, a process by the nagios user running a command "grep -r command_file" is putting tremendous strain on the server.

Running a grep search reveals that the strenuous command is related to a HPE iLO plugin:

Code: Select all

[root@nagios]# grep -R "grep -r command_file" /usr/local/nagios/
/usr/local/nagios/libexec/nagios_hpeilo_traps:  NagiosCmdFile=`grep -r command_file $nagios_cfg_file | \
/usr/local/nagios/libexec/nagios_hpeilo_traps:  NagiosCmdFile=`grep -r command_file $nagios_cfg_file | \
Here is that file in question, fully:

Code: Select all

#!/bin/bash
#
#  nagios_hpeilo_traps -- The script aims to collect the SNMP traps 
#			 received from HP ProLiant Server and to 
#			 update the corresponding status in real-time
#	        
#  (C) Copyright [2015] Hewlett-Packard Enterprise Development Company, L.P.
#
#  This program is free software; you can redistribute it and/or modify 
#  it under the terms of version 2 of the GNU General Public License as 
#  published by the Free Software Foundation.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 
#
#  See the GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License 
#  along with this program; if not, write to:
#   Free Software Foundation, Inc.
#   51 Franklin Street, Fifth Floor
#   Boston, MA 02110-1301, USA.  
#
#   Written by Adrian Huang <[email protected]>.

prefix=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )

NagiosLibexec="${prefix}"
NagiosHpiloEngine="nagios_hpeilo_engine"

HpiloLogFile="/var/log/nagios_hpeilo_traps.log"
HpiloSnmpComm="public" # hpeilo ilo community string
HpiloServices=("System Status" "Fans" "Memory" "Network" "Power Supplies" "Processors" "Storage" "Temperatures")

SnmpTrapOid="iso.3.6.1.6.3.1.1.4.1.0"

HlthStatusChgTrap="HlthStuatsChangeTrap"

function get_hlth_service_type () {
	case $1 in
		603[5-9] | 6055 ) # Fan Group Trap
			echo "${HpiloServices[1]}"
			;;
		603[0-4] | 604[8-9] | 6050 |  605[4-5] ) # Power Supply Group Trap
			echo "${HpiloServices[4]}"
			;;
		604[0-2] ) # Temperature Group Trap
			echo "${HpiloServices[7]}"
			;;
		602[3-4] ) # Processors Group Trap
			echo "${HpiloServices[5]}"
			;;
		11020 ) # Health Status Array Change Trap
			echo "$HlthStatusChgTrap"
			;;
		*) 
			echo "None"
			;;
	esac
}

# argument 1 ($1): the iLO IP which generated the trap this time 
# argument 2 ($2): service type
# argument 3 ($3): service status (NAGIOS_OK or NAGIOS_WARNING or ...)
# argument 4 ($4): status information 
function nagios_passive_check () {

	
	write_log "PROCESS_SERVICE_CHECK_RESULT;$1;$2;$3;$4" 

	now=$((`date +%s`))
	printf "[%lu] PROCESS_SERVICE_CHECK_RESULT;$1;$2;$3;$4\n" $now > $NagiosCmdFile

}

# argument 1 ($1): The data to be logged
function write_log () {
	if [ "$VerboseMode" = "1" ]
	then
		echo -e "[`date "+%b %d %T"`] $1" >> $HpiloLogFile
	fi
}

function get_nagios_cfg_file() {
	local	nagios_cfg=


	# The format might be like: 
	#	1)  NagiosIloCfgFile=${prefix}/etc/nagios.cfg
	#	2)  NAGIOSCFG="/etc/nagios3/nagios.cfg"
        nagios_cfg=`grep -E "=.*etc.*nagios.cfg" /etc/init.d/nagios* | \
			cut -d '=' -f 2`

	# trim double quotes
	nagios_cfg=`echo $nagios_cfg | tr -d '"'`

	if [[ $nagios_cfg = *prefix* ]]; then
		# The /etc/init.d/nagios* has the fomrat like this:
		# 	NagiosIloCfgFile=${prefix}/etc/nagios.cfg
		# So, we need to replace ${prefix} with the actual value

		# Find the nagios prefix path
		local nagios_prefix=`grep -i "^prefix=" /etc/init.d/nagios* | \
				cut -d '=' -f 2`

		# ${prefix}/etc/nagios.cfg -> will be etc/nagios.cfg
		local nagios_suffix=`echo $nagios_cfg | cut -d '/' -f 2-`

		nagios_cfg="${nagios_prefix}/$nagios_suffix"
	fi

	echo "$nagios_cfg"

}

function get_nagios_ilo_cfg_path() {
	local   nagios_cfg=$(get_nagios_cfg_file)
	local nagios_path=`dirname $nagios_cfg`
	local NagiosIloCfgFile="$nagios_path/ilo/ilo.cfg"
	echo $NagiosIloCfgFile
}

# argument 1 ($1): the iLO IP which generated the trap this time 
# argument 2 ($2): the iLO IP trap number 
# argument 3 ($3): service type
function update_hlth_status () {
	local nagios_cfg_file=$(get_nagios_cfg_file)
	local oid_idx=
	local nagios_service_status=

	write_log "$nagios_cfg_file"

	NagiosCmdFile=`grep -r command_file $nagios_cfg_file | \
		awk -F = '{print $2}'`


	for((idx=0;idx<${#HpiloServices[@]};idx++)) 
	do
		if [ "$3" = "${HpiloServices[$idx]}" ]; then
			oid_idx=$idx
			break
		fi
	done
		
	# The oid_idx must be increased by 1 since the oid index of the 
	# NagiosHpiloEngine is started from 1 rather than 0. 
	write_log "$NagiosLibexec/$NagiosHpiloEngine -H $IloIP -C $HpiloSnmpComm -o $((oid_idx+1))" 
	output=`$NagiosLibexec/$NagiosHpiloEngine -H $IloIP -C $HpiloSnmpComm -o $((oid_idx+1))`

	nagios_service_status=$?

	nagios_passive_check "$1" "$3" "$nagios_service_status" "$output"
		

}

# argument 1 ($1): the iLO IP  
# argument 2 ($2): the iLO IP  RO community string
function update_hlth_status_from_status_array () {
	local nagios_cfg_file=$(get_nagios_cfg_file)
	local oid_idx=0
	local curr_oid_idx=$3
	local curr_oid_idx=$((curr_oid_idx-1))
	local curr_odi_status=
	local curr_oid_disc=
	nagios_service_status=0
	local hlth_array_count=0
	write_log "$nagios_cfg_file"

	NagiosCmdFile=`grep -r command_file $nagios_cfg_file | \
		awk -F = '{print $2}'`


	# The oid_idx must be increased by 1 since the oid index of the 
	# NagiosHpiloEngine is started from 1 rather than 0. 
	write_log "$NagiosLibexec/$NagiosHpiloEngine -H $1 -C $2 -o $((oid_idx))" 
	output=`$NagiosLibexec/$NagiosHpiloEngine -H $1 -C $2 -o $((oid_idx))`
	nagios_service_status=$?
	write_log "${HpiloServices[0]} nagios_service_status : $nagios_service_status - $output"
	IFS=';' read -a hlth_status_array <<< "$output"	
	hlth_array_count=${#hlth_status_array[@]}
	for ((xy=0;xy<${#HpiloServices[@]};xy++));do
		service_key=${HpiloServices[$xy]}
		case $service_key in
			"System Status") service_index=0 ;;
			"Fans") service_index=4 ;;
			"Memory") service_index=3 ;;
			"Network") service_index=14 ;;
			"Power Supplies") service_index=6 ;;
			"Processors") service_index=2 ;;
			"Storage") service_index=9 ;;
			"Temperatures") service_index=5 ;;
			*) ;; 
		esac

		write_log "hlth_array_count : $hlth_array_count"
		if [ $hlth_array_count = 1 ]; then
			status_key=${hlth_status_array[0]}
			status_value=3
			write_log "status_key : $status_key"
		else
			status_key=${hlth_status_array[$service_index]}
			case $status_key in
				"OK") status_value=0 ;;
				"Degraded") status_value=1 ;; 
				"Failed") status_value=2 ;;
				"NA" | "Other") status_value=3 ;;
				*) ;;
			esac
		fi

		if [ "$status_value" == "1" ] || [ "$status_value" == "2" ]; then
			case "$xy" in
				[1] | [4] | [6])
					status_key=`$NagiosLibexec/$NagiosHpiloEngine -H $1 -C $2 -o $((xy+1))`
					nagios_service_status=$?
					;;
				*)
					;;
			esac
		elif [ "$status_value" == "3" ]; then
			case "$xy" in
				3)
					status_key="Other: Probable Cause might be AMS service is not installed or started on host OS"
				;;
				4)
					status_key="Other: Verify status on OA for Power Supplies of blade server" 
				;;
				*)
				;;
 			esac
		fi
		if [ $curr_oid_idx = $xy ]; then
			curr_odi_status=$status_value
			curr_oid_disc="$status_key"
			continue
		fi
		nagios_passive_check "$1" "${HpiloServices[$xy]}" "$status_value" "$status_key"
		sleep 0.001
	done
	output=$curr_oid_disc
	nagios_service_status=$curr_odi_status
	write_log "5. $1 ${HpiloServices[0]} $nagios_service_status $output "

}


# $1: file name
# $2: search pattern
function search_string() {
local   str=

if [ -f $1 ]; then
	str=`grep "$2" -A 1 $1`
fi
echo $str
}

#start here

	VerboseMode=0
	ActiveMode=0
	

# parse the arguments
while [ "x$1" != "x" ]; do
	option=$1

	case $option in
		-H)

		# Get the Host IP
		shift

		host_ip=$1
		;;

		-C)
		# Get the community string
		shift
		comm_str=$1

		;;

		-o)
		# Get the OID
		shift
		oid_id=$1

		;;

		-v)
		# enable verbose mode
		VerboseMode=1
		;;
		
		-A)
		# Active mode: Get Health Status Array and Update to web page 
		ActiveMode=1
		;;
		*)
		echo "ERROR: unknown option $option"
		
		exit 1
		;;

	esac
	shift
done


if [ "$ActiveMode" = "0" ]
then

	read IloIP 
	read protocol

# Get the iLO trap ID 
	while read oid val
	do
		if [ "$oid" = "$SnmpTrapOid" ]; then
			# get the iLO trap ID
			IloTrapNumber=`echo $val | awk -F . '{print $NF}'`
			break;
		fi
	done


ServiceType=$(get_hlth_service_type $IloTrapNumber)

write_log "ilo_ip: $IloIP trap_number: $IloTrapNumber, type: $ServiceType"


NagiosIloCfgFile=$(get_nagios_ilo_cfg_path)
has_configured=$(search_string $NagiosIloCfgFile $IloIP)
if [ "x${has_configured}" != "x" ]; then
	comm_str_id=`echo $has_configured | awk '{print $3}'`
	if [ "x${comm_str_id}" != "x" ]; then
		HpiloSnmpComm=`echo $has_configured | awk '{print $4}'`
	fi
fi


if [ "$ServiceType" != "None" ]
then

	if [ "$ServiceType" != "$HlthStatusChgTrap" ]
	then
		update_hlth_status $IloIP $IloTrapNumber "$ServiceType" 
	fi

	# update system health status
	update_hlth_status $IloIP $IloTrapNumber "${HpiloServices[0]}"
fi
else

	write_log "host_ip : $host_ip  comm_str: $comm_str  oid_idx: $oid_id"
	#if [ "$oid_id" = "1" ]; then
	write_log "Active Schedule Mode"
		update_hlth_status_from_status_array $host_ip $comm_str $oid_id
		printf "$output\n"
		exit $nagios_service_status
	#fi
	
fi
This performance issue started about a week or two ago, and it seems to go hand-in-hand with the issue of my iLO card checks timing out. This problem happens about every two or three days, and it is causing a lot of false alerts for our team. Usually, all of the sudden all of my HPE iLO cards get service check timeouts, and re-running the checks makes no difference - this usually lasts about an hour or so.

You would think a timeout threshold of 120 seconds would be enough:
Image

Here's what my Outlook inbox looked like when I came to work this morning. Looks like the iLOs have been going at it all night long:
Image

I would really appreciate any input or ideas on this matter. I've opened an issue on the plugin's GitHub page, but it seems nobody is reading them, as every single issue case there is without any answers.

I wasn't able to find any relevant information in the iLO card's or my Nagios Core's logs. Here's my iLO.cfg if it helps:

Code: Select all

define host {
	name generic-iLO-host
	use generic-host-template
	check_interval 5
	max_check_attempts 10
	check_command check-iLO-host-alive
	action_url https://$HOSTADDRESS$
	_nagiosip	192.4.61.36
	icon_image_alt HP Integrated Lights-Out (iLO)
	register 0
	contact_groups admins,admins-sms
	icon_image hpe.png
        first_notification_delay 1
}

define service {
	name generic-iLO-service
	use generic-service
	passive_checks_enabled	1
	active_checks_enabled	1
	check_interval 5
	register 0
	notification_options    c,w
	first_notification_delay 1
        contact_groups admins,admins-sms
}

define service {
	name generic-iLO-Passive-service
	use generic-service
	passive_checks_enabled	1
	active_checks_enabled	1
	check_period	24x7
	check_interval 2880
	normal_check_interval 2880
	check_freshness                 1
	freshness_threshold		10800
	register 0
        notification_options    c,w
}

define host {
	host_name ProLiant DL360 Gen9 VMWARE-FOM
	address 10.88.255.15
	_UUID 818208CZJ6490T71
	_iLO_SNMP_community corpsnmp
	use generic-iLO-host
}

define hostgroup {
	hostgroup_name	HPE-iLO
	members ProLiant DL360 Gen9 VMWARE-FOM, ProLiant DL360 Gen9 VMWARE1, ProLiant DL360 Gen9 VMWARE2, ProLiant DL380 Gen9 BACKUP, ProLiant ML350 Gen9 DC
}

define servicegroup {
	servicegroup_name	System Status
	members 	ProLiant DL360 Gen9 VMWARE-FOM,System Status, ProLiant DL360 Gen9 VMWARE1,System Status, ProLiant DL360 Gen9 VMWARE2,System Status, ProLiant DL380 Gen9 BACKUP,System Status, ProLiant ML350 Gen9 DC,System Status
}

define servicegroup {
	servicegroup_name	Fans
	members 	ProLiant DL360 Gen9 VMWARE-FOM,Fans, ProLiant DL360 Gen9 VMWARE1,Fans, ProLiant DL360 Gen9 VMWARE2,Fans, ProLiant DL380 Gen9 BACKUP,Fans, ProLiant ML350 Gen9 DC,Fans
}

define servicegroup {
	servicegroup_name	Memory
	members 	ProLiant DL360 Gen9 VMWARE-FOM,Memory, ProLiant DL360 Gen9 VMWARE1,Memory, ProLiant DL360 Gen9 VMWARE2,Memory, ProLiant DL380 Gen9 BACKUP,Memory, ProLiant ML350 Gen9 DC,Memory
}

define servicegroup {
	servicegroup_name	Network
	members 	ProLiant DL360 Gen9 VMWARE-FOM,Network, ProLiant DL360 Gen9 VMWARE1,Network, ProLiant DL360 Gen9 VMWARE2,Network, ProLiant DL380 Gen9 BACKUP,Network, ProLiant ML350 Gen9 DC,Network
}

define servicegroup {
	servicegroup_name	Power Supplies
	members 	ProLiant DL360 Gen9 VMWARE-FOM,Power Supplies, ProLiant DL360 Gen9 VMWARE1,Power Supplies, ProLiant DL360 Gen9 VMWARE2,Power Supplies, ProLiant DL380 Gen9 BACKUP,Power Supplies, ProLiant ML350 Gen9 DC,Power Supplies
}

define servicegroup {
	servicegroup_name	Processors
	members 	ProLiant DL360 Gen9 VMWARE-FOM,Processors, ProLiant DL360 Gen9 VMWARE1,Processors, ProLiant DL360 Gen9 VMWARE2,Processors, ProLiant DL380 Gen9 BACKUP,Processors, ProLiant ML350 Gen9 DC,Processors
}

define servicegroup {
	servicegroup_name	Storage
	members 	ProLiant DL360 Gen9 VMWARE-FOM,Storage, ProLiant DL360 Gen9 VMWARE1,Storage, ProLiant DL360 Gen9 VMWARE2,Storage, ProLiant DL380 Gen9 BACKUP,Storage, ProLiant ML350 Gen9 DC,Storage
}

define servicegroup {
	servicegroup_name	Temperatures
	members 	ProLiant DL360 Gen9 VMWARE-FOM,Temperatures, ProLiant DL360 Gen9 VMWARE1,Temperatures, ProLiant DL360 Gen9 VMWARE2,Temperatures, ProLiant DL380 Gen9 BACKUP,Temperatures, ProLiant ML350 Gen9 DC,Temperatures
}

define service {
	use generic-iLO-service
	hostgroup_name HPE-iLO
	service_description System Status
	check_command nagios_hpeilo_health_service!1
}

define service {
	use generic-iLO-Passive-service
	hostgroup_name HPE-iLO
	service_description Fans
	action_url  http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=2
	check_command nagios_hpeilo_health_service!2
}

define service {
	use generic-iLO-Passive-service
	hostgroup_name HPE-iLO
	service_description Memory
	action_url  http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=5
	check_command nagios_hpeilo_health_service!3
}

define service {
	use generic-iLO-Passive-service
	hostgroup_name HPE-iLO
	service_description Network
	action_url  http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=7
	check_command nagios_hpeilo_health_service!4
}

define service {
	use generic-iLO-Passive-service
	hostgroup_name HPE-iLO
	service_description Power Supplies
	action_url  http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=1
	check_command nagios_hpeilo_health_service!5
}

define service {
	use generic-iLO-Passive-service
	hostgroup_name HPE-iLO
	service_description Processors
	action_url  http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=6
	check_command nagios_hpeilo_health_service!6
}

define service {
	use generic-iLO-Passive-service
	hostgroup_name HPE-iLO
	service_description Storage
	action_url  http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=4
	check_command nagios_hpeilo_health_service!7
}

define service {
	use generic-iLO-Passive-service
	hostgroup_name HPE-iLO
	service_description Temperatures
	action_url  http://$_HOSTNAGIOSIP$/nagios/hpeilo/nagios_hpeilo_service_details.php?ip=$HOSTADDRESS$&comm=$_HOSTILO_SNMP_COMMUNITY$&id=3
	check_command nagios_hpeilo_health_service!8
}

define command {
	command_name nagios_hpeilo_engine
	command_line /usr/local/nagios/libexec/nagios_hpeilo_engine -H $HOSTADDRESS$ -C  $_HOSTILO_SNMP_COMMUNITY$ -o $ARG1$
}

define command {
	command_name check-iLO-host-alive
	command_line /usr/local/nagios/libexec/nagios_hpeilo_engine -H $HOSTADDRESS$ -C  $_HOSTILO_SNMP_COMMUNITY$ -o 11
}

define command {
	command_name nagios_hpeilo_health_service
	command_line /usr/local/nagios/libexec/nagios_hpeilo_traps -A -H $HOSTADDRESS$ -C  $_HOSTILO_SNMP_COMMUNITY$ -o $ARG1$
}

define host {
	host_name ProLiant DL360 Gen9 VMWARE1
	address 10.88.255.16
	_UUID 8654865486548654
	_iLO_SNMP_community corpsnmp
	use generic-iLO-host
}

define host {
	host_name ProLiant DL360 Gen9 VMWARE2
	address 10.88.255.17
	_UUID 8654865486548654
	_iLO_SNMP_community corpsnmp
	use generic-iLO-host
}

define host {
	host_name ProLiant DL380 Gen9 BACKUP
	address 10.88.255.18
	_UUID 8654865486548654
	_iLO_SNMP_community corpsnmp
	use generic-iLO-host
}

define host {
	host_name ProLiant ML350 Gen9 DC
	address 192.4.63.158
	_UUID 865486548654865
	_iLO_SNMP_community corpsnmp
	use generic-iLO-host
}
Please let me know if I can provide more details, logs, configuration files etc.

Re: "grep -r command_file" Nagios process throttling CPUs

Posted: Thu Feb 18, 2021 6:18 am
by Koja
Hello,

I believe I've figured out the issue. A plugin called nagios_hpeilo_traps contains a function which searches for the Nagios command file (nagios.cmd) and it uses the command grep -r command_file which is for some reason very resource intensive.

Here is a snippet of the function:

Code: Select all

NagiosCmdFile=`grep -r command_file $nagios_cfg_file | \
                awk -F = '{print $2}'`
And I've replace it with a full path to the command file like this:

Code: Select all

NagiosCmdFile="/usr/local/nagios/var/rw/nagios.cmd"
This has completely resolved the issue for me, I'm no longer getting timeouts and nothing is hogging my server's CPU resources!

This thread can be closed.

Re: "grep -r command_file" Nagios process throttling CPUs

Posted: Fri Feb 19, 2021 5:40 pm
by benjaminsmith
Hi @koja,
This has completely resolved the issue for me, I'm no longer getting timeouts and nothing is hogging my server's CPU resources!
This thread can be closed.
Glad you got it figured out.

Thanks for using the Nagios Community Forum.