Passive alerts returning as missing randomly and then OK
Posted: Tue Dec 15, 2015 2:45 pm
I have 4 servers sending passive checks to nagios core 3. The checks are making it there, but sometimes it says "plugin out of bounds" and these are critical alerts so we receive emails. Within 30 seconds it refreshes and goes to an OK state. Below is the cron and script sending data via nsca.
Thanks.
(Additional information added by Jesse):
-Upgraded recently from CentOS 5 to CentOS 7
-Upgraded recently from Nagios Core 3.x to a higher version of 3.x
-send_nsca was updated
-send_nsca is running via cron every minute
-Issue happens to the four servers below that all have these checks, other passive checks are not alerting:
Thanks.
Code: Select all
*/2 * * * * /opt/home/cl/cron_for_aws
MASTER_PORT_A=5667
MASTER_HOST_AWS=ip
ROOT=~cloverleaf/libexec
NSCA=${ROOT}/send_nsca
NSCA_AWS=${ROOT}/send_nsca_1.9
NSCA_AWS_CMD="-H ${MASTER_HOST_AWS} -p ${MASTER_PORT_A} -c ${NSCA}.cfg"
ECHO="/bin/echo -e"
tempfile=/tmp/cldp-nsca-aws.$$
trap "rm $tempfile" 0 1 2 15
# these must get changed per host and per service
this_host='saturn'
# format of the file must be
# hostname_in_nagios service_name_in_nagios status extended
# check for mysql replication
x=`${ROOT}/check_mysql -ucl-ppass -S`
status=$?
x=${x##REPLICATION }
message=${x##*:}
line="${this_host}\tmysql_replication\t${status}\t${x}"
echo $x
$ECHO $line > $tempfile
$NSCA_AWS $NSCA_AWS_CMD < $tempfile
# check for mysql at ALL
x=`${ROOT}/check_mysql -ucl -ppass`
status=$?
x=${x##MYSQL }
message=${x##OK }
line="${this_host}\tmysql_status\t${status}\t${message}"
$ECHO $line > $tempfile
$NSCA_AWS $NSCA_AWS_CMD < $tempfile
# check for disk space
x=`${ROOT}/check_disk -w 10% -c 5% -p /`
status=$?
x=${x##DISK }
message=${x##* }
line="${this_host}\tdisk_space\t ${status}\t${message}"
$ECHO $line > $tempfile
$NSCA_AWS $NSCA_AWS_CMD < $tempfile
# check for /opt space
x=`${ROOT}/check_disk -w 10% -c 5% -p /opt`
status=$?
x=${x##DISK }
message=${x##* }
line="${this_host}\topt_space\t ${status}\t${message}"
$ECHO $line > $tempfile
$NSCA_AWS $NSCA_AWS_CMD < $tempfile
# check for file
MAGIC_FILE=~cl/html/images/SENTINEL
x=`${ROOT}/check_file_age -w 600 -c 900 -f ${MAGIC_FILE}`
status=$?
#status=${x%% *}
message=${x##*-}
line="${this_host}\tcheck_rsync\t${status}\t${message}"
$ECHO $line > $tempfile
$NSCA_AWS $NSCA_AWS_CMD < $tempfile
# check for activity
# needs to be in the ginger directory
cd ~cl/html/riviera/
x=`/usr/bin/php ~cl/html/riviera/ck_activity.php`
status=$?
#status=${x%% *}
message=${x##*-}
line="${this_host}\tweb_active\t${status}\t${message}"
$ECHO $line > $tempfile
$NSCA_AWS $NSCA_AWS_CMD < $tempfile
# check for News update
x=`/usr/bin/php ~cl/libexec/saturn-mysql-check.php`
status=$?
message=${x##*-}
line="${this_host}\tnews_update\t${status}\t${message}"
$ECHO $line > $tempfile
$NSCA_AWS $NSCA_AWS_CMD < $tempfile(Additional information added by Jesse):
-Upgraded recently from CentOS 5 to CentOS 7
-Upgraded recently from Nagios Core 3.x to a higher version of 3.x
-send_nsca was updated
-send_nsca is running via cron every minute
-Issue happens to the four servers below that all have these checks, other passive checks are not alerting:
Code: Select all
define service{
use passive-check
host_name mercury
service_description disk_space
}
define service{
use passive-check
host_name mercury
service_description opt_space
}
define service{
use passive-check
host_name mercury
service_description mysql_status
}
define service{
use passive-check
host_name mercury
service_description mysql_replication
}
define service{
use passive-check
host_name mercury
service_description web_active
register 0
}
define service{
use passive-check
host_name mercury
service_description check_rsync
}
define service{
use 12-hour-active-check
host_name mercury
service_description Awstats Copy
check_command check_file_age!90000!180000!/var/www/awstats/digeo-mercury
}
define service{
use 12-hour-active-check
host_name mercury
service_description Log Schlep
check_command check_logs!mercury
register 0
}
define servicegroup{
servicegroup_name mercury_noncrit
alias Mercury Non-Critical
members mercury, opt_space, mercury, disk_space, mercury, mysql_replication, mercury, check_rsync
}
define servicegroup{
servicegroup_name mercury_crit
alias Mercury Critical
members mercury, mysql_status
}
####################################################
# Non-Critical Service Alerts
# Daytime: immediate e-mail
# pagers if not acknowledged in 30 min.
# e-mail every hour
# After Hours: single e-mail
#####################################################
define serviceescalation{
servicegroup_name mercury_noncrit
first_notification 1
last_notification 1
notification_interval 30
escalation_period 24x7
contact_groups cld, digeo
}
define serviceescalation{
servicegroup_name mercury_noncrit
first_notification 2
last_notification 2
escalation_period days
contact_groups pagers
}
define serviceescalation{
servicegroup_name mercury_noncrit
first_notification 3
last_notification 0
notification_interval 120
escalation_period 24x7
contact_groups cld
}
define serviceescalation{
servicegroup_name mercury_noncrit
first_notification 1
last_notification 1
notification_interval 60
escalation_period nights
contact_groups cld
}
####################################################
# Critical Service Alerts
# Daytime: immediate page
# immediate e-mail
# hourly e-mails
#
# After Hours: immediate e-mail
# immediate page
# follow-up page in 30 min.
#######################################
define serviceescalation{
servicegroup_name mercury_crit
first_notification 1
last_notification 1
notification_interval 1
escalation_period 24x7
contact_groups cld, digeo
}
define serviceescalation{
servicegroup_name mercury_crit
first_notification 2
last_notification 2
notification_interval 59
escalation_period days
contact_groups pagers
}
define serviceescalation{
servicegroup_name mercury_crit
first_notification 3
last_notification 0
notification_interval 60
escalation_period days
contact_groups cld
}
define serviceescalation{
servicegroup_name mercury_crit
first_notification 2
last_notification 3
notification_interval 30
escalation_period nights
contact_groups pagers
}
define serviceescalation{
servicegroup_name mercury_crit
first_notification 3
last_notification 3
notification_interval 60
escalation_period nights
contact_groups cld
}