I have a check running that does not use the retry_interval when in a soft state. I've gone over my configs and have been unable to determine the issue. Debug confirms that the issue is definitely happening.
The check is sent to the host via NRPE:
Code: Select all
[1530676320.673831] [4096.2] [pid=25078] ## 1 descriptors had input
[1530676320.673839] [12288.1] [pid=25078] ## Polling 1500ms; sockets=8; events=180; iobs=0x25ea1c0
[1530676322.175406] [4096.2] [pid=25078] ## 0 descriptors had input
[1530676322.175558] [12288.1] [pid=25078] ## Polling 1500ms; sockets=8; events=180; iobs=0x25ea1c0
[1530676322.236477] [256.2] [pid=25078] Read 77 bytes from command worker
[1530676322.236516] [256.1] [pid=25078] Read raw external command '[1530676322] SCHEDULE_FORCED_SVC_CHECK;NUT-R1;Dogfood Enforcement;1530676320'
[1530676322.236529] [001.0] [pid=25078] process_external_command1()
[1530676322.236541] [128.2] [pid=25078] Raw command entry: [1530676322] SCHEDULE_FORCED_SVC_CHECK;NUT-R1;Dogfood Enforcement;1530676320
[1530676322.236612] [064.1] [pid=25078] Making callbacks (type 2)...
[1530676322.236627] [064.1] [pid=25078] Making callbacks (type 17)...
[1530676322.236638] [001.0] [pid=25078] process_external_command2()
[1530676322.236647] [128.1] [pid=25078] External Command Type: 54
[1530676322.236657] [128.1] [pid=25078] Command Entry Time: 1530676322
[1530676322.236666] [128.1] [pid=25078] Command Arguments: NUT-R1;Dogfood Enforcement;1530676320
[1530676322.236680] [001.0] [pid=25078] schedule_service_check()
[1530676322.236777] [016.0] [pid=25078] Scheduling a forced, active check of service 'Dogfood Enforcement' on host 'NUT-R1' @ Wed Jul 4 03:52:00 2018
[1530676322.236808] [016.2] [pid=25078] Found another service check event for this service @ Wed Jul 4 03:53:16 2018
[1530676322.236844] [016.2] [pid=25078] New service check event is forced, so it will be used instead of the existing event.
[1530676322.236865] [064.1] [pid=25078] Making callbacks (type 1)...
[1530676322.236883] [016.2] [pid=25078] Scheduling new service check event.
[1530676322.236895] [001.0] [pid=25078] add_event()
[1530676322.236908] [064.1] [pid=25078] Making callbacks (type 13)...
[1530676322.236921] [064.1] [pid=25078] Making callbacks (type 17)...
[1530676322.236933] [4096.2] [pid=25078] ## 1 descriptors had input
[1530676322.236982] [008.1] [pid=25078] ** Event Check Loop
[1530676322.237076] [008.1] [pid=25078] Next Event Time: Wed Jul 4 03:52:00 2018
[1530676322.237116] [008.1] [pid=25078] Current/Max Service Checks: 1/0 (inf% saturation)
[1530676322.237130] [12288.1] [pid=25078] ## Polling 0ms; sockets=8; events=180; iobs=0x25ea1c0
[1530676322.237166] [4096.2] [pid=25078] ## 0 descriptors had input
[1530676322.237187] [001.0] [pid=25078] handle_timed_event() start
[1530676322.237198] [064.1] [pid=25078] Making callbacks (type 1)...
[1530676322.237247] [008.0] [pid=25078] ** Timed Event ** Type: EVENT_SERVICE_CHECK, Run Time: Wed Jul 4 03:52:00 2018
[1530676322.237290] [008.0] [pid=25078] ** Service Check Event ==> Host: 'NUT-R1', Service: 'Dogfood Enforcement', Options: 1, Latency: 0.000384 sec
[1530676322.237308] [001.0] [pid=25078] run_scheduled_service_check() start
[1530676322.237348] [016.0] [pid=25078] Attempting to run scheduled check of service 'Dogfood Enforcement' on host 'NUT-R1': check options=1, latency=0.000384
[1530676322.237362] [001.0] [pid=25078] run_async_service_check()
[1530676322.237372] [001.0] [pid=25078] check_service_check_viability()
[1530676322.237381] [001.0] [pid=25078] check_service_parents()
[1530676322.237392] [064.1] [pid=25078] Making callbacks (type 6)...
[1530676322.237409] [016.0] [pid=25078] Checking service 'Dogfood Enforcement' on host 'NUT-R1'...
[1530676322.237422] [001.0] [pid=25078] get_raw_command_line_r()
[1530676322.237432] [2320.2] [pid=25078] Raw Command Input: $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
[1530676322.237442] [001.0] [pid=25078] process_macros_r()
[1530676322.237452] [2048.1] [pid=25078] **** BEGIN MACRO PROCESSING ***********
[1530676322.237461] [2048.1] [pid=25078] Processing: 'check_df_enf'
[1530676322.237471] [2048.2] [pid=25078] Processing part: 'check_df_enf'
[1530676322.237481] [2048.2] [pid=25078] Not currently in macro. Running output (12): 'check_df_enf'
[1530676322.237498] [2048.1] [pid=25078] Done. Final output: 'check_df_enf'
[1530676322.237508] [2048.1] [pid=25078] **** END MACRO PROCESSING *************
[1530676322.237517] [2320.2] [pid=25078] Expanded Command Output: $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
[1530676322.237527] [001.0] [pid=25078] process_macros_r()
[1530676322.237537] [2048.1] [pid=25078] **** BEGIN MACRO PROCESSING ***********
[1530676322.237545] [2048.1] [pid=25078] Processing: '$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$'
[1530676322.237555] [2048.2] [pid=25078] Processing part: ''
[1530676322.237565] [2048.2] [pid=25078] Not currently in macro. Running output (0): ''
[1530676322.237582] [2048.2] [pid=25078] Processing part: 'USER1'
[1530676322.237593] [2048.2] [pid=25078] Processed 'USER1', Free: 0
[1530676322.237611] [2048.2] [pid=25078] Processed 'USER1', Free: 0, Cleaning options: 3
[1530676322.237624] [2048.2] [pid=25078] Uncleaned macro. Running output (19): '/opt/nagios/plugins'
[1530676322.237634] [2048.2] [pid=25078] Just finished macro. Running output (19): '/opt/nagios/plugins'
[1530676322.237644] [2048.2] [pid=25078] Processing part: '/check_nrpe -H '
[1530676322.237654] [2048.2] [pid=25078] Not currently in macro. Running output (34): '/opt/nagios/plugins/check_nrpe -H '
[1530676322.237670] [2048.2] [pid=25078] Processing part: 'HOSTADDRESS'
[1530676322.237681] [2048.2] [pid=25078] Processed 'HOSTADDRESS', Free: 0
[1530676322.237690] [2048.2] [pid=25078] Processed 'HOSTADDRESS', Free: 0, Cleaning options: 3
[1530676322.237701] [2048.2] [pid=25078] Uncleaned macro. Running output (47): '/opt/nagios/plugins/check_nrpe -H 10.100.10.108'
[1530676322.237710] [2048.2] [pid=25078] Just finished macro. Running output (47): '/opt/nagios/plugins/check_nrpe -H 10.100.10.108'
[1530676322.237720] [2048.2] [pid=25078] Processing part: ' -c '
[1530676322.237729] [2048.2] [pid=25078] Not currently in macro. Running output (51): '/opt/nagios/plugins/check_nrpe -H 10.100.10.108 -c '
[1530676322.237739] [2048.2] [pid=25078] Processing part: 'ARG1'
[1530676322.237749] [2048.2] [pid=25078] Processed 'ARG1', Free: 0
[1530676322.237768] [2048.2] [pid=25078] Processed 'ARG1', Free: 0, Cleaning options: 3
[1530676322.237779] [2048.2] [pid=25078] Uncleaned macro. Running output (63): '/opt/nagios/plugins/check_nrpe -H 10.100.10.108 -c check_df_enf'
[1530676322.237789] [2048.2] [pid=25078] Just finished macro. Running output (63): '/opt/nagios/plugins/check_nrpe -H 10.100.10.108 -c check_df_enf'
[1530676322.237798] [2048.2] [pid=25078] Processing part: ''
[1530676322.237809] [2048.2] [pid=25078] Not currently in macro. Running output (63): '/opt/nagios/plugins/check_nrpe -H 10.100.10.108 -c check_df_enf'
[1530676322.237825] [2048.1] [pid=25078] Done. Final output: '/opt/nagios/plugins/check_nrpe -H 10.100.10.108 -c check_df_enf'
[1530676322.237835] [2048.1] [pid=25078] **** END MACRO PROCESSING *************
Code: Select all
[1530676330.667881] [4096.2] [pid=25078] ## 1 descriptors had input
[1530676330.667889] [064.1] [pid=25078] Making callbacks (type 11)...
[1530676330.667926] [12288.1] [pid=25078] ## Polling 1500ms; sockets=8; events=179; iobs=0x25ea1c0
[1530676331.281520] [016.2] [pid=25078] Processing check result for service 'Dogfood Enforcement' on host 'NUT-R1'
[1530676331.281617] [001.0] [pid=25078] handle_async_service_check_result()
[1530676331.281630] [016.0] [pid=25078] ** Handling ACTIVE async check result for service 'Dogfood Enforcement' on host 'NUT-R1' from 'Core Worker 25085'...
[1530676331.281639] [016.1] [pid=25078] * OPTIONS: 1, SCHEDULED: 1, RESCHEDULE: 1, EXITED OK: 1, RETURN CODE: 2, OUTPUT:
CRITICAL - Dogfood wireless is down.
[1530676331.281649] [016.2] [pid=25078] Parsing check output...
Short Output: CRITICAL - Dogfood wireless is down.
Long Output: NULL
Perf Data: NULL
[1530676331.281657] [001.0] [pid=25078] get_service_check_return_code()
[1530676331.281664] [016.1] [pid=25078] Service is in a non-OK state!
[1530676331.281671] [016.1] [pid=25078] Host is currently UP, so we'll recheck its state to make sure...
[1530676331.281678] [016.2] [pid=25078] Service not ok, host is up but cached data isn't valid, scheduling host check
[1530676331.281685] [001.0] [pid=25078] schedule_host_check()
[1530676331.281701] [016.0] [pid=25078] Scheduling a non-forced, active check of host 'NUT-R1' @ Wed Jul 4 03:52:11 2018
[1530676331.281720] [016.2] [pid=25078] Found another host check event for this host @ Wed Jul 4 03:55:17 2018
[1530676331.281728] [016.2] [pid=25078] New host check event occurs before the existing (older) event, so it will be used instead.
[1530676331.281736] [016.2] [pid=25078] Scheduling new host check event.
[1530676331.281746] [064.1] [pid=25078] Making callbacks (type 1)...
[1530676331.281758] [001.0] [pid=25078] add_event()
[1530676331.281782] [064.1] [pid=25078] Making callbacks (type 12)...
[1530676331.281794] [016.1] [pid=25078] Service was OK at last check.
[1530676331.281805] [016.1] [pid=25078] Service is a non-OK state (CRITICAL)![1530676331.281843] [016.2] [pid=25078] Service experienced a state change
[1530676331.282007] [001.0] [pid=25078] check_for_service_flapping()
[1530676331.282037] [016.1] [pid=25078] Checking service 'Dogfood Enforcement' on host 'NUT-R1' for flapping...
[1530676331.282046] [001.0] [pid=25078] check_for_host_flapping()
[1530676331.282053] [016.1] [pid=25078] Checking host 'NUT-R1' for flapping...
[1530676331.282060] [016.2] [pid=25078] LFT=5.00, HFT=20.00, CPC=0.00, PSC=0.00%
[1530676331.282123] [016.1] [pid=25078] Host is not flapping (0.00% state change).
[1530676331.282184] [016.1] [pid=25078] Rescheduling next check of service at Wed Jul 4 03:57:02 2018
[1530676331.282202] [001.0] [pid=25078] get_next_valid_time()
[1530676331.282216] [001.0] [pid=25078] _get_matching_timerange()
[1530676331.282227] [001.0] [pid=25078] schedule_service_check()
[1530676331.282258] [016.0] [pid=25078] Scheduling a non-forced, active check of service 'Dogfood Enforcement' on host 'NUT-R1' @ Wed Jul 4 03:57:02 2018
[1530676331.282275] [016.2] [pid=25078] Scheduling new service check event.
[1530676331.282283] [001.0] [pid=25078] add_event()
[1530676331.282294] [064.1] [pid=25078] Making callbacks (type 13)...
[1530676331.282378] [064.1] [pid=25078] Making callbacks (type 2)...
[1530676331.282391] [001.0] [pid=25078] handle_service_event()
[1530676331.282417] [064.1] [pid=25078] Making callbacks (type 23)...
[1530676331.282428] [001.0] [pid=25078] run_global_service_event_handler()
[1530676331.282435] [001.0] [pid=25078] clear_volatile_macros_r()
[1530676331.282443] [016.2] [pid=25078] STATE: 2, TYPE: SOFT, CUR: 1, MAX: 4, LAST_STATE: 0, LAST_HARD: 2, NOTIFY: 0, LOGGED: 1, HANDLED: 1
[1530676331.282452] [064.1] [pid=25078] Making callbacks (type 6)...
[1530676331.282459] [064.1] [pid=25078] Making callbacks (type 13)...
[1530676331.282467] [16384.0] [pid=25078] destroy_job(0=CHECK)
[1530676331.282475] [16384.1] [pid=25078] * removing job->wp
From my searching and the documentation it would seem that the check should be scheduled once per minute (retry_interval until a hard state is reached, however, it looks like it checks the host and schedules the next check 5 minutes later (check_interval).
Below are the host and service templates in use:
Code: Select all
define host{
name generic-host ; The name of this host template
notifications_enabled 1 ; Host notifications are enabled
event_handler_enabled 1 ; Host event handler is enabled
flap_detection_enabled 1 ; Flap detection is enabled
process_perf_data 1 ; Process performance data
retain_status_information 1 ; Retain status information across program restarts
retain_nonstatus_information 1 ; Retain non-status information across program restarts
notification_period 24x7 ; Send host notifications at any time
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE!
}
# Linux host definition template - This is NOT a real host, just a template!
define host{
name linux-server ; The name of this host template
use generic-host ; This template inherits other values from the generic-host template
check_period 24x7 ; By default, Linux hosts are checked round the clock
check_interval 5 ; Actively check the host every 5 minutes
retry_interval 1 ; Schedule host check retries at 1 minute intervals
max_check_attempts 10 ; Check each Linux host 10 times (max)
check_command check-host-alive ; Default command to check Linux hosts
# notification_period workhours ; Linux admins hate to be woken up, so we only notify during the day
# ; Note that the notification_period variable is being overridden from
# ; the value that is inherited from the generic-host template!
notification_period 24x7 ; Send host notifications at any time
notification_interval 120 ; Resend notifications every 2 hours
notification_options d,u,r ; Only send notifications for specific host states
contact_groups admins ; Notifications get sent to the admins by default
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE!
}
define service{
name generic-service ; The 'name' of this service template
active_checks_enabled 1 ; Active service checks are enabled
passive_checks_enabled 1 ; Passive service checks are enabled/accepted
parallelize_check 1 ; Active service checks should be parallelized (disabling this can lead to major performance problems)
obsess_over_service 1 ; We should obsess over this service (if necessary)
check_freshness 0 ; Default is to NOT check service 'freshness'
notifications_enabled 1 ; Service notifications are enabled
event_handler_enabled 1 ; Service event handler is enabled
flap_detection_enabled 1 ; Flap detection is enabled
process_perf_data 1 ; Process performance data
retain_status_information 1 ; Retain status information across program restarts
retain_nonstatus_information 1 ; Retain non-status information across program restarts
is_volatile 0 ; The service is not volatile
check_period 24x7 ; The service can be checked at any time of the day
max_check_attempts 3 ; Re-check the service up to 3 times in order to determine its final (hard) state
check_interval 10 ; Check the service every 10 minutes under normal conditions
retry_interval 2 ; Re-check the service every two minutes until a hard state can be determined
contact_groups admins ; Notifications get sent out to everyone in the 'admins' group
notification_options w,u,c,r ; Send notifications about warning, unknown, critical, and recovery events
notification_interval 60 ; Re-notify about service problems every hour
notification_period 24x7 ; Notifications can be sent out at any time
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
}
# Local service definition template - This is NOT a real service, just a template!
define service{
name local-service ; The name of this service template
use generic-service ; Inherit default values from the generic-service definition
max_check_attempts 4 ; Re-check the service up to 4 times in order to determine its final (hard) state
check_interval 5 ; Check the service every 5 minutes under normal conditions
retry_interval 1 ; Re-check the service every minute until a hard state can be determined
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
}
Code: Select all
efine host{
use linux-server ; Name of host template to use
; This host definition will inherit all variables that are defined
; in (or inherited by) the linux-server host template definition.
host_name NUT-R1
parents IMP-2960-TOR
alias NUT-R1
address 10.100.10.108
contact_groups admins
icon_image raspbian.gif
statusmap_image raspbian.gd2
}
define service{
use local-service ; Name of service template to use
host_name NUT-R1
service_description Dogfood Enforcement
contact_groups admins,support
check_command check_nrpe!check_df_enf
}