Page 1 of 2

retry interval

Posted: Tue Jun 25, 2013 11:32 am
by paul.jobb
it appears with some of our checks the retry interval is being ignored and the check is retrying every 10 seconds, this is having a negative impact on our availability numbers

attached is my service template and I have the interval set at 1 minute but it appears the check is retrying after 10 seconds based on the attached screen shot

define service {
name GOA-Service-check_mssql_qry
check_command GOA_mssql_query
max_check_attempts 3
check_interval 10
retry_interval 1
check_period 24x7
process_perf_data 1
notification_interval 60
notification_period 24x7
notification_options w,u,r,c,f
_CRIT 200
_DB DB
_QUERY select ReturnField from table
_RESULT ZYXWVUTSRQPONMLKJIHGFEDCBA
_WARN 50
register 0

}
Capture.PNG
the from what I can tell there isn't any latency taking place, we are using mod_gearman to broker the service checks
Capture2.PNG

Re: retry interval

Posted: Tue Jun 25, 2013 12:08 pm
by abrist
Have you changed the "interval_length" directive in the nagios.cfg? The default is 60, as in 1 interval = 60 seconds. If you have changed this to 10, that would explain why 1 interval is 10 seconds instead of 60.

Code: Select all

grep interval_length /usr/local/nagios/etc/nagios.cfg

Re: retry interval

Posted: Tue Jun 25, 2013 12:14 pm
by paul.jobb
no I just checked it is set to 60

Code: Select all

# MODIFIED
admin_email=root@localhost
admin_pager=root@localhost
translate_passive_host_checks=1
log_event_handlers=0
use_large_installation_tweaks=1
enable_environment_macros=0


# NDOUtils module
broker_module=/usr/local/nagios/bin/ndomod.o config_file=/usr/local/nagios/etc/ndomod.cfg


# PNP settings - bulk mode with NCPD
process_performance_data=1
# service performance data
service_perfdata_file=/usr/local/nagios/var/service-perfdata
service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$\tSERVICESTATE::$SERVICESTATE$\tSERVICESTATETYPE::$SERVICESTATETYPE$\tSERVICEOUTPUT::$SERVICEOUTPUT$
service_perfdata_file_mode=a
service_perfdata_file_processing_interval=15
service_perfdata_file_processing_command=process-service-perfdata-file-bulk
# host performance data
host_perfdata_file=/usr/local/nagios/var/host-perfdata
host_perfdata_file_template=DATATYPE::HOSTPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tHOSTPERFDATA::$HOSTPERFDATA$\tHOSTCHECKCOMMAND::$HOSTCHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$\tHOSTOUTPUT::$HOSTOUTPUT$
host_perfdata_file_mode=a
host_perfdata_file_processing_interval=15
host_perfdata_file_processing_command=process-host-perfdata-file-bulk


# OBJECTS - UNMODIFIED
#cfg_file=/usr/local/nagios/etc/objects/commands.cfg
#cfg_file=/usr/local/nagios/etc/objects/contacts.cfg
#cfg_file=/usr/local/nagios/etc/objects/localhost.cfg
#cfg_file=/usr/local/nagios/etc/objects/templates.cfg
#cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg


# STATIC OBJECT DEFINITIONS (THESE DON'T GET EXPORTED/IMPORTED BY NAGIOSQL)
cfg_dir=/usr/local/nagios/etc/static

# OBJECTS EXPORTED FROM NAGIOSQL
cfg_file=/usr/local/nagios/etc/contacttemplates.cfg
cfg_file=/usr/local/nagios/etc/contactgroups.cfg
cfg_file=/usr/local/nagios/etc/contacts.cfg
cfg_file=/usr/local/nagios/etc/timeperiods.cfg
cfg_file=/usr/local/nagios/etc/commands.cfg
cfg_file=/usr/local/nagios/etc/hostgroups.cfg
cfg_file=/usr/local/nagios/etc/servicegroups.cfg
cfg_file=/usr/local/nagios/etc/hosttemplates.cfg
cfg_file=/usr/local/nagios/etc/servicetemplates.cfg
cfg_file=/usr/local/nagios/etc/servicedependencies.cfg
cfg_file=/usr/local/nagios/etc/serviceescalations.cfg
cfg_file=/usr/local/nagios/etc/hostdependencies.cfg
cfg_file=/usr/local/nagios/etc/hostescalations.cfg
cfg_file=/usr/local/nagios/etc/hostextinfo.cfg
cfg_file=/usr/local/nagios/etc/serviceextinfo.cfg
cfg_dir=/usr/local/nagios/etc/hosts
cfg_dir=/usr/local/nagios/etc/services

# GLOBAL EVENT HANDLERS
global_host_event_handler=xi_host_event_handler
global_service_event_handler=xi_service_event_handler



# UNMODIFIED
accept_passive_host_checks=1
accept_passive_service_checks=1
additional_freshness_latency=15
auto_reschedule_checks=0
auto_rescheduling_interval=30
auto_rescheduling_window=180
bare_update_check=0
cached_host_check_horizon=15
cached_service_check_horizon=15
check_external_commands=1
check_for_orphaned_hosts=1
check_for_orphaned_services=1
check_for_updates=1
check_host_freshness=0
check_result_path=/usr/local/nagios/var/spool/checkresults
check_result_reaper_frequency=10
check_service_freshness=1
command_check_interval=-1
command_file=/usr/local/nagios/var/rw/nagios.cmd
daemon_dumps_core=0
date_format=us
debug_file=/usr/local/nagios/var/nagios.debug
debug_level=0
debug_verbosity=1
enable_embedded_perl=1
enable_event_handlers=1
enable_flap_detection=1
enable_notifications=1
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
event_broker_options=-1
event_handler_timeout=30
execute_host_checks=1
execute_service_checks=1
external_command_buffer_slots=4096
high_host_flap_threshold=20.0
high_service_flap_threshold=20.0
host_check_timeout=30
host_freshness_check_interval=60
host_inter_check_delay_method=s
illegal_macro_output_chars=`~$&|'"<>
illegal_object_name_chars=`~!$%^&*|'"<>?,()=
interval_length=60
lock_file=/usr/local/nagios/var/nagios.lock
log_archive_path=/usr/local/nagios/var/archives
log_external_commands=0
log_file=/usr/local/nagios/var/nagios.log
log_host_retries=1
log_initial_states=0
log_notifications=1
log_passive_checks=0
log_rotation_method=d
log_service_retries=1
low_host_flap_threshold=5.0
low_service_flap_threshold=5.0
max_check_result_file_age=3600
max_check_result_reaper_time=30
max_concurrent_checks=0
max_debug_file_size=1000000
max_host_check_spread=30
max_service_check_spread=30
nagios_group=nagios
nagios_user=nagios
notification_timeout=30
object_cache_file=/usr/local/nagios/var/objects.cache
obsess_over_hosts=0
obsess_over_services=0
ocsp_timeout=5
p1_file=/usr/local/nagios/bin/p1.pl
passive_host_checks_are_soft=0
perfdata_timeout=5
precached_object_file=/usr/local/nagios/var/objects.precache
resource_file=/usr/local/nagios/etc/resource.cfg
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0
retained_host_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_service_attribute_mask=0
retain_state_information=1
retention_update_interval=60
service_check_timeout=60
service_freshness_check_interval=60
service_inter_check_delay_method=s
service_interleave_factor=s
sleep_time=0.25
soft_state_dependencies=0
state_retention_file=/usr/local/nagios/var/retention.dat
status_file=/usr/local/nagios/var/status.dat
status_update_interval=10
temp_file=/usr/local/nagios/var/nagios.tmp
temp_path=/tmp
use_aggressive_host_checking=0
use_embedded_perl_implicitly=1
use_regexp_matching=0
use_retained_program_state=1
use_retained_scheduling_info=1
use_syslog=1
use_true_regexp_matching=0
broker_module=/usr/lib64/mod_gearman/mod_gearman.o config=/etc/mod_gearman/mod_gearman_neb.conf eventhandler=no

Re: retry interval

Posted: Tue Jun 25, 2013 12:20 pm
by slansing
Is that the complete service information? It sounds like you may have freshness checking enabled:

http://nagios.sourceforge.net/docs/3_0/freshness.html

Re: retry interval

Posted: Tue Jun 25, 2013 12:26 pm
by paul.jobb
this is the service record, it uses the template I attached earlier

###############################################################################
#
# Service configuration file
#
# Created by: Nagios QL Version 3.0.3
# Date: 2013-06-21 15:22:17
# Version: Nagios 3.x config file
#
# --- DO NOT EDIT THIS FILE BY HAND ---
# Nagios QL will overwite all manual settings during the next update
#
###############################################################################

define service {
host_name EDM-GOA-SQL-101
service_description MSSQL Query - EDM-GOA-SQL-101
use GOA-Service-check_mssql_qry
register 1
}

###############################################################################
#
# Service configuration file
#
# END OF FILE
#
###############################################################################

Re: retry interval

Posted: Tue Jun 25, 2013 2:16 pm
by abrist
Can we see the contents of the template:

Code: Select all

GOA-Service-check_mssql_qry

Re: retry interval

Posted: Tue Jun 25, 2013 2:33 pm
by paul.jobb
define service {
name GOA-Service-check_mssql_qry
check_command GOA_mssql_query
max_check_attempts 3
check_interval 10
retry_interval 1
check_period 24x7
process_perf_data 1
notification_interval 60
notification_period 24x7
notification_options w,u,r,c,f
_CRIT 200
_DB CTOPC
_PORT 1179
_QUERY select ReturnField from CTOPC_MON
_RESULT ZYXWVUTSRQPONMLKJIHGFEDCBA
_WARN 50
register 0

}

Re: retry interval

Posted: Wed Jun 26, 2013 7:37 am
by scottwilkerson
Can you attach a screenshot of the Advanced tab of the service status detail page for one of these servers that is checking every 10 seconds?

Thanks

Re: retry interval

Posted: Wed Jun 26, 2013 9:49 am
by paul.jobb
The check times seem to be running per schedule, its the retries that seem to be the issue...
screenshot-sql-101.png

Re: retry interval

Posted: Wed Jun 26, 2013 10:00 am
by slansing
Hmm, could you force this host or another with this issue, into a down state and then post the same screenshot?