Page 1 of 1
Check-service going on same seconds no matter what
Posted: Sun Feb 23, 2014 1:25 am
by voicelink
We are having problem with our Nagios service checks. Software was running without problems for few years, but now we don't know what happened and causes our Nagios server have huge loads from time to time.
I think the problem is now that many of the checks are somehow running on same second, and that what causes huge load at that time.
We are checking about 4000 services and when looking at monitoring engine event queue we can see huge spikes, 600-800 at some time.
We first had service_inter_check_delay_method as SMART, when we started to have this problem we changed it to dumb, but didn't help. Still we get hundreds of checks at the same second. We also changed max_service_check_spread from 30 > 60 and didn't help.
Any ideas how we get service checks more evenly?
Re: Check-service going on same seconds no matter what
Posted: Mon Feb 24, 2014 11:55 am
by sreinhardt
Could you post your nagios.cfg please so we can have a look at exactly how it is configured presently?
Re: Check-service going on same seconds no matter what
Posted: Mon Feb 24, 2014 12:22 pm
by voicelink
Code: Select all
[root@nagiosxi etc]# more nagios.cfg
# MODIFIED
admin_email=root@localhost
admin_pager=root@localhost
translate_passive_host_checks=1
log_event_handlers=0
use_large_installation_tweaks=1
enable_environment_macros=0
# NDOUtils module
broker_module=/usr/local/nagios/bin/ndomod.o config_file=/usr/local/nagios/etc/
ndomod.cfg
# PNP settings - bulk mode with NCPD
process_performance_data=1
# service performance data
service_perfdata_file=/usr/local/nagios/var/service-perfdata
service_perfdata_file_template=DATATYPE::SERVICEPERFDATA\tTIMET::$TIMET$\tHOSTN
AME::$HOSTNAME$\tSERVICEDESC::$SERVICEDESC$\tSERVICEPERFDATA::$SERVICEPERFDATA$
\tSERVICECHECKCOMMAND::$SERVICECHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATE
TYPE::$HOSTSTATETYPE$\tSERVICESTATE::$SERVICESTATE$\tSERVICESTATETYPE::$SERVICE
STATETYPE$\tSERVICEOUTPUT::$SERVICEOUTPUT$
service_perfdata_file_mode=a
service_perfdata_file_processing_interval=15
service_perfdata_file_processing_command=process-service-perfdata-file-bulk
# host performance data
host_perfdata_file=/usr/local/nagios/var/host-perfdata
host_perfdata_file_template=DATATYPE::HOSTPERFDATA\tTIMET::$TIMET$\tHOSTNAME::$HOSTNAME$\tHOSTPERFDATA::$HOSTPERFDATA$\tHOSTCHECKCOMMAND::$HOSTCHECKCOMMAND$\tHOSTSTATE::$HOSTSTATE$\tHOSTSTATETYPE::$HOSTSTATETYPE$\tHOSTOUTPUT::$HOSTOUTPUT
$
host_perfdata_file_mode=a
host_perfdata_file_processing_interval=15
host_perfdata_file_processing_command=process-host-perfdata-file-bulk
# OBJECTS - UNMODIFIED
#cfg_file=/usr/local/nagios/etc/objects/commands.cfg
#cfg_file=/usr/local/nagios/etc/objects/contacts.cfg
#cfg_file=/usr/local/nagios/etc/objects/localhost.cfg
#cfg_file=/usr/local/nagios/etc/objects/templates.cfg
#cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg
# STATIC OBJECT DEFINITIONS (THESE DON'T GET EXPORTED/IMPORTED BY NAGIOSQL)
cfg_dir=/usr/local/nagios/etc/static
# OBJECTS EXPORTED FROM NAGIOSQL
cfg_file=/usr/local/nagios/etc/contacttemplates.cfg
cfg_file=/usr/local/nagios/etc/contactgroups.cfg
cfg_file=/usr/local/nagios/etc/contacts.cfg
cfg_file=/usr/local/nagios/etc/timeperiods.cfg
cfg_file=/usr/local/nagios/etc/commands.cfg
cfg_file=/usr/local/nagios/etc/hostgroups.cfg
cfg_file=/usr/local/nagios/etc/servicegroups.cfg
cfg_file=/usr/local/nagios/etc/hosttemplates.cfg
cfg_file=/usr/local/nagios/etc/servicetemplates.cfg
cfg_file=/usr/local/nagios/etc/servicedependencies.cfg
cfg_file=/usr/local/nagios/etc/serviceescalations.cfg
cfg_file=/usr/local/nagios/etc/hostdependencies.cfg
cfg_file=/usr/local/nagios/etc/hostescalations.cfg
cfg_file=/usr/local/nagios/etc/hostextinfo.cfg
cfg_file=/usr/local/nagios/etc/serviceextinfo.cfg
cfg_dir=/usr/local/nagios/etc/hosts
cfg_dir=/usr/local/nagios/etc/services
# GLOBAL EVENT HANDLERS
global_host_event_handler=xi_host_event_handler
global_service_event_handler=xi_service_event_handler
# UNMODIFIED
accept_passive_host_checks=1
accept_passive_service_checks=1
additional_freshness_latency=15
auto_reschedule_checks=0
auto_rescheduling_interval=30
auto_rescheduling_window=180
bare_update_check=0
cached_host_check_horizon=15
cached_service_check_horizon=15
check_external_commands=1
check_for_orphaned_hosts=1
check_for_orphaned_services=1
check_for_updates=1
check_host_freshness=0
check_result_path=/usr/local/nagios/var/spool/checkresults
check_result_reaper_frequency=3
check_service_freshness=1
command_check_interval=-1
command_file=/usr/local/nagios/var/rw/nagios.cmd
daemon_dumps_core=0
date_format=us
debug_file=/usr/local/nagios/var/nagios.debug
debug_level=0
debug_verbosity=1
enable_embedded_perl=1
enable_event_handlers=1
enable_flap_detection=1
enable_notifications=1
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
event_broker_options=-1
event_handler_timeout=30
execute_host_checks=1
execute_service_checks=1
external_command_buffer_slots=4096
high_host_flap_threshold=20.0
high_service_flap_threshold=20.0
host_check_timeout=30
host_freshness_check_interval=60
host_inter_check_delay_method=s
illegal_macro_output_chars=`~$&|'"<>
illegal_object_name_chars=`~!$%^&*|'"<>?,()=
interval_length=60
lock_file=/usr/local/nagios/var/nagios.lock
log_archive_path=/usr/local/nagios/var/archives
log_external_commands=0
log_file=/usr/local/nagios/var/nagios.log
log_host_retries=1
log_initial_states=0
log_notifications=1
log_passive_checks=0
log_rotation_method=d
log_service_retries=1
low_host_flap_threshold=5.0
low_service_flap_threshold=5.0
max_check_result_file_age=3600
max_check_result_reaper_time=10
max_concurrent_checks=0
max_debug_file_size=1000000
max_host_check_spread=60
max_service_check_spread=60
nagios_group=nagios
nagios_user=nagios
notification_timeout=30
object_cache_file=/usr/local/nagios/var/objects.cache
obsess_over_hosts=0
obsess_over_services=0
ocsp_timeout=5
p1_file=/usr/local/nagios/bin/p1.pl
passive_host_checks_are_soft=0
perfdata_timeout=5
precached_object_file=/usr/local/nagios/var/objects.precache
resource_file=/usr/local/nagios/etc/resource.cfg
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0
retained_host_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_service_attribute_mask=0
retain_state_information=1
retention_update_interval=60
service_check_timeout=60
service_freshness_check_interval=60
service_inter_check_delay_method=d
service_interleave_factor=s
sleep_time=0.25
soft_state_dependencies=0
state_retention_file=/usr/local/nagios/var/retention.dat
status_file=/usr/local/nagios/var/status.dat
status_update_interval=10
temp_file=/usr/local/nagios/var/nagios.tmp
temp_path=/tmp
use_aggressive_host_checking=0
use_embedded_perl_implicitly=1
use_regexp_matching=0
use_retained_program_state=1
use_retained_scheduling_info=1
use_syslog=1
use_true_regexp_matching=0
Moderator note: Please use code wraps around long output. See the changes I made to your post for an example.
Re: Check-service going on same seconds no matter what
Posted: Mon Feb 24, 2014 5:05 pm
by abrist
A few questions:
1) What does io wait look like on this server?
2) Have you tried to update any xi component (specifically core)? (don't update core if you have not done so as it will break xi)
3) Any issues with the dbs:
4) Is ndo running normally?
5) Does the issue go away for a while when you restart ndo:
Re: Check-service going on same seconds no matter what
Posted: Mon Feb 24, 2014 11:02 pm
by voicelink
i/o wait is 0-1% usually
We will check the logs
Re: Check-service going on same seconds no matter what
Posted: Tue Feb 25, 2014 1:18 am
by voicelink
[root@nagiosxi ~]# grep crashed /var/log/mysqld.log
[root@nagiosxi ~]# service ndo2db status
ndo2db (pid 2014) is running...
[root@nagiosxi ~]# service ndo2db restart
Stopping ndo2db: head: cannot open `/usr/local/nagios/var/ndo2db.lock' for reading: No such file or directory
done.
Starting ndo2db: done.
Re: Check-service going on same seconds no matter what
Posted: Tue Feb 25, 2014 1:52 pm
by slansing
Did the issue go away for a while as abrist asked? If so, for how long?
Re: Check-service going on same seconds no matter what
Posted: Tue Feb 25, 2014 11:07 pm
by voicelink
Yesterday we got five times load warning and day before six times. So pretty much same amount after restarting ndo
Re: Check-service going on same seconds no matter what
Posted: Wed Feb 26, 2014 1:34 pm
by lmiltchev
Run "top" and place the output in code wraps. Thanks!