From time to time gearmand stops to receive checks from all remote servers and nagios stops working while I restart gearmand and nagios services on the central server. Is it issue of Nagios or Gearmand or my misconfiguration? Could anyone help me to make working it stable?
2305 hosts and 7421 services are monitored on the central Nagios. ~90% of them are received passively.
My configuration points:
1. Central server:
1.1 /usr/local/nagios/etc/nagios.cfg:
Code: Select all
broker_module=/usr/lib/mod_gearman/mod_gearman.o config=/etc/mod_gearman/mod_gearman_neb.confCode: Select all
debug=1
logfile=/var/log/mod_gearman/mod_gearman_neb.log
server=localhost:4730
#dupserver=<host>:<port>
eventhandler=no
services=no
hosts=no
#hostgroups=name1
#hostgroups=name2,name3
#servicegroups=name1,name2,name3
do_hostchecks=no
encryption=yes
key=my_pass
#keyfile=/path/to/secret.file
use_uniq_jobs=on
# NEB Module Config
localhostgroups=
localservicegroups=
#queue_custom_variable=WORKER
result_workers=1
perfdata=no
perfdata_mode=1
orphan_host_checks=yes
accept_clear_results=no2.1 /usr/local/nagios/etc/nagios.cfg:
Code: Select all
ocsp_command=gmlonp-submit_service_send_gearman
ochp_command=gmlonp-submit_host_send_gearmanCode: Select all
define command{
command_name gmlonp-submit_host_send_gearman
command_line /usr/bin/send_gearman --server=10.93.1.51:4730 --encryption=yes --key=my_pass --host="$HOSTNAME$" --returncode=$HOSTSTATEID$ --message="$HOSTOUTPUT$|$HOSTPERFDATA$"
}
define command{
command_name gmlonp-submit_service_send_gearman
command_line /usr/bin/send_gearman --server=10.93.1.51:4730 --encryption=yes --key=my_pass --host="$HOSTNAME$" --service="$SERVICEDESC$" --returncode=$SERVICESTATEID$ --message="$SERVICEOUTPUT$|$SERVICEPERFDATA$"
}Code: Select all
[root@rl-nms-01 ~]# tail -f /var/log/mod_gearman/mod_gearman_neb.log
[2015-10-06 07:30:37][10993][DEBUG] service job completed: vtbonp-sql-1c Disk E Space: 0
[2015-10-06 07:30:37][10993][DEBUG] service job completed: AZS160004 Memory Usage: 0
[2015-10-06 07:30:37][10993][DEBUG] host job completed: mnsonp-kis-ib: 0
[2015-10-06 07:30:37][10993][DEBUG] service job completed: AZS800047 Disk C Usage: 0
[2015-10-06 07:30:37][10993][DEBUG] host job completed: AZS160034: 0
[2015-10-06 07:30:37][10993][DEBUG] host job completed: POS650045: 0
[2015-10-06 07:30:37][10993][DEBUG] service job completed: AZS800015 PING: 0
[2015-10-06 07:30:37][10993][DEBUG] host job completed: DSL370082: 1
[2015-10-06 07:30:37][10993][DEBUG] service job completed: AZS370056 CPU Load: 0
[2015-10-06 07:30:37][10993][DEBUG] service job completed: NET370449 Memory Usage: 0
Output of netstat -na | grep :4730 in attach.
Code: Select all
[root@rl-nms-01 ~]# /etc/init.d/nagios status
nagios (pid 25269) is running...
[root@rl-nms-01 ~]# /etc/init.d/gearmand status
gearmand (pid 10897) is running...