At the company I work at we have a cluster of four independent Nagios Core installations, made high-available and distributed using Merlin. Furthermore the only extra add-on enabled is Nconf.
All servers have powerful specifications and are overkill for the amount of host and service checks (+- 178 / 1876) the nodes each perform. CPU is 2x 8-core Intel E5640 @ 2.67GHz with 16GB RAM.
Now we see the Nagios daemon crash frequently on the different nodes, at different times. Generally the daemon crashes on one node at least once in two days, but often more (daily). We restart the daemon and it may stay up for days, but sooner or later it will crash again in the same week.
I have no clue why this happens, and why all 4 servers have the same behavior but never at the same moment. I saved log files for each crash of: dmesg, strace, nagios.log, nagios.debug and nagios.debug.old. I see this pattern, the last thing Nagios performs before crashing is:
Code: Select all
Moving temp check result file '/var/log/nagios/spool/checkresults/checkt97WCW' to queue file '/var/log/nagios/spool/checkresults/c3eiCkE'...Strace shows the same interesting pattern each time, which I find very strange:
Code: Select all
open("/dev/tty", O_RDWR|O_NOCTTY|O_NONBLOCK) = -1 ENXIO (No such device or address)
writev(2, [{"*** glibc detected *** ", 23}, {"/usr/sbin/nagios", 16}, {": ", 2}, {"double free or corruption (fastt"..., 35}, {": 0x", 4}, {"0000000002ca6170", 16}, {" ***\n", 5}], 7) = 101
I also attached the other log files: nagios.debug,, dmesg and strace.
I hope someone has seen this before and knows of a solution or something to try to debug these weird and nasty crashes.
Thanks a lot in advance!
Logs following here:
NAGIOS.DEBUG
Code: Select all
tail padm121_2014_01_14_16\:10\:01_nagios.debug | perl -pe 's/(\d+)/localtime($1)/e'
[Tue Jan 14 16:08:46 2014.361224] [016.2] [pid=20022] Moving temp check result file '/var/log/nagios/spool/checkresults/checkRdPYVP' to queue file '/var/log/nagios/spool/checkresults/cNnYx2w'...
[Tue Jan 14 16:08:46 2014.379104] [016.2] [pid=20025] Moving temp check result file '/var/log/nagios/spool/checkresults/checkPQKNml' to queue file '/var/log/nagios/spool/checkresults/c6CRAw2'...
[Tue Jan 14 16:08:46 2014.392623] [016.2] [pid=20027] Moving temp check result file '/var/log/nagios/spool/checkresults/checkVei3OQ' to queue file '/var/log/nagios/spool/checkresults/c0hFh1x'...
[Tue Jan 14 16:08:46 2014.418309] [016.2] [pid=20034] Moving temp check result file '/var/log/nagios/spool/checkresults/checkPajSlm' to queue file '/var/log/nagios/spool/checkresults/cXldKC3'...
[Tue Jan 14 16:08:46 2014.427440] [016.2] [pid=20037] Moving temp check result file '/var/log/nagios/spool/checkresults/checkhiKqWR' to queue file '/var/log/nagios/spool/checkresults/cGANWez'...
[Tue Jan 14 16:08:46 2014.454840] [016.2] [pid=20040] Moving temp check result file '/var/log/nagios/spool/checkresults/checkL1JMAn' to queue file '/var/log/nagios/spool/checkresults/cJlofY4'...
[Tue Jan 14 16:08:46 2014.471208] [016.2] [pid=20043] Moving temp check result file '/var/log/nagios/spool/checkresults/checkj2nLiT' to queue file '/var/log/nagios/spool/checkresults/cMrdbJA'...
[Tue Jan 14 16:08:46 2014.497833] [016.2] [pid=20047] Moving temp check result file '/var/log/nagios/spool/checkresults/check7yYi4o' to queue file '/var/log/nagios/spool/checkresults/cSZHwz6'...
[Tue Jan 14 16:08:46 2014.507346] [016.2] [pid=20049] Moving temp check result file '/var/log/nagios/spool/checkresults/checkHfYTRU' to queue file '/var/log/nagios/spool/checkresults/cYJ9PoC'...
[Tue Jan 14 16:08:46 2014.566789] [016.2] [pid=20058] Moving temp check result file '/var/log/nagios/spool/checkresults/checkt97WCW' to queue file '/var/log/nagios/spool/checkresults/c3eiCkE'...
Code: Select all
Call Trace:
nagios/23148: potentially unexpected fatal signal 6.
CPU 2
Modules linked in: mptctl mptbase nfs lockd fscache auth_rpcgss nfs_acl sunrpc bonding 8021q garp stp llc ipv6 nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack iptable_filter ip_tables power_meter e1000e be2iscsi iscsi_boot_sysfs libiscsi scsi_transport_iscsi be2net serio_raw iTCO_wdt iTCO_vendor_support hpilo hpwdt sg i7core_edac edac_core shpchp ext4 jbd2 mbcache sd_mod crc_t10dif hpsa dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
Pid: 23148, comm: nagios Not tainted 2.6.32-358.23.2.el6.x86_64 #1 HP ProLiant BL460c G7
RIP: 0033:[<00007f78f85fc8e5>] [<00007f78f85fc8e5>] 0x7f78f85fc8e5
RSP: 002b:00007fffd1d6e018 EFLAGS: 00000206
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffffffffffff
RDX: 0000000000000006 RSI: 0000000000005a6c RDI: 0000000000005a6c
RBP: 00007fffd1d6ea60 R08: 0000000000000000 R09: 00000000004281c0
R10: 0000000000000008 R11: 0000000000000206 R12: 000000000000001b
R13: 00007fffd1d6e340 R14: 0000000000000065 R15: 0000000000000007
FS: 00007f78f9214700(0000) GS:ffff880028220000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00007f78f8724ae0 CR3: 00000003fa424000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process nagios (pid: 23148, threadinfo ffff8803692dc000, task ffff880406cbe040)
Call Trace:
Code: Select all
stat("/var/log/nagios/spool/checkresults/cfe4FDU", {st_mode=S_IFREG|0600, st_size=437, ...}) = 0
stat("/var/log/nagios/spool/checkresults/cfe4FDU.ok", {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
write(3, "[1389712125.188359] [016.1] [pid"..., 115) = 115
lseek(3, 0, SEEK_CUR) = 38623099
open("/var/log/nagios/spool/checkresults/cfe4FDU", O_RDONLY) = 26
fstat(26, {st_mode=S_IFREG|0600, st_size=437, ...}) = 0
mmap(NULL, 437, PROT_READ, MAP_PRIVATE, 26, 0) = 0x7f78f9181000
munmap(0x7f78f9181000, 437) = 0
close(26) = 0
unlink("/var/log/nagios/spool/checkresults/cfe4FDU") = 0
unlink("/var/log/nagios/spool/checkresults/cfe4FDU.ok") = 0
stat("/var/log/nagios/spool/checkresults/cXwtOh0", {st_mode=S_IFREG|0600, st_size=415, ...}) = 0
stat("/var/log/nagios/spool/checkresults/cXwtOh0.ok", {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
write(3, "[1389712125.188941] [016.1] [pid"..., 115) = 115
lseek(3, 0, SEEK_CUR) = 38623214
open("/var/log/nagios/spool/checkresults/cXwtOh0", O_RDONLY) = 26
fstat(26, {st_mode=S_IFREG|0600, st_size=415, ...}) = 0
mmap(NULL, 415, PROT_READ, MAP_PRIVATE, 26, 0) = 0x7f78f9181000
munmap(0x7f78f9181000, 415) = 0
close(26) = 0
unlink("/var/log/nagios/spool/checkresults/cXwtOh0") = 0
unlink("/var/log/nagios/spool/checkresults/cXwtOh0.ok") = 0
stat("/var/log/nagios/spool/checkresults/ckGvgEr", {st_mode=S_IFREG|0600, st_size=571, ...}) = 0
stat("/var/log/nagios/spool/checkresults/ckGvgEr.ok", {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
write(3, "[1389712125.189443] [016.1] [pid"..., 115) = 115
lseek(3, 0, SEEK_CUR) = 38623624
open("/var/log/nagios/spool/checkresults/ckGvgEr", O_RDONLY) = 26
fstat(26, {st_mode=S_IFREG|0600, st_size=571, ...}) = 0
mmap(NULL, 571, PROT_READ, MAP_PRIVATE, 26, 0) = 0x7f78f9181000
munmap(0x7f78f9181000, 571) = 0
close(26) = 0
unlink("/var/log/nagios/spool/checkresults/ckGvgEr") = 0
unlink("/var/log/nagios/spool/checkresults/ckGvgEr.ok") = 0
stat("/var/log/nagios/spool/checkresults/cosNr1h", {st_mode=S_IFREG|0600, st_size=378, ...}) = 0
stat("/var/log/nagios/spool/checkresults/cosNr1h.ok", {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
write(3, "[1389712125.190093] [016.1] [pid"..., 115) = 115
lseek(3, 0, SEEK_CUR) = 38623739
open("/var/log/nagios/spool/checkresults/cosNr1h", O_RDONLY) = 26
fstat(26, {st_mode=S_IFREG|0600, st_size=378, ...}) = 0
mmap(NULL, 378, PROT_READ, MAP_PRIVATE, 26, 0) = 0x7f78f9181000
munmap(0x7f78f9181000, 378) = 0
close(26) = 0
unlink("/var/log/nagios/spool/checkresults/cosNr1h") = 0
unlink("/var/log/nagios/spool/checkresults/cosNr1h.ok") = 0
stat("/var/log/nagios/spool/checkresults/c8a1h2R", {st_mode=S_IFREG|0600, st_size=461, ...}) = 0
stat("/var/log/nagios/spool/checkresults/c8a1h2R.ok", {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
write(3, "[1389712125.190605] [016.1] [pid"..., 115) = 115
lseek(3, 0, SEEK_CUR) = 38623854
open("/var/log/nagios/spool/checkresults/c8a1h2R", O_RDONLY) = 26
fstat(26, {st_mode=S_IFREG|0600, st_size=461, ...}) = 0
mmap(NULL, 461, PROT_READ, MAP_PRIVATE, 26, 0) = 0x7f78f9181000
munmap(0x7f78f9181000, 461) = 0
close(26)
unlink("/var/log/nagios/spool/checkresults/c8a1h2R") = 0
unlink("/var/log/nagios/spool/checkresults/c8a1h2R.ok") = 0
stat("/var/log/nagios/spool/checkresults/cAJJ2LR", {st_mode=S_IFREG|0600, st_size=427, ...}) = 0
stat("/var/log/nagios/spool/checkresults/cAJJ2LR.ok", {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
write(3, "[1389712125.191116] [016.1] [pid"..., 115) = 115
lseek(3, 0, SEEK_CUR) = 38623969
open("/var/log/nagios/spool/checkresults/cAJJ2LR", O_RDONLY) = 26
fstat(26, {st_mode=S_IFREG|0600, st_size=427, ...}) = 0
mmap(NULL, 427, PROT_READ, MAP_PRIVATE, 26, 0) = 0x7f78f9181000
open("/dev/tty", O_RDWR|O_NOCTTY|O_NONBLOCK) = -1 ENXIO (No such device or address)
writev(2, [{"*** glibc detected *** ", 23}, {"/usr/sbin/nagios", 16}, {": ", 2}, {"double free or corruption (fastt"..., 35}, {": 0x", 4}, {"0000000002ca6170", 16}, {" ***\n", 5}], 7) = 101
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f78f9180000
futex(0x7f78f895b600, FUTEX_WAKE_PRIVATE, 2147483647) = 0
futex(0x7f78f7c3fa90, FUTEX_WAKE_PRIVATE, 2147483647) = 0
write(2, "======= Backtrace: =========\n", 29) = 29
writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"+0x", 3}, {"76126", 5}, {")", 1}, {"[0x", 3}, {"7f78f8640126", 12}, {"]\n", 2}], 8) = 43
writev(2, [{"/usr/sbin/nagios", 16}, {"(", 1}, {"my_strtok", 9}, {"+0x", 3}, {"26", 2}, {")", 1}, {"[0x", 3}, {"4160a6", 6}, {"]\n", 2}], 9) = 43
writev(2, [{"/usr/sbin/nagios", 16}, {"(", 1}, {"process_check_result_file", 25}, {"+0x", 3}, {"bd", 2}, {")", 1}, {"[0x", 3}, {"43fb3d", 6}, {"]\n", 2}], 9) = 59
writev(2, [{"/usr/sbin/nagios", 16}, {"(", 1}, {"process_check_result_queue", 26}, {"+0x", 3}, {"151", 3}, {")", 1}, {"[0x", 3}, {"4404a1", 6}, {"]\n", 2}], 9) = 61
writev(2, [{"/usr/sbin/nagios", 16}, {"(", 1}, {"reap_check_results", 18}, {"+0x", 3}, {"5c", 2}, {")", 1}, {"[0x", 3}, {"41db0c", 6}, {"]\n", 2}], 9) = 52
writev(2, [{"/usr/sbin/nagios", 16}, {"(", 1}, {"handle_timed_event", 18}, {"+0x", 3}, {"2d8", 3}, {")", 1}, {"[0x", 3}, {"42b118", 6}, {"]\n", 2}], 9) = 53
writev(2, [{"/usr/sbin/nagios", 16}, {"(", 1}, {"event_execution_loop", 20}, {"+0x", 3}, {"474", 3}, {")", 1}, {"[0x", 3}, {"42b804", 6}, {"]\n", 2}], 9) = 55
writev(2, [{"/usr/sbin/nagios", 16}, {"(", 1}, {"main", 4}, {"+0x", 3}, {"609", 3}, {")", 1}, {"[0x", 3}, {"412a89", 6}, {"]\n", 2}], 9) = 39
writev(2, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"__libc_start_main", 17}, {"+0x", 3}, {"fd", 2}, {")", 1}, {"[0x", 3}, {"7f78f85e8cdd", 12}, {"]\n", 2}], 9) = 57
writev(2, [{"/usr/sbin/nagios", 16}, {"[0x", 3}, {"4123b9", 6}, {"]\n", 2}], 4) = 27
write(2, "======= Memory map: ========\n", 29) = 29
open("/proc/self/maps", O_RDONLY) = 27
read(27, "00400000-0049b000 r-xp 00000000 "..., 1024) = 1024
write(2, "00400000-0049b000 r-xp 00000000 "..., 1024) = 1024
read(27, "0000 00:00 0 \n7f78cc15e000-7f78d"..., 1024) = 1024
write(2, "0000 00:00 0 \n7f78cc15e000-7f78d"..., 1024) = 1024
read(27, " 00000000 00:00 0 \n7f78f73cb000-"..., 1024) = 1024
write(2, " 00000000 00:00 0 \n7f78f73cb000-"..., 1024) = 1024
read(27, "00 fd:02 524657 "..., 1024) = 1024
write(2, "00 fd:02 524657 "..., 1024) = 1024
read(27, "31000-7f78f7f46000 rw-p 00000000"..., 1024) = 1024
write(2, "31000-7f78f7f46000 rw-p 00000000"..., 1024) = 1024
read(27, "60 /lib64/li"..., 1024) = 1024
write(2, "60 /lib64/li"..., 1024) = 1024
read(27, "ib64/libpthread-2.12.so\n7f78f8b7"..., 1024) = 1024
write(2, "ib64/libpthread-2.12.so\n7f78f8b7"..., 1024) = 1024
read(27, "rw-p 00000000 00:00 0 \n7f78f918b"..., 1024) = 1024
write(2, "rw-p 00000000 00:00 0 \n7f78f918b"..., 1024) = 1024
read(27, "2000 r--p 0001f000 fd:00 394747 "..., 1024) = 455
write(2, "2000 r--p 0001f000 fd:00 394747 "..., 455) = 455
read(27, "", 1024) = 0
close(27) = 0
rt_sigprocmask(SIG_UNBLOCK, [ABRT], NULL, 8) = 0
tgkill(23148, 23148, SIGABRT) = 0
--- SIGABRT (Aborted) @ 0 (0) ---
Process 23148 detached