Presenting all information on one host/check to simplify.
This one is in hard critical, but current_notification_number=0 (along with last/next notification, etc).
From my attempt to pore over all of this, I don't see what is stopping it from triggering a notification.
Sample logs:
debug output (full, grepping for host/service):
Code: Select all
[Tue May 7 16:52:38 2019.418912] [016.1] [pid=112134] Check results for service 'MEMORY' on host 'REDACTED_HOST' are fresh.
[Tue May 7 16:53:38 2019.079681] [016.1] [pid=112134] Check results for service 'MEMORY' on host 'REDACTED_HOST' are stale by 0d 0h 0m 5s (threshold=0d 0h 10m 0s). Forcing an immediate check of the service...
[Tue May 7 16:53:38 2019.079696] [016.0] [pid=112134] Scheduling a forced, active check of service 'MEMORY' on host 'REDACTED_HOST' @ Tue May 7 16:53:37 2019
[Tue May 7 16:53:38 2019.514079] [008.0] [pid=112134] ** Service Check Event ==> Host: 'REDACTED_HOST', Service: 'MEMORY', Options: 3, Latency: 0.434368 sec
[Tue May 7 16:53:38 2019.514091] [016.0] [pid=112134] Attempting to run scheduled check of service 'MEMORY' on host 'REDACTED_HOST': check options=3, latency=0.434368
[Tue May 7 16:53:38 2019.514120] [016.0] [pid=112134] Checking service 'MEMORY' on host 'REDACTED_HOST'...
[Tue May 7 16:53:53 2019.538269] [016.0] [pid=112134] ** Handling ACTIVE async check result for service 'MEMORY' on host 'REDACTED_HOST' from 'Core Worker 112144'...
[Tue May 7 16:53:53 2019.538284] [016.1] [pid=112134] Check results for service 'MEMORY' on host 'REDACTED_HOST' are stale by 0d 0h 0m 21s (threshold=0d 0h 10m 0s). Forcing an immediate check of the service...
[Tue May 7 16:53:53 2019.538419] [016.1] [pid=112134] Checking service 'MEMORY' on host 'REDACTED_HOST' for flapping...
[Tue May 7 16:54:38 2019.101310] [016.1] [pid=112134] Check results for service 'MEMORY' on host 'REDACTED_HOST' are fresh.
[Tue May 7 16:55:37 2019.660318] [016.1] [pid=112134] Check results for service 'MEMORY' on host 'REDACTED_HOST' are fresh.
Code: Select all
[Tue May 7 16:53:38 2019] Warning: The results of service 'MEMORY' on host 'REDACTED_HOST' are stale by 0d 0h 0m 5s (threshold=0d 0h 10m 0s). I'm forcing an immediate check of the service.
Config information (All except nagios.cfg from objects.cache to ensure combinations of 'use' etc are accounted for):
nagios.cfg (cat nagios.cfg | grep -v -e '^#' -e '^$' | sort):
Code: Select all
accept_passive_host_checks=1
accept_passive_service_checks=1
additional_freshness_latency=15
admin_email=root@localhost
admin_pager=pagenagios@localhost
auto_reschedule_checks=0
auto_rescheduling_interval=30
auto_rescheduling_window=180
bare_update_check=0
broker_module=/usr/local/nagios/lib/mk-livestatus/livestatus.o /usr/local/nagios/var/rw/live max_response_size=1048576000
cached_host_check_horizon=15
cached_service_check_horizon=15
cfg_file=/etc/nagios.live/datacat_commands.cfg
cfg_file=/etc/nagios.live/datacat_hostgroups.cfg
cfg_file=/etc/nagios.live/datacat_hosts.cfg
cfg_file=/etc/nagios.live/datacat_servicegroups.cfg
cfg_file=/etc/nagios.live/datacat_services.cfg
cfg_file=/etc/nagios.live/nagios_command.cfg
cfg_file=/etc/nagios.live/nagios_contact.cfg
cfg_file=/etc/nagios.live/nagios_contactgroup.cfg
cfg_file=/etc/nagios.live/nagios_host.cfg
cfg_file=/etc/nagios.live/nagios_hostdependency.cfg
cfg_file=/etc/nagios.live/nagios_hostgroup.cfg
cfg_file=/etc/nagios.live/nagios_service.cfg
cfg_file=/etc/nagios.live/nagios_servicedependency.cfg
cfg_file=/etc/nagios.live/nagios_servicegroup.cfg
cfg_file=/etc/nagios.live/nagios_timeperiod.cfg
cfg_file=/etc/nagios.live/priority_service.cfg
check_external_commands=1
check_for_orphaned_hosts=1
check_for_orphaned_services=1
check_for_updates=1
check_host_freshness=0
check_result_path=/var/log/nagios/spool/checkresults
check_result_reaper_frequency=10
check_service_freshness=1
command_check_interval=-1
command_file=/usr/local/nagios/var/nagios.cmd
daemon_dumps_core=0
date_format=iso8601
debug_file=/var/log/nagios/nagios.debug
debug_level=-1
debug_verbosity=1
enable_embedded_perl=1
enable_environment_macros=0
enable_event_handlers=1
enable_flap_detection=0
enable_notifications=1
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
event_broker_options=-1
event_handler_timeout=30
execute_host_checks=1
execute_service_checks=1
external_command_buffer_slots=4096
high_host_flap_threshold=20.0
high_service_flap_threshold=20.0
host_check_timeout=30
host_freshness_check_interval=60
host_inter_check_delay_method=s
illegal_macro_output_chars=`~$&|'"<>
illegal_object_name_chars=`~!$%^&*|'"<>?,()=
interval_length=60
lock_file=/usr/local/nagios/var/nagios.lock
log_archive_path=/var/log/nagios/archives
log_event_handlers=1
log_external_commands=1
log_file=/var/log/nagios/nagios.log
log_host_retries=1
log_initial_states=0
log_notifications=1
log_passive_checks=1
log_rotation_method=h
log_service_retries=1
low_host_flap_threshold=5.0
low_service_flap_threshold=5.0
max_check_result_file_age=3600
max_check_result_reaper_time=30
max_concurrent_checks=0
max_debug_file_size=100000000
max_host_check_spread=30
max_service_check_spread=30
nagios_group=nagios
nagios_user=nagios
notification_timeout=30
object_cache_file=/var/log/nagios/objects.cache
obsess_over_hosts=0
obsess_over_services=0
ocsp_timeout=5
p1_file=/usr/sbin/p1.pl
passive_host_checks_are_soft=0
perfdata_timeout=5
precached_object_file=/var/log/nagios/objects.precache
process_performance_data=0
resource_file=/usr/local/nagios/etc/private/resource.cfg
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0
retained_host_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_service_attribute_mask=0
retain_state_information=1
retention_update_interval=1
service_check_timeout=60
service_freshness_check_interval=60
service_inter_check_delay_method=s
service_interleave_factor=s
service_perfdata_file_mode=a
sleep_time=0.25
soft_state_dependencies=0
state_retention_file=/var/log/nagios/retention.dat
status_file=/var/log/nagios/status.dat
status_update_interval=10
temp_file=/var/log/nagios/nagios.tmp
temp_path=/tmp
translate_passive_host_checks=0
use_aggressive_host_checking=0
use_embedded_perl_implicitly=1
use_large_installation_tweaks=0
use_regexp_matching=0
use_retained_program_state=1
use_retained_scheduling_info=1
use_syslog=0
use_true_regexp_matching=0
Code: Select all
define service {
host_name REDACTED_HOST
service_description MEMORY
check_period 24x7
check_command check_MEMORY
contact_groups opsalerts
notification_period 24x7
initial_state o
importance 0
check_interval 5.000000
retry_interval 2.000000
max_check_attempts 3
is_volatile 0
parallelize_check 1
active_checks_enabled 0
passive_checks_enabled 1
obsess 1
event_handler_enabled 1
low_flap_threshold 0.000000
high_flap_threshold 0.000000
flap_detection_enabled 1
flap_detection_options a
freshness_threshold 600
check_freshness 1
notification_options r,w,u,c
notifications_enabled 1
notification_interval 60.000000
first_notification_delay 0.000000
stalking_options n
process_perf_data 1
notes_url REDACTED_RUNBOOK_URL
retain_status_information 1
retain_nonstatus_information 1
}
Code: Select all
define contactgroup {
contactgroup_name opsalerts
alias Ops Alert Mailing List
members opsalerts
}
define contact {
contact_name opsalerts
alias opsalerts
service_notification_period 24x7
host_notification_period 24x7
service_notification_options r,w,u,c
host_notification_options r,d,u
service_notification_commands notify-service-by-email
host_notification_commands notify-host-by-email
email REDACTED_EMAIL
minimum_importance 0
host_notifications_enabled 1
service_notifications_enabled 1
can_submit_commands 1
retain_status_information 1
retain_nonstatus_information 1
}
status.dat extractions:
System status:
Code: Select all
info {
created=1557248618
version=4.4.3
last_update_check=1557170975
update_available=0
last_version=4.4.3
new_version=4.4.3
}
programstatus {
modified_host_attributes=0
modified_service_attributes=0
nagios_pid=112134
daemon_mode=1
program_start=1557247713
last_log_rotation=1557248399
enable_notifications=1
active_service_checks_enabled=1
passive_service_checks_enabled=1
active_host_checks_enabled=1
passive_host_checks_enabled=1
enable_event_handlers=1
obsess_over_services=0
obsess_over_hosts=0
check_service_freshness=1
check_host_freshness=0
enable_flap_detection=0
process_performance_data=0
global_host_event_handler=
global_service_event_handler=
next_comment_id=945118
next_downtime_id=170282
next_event_id=1559258
next_problem_id=739973
next_notification_id=505547
active_scheduled_host_check_stats=224,1491,4555
active_ondemand_host_check_stats=79,450,1396
passive_host_check_stats=949,4645,14018
active_scheduled_service_check_stats=92,141,419
active_ondemand_service_check_stats=0,0,0
passive_service_check_stats=4314,21706,65361
cached_host_check_stats=79,450,1396
cached_service_check_stats=0,0,0
external_command_stats=5375,26967,81220
parallel_host_check_stats=224,1491,4555
serial_host_check_stats=0,0,0
}
Code: Select all
hoststatus {
host_name=REDACTED_HOST
modified_attributes=0
check_command=check-host-alive-production!500.0,10%!900.0,20%!10!1
check_period=24x7
notification_period=24x7
importance=0
check_interval=5.000000
retry_interval=1.000000
event_handler=
has_been_checked=1
should_be_scheduled=1
check_execution_time=20.002
check_latency=0.354
check_type=0
current_state=1
last_hard_state=1
last_event_id=1544961
current_event_id=1558724
current_problem_id=739598
last_problem_id=732699
plugin_output=CRITICAL - Plugin timed out
long_plugin_output=
performance_data=
last_check=1557248400
next_check=1557248700
check_options=0
current_attempt=5
max_attempts=5
state_type=1
last_state_change=1557182576
last_hard_state_change=1557182896
last_time_up=1557182576
last_time_down=1557248400
last_time_unreachable=0
last_notification=1557248033
next_notification=1557255233
no_more_notifications=0
current_notification_number=10
current_notification_id=505545
notifications_enabled=1
problem_has_been_acknowledged=0
acknowledgement_type=0
active_checks_enabled=1
passive_checks_enabled=1
event_handler_enabled=1
flap_detection_enabled=1
process_performance_data=1
obsess=1
last_update=1557248618
is_flapping=0
percent_state_change=0.00
scheduled_downtime_depth=0
}
Code: Select all
servicestatus {
host_name=REDACTED_HOST
service_description=MEMORY
modified_attributes=0
check_command=check_MEMORY
check_period=24x7
notification_period=24x7
importance=0
check_interval=5.000000
retry_interval=2.000000
event_handler=
has_been_checked=1
should_be_scheduled=0
check_execution_time=15.022
check_latency=0.434
check_type=0
current_state=2
last_hard_state=2
last_event_id=1492215
current_event_id=1558908
current_problem_id=739782
last_problem_id=708061
current_attempt=3
max_attempts=3
state_type=1
last_state_change=1557182940
last_hard_state_change=1557182940
last_time_ok=1557182940
last_time_warning=0
last_time_unknown=0
last_time_critical=1557248018
plugin_output=CHECK_NRPE STATE CRITICAL: Socket timeout after 15 seconds.
long_plugin_output=
performance_data=
last_check=1557248018
next_check=1557248017
check_options=0
current_notification_number=0
current_notification_id=0
last_notification=0
next_notification=0
no_more_notifications=0
notifications_enabled=1
active_checks_enabled=0
passive_checks_enabled=1
event_handler_enabled=1
problem_has_been_acknowledged=0
acknowledgement_type=0
flap_detection_enabled=1
process_performance_data=1
obsess=1
last_update=1557248618
is_flapping=0
percent_state_change=0.00
scheduled_downtime_depth=0
}
Code: Select all
contactstatus {
contact_name=opsalerts
modified_attributes=0
modified_host_attributes=0
modified_service_attributes=0
host_notification_period=24x7
service_notification_period=24x7
last_host_notification=1557248212
last_service_notification=1557206818
host_notifications_enabled=1
service_notifications_enabled=1
}