2019-02-25 08:48:22 -05:00
/* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
2013-02-09 12:39:43 -05:00
2014-05-25 10:23:35 -04:00
# include "icinga/checkable.hpp"
# include "icinga/service.hpp"
# include "icinga/host.hpp"
# include "icinga/checkcommand.hpp"
# include "icinga/icingaapplication.hpp"
# include "icinga/cib.hpp"
2015-10-19 11:31:18 -04:00
# include "icinga/clusterevents.hpp"
2014-05-25 10:23:35 -04:00
# include "remote/messageorigin.hpp"
2014-11-13 05:23:57 -05:00
# include "remote/apilistener.hpp"
2014-05-25 10:23:35 -04:00
# include "base/objectlock.hpp"
2014-10-19 08:21:12 -04:00
# include "base/logger.hpp"
2014-05-25 10:23:35 -04:00
# include "base/convert.hpp"
# include "base/utility.hpp"
# include "base/context.hpp"
2025-05-20 07:02:26 -04:00
# include <shared_mutex>
2013-02-09 12:39:43 -05:00
using namespace icinga ;
2015-08-04 08:47:44 -04:00
boost : : signals2 : : signal < void ( const Checkable : : Ptr & , const CheckResult : : Ptr & , const MessageOrigin : : Ptr & ) > Checkable : : OnNewCheckResult ;
boost : : signals2 : : signal < void ( const Checkable : : Ptr & , const CheckResult : : Ptr & , StateType , const MessageOrigin : : Ptr & ) > Checkable : : OnStateChange ;
boost : : signals2 : : signal < void ( const Checkable : : Ptr & , const CheckResult : : Ptr & , std : : set < Checkable : : Ptr > , const MessageOrigin : : Ptr & ) > Checkable : : OnReachabilityChanged ;
2016-06-07 06:44:12 -04:00
boost : : signals2 : : signal < void ( const Checkable : : Ptr & , NotificationType , const CheckResult : : Ptr & , const String & , const String & , const MessageOrigin : : Ptr & ) > Checkable : : OnNotificationsRequested ;
2016-01-22 12:42:15 -05:00
boost : : signals2 : : signal < void ( const Checkable : : Ptr & ) > Checkable : : OnNextCheckUpdated ;
2014-04-03 09:36:13 -04:00
2019-08-14 11:43:06 -04:00
Atomic < uint_fast64_t > Checkable : : CurrentConcurrentChecks ( 0 ) ;
2021-02-02 04:16:04 -05:00
std : : mutex Checkable : : m_StatsMutex ;
2016-05-12 07:46:22 -04:00
int Checkable : : m_PendingChecks = 0 ;
2021-02-02 04:16:04 -05:00
std : : condition_variable Checkable : : m_PendingChecksCV ;
2016-05-12 07:46:22 -04:00
2018-01-03 22:25:35 -05:00
CheckCommand : : Ptr Checkable : : GetCheckCommand ( ) const
2013-02-09 12:39:43 -05:00
{
2015-09-22 03:42:30 -04:00
return dynamic_pointer_cast < CheckCommand > ( NavigateCheckCommandRaw ( ) ) ;
2013-02-09 12:39:43 -05:00
}
2018-01-03 22:25:35 -05:00
TimePeriod : : Ptr Checkable : : GetCheckPeriod ( ) const
2013-03-13 11:04:53 -04:00
{
2015-08-04 08:47:44 -04:00
return TimePeriod : : GetByName ( GetCheckPeriodRaw ( ) ) ;
2013-02-09 12:39:43 -05:00
}
2014-04-03 09:36:13 -04:00
void Checkable : : SetSchedulingOffset ( long offset )
2013-02-09 12:39:43 -05:00
{
2013-02-26 04:13:54 -05:00
m_SchedulingOffset = offset ;
2013-02-09 12:39:43 -05:00
}
2018-01-03 22:25:35 -05:00
long Checkable : : GetSchedulingOffset ( )
2013-02-09 12:39:43 -05:00
{
2013-02-26 04:13:54 -05:00
return m_SchedulingOffset ;
2013-02-09 12:39:43 -05:00
}
2016-03-15 08:02:38 -04:00
void Checkable : : UpdateNextCheck ( const MessageOrigin : : Ptr & origin )
2013-02-09 12:39:43 -05:00
{
double interval ;
2017-12-14 09:37:20 -05:00
if ( GetStateType ( ) = = StateTypeSoft & & GetLastCheckResult ( ) ! = nullptr )
2013-02-09 12:39:43 -05:00
interval = GetRetryInterval ( ) ;
else
interval = GetCheckInterval ( ) ;
double now = Utility : : GetTime ( ) ;
double adj = 0 ;
if ( interval > 1 )
2013-03-20 10:25:53 -04:00
adj = fmod ( now * 100 + GetSchedulingOffset ( ) , interval * 100 ) / 100.0 ;
2013-02-09 12:39:43 -05:00
2019-01-09 05:27:33 -05:00
if ( adj ! = 0.0 )
adj = std : : min ( 0.5 + fmod ( GetSchedulingOffset ( ) , interval * 5 ) / 100.0 , adj ) ;
2016-05-24 05:05:29 -04:00
2018-07-02 10:17:33 -04:00
double nextCheck = now - adj + interval ;
2019-01-16 11:27:38 -05:00
double lastCheck = GetLastCheck ( ) ;
2018-07-02 10:17:33 -04:00
Log ( LogDebug , " Checkable " )
2025-02-26 10:36:57 -05:00
< < std : : fixed < < std : : setprecision ( 0 )
2018-07-02 10:17:33 -04:00
< < " Update checkable ' " < < GetName ( ) < < " ' with check interval ' " < < GetCheckInterval ( )
2019-01-16 11:27:38 -05:00
< < " ' from last check time at " < < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , ( lastCheck < 0 ? 0 : lastCheck ) )
2025-02-26 10:36:57 -05:00
< < " ( " < < lastCheck < < " ) to next check time at "
< < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , nextCheck ) < < " ( " < < nextCheck < < " ). " ;
2018-07-02 10:17:33 -04:00
SetNextCheck ( nextCheck , false , origin ) ;
2013-02-09 12:39:43 -05:00
}
2018-01-03 22:25:35 -05:00
bool Checkable : : HasBeenChecked ( ) const
2013-07-05 03:35:49 -04:00
{
2017-12-14 09:37:20 -05:00
return GetLastCheckResult ( ) ! = nullptr ;
2013-07-05 03:35:49 -04:00
}
2018-01-03 22:25:35 -05:00
double Checkable : : GetLastCheck ( ) const
2013-07-05 03:35:49 -04:00
{
2013-11-09 08:22:38 -05:00
CheckResult : : Ptr cr = GetLastCheckResult ( ) ;
2013-07-05 03:35:49 -04:00
double schedule_end = - 1 ;
2013-10-26 03:41:45 -04:00
if ( cr )
2013-11-09 08:22:38 -05:00
schedule_end = cr - > GetScheduleEnd ( ) ;
2013-07-05 03:35:49 -04:00
return schedule_end ;
}
2025-05-22 11:56:16 -04:00
Checkable : : ProcessingResult Checkable : : ProcessCheckResult ( const CheckResult : : Ptr & cr , const WaitGroup : : Ptr & producer , const MessageOrigin : : Ptr & origin )
2013-02-09 12:39:43 -05:00
{
2022-04-26 07:33:59 -04:00
using Result = Checkable : : ProcessingResult ;
2025-05-20 04:41:26 -04:00
VERIFY ( cr ) ;
2025-05-20 07:02:26 -04:00
VERIFY ( producer ) ;
2025-05-20 04:41:26 -04:00
2025-03-10 08:51:25 -04:00
ObjectLock olock ( this ) ;
m_CheckRunning = false ;
2014-03-12 05:05:36 -04:00
2013-03-19 11:20:13 -04:00
double now = Utility : : GetTime ( ) ;
2013-11-09 08:22:38 -05:00
if ( cr - > GetScheduleStart ( ) = = 0 )
cr - > SetScheduleStart ( now ) ;
2013-03-19 11:20:13 -04:00
2013-11-09 08:22:38 -05:00
if ( cr - > GetScheduleEnd ( ) = = 0 )
cr - > SetScheduleEnd ( now ) ;
2013-03-19 11:20:13 -04:00
2013-11-09 08:22:38 -05:00
if ( cr - > GetExecutionStart ( ) = = 0 )
cr - > SetExecutionStart ( now ) ;
2013-03-19 11:20:13 -04:00
2013-11-09 08:22:38 -05:00
if ( cr - > GetExecutionEnd ( ) = = 0 )
cr - > SetExecutionEnd ( now ) ;
2013-03-19 11:20:13 -04:00
2021-07-20 05:10:26 -04:00
if ( ! origin | | origin - > IsLocal ( ) )
cr - > SetSchedulingSource ( IcingaApplication : : GetInstance ( ) - > GetNodeName ( ) ) ;
2014-11-13 05:23:57 -05:00
Endpoint : : Ptr command_endpoint = GetCommandEndpoint ( ) ;
2021-01-25 10:05:03 -05:00
if ( cr - > GetCheckSource ( ) . IsEmpty ( ) ) {
if ( ( ! origin | | origin - > IsLocal ( ) ) )
cr - > SetCheckSource ( IcingaApplication : : GetInstance ( ) - > GetNodeName ( ) ) ;
/* override check source if command_endpoint was defined */
if ( command_endpoint & & ! GetExtension ( " agent_check " ) )
cr - > SetCheckSource ( command_endpoint - > GetName ( ) ) ;
}
2015-09-05 09:18:10 -04:00
2025-05-20 07:02:26 -04:00
std : : shared_lock producerLock ( * producer , std : : try_to_lock ) ;
if ( ! producerLock ) {
// Discard the check result to not delay the current reload.
// We'll re-run the check immediately after the reload.
return Result : : CheckableInactive ;
}
2015-09-05 09:18:10 -04:00
/* agent checks go through the api */
2015-01-18 16:15:35 -05:00
if ( command_endpoint & & GetExtension ( " agent_check " ) ) {
2014-11-13 05:23:57 -05:00
ApiListener : : Ptr listener = ApiListener : : GetInstance ( ) ;
if ( listener ) {
2015-01-18 16:15:35 -05:00
/* send message back to its origin */
2015-10-19 11:31:18 -04:00
Dictionary : : Ptr message = ClusterEvents : : MakeCheckResultMessage ( this , cr ) ;
2014-11-13 05:23:57 -05:00
listener - > SyncSendMessage ( command_endpoint , message ) ;
}
2022-04-26 07:33:59 -04:00
return Result : : Ok ;
2015-01-18 16:15:35 -05:00
2014-11-13 05:23:57 -05:00
}
2018-05-29 07:51:34 -04:00
if ( ! IsActive ( ) )
2022-04-26 07:33:59 -04:00
return Result : : CheckableInactive ;
2018-05-29 07:51:34 -04:00
2013-03-06 05:03:50 -05:00
bool reachable = IsReachable ( ) ;
2014-02-27 05:05:55 -05:00
bool notification_reachable = IsReachable ( DependencyNotification ) ;
2013-03-06 05:03:50 -05:00
2025-02-06 02:58:03 -05:00
// Cache whether the previous state of this Checkable affects its children before overwriting the last check result.
// This will be used to determine whether the on reachability changed event should be triggered.
bool affectsPreviousStateChildren ( reachable & & AffectsChildren ( ) ) ;
2013-11-09 08:22:38 -05:00
CheckResult : : Ptr old_cr = GetLastCheckResult ( ) ;
2014-04-03 09:36:13 -04:00
ServiceState old_state = GetStateRaw ( ) ;
2013-03-07 06:04:20 -05:00
StateType old_stateType = GetStateType ( ) ;
2013-10-26 03:41:45 -04:00
long old_attempt = GetCheckAttempt ( ) ;
2014-07-22 08:13:21 -04:00
bool recovery = false ;
2013-02-09 12:39:43 -05:00
2019-02-08 07:32:13 -05:00
/* When we have an check result already (not after fresh start),
* prevent to accept old check results and allow overrides for
* CRs happened in the future .
*/
if ( old_cr ) {
double currentCRTimestamp = old_cr - > GetExecutionStart ( ) ;
double newCRTimestamp = cr - > GetExecutionStart ( ) ;
/* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */
if ( currentCRTimestamp > now ) {
/* our current CR is from the future, let the new CR override it. */
Log ( LogDebug , " Checkable " )
< < std : : fixed < < std : : setprecision ( 6 ) < < " Processing check result for checkable ' " < < GetName ( ) < < " ' from "
< < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , newCRTimestamp ) < < " ( " < < newCRTimestamp
< < " ). Overriding since ours is from the future at "
< < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , currentCRTimestamp ) < < " ( " < < currentCRTimestamp < < " ). " ;
} else {
/* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */
if ( newCRTimestamp < currentCRTimestamp ) {
Log ( LogDebug , " Checkable " )
< < std : : fixed < < std : : setprecision ( 6 ) < < " Skipping check result for checkable ' " < < GetName ( ) < < " ' from "
< < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , newCRTimestamp ) < < " ( " < < newCRTimestamp
< < " ). It is in the past compared to ours at "
< < Utility : : FormatDateTime ( " %Y-%m-%d %H:%M:%S %z " , currentCRTimestamp ) < < " ( " < < currentCRTimestamp < < " ). " ;
2022-04-26 07:33:59 -04:00
return Result : : NewerCheckResultPresent ;
2019-02-08 07:32:13 -05:00
}
}
}
2013-09-12 11:39:29 -04:00
2013-03-25 13:36:15 -04:00
/* The ExecuteCheck function already sets the old state, but we need to do it again
2013-03-08 08:43:48 -05:00
* in case this was a passive check result . */
2014-04-03 09:36:13 -04:00
SetLastStateRaw ( old_state ) ;
2013-03-07 06:04:20 -05:00
SetLastStateType ( old_stateType ) ;
2013-03-19 08:04:30 -04:00
SetLastReachable ( reachable ) ;
2013-03-07 06:04:20 -05:00
2016-03-10 08:32:57 -05:00
Host : : Ptr host ;
Service : : Ptr service ;
tie ( host , service ) = GetHostService ( this ) ;
CheckableType checkableType = CheckableHost ;
if ( service )
checkableType = CheckableService ;
2014-03-10 03:56:31 -04:00
long attempt = 1 ;
2013-02-09 12:39:43 -05:00
2014-12-12 09:53:10 -05:00
std : : set < Checkable : : Ptr > children = GetChildren ( ) ;
2016-08-08 05:02:08 -04:00
if ( IsStateOK ( cr - > GetState ( ) ) ) {
2016-05-31 11:03:49 -04:00
SetStateType ( StateTypeHard ) ; // NOT-OK -> HARD OK
2013-02-09 12:39:43 -05:00
2016-08-24 05:13:19 -04:00
if ( ! IsStateOK ( old_state ) )
2016-05-31 11:03:49 -04:00
recovery = true ;
2014-07-22 08:13:21 -04:00
2013-07-18 11:04:09 -04:00
ResetNotificationNumbers ( ) ;
2020-07-09 04:44:38 -04:00
SaveLastState ( ServiceOK , cr - > GetExecutionEnd ( ) ) ;
2013-02-09 12:39:43 -05:00
} else {
2016-09-27 05:29:25 -04:00
/* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */
if ( IsStateOK ( old_state ) ) {
2013-02-09 12:39:43 -05:00
SetStateType ( StateTypeSoft ) ;
2016-09-27 05:29:25 -04:00
attempt = 1 ;
}
/* SOFT state change, increase attempt counter. */
if ( old_stateType = = StateTypeSoft & & ! IsStateOK ( old_state ) ) {
2015-02-20 15:30:14 -05:00
SetStateType ( StateTypeSoft ) ;
2016-09-27 05:29:25 -04:00
attempt = old_attempt + 1 ;
}
/* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */
if ( attempt > = GetMaxCheckAttempts ( ) ) {
SetStateType ( StateTypeHard ) ;
attempt = 1 ;
2013-02-09 12:39:43 -05:00
}
2013-02-23 19:10:34 -05:00
2016-03-15 04:46:20 -04:00
if ( ! IsStateOK ( cr - > GetState ( ) ) ) {
2020-07-09 04:44:38 -04:00
SaveLastState ( cr - > GetState ( ) , cr - > GetExecutionEnd ( ) ) ;
2013-11-09 08:22:38 -05:00
}
2013-02-09 12:39:43 -05:00
}
2013-07-18 12:28:23 -04:00
if ( ! reachable )
2020-07-09 04:44:38 -04:00
SetLastStateUnreachable ( cr - > GetExecutionEnd ( ) ) ;
2013-07-18 12:28:23 -04:00
2013-10-26 03:41:45 -04:00
SetCheckAttempt ( attempt ) ;
2013-02-09 12:39:43 -05:00
2014-04-03 09:36:13 -04:00
ServiceState new_state = cr - > GetState ( ) ;
2020-03-04 04:55:07 -05:00
SetStateRaw ( new_state ) ;
2013-02-09 12:39:43 -05:00
2016-03-10 08:32:57 -05:00
bool stateChange ;
/* Exception on state change calculation for hosts. */
if ( checkableType = = CheckableService )
stateChange = ( old_state ! = new_state ) ;
else
stateChange = ( Host : : CalculateState ( old_state ) ! = Host : : CalculateState ( new_state ) ) ;
2019-03-27 06:43:14 -04:00
/* Store the current last state change for the next iteration. */
SetPreviousStateChange ( GetLastStateChange ( ) ) ;
2013-06-21 04:20:29 -04:00
if ( stateChange ) {
2020-07-09 04:44:38 -04:00
SetLastStateChange ( cr - > GetExecutionEnd ( ) ) ;
2013-02-09 12:39:43 -05:00
/* remove acknowledgements */
if ( GetAcknowledgement ( ) = = AcknowledgementNormal | |
2017-12-19 09:50:05 -05:00
( GetAcknowledgement ( ) = = AcknowledgementSticky & & IsStateOK ( new_state ) ) ) {
2019-11-28 11:46:12 -05:00
ClearAcknowledgement ( " " ) ;
2013-02-09 12:39:43 -05:00
}
}
2013-06-19 04:57:07 -04:00
bool remove_acknowledgement_comments = false ;
if ( GetAcknowledgement ( ) = = AcknowledgementNone )
remove_acknowledgement_comments = true ;
2013-03-20 10:25:53 -04:00
bool hardChange = ( GetStateType ( ) = = StateTypeHard & & old_stateType = = StateTypeSoft ) ;
2014-04-03 09:36:13 -04:00
if ( stateChange & & old_stateType = = StateTypeHard & & GetStateType ( ) = = StateTypeHard )
2013-03-21 06:37:34 -04:00
hardChange = true ;
2015-01-08 10:18:11 -05:00
bool is_volatile = GetVolatile ( ) ;
2013-06-13 06:24:20 -04:00
2015-01-08 10:18:11 -05:00
if ( hardChange | | is_volatile ) {
2014-04-03 09:36:13 -04:00
SetLastHardStateRaw ( new_state ) ;
2020-07-09 04:44:38 -04:00
SetLastHardStateChange ( cr - > GetExecutionEnd ( ) ) ;
2019-10-23 09:22:54 -04:00
SetLastHardStatesRaw ( GetLastHardStatesRaw ( ) / 100u + new_state * 100u ) ;
2013-07-05 03:35:49 -04:00
}
2013-03-02 03:07:47 -05:00
2019-11-04 05:02:07 -05:00
if ( stateChange ) {
SetLastSoftStatesRaw ( GetLastSoftStatesRaw ( ) / 100u + new_state * 100u ) ;
}
2022-01-10 13:18:11 -05:00
cr - > SetPreviousHardState ( ServiceState ( GetLastHardStatesRaw ( ) % 100u ) ) ;
2016-03-15 04:46:20 -04:00
if ( ! IsStateOK ( new_state ) )
2021-12-08 05:49:42 -05:00
TriggerDowntimes ( cr - > GetExecutionEnd ( ) ) ;
2013-02-09 12:39:43 -05:00
2014-05-26 14:56:34 -04:00
/* statistics for external tools */
2016-03-10 08:32:57 -05:00
Checkable : : UpdateStatistics ( cr , checkableType ) ;
2013-02-09 12:39:43 -05:00
2013-03-18 07:55:41 -04:00
bool in_downtime = IsInDowntime ( ) ;
2016-03-11 07:19:03 -05:00
bool send_notification = false ;
2019-07-02 05:23:16 -04:00
bool suppress_notification = ! notification_reachable | | in_downtime | | IsAcknowledged ( ) ;
2016-03-11 07:19:03 -05:00
2019-07-02 05:23:16 -04:00
/* Send notifications whether when a hard state change occurred. */
if ( hardChange & & ! ( old_stateType = = StateTypeSoft & & IsStateOK ( new_state ) ) )
send_notification = true ;
/* Or if the checkable is volatile and in a HARD state. */
else if ( is_volatile & & GetStateType ( ) = = StateTypeHard )
send_notification = true ;
2013-03-18 07:55:41 -04:00
2016-03-15 04:46:20 -04:00
if ( IsStateOK ( old_state ) & & old_stateType = = StateTypeSoft )
2013-03-20 10:25:53 -04:00
send_notification = false ; /* Don't send notifications for SOFT-OK -> HARD-OK. */
2016-03-15 04:46:20 -04:00
if ( is_volatile & & IsStateOK ( old_state ) & & IsStateOK ( new_state ) )
2015-01-08 10:18:11 -05:00
send_notification = false ; /* Don't send notifications for volatile OK -> OK changes. */
2013-06-19 04:57:07 -04:00
if ( remove_acknowledgement_comments )
2023-03-03 06:07:37 -05:00
RemoveAckComments ( String ( ) , cr - > GetExecutionEnd ( ) ) ;
2013-06-19 04:57:07 -04:00
2018-01-11 05:17:38 -05:00
Dictionary : : Ptr vars_after = new Dictionary ( {
{ " state " , new_state } ,
{ " state_type " , GetStateType ( ) } ,
{ " attempt " , GetCheckAttempt ( ) } ,
{ " reachable " , reachable }
} ) ;
2013-03-19 08:04:30 -04:00
if ( old_cr )
2013-11-09 08:22:38 -05:00
cr - > SetVarsBefore ( old_cr - > GetVarsAfter ( ) ) ;
2013-03-19 08:04:30 -04:00
2013-11-09 08:22:38 -05:00
cr - > SetVarsAfter ( vars_after ) ;
2013-03-19 08:04:30 -04:00
2020-03-04 04:55:07 -05:00
if ( service ) {
SetLastCheckResult ( cr ) ;
} else {
bool wasProblem = GetProblem ( ) ;
2025-04-16 06:04:53 -04:00
2020-03-04 04:55:07 -05:00
SetLastCheckResult ( cr ) ;
2025-04-16 06:04:53 -04:00
if ( GetProblem ( ) ! = wasProblem ) {
auto services = host - > GetServices ( ) ;
for ( auto & service : services ) {
Service : : OnHostProblemChanged ( service , cr , origin ) ;
}
}
2020-03-04 04:55:07 -05:00
}
2025-04-16 06:04:53 -04:00
2017-10-19 11:32:52 -04:00
bool was_flapping = IsFlapping ( ) ;
2013-06-21 04:20:29 -04:00
2020-11-11 11:43:30 -05:00
UpdateFlappingStatus ( cr - > GetState ( ) ) ;
2016-03-10 08:32:57 -05:00
2017-10-19 11:32:52 -04:00
bool is_flapping = IsFlapping ( ) ;
2013-06-21 04:20:29 -04:00
2024-02-29 04:47:55 -05:00
// Don't recompute the next check when the current check isn't generated by this endpoint. When the check is
// remotely generated we should've already received the "SetNextCheck" event before the "event::CheckResult"
// cluster event. Otherwise, the next check received before this check will be invalidated and cause the Checkable
2024-10-24 03:44:36 -04:00
// "next_check/next_update" in an HA setup to always be different from the other endpoint as the "m_SchedulingOffset"
2024-02-29 04:47:55 -05:00
// is randomly initialised on each node.
if ( ! origin ) {
if ( cr - > GetActive ( ) ) {
UpdateNextCheck ( ) ;
} else {
/* Reschedule the next check for external passive check results. The side effect of
* this is that for as long as we receive results for a service we
* won ' t execute any active checks . */
double offset ;
double ttl = cr - > GetTtl ( ) ;
if ( ttl > 0 )
offset = ttl ;
else
offset = GetCheckInterval ( ) ;
SetNextCheck ( Utility : : GetTime ( ) + offset ) ;
}
2016-03-15 08:02:38 -04:00
}
2017-11-08 06:12:27 -05:00
# ifdef I2_DEBUG /* I2_DEBUG */
Log ( LogDebug , " Checkable " )
2017-12-19 09:50:05 -05:00
< < " Flapping: Checkable " < < GetName ( )
< < " was: " < < was_flapping
< < " is: " < < is_flapping
< < " threshold low: " < < GetFlappingThresholdLow ( )
< < " threshold high: " < < GetFlappingThresholdHigh ( )
< < " % current: " < < GetFlappingCurrent ( ) < < " %. " ;
2017-11-08 06:12:27 -05:00
# endif /* I2_DEBUG */
2013-07-01 11:25:30 -04:00
2014-11-08 15:17:16 -05:00
OnNewCheckResult ( this , cr , origin ) ;
2014-03-09 13:06:24 -04:00
/* signal status updates to for example db_ido */
2014-11-08 15:17:16 -05:00
OnStateChanged ( this ) ;
2013-03-02 03:07:47 -05:00
2014-05-25 06:45:29 -04:00
String old_state_str = ( service ? Service : : StateToString ( old_state ) : Host : : StateToString ( Host : : CalculateState ( old_state ) ) ) ;
String new_state_str = ( service ? Service : : StateToString ( new_state ) : Host : : StateToString ( Host : : CalculateState ( new_state ) ) ) ;
2016-05-21 07:41:43 -04:00
/* Whether a hard state change or a volatile state change except OK -> OK happened. */
2016-05-21 08:16:47 -04:00
if ( hardChange | | ( is_volatile & & ! ( IsStateOK ( old_state ) & & IsStateOK ( new_state ) ) ) ) {
2014-11-08 15:17:16 -05:00
OnStateChange ( this , cr , StateTypeHard , origin ) ;
2014-10-19 11:52:17 -04:00
Log ( LogNotice , " Checkable " )
2017-12-19 09:50:05 -05:00
< < " State Change: Checkable ' " < < GetName ( ) < < " ' hard state change from " < < old_state_str < < " to " < < new_state_str < < " detected. " < < ( is_volatile ? " Checkable is volatile. " : " " ) ;
2016-06-13 04:09:18 -04:00
}
/* Whether a state change happened or the state type is SOFT (must be logged too). */
else if ( stateChange | | GetStateType ( ) = = StateTypeSoft ) {
2014-11-08 15:17:16 -05:00
OnStateChange ( this , cr , StateTypeSoft , origin ) ;
2014-10-19 11:52:17 -04:00
Log ( LogNotice , " Checkable " )
2017-12-19 09:50:05 -05:00
< < " State Change: Checkable ' " < < GetName ( ) < < " ' soft state change from " < < old_state_str < < " to " < < new_state_str < < " detected. " ;
2014-05-22 17:47:03 -04:00
}
2013-09-25 12:01:08 -04:00
2016-05-21 07:41:43 -04:00
if ( GetStateType ( ) = = StateTypeSoft | | hardChange | | recovery | |
2017-12-19 09:50:05 -05:00
( is_volatile & & ! ( IsStateOK ( old_state ) & & IsStateOK ( new_state ) ) ) )
2013-06-13 05:33:00 -04:00
ExecuteEventHandler ( ) ;
2019-07-02 05:23:16 -04:00
int suppressed_types = 0 ;
2016-06-13 04:12:38 -04:00
/* Flapping start/end notifications */
2019-07-02 05:23:16 -04:00
if ( ! was_flapping & & is_flapping ) {
2016-11-10 08:02:02 -05:00
/* FlappingStart notifications happen on state changes, not in downtimes */
2019-07-02 05:23:16 -04:00
if ( ! IsPaused ( ) ) {
if ( in_downtime ) {
suppressed_types | = NotificationFlappingStart ;
} else {
OnNotificationsRequested ( this , NotificationFlappingStart , cr , " " , " " , nullptr ) ;
}
}
2016-06-13 04:12:38 -04:00
Log ( LogNotice , " Checkable " )
2017-12-19 09:50:05 -05:00
< < " Flapping Start: Checkable ' " < < GetName ( ) < < " ' started flapping (Current flapping value "
< < GetFlappingCurrent ( ) < < " % > high threshold " < < GetFlappingThresholdHigh ( ) < < " %). " ;
2016-06-13 04:12:38 -04:00
NotifyFlapping ( origin ) ;
2019-07-02 05:23:16 -04:00
} else if ( was_flapping & & ! is_flapping ) {
2016-11-10 08:02:02 -05:00
/* FlappingEnd notifications are independent from state changes, must not happen in downtine */
2019-07-02 05:23:16 -04:00
if ( ! IsPaused ( ) ) {
if ( in_downtime ) {
suppressed_types | = NotificationFlappingEnd ;
} else {
OnNotificationsRequested ( this , NotificationFlappingEnd , cr , " " , " " , nullptr ) ;
}
}
2016-06-13 04:12:38 -04:00
Log ( LogNotice , " Checkable " )
2017-12-19 09:50:05 -05:00
< < " Flapping Stop: Checkable ' " < < GetName ( ) < < " ' stopped flapping (Current flapping value "
< < GetFlappingCurrent ( ) < < " % < low threshold " < < GetFlappingThresholdLow ( ) < < " %). " ;
2016-06-13 04:12:38 -04:00
NotifyFlapping ( origin ) ;
}
2016-08-24 05:13:19 -04:00
if ( send_notification & & ! is_flapping ) {
2019-07-02 05:23:16 -04:00
if ( ! IsPaused ( ) ) {
2022-01-28 09:15:38 -05:00
/* If there are still some pending suppressed state notification, keep the suppression until these are
* handled by Checkable : : FireSuppressedNotifications ( ) .
*/
bool pending = GetSuppressedNotifications ( ) & ( NotificationRecovery | NotificationProblem ) ;
if ( suppress_notification | | pending ) {
2019-07-02 05:23:16 -04:00
suppressed_types | = ( recovery ? NotificationRecovery : NotificationProblem ) ;
} else {
OnNotificationsRequested ( this , recovery ? NotificationRecovery : NotificationProblem , cr , " " , " " , nullptr ) ;
}
}
}
if ( suppressed_types ) {
/* If some notifications were suppressed, but just because of e.g. a downtime,
* stash them into a notification types bitmask for maybe re - sending later .
*/
int suppressed_types_before ( GetSuppressedNotifications ( ) ) ;
int suppressed_types_after ( suppressed_types_before | suppressed_types ) ;
2022-01-28 09:15:38 -05:00
const int conflict = NotificationFlappingStart | NotificationFlappingEnd ;
if ( ( suppressed_types_after & conflict ) = = conflict ) {
/* Flapping start and end cancel out each other. */
suppressed_types_after & = ~ conflict ;
}
2019-07-02 05:23:16 -04:00
2022-01-28 09:15:38 -05:00
const int stateNotifications = NotificationRecovery | NotificationProblem ;
if ( ! ( suppressed_types_before & stateNotifications ) & & ( suppressed_types & stateNotifications ) ) {
/* A state-related notification is suppressed for the first time, store the previous state. When
* notifications are no longer suppressed , this can be compared with the current state to determine
* if a notification must be sent . This is done differently compared to flapping notifications just above
* as for state notifications , problem and recovery don ' t always cancel each other . For example ,
* WARNING - > OK - > CRITICAL generates both types once , but there should still be a notification .
*/
SetStateBeforeSuppression ( old_stateType = = StateTypeHard ? old_state : ServiceOK ) ;
2019-07-02 05:23:16 -04:00
}
if ( suppressed_types_after ! = suppressed_types_before ) {
SetSuppressedNotifications ( suppressed_types_after ) ;
}
2016-02-22 13:43:44 -05:00
}
2022-01-25 05:38:05 -05:00
/* update reachability for child objects */
2025-02-06 02:58:03 -05:00
if ( ( stateChange | | hardChange ) & & ! children . empty ( ) & & ( affectsPreviousStateChildren | | AffectsChildren ( ) ) )
2022-01-25 05:38:05 -05:00
OnReachabilityChanged ( this , cr , children , origin ) ;
2022-04-26 07:33:59 -04:00
2025-04-16 06:04:53 -04:00
olock . Unlock ( ) ;
2025-03-10 08:51:25 -04:00
if ( recovery ) {
for ( auto & child : children ) {
if ( child - > GetProblem ( ) & & child - > GetEnableActiveChecks ( ) ) {
auto nextCheck ( now + Utility : : Random ( ) % 60 ) ;
ObjectLock oLock ( child ) ;
if ( nextCheck < child - > GetNextCheck ( ) ) {
child - > SetNextCheck ( nextCheck ) ;
}
}
}
}
if ( stateChange ) {
/* reschedule direct parents */
for ( const Checkable : : Ptr & parent : GetParents ( ) ) {
if ( parent . get ( ) = = this )
continue ;
if ( ! parent - > GetEnableActiveChecks ( ) )
continue ;
if ( parent - > GetNextCheck ( ) > = now + parent - > GetRetryInterval ( ) ) {
ObjectLock olock ( parent ) ;
parent - > SetNextCheck ( now ) ;
}
}
}
2022-04-26 07:33:59 -04:00
return Result : : Ok ;
2013-02-09 12:39:43 -05:00
}
2025-05-22 11:56:16 -04:00
void Checkable : : ExecuteRemoteCheck ( const WaitGroup : : Ptr & producer , const Dictionary : : Ptr & resolvedMacros )
2015-01-18 16:15:35 -05:00
{
2022-11-24 06:40:36 -05:00
CONTEXT ( " Executing remote check for object ' " < < GetName ( ) < < " ' " ) ;
2015-01-18 16:15:35 -05:00
double scheduled_start = GetNextCheck ( ) ;
double before_check = Utility : : GetTime ( ) ;
CheckResult : : Ptr cr = new CheckResult ( ) ;
cr - > SetScheduleStart ( scheduled_start ) ;
cr - > SetExecutionStart ( before_check ) ;
2025-05-22 11:56:16 -04:00
GetCheckCommand ( ) - > Execute ( this , cr , producer , resolvedMacros , true ) ;
2015-01-18 16:15:35 -05:00
}
2025-05-22 11:56:16 -04:00
void Checkable : : ExecuteCheck ( const WaitGroup : : Ptr & producer )
2013-02-09 12:39:43 -05:00
{
2022-11-24 06:40:36 -05:00
CONTEXT ( " Executing check for object ' " < < GetName ( ) < < " ' " ) ;
2013-11-19 01:49:41 -05:00
2016-05-10 05:12:37 -04:00
/* keep track of scheduling info in case the check type doesn't provide its own information */
double scheduled_start = GetNextCheck ( ) ;
double before_check = Utility : : GetTime ( ) ;
2020-03-05 09:42:07 -05:00
SetLastCheckStarted ( Utility : : GetTime ( ) ) ;
2018-07-02 10:17:53 -04:00
/* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
* queues and ensures that checks are not fired multiple times . ProcessCheckResult ( )
* is called too late . See # 6421.
*/
UpdateNextCheck ( ) ;
2013-03-19 09:13:58 -04:00
bool reachable = IsReachable ( ) ;
2013-03-06 05:03:50 -05:00
{
ObjectLock olock ( this ) ;
/* don't run another check if there is one pending */
2013-03-25 13:36:15 -04:00
if ( m_CheckRunning )
2013-03-06 05:03:50 -05:00
return ;
m_CheckRunning = true ;
2013-03-08 08:43:48 -05:00
2014-04-03 09:36:13 -04:00
SetLastStateRaw ( GetStateRaw ( ) ) ;
2013-03-08 08:43:48 -05:00
SetLastStateType ( GetLastStateType ( ) ) ;
2013-03-19 09:13:58 -04:00
SetLastReachable ( reachable ) ;
2013-02-09 12:39:43 -05:00
}
2015-01-18 16:15:35 -05:00
CheckResult : : Ptr cr = new CheckResult ( ) ;
2013-03-25 13:36:15 -04:00
2015-01-18 16:15:35 -05:00
cr - > SetScheduleStart ( scheduled_start ) ;
cr - > SetExecutionStart ( before_check ) ;
2013-02-09 12:39:43 -05:00
2014-11-13 05:23:57 -05:00
Endpoint : : Ptr endpoint = GetCommandEndpoint ( ) ;
2015-01-18 16:15:35 -05:00
bool local = ! endpoint | | endpoint = = Endpoint : : GetLocalEndpoint ( ) ;
2014-11-13 05:23:57 -05:00
2015-01-18 16:15:35 -05:00
if ( local ) {
2025-05-22 11:56:16 -04:00
GetCheckCommand ( ) - > Execute ( this , cr , producer , nullptr , false ) ;
2015-01-18 16:15:35 -05:00
} else {
Dictionary : : Ptr macros = new Dictionary ( ) ;
2025-05-22 11:56:16 -04:00
GetCheckCommand ( ) - > Execute ( this , cr , producer , macros , false ) ;
2014-11-13 05:23:57 -05:00
2015-10-22 04:52:38 -04:00
if ( endpoint - > GetConnected ( ) ) {
2015-01-18 16:15:35 -05:00
/* perform check on remote endpoint */
2014-11-13 05:23:57 -05:00
Dictionary : : Ptr message = new Dictionary ( ) ;
message - > Set ( " jsonrpc " , " 2.0 " ) ;
message - > Set ( " method " , " event::ExecuteCommand " ) ;
Host : : Ptr host ;
Service : : Ptr service ;
tie ( host , service ) = GetHostService ( this ) ;
Dictionary : : Ptr params = new Dictionary ( ) ;
message - > Set ( " params " , params ) ;
params - > Set ( " command_type " , " check_command " ) ;
params - > Set ( " command " , GetCheckCommand ( ) - > GetName ( ) ) ;
params - > Set ( " host " , host - > GetName ( ) ) ;
if ( service )
params - > Set ( " service " , service - > GetShortName ( ) ) ;
2025-05-19 08:07:33 -04:00
double checkTimeout = GetCheckCommand ( ) - > GetTimeout ( ) ;
2020-02-27 05:46:52 -05:00
/*
* If the host / service object specifies the ' check_timeout ' attribute ,
* forward this to the remote endpoint to limit the command execution time .
*/
2025-05-19 08:07:33 -04:00
if ( auto ckCheckTimeout ( GetCheckTimeout ( ) ) ; ! ckCheckTimeout . IsEmpty ( ) ) {
checkTimeout = Convert : : ToDouble ( ckCheckTimeout ) ;
params - > Set ( " check_timeout " , ckCheckTimeout ) ;
}
2020-02-27 05:46:52 -05:00
2014-11-13 05:23:57 -05:00
params - > Set ( " macros " , macros ) ;
ApiListener : : Ptr listener = ApiListener : : GetInstance ( ) ;
if ( listener )
listener - > SyncSendMessage ( endpoint , message ) ;
2015-01-18 16:15:35 -05:00
2016-01-21 04:32:38 -05:00
/* Re-schedule the check so we don't run it again until after we've received
2017-12-19 09:50:05 -05:00
* a check result from the remote instance . The check will be re - scheduled
* using the proper check interval once we ' ve received a check result .
*/
2025-05-19 08:07:33 -04:00
SetNextCheck ( Utility : : GetTime ( ) + checkTimeout + 30 ) ;
2020-02-11 06:49:40 -05:00
/*
* Let the user know that there was a problem with the check if
* 1 ) The endpoint is not syncing ( replay log , etc . )
* 2 ) Outside of the cold startup window ( 5 min )
*/
2016-10-24 02:38:58 -04:00
} else if ( ! endpoint - > GetSyncing ( ) & & Application : : GetInstance ( ) - > GetStartTime ( ) < Utility : : GetTime ( ) - 300 ) {
2015-01-18 16:15:35 -05:00
/* fail to perform check on unconnected endpoint */
cr - > SetState ( ServiceUnknown ) ;
2016-04-19 03:35:48 -04:00
String output = " Remote Icinga instance ' " + endpoint - > GetName ( ) + " ' is not connected to " ;
Endpoint : : Ptr localEndpoint = Endpoint : : GetLocalEndpoint ( ) ;
if ( localEndpoint )
output + = " ' " + localEndpoint - > GetName ( ) + " ' " ;
else
output + = " this instance " ;
cr - > SetOutput ( output ) ;
2015-01-18 16:15:35 -05:00
2025-05-22 11:56:16 -04:00
ProcessCheckResult ( cr , producer ) ;
2014-11-13 05:23:57 -05:00
}
{
ObjectLock olock ( this ) ;
m_CheckRunning = false ;
}
}
2013-02-09 12:39:43 -05:00
}
2013-02-11 17:37:39 -05:00
2014-05-26 14:56:34 -04:00
void Checkable : : UpdateStatistics ( const CheckResult : : Ptr & cr , CheckableType type )
2013-02-11 17:37:39 -05:00
{
2013-11-09 08:22:38 -05:00
time_t ts = cr - > GetScheduleEnd ( ) ;
2014-05-26 14:56:34 -04:00
if ( type = = CheckableHost ) {
if ( cr - > GetActive ( ) )
CIB : : UpdateActiveHostChecksStatistics ( ts , 1 ) ;
else
CIB : : UpdatePassiveHostChecksStatistics ( ts , 1 ) ;
} else if ( type = = CheckableService ) {
if ( cr - > GetActive ( ) )
CIB : : UpdateActiveServiceChecksStatistics ( ts , 1 ) ;
else
CIB : : UpdatePassiveServiceChecksStatistics ( ts , 1 ) ;
} else {
2014-05-28 07:45:45 -04:00
Log ( LogWarning , " Checkable " , " Unknown checkable type for statistic update. " ) ;
2014-05-26 14:56:34 -04:00
}
2013-02-11 17:37:39 -05:00
}
2016-05-12 07:46:22 -04:00
2018-01-03 22:25:35 -05:00
void Checkable : : IncreasePendingChecks ( )
2016-05-12 07:46:22 -04:00
{
2021-02-02 04:16:04 -05:00
std : : unique_lock < std : : mutex > lock ( m_StatsMutex ) ;
2016-05-12 07:46:22 -04:00
m_PendingChecks + + ;
}
2018-01-03 22:25:35 -05:00
void Checkable : : DecreasePendingChecks ( )
2016-05-12 07:46:22 -04:00
{
2021-02-02 04:16:04 -05:00
std : : unique_lock < std : : mutex > lock ( m_StatsMutex ) ;
2016-05-12 07:46:22 -04:00
m_PendingChecks - - ;
2018-01-16 04:40:08 -05:00
m_PendingChecksCV . notify_one ( ) ;
2016-05-12 07:46:22 -04:00
}
2018-01-03 22:25:35 -05:00
int Checkable : : GetPendingChecks ( )
2016-05-12 07:46:22 -04:00
{
2021-02-02 04:16:04 -05:00
std : : unique_lock < std : : mutex > lock ( m_StatsMutex ) ;
2016-05-12 07:46:22 -04:00
return m_PendingChecks ;
}
2018-01-16 04:40:08 -05:00
void Checkable : : AquirePendingCheckSlot ( int maxPendingChecks )
{
2021-02-02 04:16:04 -05:00
std : : unique_lock < std : : mutex > lock ( m_StatsMutex ) ;
2018-01-16 04:40:08 -05:00
while ( m_PendingChecks > = maxPendingChecks )
m_PendingChecksCV . wait ( lock ) ;
m_PendingChecks + + ;
}