2014-10-08 19:14:37 -04:00
/ *
2017-10-11 19:36:39 -04:00
Copyright 2017 The Kubernetes Authors .
2014-10-08 19:14:37 -04:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
2017-10-11 19:36:39 -04:00
// The Controller sets tainted annotations on nodes.
// Tainted nodes should not be used for new work loads and
// some effort should be given to getting existing work
// loads off of tainted nodes.
2014-10-08 19:14:37 -04:00
2017-10-11 19:36:39 -04:00
package nodelifecycle
2017-08-08 15:55:57 -04:00
2017-10-11 19:36:39 -04:00
import (
2020-02-07 21:16:47 -05:00
"context"
2018-03-14 03:08:17 -04:00
"fmt"
"sync"
"time"
2020-04-17 15:25:06 -04:00
"k8s.io/klog/v2"
2018-03-14 03:08:17 -04:00
2019-10-16 03:11:03 -04:00
coordv1 "k8s.io/api/coordination/v1"
2019-07-16 22:24:21 -04:00
v1 "k8s.io/api/core/v1"
2017-01-25 08:39:54 -05:00
apiequality "k8s.io/apimachinery/pkg/api/equality"
2017-02-06 13:35:50 -05:00
apierrors "k8s.io/apimachinery/pkg/api/errors"
2017-01-11 09:09:48 -05:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
2023-07-10 09:02:56 -04:00
utilfeature "k8s.io/apiserver/pkg/util/feature"
2018-08-15 22:06:39 -04:00
appsv1informers "k8s.io/client-go/informers/apps/v1"
2019-10-16 03:11:03 -04:00
coordinformers "k8s.io/client-go/informers/coordination/v1"
2017-06-23 16:56:37 -04:00
coreinformers "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes"
2017-10-11 19:36:39 -04:00
"k8s.io/client-go/kubernetes/scheme"
2018-10-19 17:19:23 -04:00
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
2018-08-15 22:06:39 -04:00
appsv1listers "k8s.io/client-go/listers/apps/v1"
2019-10-16 03:11:03 -04:00
coordlisters "k8s.io/client-go/listers/coordination/v1"
2017-06-23 16:56:37 -04:00
corelisters "k8s.io/client-go/listers/core/v1"
2017-10-11 19:36:39 -04:00
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/flowcontrol"
2018-08-31 03:26:19 -04:00
"k8s.io/client-go/util/workqueue"
2021-11-12 10:52:27 -05:00
nodetopology "k8s.io/component-helpers/node/topology"
2021-02-09 13:47:42 -05:00
kubeletapis "k8s.io/kubelet/pkg/apis"
2017-02-06 07:58:48 -05:00
"k8s.io/kubernetes/pkg/controller"
2017-10-11 19:36:39 -04:00
"k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
2023-07-10 09:02:56 -04:00
"k8s.io/kubernetes/pkg/controller/tainteviction"
2021-11-12 10:52:27 -05:00
controllerutil "k8s.io/kubernetes/pkg/controller/util/node"
2023-07-10 09:02:56 -04:00
"k8s.io/kubernetes/pkg/features"
2017-07-06 09:13:13 -04:00
taintutils "k8s.io/kubernetes/pkg/util/taints"
2014-10-08 19:14:37 -04:00
)
2016-08-16 11:08:26 -04:00
func init ( ) {
// Register prometheus metrics
Register ( )
}
2015-01-16 17:28:20 -05:00
var (
2017-08-08 19:25:20 -04:00
// UnreachableTaintTemplate is the taint for when a node becomes unreachable.
2017-02-06 07:58:48 -05:00
UnreachableTaintTemplate = & v1 . Taint {
2019-11-04 14:31:16 -05:00
Key : v1 . TaintNodeUnreachable ,
2017-02-06 07:58:48 -05:00
Effect : v1 . TaintEffectNoExecute ,
}
2017-10-11 19:36:39 -04:00
2017-08-08 19:25:20 -04:00
// NotReadyTaintTemplate is the taint for when a node is not ready for
// executing pods
2017-02-06 07:58:48 -05:00
NotReadyTaintTemplate = & v1 . Taint {
2019-11-04 14:31:16 -05:00
Key : v1 . TaintNodeNotReady ,
2017-02-06 07:58:48 -05:00
Effect : v1 . TaintEffectNoExecute ,
}
2017-07-19 11:51:19 -04:00
2018-08-22 18:26:46 -04:00
// map {NodeConditionType: {ConditionStatus: TaintKey}}
// represents which NodeConditionType under which ConditionStatus should be
// tainted with which TaintKey
// for certain NodeConditionType, there are multiple {ConditionStatus,TaintKey} pairs
nodeConditionToTaintKeyStatusMap = map [ v1 . NodeConditionType ] map [ v1 . ConditionStatus ] string {
2018-05-09 09:36:05 -04:00
v1 . NodeReady : {
2019-11-04 14:31:16 -05:00
v1 . ConditionFalse : v1 . TaintNodeNotReady ,
v1 . ConditionUnknown : v1 . TaintNodeUnreachable ,
2018-05-09 09:36:05 -04:00
} ,
v1 . NodeMemoryPressure : {
2019-11-04 14:31:16 -05:00
v1 . ConditionTrue : v1 . TaintNodeMemoryPressure ,
2018-05-09 09:36:05 -04:00
} ,
v1 . NodeDiskPressure : {
2019-11-04 14:31:16 -05:00
v1 . ConditionTrue : v1 . TaintNodeDiskPressure ,
2018-05-09 09:36:05 -04:00
} ,
v1 . NodeNetworkUnavailable : {
2019-11-04 14:31:16 -05:00
v1 . ConditionTrue : v1 . TaintNodeNetworkUnavailable ,
2018-05-09 09:36:05 -04:00
} ,
v1 . NodePIDPressure : {
2019-11-04 14:31:16 -05:00
v1 . ConditionTrue : v1 . TaintNodePIDPressure ,
2018-05-09 09:36:05 -04:00
} ,
2017-07-19 11:51:19 -04:00
}
taintKeyToNodeConditionMap = map [ string ] v1 . NodeConditionType {
2019-11-04 14:31:16 -05:00
v1 . TaintNodeNotReady : v1 . NodeReady ,
v1 . TaintNodeUnreachable : v1 . NodeReady ,
v1 . TaintNodeNetworkUnavailable : v1 . NodeNetworkUnavailable ,
v1 . TaintNodeMemoryPressure : v1 . NodeMemoryPressure ,
v1 . TaintNodeDiskPressure : v1 . NodeDiskPressure ,
v1 . TaintNodePIDPressure : v1 . NodePIDPressure ,
2017-07-19 11:51:19 -04:00
}
2015-01-16 17:28:20 -05:00
)
2017-08-08 19:25:20 -04:00
// ZoneState is the state of a given zone.
type ZoneState string
2016-07-12 03:38:57 -04:00
const (
2017-08-08 19:25:20 -04:00
stateInitial = ZoneState ( "Initial" )
stateNormal = ZoneState ( "Normal" )
stateFullDisruption = ZoneState ( "FullDisruption" )
statePartialDisruption = ZoneState ( "PartialDisruption" )
2016-07-12 03:38:57 -04:00
)
2017-10-11 19:36:39 -04:00
const (
2018-10-01 14:32:56 -04:00
// The amount of time the nodecontroller should sleep between retrying node health updates
2025-03-17 07:56:07 -04:00
retrySleepTime = 20 * time . Millisecond
2019-08-01 12:59:22 -04:00
// podUpdateWorkerSizes assumes that in most cases pod will be handled by monitorNodeHealth pass.
2020-02-23 22:57:53 -05:00
// Pod update workers will only handle lagging cache pods. 4 workers should be enough.
2019-08-01 12:59:22 -04:00
podUpdateWorkerSize = 4
2023-07-10 09:02:56 -04:00
// nodeUpdateWorkerSize defines the size of workers for node update or/and pod update.
nodeUpdateWorkerSize = 8
// taintEvictionController is defined here in order to prevent imports of
// k8s.io/kubernetes/cmd/kube-controller-manager/names which would result in validation errors.
// This constant will be removed upon graduation of the SeparateTaintEvictionController feature.
taintEvictionController = "taint-eviction-controller"
2017-10-11 19:36:39 -04:00
)
2019-02-22 19:09:07 -05:00
// labelReconcileInfo lists Node labels to reconcile, and how to reconcile them.
// primaryKey and secondaryKey are keys of labels to reconcile.
// - If both keys exist, but their values don't match. Use the value from the
2022-07-19 20:54:13 -04:00
// primaryKey as the source of truth to reconcile.
2019-02-22 19:09:07 -05:00
// - If ensureSecondaryExists is true, and the secondaryKey does not
2022-07-19 20:54:13 -04:00
// exist, secondaryKey will be added with the value of the primaryKey.
2019-02-22 19:09:07 -05:00
var labelReconcileInfo = [ ] struct {
primaryKey string
secondaryKey string
ensureSecondaryExists bool
} {
{
2020-05-12 20:22:43 -04:00
// Reconcile the beta and the stable OS label using the stable label as the source of truth.
// TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels
primaryKey : v1 . LabelOSStable ,
secondaryKey : kubeletapis . LabelOS ,
2019-02-22 19:09:07 -05:00
ensureSecondaryExists : true ,
} ,
{
2020-05-12 20:22:43 -04:00
// Reconcile the beta and the stable arch label using the stable label as the source of truth.
// TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels
primaryKey : v1 . LabelArchStable ,
secondaryKey : kubeletapis . LabelArch ,
2019-02-22 19:09:07 -05:00
ensureSecondaryExists : true ,
} ,
}
2018-10-01 14:32:56 -04:00
type nodeHealthData struct {
2016-12-03 13:57:26 -05:00
probeTimestamp metav1 . Time
readyTransitionTimestamp metav1 . Time
2018-10-01 14:32:56 -04:00
status * v1 . NodeStatus
2019-10-16 03:11:03 -04:00
lease * coordv1 . Lease
2015-03-31 11:15:39 -04:00
}
2019-09-09 08:02:38 -04:00
func ( n * nodeHealthData ) deepCopy ( ) * nodeHealthData {
if n == nil {
return nil
}
return & nodeHealthData {
probeTimestamp : n . probeTimestamp ,
readyTransitionTimestamp : n . readyTransitionTimestamp ,
status : n . status . DeepCopy ( ) ,
lease : n . lease . DeepCopy ( ) ,
}
}
type nodeHealthMap struct {
lock sync . RWMutex
nodeHealths map [ string ] * nodeHealthData
}
func newNodeHealthMap ( ) * nodeHealthMap {
return & nodeHealthMap {
nodeHealths : make ( map [ string ] * nodeHealthData ) ,
}
}
// getDeepCopy - returns copy of node health data.
// It prevents data being changed after retrieving it from the map.
func ( n * nodeHealthMap ) getDeepCopy ( name string ) * nodeHealthData {
n . lock . RLock ( )
defer n . lock . RUnlock ( )
return n . nodeHealths [ name ] . deepCopy ( )
}
func ( n * nodeHealthMap ) set ( name string , data * nodeHealthData ) {
n . lock . Lock ( )
defer n . lock . Unlock ( )
n . nodeHealths [ name ] = data
}
2019-08-01 12:59:22 -04:00
type podUpdateItem struct {
namespace string
name string
}
2017-10-11 19:36:39 -04:00
// Controller is the controller that manages node's life cycle.
2017-08-08 19:25:20 -04:00
type Controller struct {
2023-07-10 09:02:56 -04:00
taintManager * tainteviction . Controller
2017-10-11 19:36:39 -04:00
2019-08-01 12:59:22 -04:00
podLister corelisters . PodLister
2017-10-11 19:36:39 -04:00
podInformerSynced cache . InformerSynced
kubeClient clientset . Interface
// This timestamp is to be used instead of LastProbeTime stored in Condition. We do this
2018-11-11 20:43:53 -05:00
// to avoid the problem with time skew across the cluster.
2017-10-11 19:36:39 -04:00
now func ( ) metav1 . Time
enterPartialDisruptionFunc func ( nodeNum int ) float32
enterFullDisruptionFunc func ( nodeNum int ) float32
computeZoneStateFunc func ( nodeConditions [ ] * v1 . NodeCondition ) ( int , ZoneState )
2017-02-27 03:33:55 -05:00
knownNodeSet map [ string ] * v1 . Node
2018-10-01 14:32:56 -04:00
// per Node map storing last observed health together with a local time when it was observed.
2019-09-09 08:02:38 -04:00
nodeHealthMap * nodeHealthMap
2017-10-11 19:36:39 -04:00
2019-08-01 12:59:22 -04:00
// evictorLock protects zonePodEvictor and zoneNoExecuteTainter.
2022-09-14 05:04:08 -04:00
evictorLock sync . Mutex
2017-10-11 19:36:39 -04:00
// workers that are responsible for tainting nodes.
zoneNoExecuteTainter map [ string ] * scheduler . RateLimitedTimedQueue
2019-10-28 07:08:29 -04:00
nodesToRetry sync . Map
2017-10-11 19:36:39 -04:00
zoneStates map [ string ] ZoneState
2018-08-15 22:06:39 -04:00
daemonSetStore appsv1listers . DaemonSetLister
2017-10-11 19:36:39 -04:00
daemonSetInformerSynced cache . InformerSynced
2018-10-28 21:57:23 -04:00
leaseLister coordlisters . LeaseLister
leaseInformerSynced cache . InformerSynced
nodeLister corelisters . NodeLister
nodeInformerSynced cache . InformerSynced
2017-10-11 19:36:39 -04:00
2019-10-11 09:41:34 -04:00
getPodsAssignedToNode func ( nodeName string ) ( [ ] * v1 . Pod , error )
2019-08-23 09:59:59 -04:00
2022-06-02 15:00:48 -04:00
broadcaster record . EventBroadcaster
recorder record . EventRecorder
2017-10-11 19:36:39 -04:00
// Value controlling Controller monitoring period, i.e. how often does Controller
2018-10-01 14:32:56 -04:00
// check node health signal posted from kubelet. This value should be lower than
// nodeMonitorGracePeriod.
// TODO: Change node health monitor to watch based.
2017-10-11 19:36:39 -04:00
nodeMonitorPeriod time . Duration
2018-10-01 14:32:56 -04:00
// When node is just created, e.g. cluster bootstrap or node creation, we give
// a longer grace period.
2017-10-11 19:36:39 -04:00
nodeStartupGracePeriod time . Duration
2018-10-01 14:32:56 -04:00
// Controller will not proactively sync node health, but will monitor node
2018-10-01 14:32:56 -04:00
// health signal updated from kubelet. There are 2 kinds of node healthiness
2021-09-23 23:07:53 -04:00
// signals: NodeStatus and NodeLease. If it doesn't receive update for this amount
2024-09-09 02:36:41 -04:00
// of time, it will start posting "NodeReady==ConditionUnknown".
2018-10-01 14:32:56 -04:00
// Note: be cautious when changing the constant, it must work with
2018-10-01 14:32:56 -04:00
// nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease
// controller. The node health signal update frequency is the minimal of the
// two.
// There are several constraints:
// 1. nodeMonitorGracePeriod must be N times more than the node health signal
// update frequency, where N means number of retries allowed for kubelet to
// post node status/lease. It is pointless to make nodeMonitorGracePeriod
// be less than the node health signal update frequency, since there will
// only be fresh values from Kubelet at an interval of node health signal
2023-03-07 09:27:14 -05:00
// update frequency.
2024-07-23 06:26:30 -04:00
// 2. nodeMonitorGracePeriod should be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS (30s)
// and HTTP2_READ_IDLE_TIMEOUT_SECONDS (15s) from the http2 health check
// to ensure that the server has adequate time to handle slow or idle connections
// properly before marking a node as unhealthy.
// 3. nodeMonitorGracePeriod can't be too large for user experience - larger
2018-10-01 14:32:56 -04:00
// value takes longer for user to see up-to-date node health.
2015-03-31 07:17:12 -04:00
nodeMonitorGracePeriod time . Duration
2016-05-16 05:20:23 -04:00
2023-01-10 23:54:39 -05:00
// Number of workers Controller uses to process node monitor health updates.
// Defaults to scheduler.UpdateWorkerSize.
nodeUpdateWorkerSize int
2016-08-05 08:50:19 -04:00
evictionLimiterQPS float32
secondaryEvictionLimiterQPS float32
largeClusterThreshold int32
unhealthyZoneThreshold float32
2017-01-23 04:28:51 -05:00
2024-04-28 12:26:18 -04:00
nodeUpdateQueue workqueue . TypedInterface [ string ]
podUpdateQueue workqueue . TypedRateLimitingInterface [ podUpdateItem ]
2014-10-08 19:14:37 -04:00
}
2017-10-11 19:36:39 -04:00
// NewNodeLifecycleController returns a new taint controller.
2018-10-01 14:32:56 -04:00
func NewNodeLifecycleController (
2021-04-22 14:27:59 -04:00
ctx context . Context ,
2018-10-01 14:32:56 -04:00
leaseInformer coordinformers . LeaseInformer ,
podInformer coreinformers . PodInformer ,
2017-02-06 13:35:50 -05:00
nodeInformer coreinformers . NodeInformer ,
2018-08-15 22:06:39 -04:00
daemonSetInformer appsv1informers . DaemonSetInformer ,
2016-01-29 01:34:08 -05:00
kubeClient clientset . Interface ,
2017-10-11 19:36:39 -04:00
nodeMonitorPeriod time . Duration ,
nodeStartupGracePeriod time . Duration ,
nodeMonitorGracePeriod time . Duration ,
2016-07-12 08:29:46 -04:00
evictionLimiterQPS float32 ,
2016-08-05 08:50:19 -04:00
secondaryEvictionLimiterQPS float32 ,
largeClusterThreshold int32 ,
unhealthyZoneThreshold float32 ,
2019-09-12 23:38:02 -04:00
) ( * Controller , error ) {
2023-03-13 04:45:42 -04:00
logger := klog . FromContext ( ctx )
2017-08-08 19:25:20 -04:00
if kubeClient == nil {
2022-09-22 02:37:36 -04:00
logger . Error ( nil , "kubeClient is nil when starting nodelifecycle Controller" )
klog . FlushAndExit ( klog . ExitFlushTimeout , 1 )
2017-08-08 19:25:20 -04:00
}
2023-12-13 03:11:08 -05:00
eventBroadcaster := record . NewBroadcaster ( record . WithContext ( ctx ) )
2017-08-31 20:25:18 -04:00
recorder := eventBroadcaster . NewRecorder ( scheme . Scheme , v1 . EventSource { Component : "node-controller" } )
2016-04-13 14:38:32 -04:00
2017-08-08 19:25:20 -04:00
nc := & Controller {
2018-10-28 21:57:23 -04:00
kubeClient : kubeClient ,
now : metav1 . Now ,
knownNodeSet : make ( map [ string ] * v1 . Node ) ,
2019-09-09 08:02:38 -04:00
nodeHealthMap : newNodeHealthMap ( ) ,
2022-06-02 15:00:48 -04:00
broadcaster : eventBroadcaster ,
2017-10-11 19:36:39 -04:00
recorder : recorder ,
nodeMonitorPeriod : nodeMonitorPeriod ,
nodeStartupGracePeriod : nodeStartupGracePeriod ,
nodeMonitorGracePeriod : nodeMonitorGracePeriod ,
2023-07-10 09:02:56 -04:00
nodeUpdateWorkerSize : nodeUpdateWorkerSize ,
2017-10-11 19:36:39 -04:00
zoneNoExecuteTainter : make ( map [ string ] * scheduler . RateLimitedTimedQueue ) ,
2019-10-28 07:08:29 -04:00
nodesToRetry : sync . Map { } ,
2017-10-11 19:36:39 -04:00
zoneStates : make ( map [ string ] ZoneState ) ,
2017-07-31 07:39:34 -04:00
evictionLimiterQPS : evictionLimiterQPS ,
secondaryEvictionLimiterQPS : secondaryEvictionLimiterQPS ,
largeClusterThreshold : largeClusterThreshold ,
unhealthyZoneThreshold : unhealthyZoneThreshold ,
2024-04-28 12:26:18 -04:00
nodeUpdateQueue : workqueue . NewTypedWithConfig ( workqueue . TypedQueueConfig [ string ] { Name : "node_lifecycle_controller" } ) ,
podUpdateQueue : workqueue . NewTypedRateLimitingQueueWithConfig (
workqueue . DefaultTypedControllerRateLimiter [ podUpdateItem ] ( ) ,
workqueue . TypedRateLimitingQueueConfig [ podUpdateItem ] {
Name : "node_lifecycle_controller_pods" ,
} ,
) ,
2014-10-14 18:45:09 -04:00
}
2017-10-11 19:36:39 -04:00
2016-08-05 08:50:19 -04:00
nc . enterPartialDisruptionFunc = nc . ReducedQPSFunc
nc . enterFullDisruptionFunc = nc . HealthyQPSFunc
nc . computeZoneStateFunc = nc . ComputeZoneState
2015-10-19 22:25:31 -04:00
2016-09-23 12:01:58 -04:00
podInformer . Informer ( ) . AddEventHandler ( cache . ResourceEventHandlerFuncs {
2017-01-23 04:28:51 -05:00
AddFunc : func ( obj interface { } ) {
pod := obj . ( * v1 . Pod )
2019-08-01 12:59:22 -04:00
nc . podUpdated ( nil , pod )
2017-01-23 04:28:51 -05:00
} ,
UpdateFunc : func ( prev , obj interface { } ) {
prevPod := prev . ( * v1 . Pod )
newPod := obj . ( * v1 . Pod )
2019-08-01 12:59:22 -04:00
nc . podUpdated ( prevPod , newPod )
2017-01-23 04:28:51 -05:00
} ,
DeleteFunc : func ( obj interface { } ) {
pod , isPod := obj . ( * v1 . Pod )
2017-05-30 14:32:43 -04:00
// We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly.
2017-01-23 04:28:51 -05:00
if ! isPod {
deletedState , ok := obj . ( cache . DeletedFinalStateUnknown )
if ! ok {
2022-09-22 02:37:36 -04:00
logger . Error ( nil , "Received unexpected object" , "object" , obj )
2017-01-23 04:28:51 -05:00
return
}
pod , ok = deletedState . Obj . ( * v1 . Pod )
if ! ok {
2022-09-22 02:37:36 -04:00
logger . Error ( nil , "DeletedFinalStateUnknown contained non-Pod object" , "object" , deletedState . Obj )
2017-01-23 04:28:51 -05:00
return
}
}
2019-08-01 12:59:22 -04:00
nc . podUpdated ( pod , nil )
2017-01-23 04:28:51 -05:00
} ,
2016-07-20 16:26:07 -04:00
} )
2017-02-06 13:35:50 -05:00
nc . podInformerSynced = podInformer . Informer ( ) . HasSynced
2025-03-17 07:56:07 -04:00
controller . AddPodNodeNameIndexer ( podInformer . Informer ( ) )
2019-08-23 09:59:59 -04:00
podIndexer := podInformer . Informer ( ) . GetIndexer ( )
2019-10-11 09:41:34 -04:00
nc . getPodsAssignedToNode = func ( nodeName string ) ( [ ] * v1 . Pod , error ) {
2025-03-17 07:56:07 -04:00
objs , err := podIndexer . ByIndex ( controller . PodNodeNameKeyIndex , nodeName )
2019-08-23 09:59:59 -04:00
if err != nil {
return nil , err
}
2019-10-11 09:41:34 -04:00
pods := make ( [ ] * v1 . Pod , 0 , len ( objs ) )
2019-08-23 09:59:59 -04:00
for _ , obj := range objs {
pod , ok := obj . ( * v1 . Pod )
if ! ok {
continue
}
2019-10-11 09:41:34 -04:00
pods = append ( pods , pod )
2019-08-23 09:59:59 -04:00
}
return pods , nil
}
2019-08-01 12:59:22 -04:00
nc . podLister = podInformer . Lister ( )
2022-07-14 12:00:44 -04:00
nc . nodeLister = nodeInformer . Lister ( )
2019-06-25 10:11:47 -04:00
2023-07-10 09:02:56 -04:00
if ! utilfeature . DefaultFeatureGate . Enabled ( features . SeparateTaintEvictionController ) {
logger . Info ( "Running TaintEvictionController as part of NodeLifecyleController" )
tm , err := tainteviction . New ( ctx , kubeClient , podInformer , nodeInformer , taintEvictionController )
if err != nil {
return nil , err
}
nc . taintManager = tm
}
2017-01-23 04:28:51 -05:00
2022-09-22 02:37:36 -04:00
logger . Info ( "Controller will reconcile labels" )
2019-02-22 19:09:07 -05:00
nodeInformer . Informer ( ) . AddEventHandler ( cache . ResourceEventHandlerFuncs {
2021-11-12 10:52:27 -05:00
AddFunc : controllerutil . CreateAddNodeHandler ( func ( node * v1 . Node ) error {
2019-02-22 19:09:07 -05:00
nc . nodeUpdateQueue . Add ( node . Name )
return nil
} ) ,
2021-11-12 10:52:27 -05:00
UpdateFunc : controllerutil . CreateUpdateNodeHandler ( func ( _ , newNode * v1 . Node ) error {
2019-02-22 19:09:07 -05:00
nc . nodeUpdateQueue . Add ( newNode . Name )
return nil
} ) ,
2023-03-26 10:10:19 -04:00
DeleteFunc : controllerutil . CreateDeleteNodeHandler ( logger , func ( node * v1 . Node ) error {
2019-10-28 07:08:29 -04:00
nc . nodesToRetry . Delete ( node . Name )
return nil
} ) ,
2019-02-22 19:09:07 -05:00
} )
2018-10-01 14:32:56 -04:00
nc . leaseLister = leaseInformer . Lister ( )
2019-10-25 05:57:54 -04:00
nc . leaseInformerSynced = leaseInformer . Informer ( ) . HasSynced
2018-10-01 14:32:56 -04:00
2017-02-06 13:35:50 -05:00
nc . nodeInformerSynced = nodeInformer . Informer ( ) . HasSynced
2016-01-26 22:53:09 -05:00
2017-02-06 13:35:50 -05:00
nc . daemonSetStore = daemonSetInformer . Lister ( )
nc . daemonSetInformerSynced = daemonSetInformer . Informer ( ) . HasSynced
2016-01-26 22:53:09 -05:00
2016-07-16 14:52:51 -04:00
return nc , nil
2014-10-14 18:45:09 -04:00
}
2017-10-11 19:36:39 -04:00
// Run starts an asynchronous loop that monitors the status of cluster nodes.
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) Run ( ctx context . Context ) {
2025-09-17 02:44:42 -04:00
defer utilruntime . HandleCrashWithContext ( ctx )
2017-10-11 19:36:39 -04:00
2022-06-02 15:00:48 -04:00
// Start events processing pipeline.
2023-12-13 03:11:08 -05:00
nc . broadcaster . StartStructuredLogging ( 3 )
2022-09-22 02:37:36 -04:00
logger := klog . FromContext ( ctx )
logger . Info ( "Sending events to api server" )
2022-06-02 15:00:48 -04:00
nc . broadcaster . StartRecordingToSink (
& v1core . EventSinkImpl {
Interface : v1core . New ( nc . kubeClient . CoreV1 ( ) . RESTClient ( ) ) . Events ( "" ) ,
} )
defer nc . broadcaster . Shutdown ( )
2022-09-22 02:37:36 -04:00
logger . Info ( "Starting node controller" )
2025-10-27 09:54:49 -04:00
// Close node update queue to cleanup go routine.
var wg sync . WaitGroup
defer func ( ) {
logger . Info ( "Shutting down node controller" )
nc . nodeUpdateQueue . ShutDown ( )
nc . podUpdateQueue . ShutDown ( )
wg . Wait ( )
} ( )
2017-10-11 19:36:39 -04:00
2025-09-17 02:48:02 -04:00
if ! cache . WaitForNamedCacheSyncWithContext ( ctx , nc . leaseInformerSynced , nc . nodeInformerSynced , nc . podInformerSynced , nc . daemonSetInformerSynced ) {
2017-10-11 19:36:39 -04:00
return
}
2023-07-10 09:02:56 -04:00
if ! utilfeature . DefaultFeatureGate . Enabled ( features . SeparateTaintEvictionController ) {
logger . Info ( "Starting" , "controller" , taintEvictionController )
2025-10-27 09:54:49 -04:00
wg . Go ( func ( ) {
nc . taintManager . Run ( ctx )
} )
2023-07-10 09:02:56 -04:00
}
2018-08-31 03:26:19 -04:00
2019-02-22 19:09:07 -05:00
// Start workers to reconcile labels and/or update NoSchedule taint for nodes.
2023-07-10 09:02:56 -04:00
for i := 0 ; i < nodeUpdateWorkerSize ; i ++ {
2019-02-22 19:09:07 -05:00
// Thanks to "workqueue", each worker just need to get item from queue, because
// the item is flagged when got from queue: if new event come, the new item will
// be re-queued until "Done", so no more than one worker handle the same item and
// no event missed.
2025-10-27 09:54:49 -04:00
wg . Go ( func ( ) {
wait . UntilWithContext ( ctx , nc . doNodeProcessingPassWorker , time . Second )
} )
2017-10-11 19:36:39 -04:00
}
2019-08-01 12:59:22 -04:00
for i := 0 ; i < podUpdateWorkerSize ; i ++ {
2025-10-27 09:54:49 -04:00
wg . Go ( func ( ) {
wait . UntilWithContext ( ctx , nc . doPodProcessingWorker , time . Second )
} )
2019-08-01 12:59:22 -04:00
}
2022-09-14 05:04:08 -04:00
// Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated
// taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints.
2025-10-27 09:54:49 -04:00
wg . Go ( func ( ) {
wait . UntilWithContext ( ctx , nc . doNoExecuteTaintingPass , scheduler . NodeEvictionPeriod )
} )
2017-10-11 19:36:39 -04:00
2018-10-01 14:32:56 -04:00
// Incorporate the results of node health signal pushed from kubelet to master.
2025-10-27 09:54:49 -04:00
wg . Go ( func ( ) {
wait . UntilWithContext ( ctx , func ( ctx context . Context ) {
if err := nc . monitorNodeHealth ( ctx ) ; err != nil {
logger . Error ( err , "Error monitoring node health" )
}
} , nc . nodeMonitorPeriod )
} )
2021-04-22 14:27:59 -04:00
<- ctx . Done ( )
2017-04-04 09:35:44 -04:00
}
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) doNodeProcessingPassWorker ( ctx context . Context ) {
2022-09-22 02:37:36 -04:00
logger := klog . FromContext ( ctx )
2018-08-31 03:26:19 -04:00
for {
2018-09-10 20:24:16 -04:00
obj , shutdown := nc . nodeUpdateQueue . Get ( )
// "nodeUpdateQueue" will be shutdown when "stopCh" closed;
// we do not need to re-check "stopCh" again.
if shutdown {
2018-08-31 03:26:19 -04:00
return
}
2024-04-28 12:26:18 -04:00
nodeName := obj
2021-04-22 14:27:59 -04:00
if err := nc . doNoScheduleTaintingPass ( ctx , nodeName ) ; err != nil {
2022-09-22 02:37:36 -04:00
logger . Error ( err , "Failed to taint NoSchedule on node, requeue it" , "node" , klog . KRef ( "" , nodeName ) )
2019-09-12 23:38:02 -04:00
// TODO(k82cn): Add nodeName back to the queue
2019-02-22 19:09:07 -05:00
}
// TODO: re-evaluate whether there are any labels that need to be
// reconcile in 1.19. Remove this function if it's no longer necessary.
2023-03-14 15:38:14 -04:00
if err := nc . reconcileNodeLabels ( ctx , nodeName ) ; err != nil {
2022-09-22 02:37:36 -04:00
logger . Error ( err , "Failed to reconcile labels for node, requeue it" , "node" , klog . KRef ( "" , nodeName ) )
2019-02-22 19:09:07 -05:00
// TODO(yujuhong): Add nodeName back to the queue
2018-09-10 20:24:16 -04:00
}
nc . nodeUpdateQueue . Done ( nodeName )
2018-08-31 03:26:19 -04:00
}
}
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) doNoScheduleTaintingPass ( ctx context . Context , nodeName string ) error {
2018-09-10 20:24:16 -04:00
node , err := nc . nodeLister . Get ( nodeName )
if err != nil {
// If node not found, just ignore it.
if apierrors . IsNotFound ( err ) {
return nil
}
return err
}
2017-07-19 11:51:19 -04:00
// Map node's condition to Taints.
2018-05-14 02:55:42 -04:00
var taints [ ] v1 . Taint
2017-07-19 11:51:19 -04:00
for _ , condition := range node . Status . Conditions {
2018-08-22 18:26:46 -04:00
if taintMap , found := nodeConditionToTaintKeyStatusMap [ condition . Type ] ; found {
if taintKey , found := taintMap [ condition . Status ] ; found {
2017-07-19 11:51:19 -04:00
taints = append ( taints , v1 . Taint {
2018-08-22 18:26:46 -04:00
Key : taintKey ,
2017-07-19 11:51:19 -04:00
Effect : v1 . TaintEffectNoSchedule ,
} )
}
}
}
2018-03-14 03:08:17 -04:00
if node . Spec . Unschedulable {
// If unschedulable, append related taint.
taints = append ( taints , v1 . Taint {
2019-11-04 14:31:16 -05:00
Key : v1 . TaintNodeUnschedulable ,
2018-03-14 03:08:17 -04:00
Effect : v1 . TaintEffectNoSchedule ,
} )
}
// Get exist taints of node.
2017-07-19 11:51:19 -04:00
nodeTaints := taintutils . TaintSetFilter ( node . Spec . Taints , func ( t * v1 . Taint ) bool {
2018-08-22 18:26:46 -04:00
// only NoSchedule taints are candidates to be compared with "taints" later
if t . Effect != v1 . TaintEffectNoSchedule {
return false
}
2018-03-14 03:08:17 -04:00
// Find unschedulable taint of node.
2019-11-04 14:31:16 -05:00
if t . Key == v1 . TaintNodeUnschedulable {
2018-03-14 03:08:17 -04:00
return true
}
// Find node condition taints of node.
2017-07-19 11:51:19 -04:00
_ , found := taintKeyToNodeConditionMap [ t . Key ]
return found
} )
taintsToAdd , taintsToDel := taintutils . TaintSetDiff ( taints , nodeTaints )
2022-02-12 04:56:38 -05:00
// If nothing to add or delete, return true directly.
2017-07-19 11:51:19 -04:00
if len ( taintsToAdd ) == 0 && len ( taintsToDel ) == 0 {
return nil
}
2021-11-12 10:52:27 -05:00
if ! controllerutil . SwapNodeControllerTaint ( ctx , nc . kubeClient , taintsToAdd , taintsToDel , node ) {
2017-07-19 11:51:19 -04:00
return fmt . Errorf ( "failed to swap taints of node %+v" , node )
}
return nil
}
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) doNoExecuteTaintingPass ( ctx context . Context ) {
2022-06-02 12:30:08 -04:00
// Extract out the keys of the map in order to not hold
// the evictorLock for the entire function and hold it
// only when nescessary.
var zoneNoExecuteTainterKeys [ ] string
func ( ) {
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
zoneNoExecuteTainterKeys = make ( [ ] string , 0 , len ( nc . zoneNoExecuteTainter ) )
for k := range nc . zoneNoExecuteTainter {
zoneNoExecuteTainterKeys = append ( zoneNoExecuteTainterKeys , k )
}
} ( )
2022-09-22 02:37:36 -04:00
logger := klog . FromContext ( ctx )
2022-06-02 12:30:08 -04:00
for _ , k := range zoneNoExecuteTainterKeys {
var zoneNoExecuteTainterWorker * scheduler . RateLimitedTimedQueue
func ( ) {
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
// Extracting the value without checking if the key
// exists or not is safe to do here since zones do
// not get removed, and consequently pod evictors for
// these zones also do not get removed, only added.
zoneNoExecuteTainterWorker = nc . zoneNoExecuteTainter [ k ]
} ( )
2017-04-04 09:35:44 -04:00
// Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded).
2022-09-22 02:37:36 -04:00
zoneNoExecuteTainterWorker . Try ( logger , func ( value scheduler . TimedValue ) ( bool , time . Duration ) {
2017-04-04 09:35:44 -04:00
node , err := nc . nodeLister . Get ( value . Value )
if apierrors . IsNotFound ( err ) {
2022-09-22 02:37:36 -04:00
logger . Info ( "Node no longer present in nodeLister" , "node" , klog . KRef ( "" , value . Value ) )
2017-04-04 09:35:44 -04:00
return true , 0
} else if err != nil {
2022-09-22 02:37:36 -04:00
logger . Info ( "Failed to get Node from the nodeLister" , "node" , klog . KRef ( "" , value . Value ) , "err" , err )
2017-04-04 09:35:44 -04:00
// retry in 50 millisecond
return false , 50 * time . Millisecond
}
2021-11-12 10:52:27 -05:00
_ , condition := controllerutil . GetNodeCondition ( & node . Status , v1 . NodeReady )
2024-01-19 09:07:55 -05:00
if condition == nil {
logger . Info ( "Failed to get NodeCondition from the node status" , "node" , klog . KRef ( "" , value . Value ) )
// retry in 50 millisecond
return false , 50 * time . Millisecond
}
2017-04-04 09:35:44 -04:00
// Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive.
taintToAdd := v1 . Taint { }
oppositeTaint := v1 . Taint { }
2019-08-14 08:16:29 -04:00
switch condition . Status {
case v1 . ConditionFalse :
2017-04-04 09:35:44 -04:00
taintToAdd = * NotReadyTaintTemplate
oppositeTaint = * UnreachableTaintTemplate
2019-08-14 08:16:29 -04:00
case v1 . ConditionUnknown :
2017-04-04 09:35:44 -04:00
taintToAdd = * UnreachableTaintTemplate
oppositeTaint = * NotReadyTaintTemplate
2019-08-14 08:16:29 -04:00
default :
2017-04-04 09:35:44 -04:00
// It seems that the Node is ready again, so there's no need to taint it.
2022-09-22 02:37:36 -04:00
logger . V ( 4 ) . Info ( "Node was in a taint queue, but it's ready now. Ignoring taint request" , "node" , klog . KRef ( "" , value . Value ) )
2017-04-04 09:35:44 -04:00
return true , 0
}
2021-11-12 10:52:27 -05:00
result := controllerutil . SwapNodeControllerTaint ( ctx , nc . kubeClient , [ ] * v1 . Taint { & taintToAdd } , [ ] * v1 . Taint { & oppositeTaint } , node )
2019-01-14 22:07:14 -05:00
if result {
2023-02-08 04:44:16 -05:00
// Count the number of evictions.
2021-11-12 10:52:27 -05:00
zone := nodetopology . GetZoneKey ( node )
2021-12-10 06:16:34 -05:00
evictionsTotal . WithLabelValues ( zone ) . Inc ( )
2019-01-14 22:07:14 -05:00
}
return result , 0
2017-04-04 09:35:44 -04:00
} )
}
}
2023-04-12 10:02:33 -04:00
// monitorNodeHealth verifies node health are constantly updated by kubelet, and if not, post "NodeReady==ConditionUnknown".
// This function will
// - add nodes which are not ready or not reachable for a long period of time to a rate-limited
// queue so that NoExecute taints can be added by the goroutine running the doNoExecuteTaintingPass function,
// - update the PodReady condition Pods according to the state of the Node Ready condition.
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) monitorNodeHealth ( ctx context . Context ) error {
2023-01-10 23:54:39 -05:00
start := nc . now ( )
defer func ( ) {
updateAllNodesHealthDuration . Observe ( time . Since ( start . Time ) . Seconds ( ) )
} ( )
2017-10-11 19:36:39 -04:00
// We are listing nodes from local cache as we can tolerate some small delays
// comparing to state from etcd and there is eventual consistency anyway.
nodes , err := nc . nodeLister . List ( labels . Everything ( ) )
if err != nil {
return err
}
added , deleted , newZoneRepresentatives := nc . classifyNodes ( nodes )
2022-09-22 02:37:36 -04:00
logger := klog . FromContext ( ctx )
2017-10-11 19:36:39 -04:00
for i := range newZoneRepresentatives {
2022-09-22 02:37:36 -04:00
nc . addPodEvictorForNewZone ( logger , newZoneRepresentatives [ i ] )
2017-04-12 15:49:17 -04:00
}
2017-10-11 19:36:39 -04:00
for i := range added {
2022-09-22 02:37:36 -04:00
logger . V ( 1 ) . Info ( "Controller observed a new Node" , "node" , klog . KRef ( "" , added [ i ] . Name ) )
2023-03-14 15:38:14 -04:00
controllerutil . RecordNodeEvent ( ctx , nc . recorder , added [ i ] . Name , string ( added [ i ] . UID ) , v1 . EventTypeNormal , "RegisteredNode" , fmt . Sprintf ( "Registered Node %v in Controller" , added [ i ] . Name ) )
2017-10-11 19:36:39 -04:00
nc . knownNodeSet [ added [ i ] . Name ] = added [ i ]
2022-09-22 02:37:36 -04:00
nc . addPodEvictorForNewZone ( logger , added [ i ] )
2022-09-14 05:04:08 -04:00
nc . markNodeAsReachable ( ctx , added [ i ] )
2017-10-11 19:36:39 -04:00
}
2017-01-23 04:28:51 -05:00
2017-10-11 19:36:39 -04:00
for i := range deleted {
2022-09-22 02:37:36 -04:00
logger . V ( 1 ) . Info ( "Controller observed a Node deletion" , "node" , klog . KRef ( "" , deleted [ i ] . Name ) )
2023-03-14 15:38:14 -04:00
controllerutil . RecordNodeEvent ( ctx , nc . recorder , deleted [ i ] . Name , string ( deleted [ i ] . UID ) , v1 . EventTypeNormal , "RemovingNode" , fmt . Sprintf ( "Removing Node %v from Controller" , deleted [ i ] . Name ) )
2017-10-11 19:36:39 -04:00
delete ( nc . knownNodeSet , deleted [ i ] . Name )
2015-08-05 09:22:13 -04:00
}
2023-01-10 23:54:39 -05:00
var zoneToNodeConditionsLock sync . Mutex
2016-11-18 15:50:17 -05:00
zoneToNodeConditions := map [ string ] [ ] * v1 . NodeCondition { }
2023-01-10 23:54:39 -05:00
updateNodeFunc := func ( piece int ) {
start := nc . now ( )
defer func ( ) {
updateNodeHealthDuration . Observe ( time . Since ( start . Time ) . Seconds ( ) )
} ( )
2016-11-18 15:50:17 -05:00
var observedReadyCondition v1 . NodeCondition
var currentReadyCondition * v1 . NodeCondition
2023-01-10 23:54:39 -05:00
node := nodes [ piece ] . DeepCopy ( )
2018-10-01 14:32:56 -04:00
if err := wait . PollImmediate ( retrySleepTime , retrySleepTime * scheduler . NodeHealthUpdateRetry , func ( ) ( bool , error ) {
2023-01-10 23:54:39 -05:00
var err error
2022-09-14 05:04:08 -04:00
_ , observedReadyCondition , currentReadyCondition , err = nc . tryUpdateNodeHealth ( ctx , node )
2015-08-04 08:44:14 -04:00
if err == nil {
2017-01-05 07:22:35 -05:00
return true , nil
2015-08-04 08:44:14 -04:00
}
name := node . Name
2021-04-22 14:27:59 -04:00
node , err = nc . kubeClient . CoreV1 ( ) . Nodes ( ) . Get ( ctx , name , metav1 . GetOptions { } )
2015-08-04 08:44:14 -04:00
if err != nil {
2022-09-22 02:37:36 -04:00
logger . Error ( nil , "Failed while getting a Node to retry updating node health. Probably Node was deleted" , "node" , klog . KRef ( "" , name ) )
2017-01-05 07:22:35 -05:00
return false , err
2015-08-04 08:44:14 -04:00
}
2017-01-05 07:22:35 -05:00
return false , nil
} ) ; err != nil {
2022-09-22 02:37:36 -04:00
logger . Error ( err , "Update health of Node from Controller error, Skipping - no pods will be evicted" , "node" , klog . KObj ( node ) )
2023-01-10 23:54:39 -05:00
return
2015-08-04 08:44:14 -04:00
}
2017-01-05 07:22:35 -05:00
2019-07-16 22:24:21 -04:00
// Some nodes may be excluded from disruption checking
if ! isNodeExcludedFromDisruptionChecks ( node ) {
2023-01-10 23:54:39 -05:00
zoneToNodeConditionsLock . Lock ( )
2021-11-12 10:52:27 -05:00
zoneToNodeConditions [ nodetopology . GetZoneKey ( node ) ] = append ( zoneToNodeConditions [ nodetopology . GetZoneKey ( node ) ] , currentReadyCondition )
2023-01-10 23:54:39 -05:00
zoneToNodeConditionsLock . Unlock ( )
2016-07-12 03:38:57 -04:00
}
2015-08-04 08:44:14 -04:00
2016-05-16 05:20:23 -04:00
if currentReadyCondition != nil {
2019-08-01 12:59:22 -04:00
pods , err := nc . getPodsAssignedToNode ( node . Name )
if err != nil {
2025-09-17 02:44:42 -04:00
utilruntime . HandleErrorWithContext ( ctx , err , "Unable to list pods of node" , node . Name )
2019-08-01 12:59:22 -04:00
if currentReadyCondition . Status != v1 . ConditionTrue && observedReadyCondition . Status == v1 . ConditionTrue {
// If error happened during node status transition (Ready -> NotReady)
// we need to mark node for retry to force MarkPodsNotReady execution
// in the next iteration.
nc . nodesToRetry . Store ( node . Name , struct { } { } )
}
2023-01-10 23:54:39 -05:00
return
2019-08-01 12:59:22 -04:00
}
2023-09-21 11:32:51 -04:00
nc . processTaintBaseEviction ( ctx , node , currentReadyCondition )
2015-08-04 08:44:14 -04:00
2019-10-28 07:08:29 -04:00
_ , needsRetry := nc . nodesToRetry . Load ( node . Name )
switch {
case currentReadyCondition . Status != v1 . ConditionTrue && observedReadyCondition . Status == v1 . ConditionTrue :
// Report node event only once when status changed.
2023-03-26 10:10:19 -04:00
controllerutil . RecordNodeStatusChange ( logger , nc . recorder , node , "NodeNotReady" )
2019-10-28 07:08:29 -04:00
fallthrough
case needsRetry && observedReadyCondition . Status != v1 . ConditionTrue :
2021-11-12 10:52:27 -05:00
if err = controllerutil . MarkPodsNotReady ( ctx , nc . kubeClient , nc . recorder , pods , node . Name ) ; err != nil {
2025-09-17 02:44:42 -04:00
utilruntime . HandleErrorWithContext ( ctx , err , "Unable to mark all pods NotReady on node; queuing for retry" , "node" , node . Name )
2019-10-28 07:08:29 -04:00
nc . nodesToRetry . Store ( node . Name , struct { } { } )
2023-01-10 23:54:39 -05:00
return
2019-10-02 08:22:58 -04:00
}
2015-08-04 08:44:14 -04:00
}
}
2019-10-28 07:08:29 -04:00
nc . nodesToRetry . Delete ( node . Name )
2015-08-04 08:44:14 -04:00
}
2023-01-10 23:54:39 -05:00
// Marking the pods not ready on a node requires looping over them and
// updating each pod's status one at a time. This is performed serially, and
// can take a while if we're processing each node serially as well. So we
// process them with bounded concurrency instead, since most of the time is
// spent waiting on io.
workqueue . ParallelizeUntil ( ctx , nc . nodeUpdateWorkerSize , len ( nodes ) , updateNodeFunc )
2021-04-22 14:27:59 -04:00
nc . handleDisruption ( ctx , zoneToNodeConditions , nodes )
2016-05-16 05:20:23 -04:00
2016-07-13 10:57:22 -04:00
return nil
}
2023-09-21 11:32:51 -04:00
func ( nc * Controller ) processTaintBaseEviction ( ctx context . Context , node * v1 . Node , currentReadyCondition * v1 . NodeCondition ) {
2019-09-19 10:57:31 -04:00
decisionTimestamp := nc . now ( )
// Check eviction timeout against decisionTimestamp
2022-09-22 02:37:36 -04:00
logger := klog . FromContext ( ctx )
2023-09-21 11:32:51 -04:00
switch currentReadyCondition . Status {
2019-09-19 10:57:31 -04:00
case v1 . ConditionFalse :
// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
if taintutils . TaintExists ( node . Spec . Taints , UnreachableTaintTemplate ) {
taintToAdd := * NotReadyTaintTemplate
2021-11-12 10:52:27 -05:00
if ! controllerutil . SwapNodeControllerTaint ( ctx , nc . kubeClient , [ ] * v1 . Taint { & taintToAdd } , [ ] * v1 . Taint { UnreachableTaintTemplate } , node ) {
2022-09-22 02:37:36 -04:00
logger . Error ( nil , "Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle" )
2019-09-19 10:57:31 -04:00
}
2020-03-11 11:34:53 -04:00
} else if nc . markNodeForTainting ( node , v1 . ConditionFalse ) {
2022-09-22 02:37:36 -04:00
logger . V ( 2 ) . Info ( "Node is NotReady. Adding it to the Taint queue" , "node" , klog . KObj ( node ) , "timeStamp" , decisionTimestamp )
2019-09-19 10:57:31 -04:00
}
case v1 . ConditionUnknown :
2023-09-21 11:32:51 -04:00
// We want to update the taint straight away if Node is already tainted with the NotReadyTaintTemplate
2019-09-19 10:57:31 -04:00
if taintutils . TaintExists ( node . Spec . Taints , NotReadyTaintTemplate ) {
taintToAdd := * UnreachableTaintTemplate
2021-11-12 10:52:27 -05:00
if ! controllerutil . SwapNodeControllerTaint ( ctx , nc . kubeClient , [ ] * v1 . Taint { & taintToAdd } , [ ] * v1 . Taint { NotReadyTaintTemplate } , node ) {
2022-09-22 02:37:36 -04:00
logger . Error ( nil , "Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle" )
2019-09-19 10:57:31 -04:00
}
2020-03-11 11:34:53 -04:00
} else if nc . markNodeForTainting ( node , v1 . ConditionUnknown ) {
2022-09-22 02:37:36 -04:00
logger . V ( 2 ) . Info ( "Node is unresponsive. Adding it to the Taint queue" , "node" , klog . KObj ( node ) , "timeStamp" , decisionTimestamp )
2019-09-19 10:57:31 -04:00
}
case v1 . ConditionTrue :
2021-04-22 14:27:59 -04:00
removed , err := nc . markNodeAsReachable ( ctx , node )
2019-09-19 10:57:31 -04:00
if err != nil {
2022-09-22 02:37:36 -04:00
logger . Error ( nil , "Failed to remove taints from node. Will retry in next iteration" , "node" , klog . KObj ( node ) )
2019-09-19 10:57:31 -04:00
}
if removed {
2024-01-08 07:54:05 -05:00
logger . V ( 2 ) . Info ( "Node is healthy again, removed all taints" , "node" , klog . KObj ( node ) )
2019-09-19 10:57:31 -04:00
}
}
}
2019-07-16 22:24:21 -04:00
// labelNodeDisruptionExclusion is a label on nodes that controls whether they are
// excluded from being considered for disruption checks by the node controller.
const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption"
func isNodeExcludedFromDisruptionChecks ( node * v1 . Node ) bool {
2020-12-28 02:52:59 -05:00
if _ , ok := node . Labels [ labelNodeDisruptionExclusion ] ; ok {
2019-07-16 22:24:21 -04:00
return true
}
return false
}
2018-10-01 14:32:56 -04:00
// tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to
2017-07-15 08:22:55 -04:00
// which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred.
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) tryUpdateNodeHealth ( ctx context . Context , node * v1 . Node ) ( time . Duration , v1 . NodeCondition , * v1 . NodeCondition , error ) {
2019-09-09 08:02:38 -04:00
nodeHealth := nc . nodeHealthMap . getDeepCopy ( node . Name )
defer func ( ) {
nc . nodeHealthMap . set ( node . Name , nodeHealth )
} ( )
2015-03-30 08:44:02 -04:00
var gracePeriod time . Duration
2016-11-18 15:50:17 -05:00
var observedReadyCondition v1 . NodeCondition
2021-11-12 10:52:27 -05:00
_ , currentReadyCondition := controllerutil . GetNodeCondition ( & node . Status , v1 . NodeReady )
2016-05-16 05:20:23 -04:00
if currentReadyCondition == nil {
2015-03-30 08:44:02 -04:00
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
2018-10-01 14:32:56 -04:00
// A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set
2015-03-30 08:44:02 -04:00
// to node.CreationTimestamp to avoid handle the corner case.
2016-11-18 15:50:17 -05:00
observedReadyCondition = v1 . NodeCondition {
Type : v1 . NodeReady ,
Status : v1 . ConditionUnknown ,
2015-03-27 10:09:51 -04:00
LastHeartbeatTime : node . CreationTimestamp ,
2015-03-30 08:44:02 -04:00
LastTransitionTime : node . CreationTimestamp ,
}
2015-03-31 07:17:12 -04:00
gracePeriod = nc . nodeStartupGracePeriod
2019-09-09 08:02:38 -04:00
if nodeHealth != nil {
nodeHealth . status = & node . Status
2018-10-01 14:32:56 -04:00
} else {
2019-09-09 08:02:38 -04:00
nodeHealth = & nodeHealthData {
2018-10-01 14:32:56 -04:00
status : & node . Status ,
probeTimestamp : node . CreationTimestamp ,
readyTransitionTimestamp : node . CreationTimestamp ,
}
2015-03-31 11:15:39 -04:00
}
2015-03-30 08:44:02 -04:00
} else {
// If ready condition is not nil, make a copy of it, since we may modify it in place later.
2016-05-16 05:20:23 -04:00
observedReadyCondition = * currentReadyCondition
2015-03-31 07:17:12 -04:00
gracePeriod = nc . nodeMonitorGracePeriod
2015-03-30 08:44:02 -04:00
}
2015-03-31 11:15:39 -04:00
// There are following cases to check:
// - both saved and new status have no Ready Condition set - we leave everything as it is,
2017-08-08 19:25:20 -04:00
// - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd,
2015-03-31 11:15:39 -04:00
// - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do,
// - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be
// unresponsive, so we leave it as it is,
// - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State -
// everything's in order, no transition occurred, we update only probeTimestamp,
// - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State -
// Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp.
// TODO: things to consider:
2015-07-29 17:11:19 -04:00
// - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it,
2015-03-31 11:15:39 -04:00
// - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check
// if that's the case, but it does not seem necessary.
2016-11-18 15:50:17 -05:00
var savedCondition * v1 . NodeCondition
2019-10-16 03:11:03 -04:00
var savedLease * coordv1 . Lease
2019-09-09 08:02:38 -04:00
if nodeHealth != nil {
2021-11-12 10:52:27 -05:00
_ , savedCondition = controllerutil . GetNodeCondition ( nodeHealth . status , v1 . NodeReady )
2019-09-09 08:02:38 -04:00
savedLease = nodeHealth . lease
2015-09-29 02:43:04 -04:00
}
2022-09-22 02:37:36 -04:00
logger := klog . FromContext ( ctx )
2019-09-09 08:02:38 -04:00
if nodeHealth == nil {
2022-09-22 02:37:36 -04:00
logger . Info ( "Missing timestamp for Node. Assuming now as a timestamp" , "node" , klog . KObj ( node ) )
2019-09-09 08:02:38 -04:00
nodeHealth = & nodeHealthData {
2018-10-01 14:32:56 -04:00
status : & node . Status ,
2015-03-31 11:15:39 -04:00
probeTimestamp : nc . now ( ) ,
readyTransitionTimestamp : nc . now ( ) ,
}
2019-08-14 08:16:29 -04:00
} else if savedCondition == nil && currentReadyCondition != nil {
2022-09-22 02:37:36 -04:00
logger . V ( 1 ) . Info ( "Creating timestamp entry for newly observed Node" , "node" , klog . KObj ( node ) )
2019-09-09 08:02:38 -04:00
nodeHealth = & nodeHealthData {
2018-10-01 14:32:56 -04:00
status : & node . Status ,
2015-03-31 11:15:39 -04:00
probeTimestamp : nc . now ( ) ,
readyTransitionTimestamp : nc . now ( ) ,
}
2019-08-14 08:16:29 -04:00
} else if savedCondition != nil && currentReadyCondition == nil {
2022-09-22 02:37:36 -04:00
logger . Error ( nil , "ReadyCondition was removed from Status of Node" , "node" , klog . KObj ( node ) )
2015-03-31 11:15:39 -04:00
// TODO: figure out what to do in this case. For now we do the same thing as above.
2019-09-09 08:02:38 -04:00
nodeHealth = & nodeHealthData {
2018-10-01 14:32:56 -04:00
status : & node . Status ,
2015-03-31 11:15:39 -04:00
probeTimestamp : nc . now ( ) ,
readyTransitionTimestamp : nc . now ( ) ,
}
2019-08-14 08:16:29 -04:00
} else if savedCondition != nil && currentReadyCondition != nil && savedCondition . LastHeartbeatTime != currentReadyCondition . LastHeartbeatTime {
2016-12-03 13:57:26 -05:00
var transitionTime metav1 . Time
2015-03-31 11:15:39 -04:00
// If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now",
// otherwise we leave it as it is.
2019-08-14 08:16:29 -04:00
if savedCondition . LastTransitionTime != currentReadyCondition . LastTransitionTime {
2022-09-22 02:37:36 -04:00
logger . V ( 3 ) . Info ( "ReadyCondition for Node transitioned from savedCondition to currentReadyCondition" , "node" , klog . KObj ( node ) , "savedCondition" , savedCondition , "currentReadyCondition" , currentReadyCondition )
2015-03-31 11:15:39 -04:00
transitionTime = nc . now ( )
} else {
2019-09-09 08:02:38 -04:00
transitionTime = nodeHealth . readyTransitionTimestamp
2015-03-31 11:15:39 -04:00
}
2022-09-22 02:37:36 -04:00
if loggerV := logger . V ( 5 ) ; loggerV . Enabled ( ) {
loggerV . Info ( "Node ReadyCondition updated. Updating timestamp" , "node" , klog . KObj ( node ) , "nodeHealthStatus" , nodeHealth . status , "nodeStatus" , node . Status )
2016-02-20 15:07:23 -05:00
} else {
2022-09-22 02:37:36 -04:00
logger . V ( 3 ) . Info ( "Node ReadyCondition updated. Updating timestamp" , "node" , klog . KObj ( node ) )
2016-02-20 15:07:23 -05:00
}
2019-09-09 08:02:38 -04:00
nodeHealth = & nodeHealthData {
2018-10-01 14:32:56 -04:00
status : & node . Status ,
2015-03-31 11:15:39 -04:00
probeTimestamp : nc . now ( ) ,
readyTransitionTimestamp : transitionTime ,
}
}
2019-10-25 05:57:54 -04:00
// Always update the probe time if node lease is renewed.
// Note: If kubelet never posted the node status, but continues renewing the
// heartbeat leases, the node controller will assume the node is healthy and
// take no action.
observedLease , _ := nc . leaseLister . Leases ( v1 . NamespaceNodeLease ) . Get ( node . Name )
if observedLease != nil && ( savedLease == nil || savedLease . Spec . RenewTime . Before ( observedLease . Spec . RenewTime ) ) {
nodeHealth . lease = observedLease
nodeHealth . probeTimestamp = nc . now ( )
2018-10-01 14:32:56 -04:00
}
2015-03-31 11:15:39 -04:00
2019-09-09 08:02:38 -04:00
if nc . now ( ) . After ( nodeHealth . probeTimestamp . Add ( gracePeriod ) ) {
2018-10-01 14:32:56 -04:00
// NodeReady condition or lease was last set longer ago than gracePeriod, so
// update it to Unknown (regardless of its current value) in the master.
2015-10-22 15:47:43 -04:00
2019-08-14 08:16:29 -04:00
nodeConditionTypes := [ ] v1 . NodeConditionType {
v1 . NodeReady ,
2017-07-15 08:22:55 -04:00
v1 . NodeMemoryPressure ,
v1 . NodeDiskPressure ,
2018-09-13 20:50:05 -04:00
v1 . NodePIDPressure ,
2017-07-15 08:22:55 -04:00
// We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level.
// v1.NodeNetworkUnavailable,
}
2016-11-10 13:09:27 -05:00
nowTimestamp := nc . now ( )
2019-08-14 08:16:29 -04:00
for _ , nodeConditionType := range nodeConditionTypes {
2021-11-12 10:52:27 -05:00
_ , currentCondition := controllerutil . GetNodeCondition ( & node . Status , nodeConditionType )
2016-11-10 13:09:27 -05:00
if currentCondition == nil {
2022-09-22 02:37:36 -04:00
logger . V ( 2 ) . Info ( "Condition of node was never updated by kubelet" , "nodeConditionType" , nodeConditionType , "node" , klog . KObj ( node ) )
2016-11-10 13:09:27 -05:00
node . Status . Conditions = append ( node . Status . Conditions , v1 . NodeCondition {
Type : nodeConditionType ,
Status : v1 . ConditionUnknown ,
Reason : "NodeStatusNeverUpdated" ,
Message : "Kubelet never posted node status." ,
LastHeartbeatTime : node . CreationTimestamp ,
LastTransitionTime : nowTimestamp ,
} )
} else {
2022-09-22 02:37:36 -04:00
logger . V ( 2 ) . Info ( "Node hasn't been updated" ,
"node" , klog . KObj ( node ) , "duration" , nc . now ( ) . Time . Sub ( nodeHealth . probeTimestamp . Time ) , "nodeConditionType" , nodeConditionType , "currentCondition" , currentCondition )
2016-11-10 13:09:27 -05:00
if currentCondition . Status != v1 . ConditionUnknown {
currentCondition . Status = v1 . ConditionUnknown
currentCondition . Reason = "NodeStatusUnknown"
currentCondition . Message = "Kubelet stopped posting node status."
currentCondition . LastTransitionTime = nowTimestamp
}
2015-10-22 15:47:43 -04:00
}
}
2019-08-14 08:16:29 -04:00
// We need to update currentReadyCondition due to its value potentially changed.
2021-11-12 10:52:27 -05:00
_ , currentReadyCondition = controllerutil . GetNodeCondition ( & node . Status , v1 . NodeReady )
2015-10-22 15:47:43 -04:00
2019-08-14 08:16:29 -04:00
if ! apiequality . Semantic . DeepEqual ( currentReadyCondition , & observedReadyCondition ) {
2021-04-22 14:27:59 -04:00
if _ , err := nc . kubeClient . CoreV1 ( ) . Nodes ( ) . UpdateStatus ( ctx , node , metav1 . UpdateOptions { } ) ; err != nil {
2022-09-22 02:37:36 -04:00
logger . Error ( err , "Error updating node" , "node" , klog . KObj ( node ) )
2016-05-16 05:20:23 -04:00
return gracePeriod , observedReadyCondition , currentReadyCondition , err
2015-03-31 11:15:39 -04:00
}
2019-09-09 08:02:38 -04:00
nodeHealth = & nodeHealthData {
2018-10-01 14:32:56 -04:00
status : & node . Status ,
2019-09-09 08:02:38 -04:00
probeTimestamp : nodeHealth . probeTimestamp ,
2017-08-08 19:25:20 -04:00
readyTransitionTimestamp : nc . now ( ) ,
2018-10-01 14:32:56 -04:00
lease : observedLease ,
2017-08-08 19:25:20 -04:00
}
return gracePeriod , observedReadyCondition , currentReadyCondition , nil
2015-03-30 08:44:02 -04:00
}
}
2019-08-14 08:16:29 -04:00
return gracePeriod , observedReadyCondition , currentReadyCondition , nil
2016-05-16 05:20:23 -04:00
}
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) handleDisruption ( ctx context . Context , zoneToNodeConditions map [ string ] [ ] * v1 . NodeCondition , nodes [ ] * v1 . Node ) {
2017-10-11 19:36:39 -04:00
newZoneStates := map [ string ] ZoneState { }
allAreFullyDisrupted := true
2022-09-22 02:37:36 -04:00
logger := klog . FromContext ( ctx )
2017-10-11 19:36:39 -04:00
for k , v := range zoneToNodeConditions {
zoneSize . WithLabelValues ( k ) . Set ( float64 ( len ( v ) ) )
unhealthy , newState := nc . computeZoneStateFunc ( v )
zoneHealth . WithLabelValues ( k ) . Set ( float64 ( 100 * ( len ( v ) - unhealthy ) ) / float64 ( len ( v ) ) )
unhealthyNodes . WithLabelValues ( k ) . Set ( float64 ( unhealthy ) )
if newState != stateFullDisruption {
allAreFullyDisrupted = false
}
newZoneStates [ k ] = newState
if _ , had := nc . zoneStates [ k ] ; ! had {
2022-09-22 02:37:36 -04:00
logger . Error ( nil , "Setting initial state for unseen zone" , "zone" , k )
2017-10-11 19:36:39 -04:00
nc . zoneStates [ k ] = stateInitial
}
}
allWasFullyDisrupted := true
for k , v := range nc . zoneStates {
if _ , have := zoneToNodeConditions [ k ] ; ! have {
zoneSize . WithLabelValues ( k ) . Set ( 0 )
zoneHealth . WithLabelValues ( k ) . Set ( 100 )
unhealthyNodes . WithLabelValues ( k ) . Set ( 0 )
delete ( nc . zoneStates , k )
continue
}
if v != stateFullDisruption {
allWasFullyDisrupted = false
break
}
}
// At least one node was responding in previous pass or in the current pass. Semantics is as follows:
// - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use,
// - if the new state is "normal" we resume normal operation (go back to default limiter settings),
// - if new state is "fullDisruption" we restore normal eviction rate,
// - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions.
if ! allAreFullyDisrupted || ! allWasFullyDisrupted {
// We're switching to full disruption mode
if allAreFullyDisrupted {
2022-09-22 02:37:36 -04:00
logger . Info ( "Controller detected that all Nodes are not-Ready. Entering master disruption mode" )
2017-10-11 19:36:39 -04:00
for i := range nodes {
2022-09-14 05:04:08 -04:00
_ , err := nc . markNodeAsReachable ( ctx , nodes [ i ] )
if err != nil {
logger . Error ( nil , "Failed to remove taints from Node" , "node" , klog . KObj ( nodes [ i ] ) )
2017-10-11 19:36:39 -04:00
}
}
// We stop all evictions.
for k := range nc . zoneStates {
2022-09-14 05:04:08 -04:00
nc . zoneNoExecuteTainter [ k ] . SwapLimiter ( 0 )
2017-10-11 19:36:39 -04:00
}
for k := range nc . zoneStates {
nc . zoneStates [ k ] = stateFullDisruption
}
// All rate limiters are updated, so we can return early here.
return
}
// We're exiting full disruption mode
if allWasFullyDisrupted {
2022-09-22 02:37:36 -04:00
logger . Info ( "Controller detected that some Nodes are Ready. Exiting master disruption mode" )
2017-10-11 19:36:39 -04:00
// When exiting disruption mode update probe timestamps on all Nodes.
now := nc . now ( )
for i := range nodes {
2019-09-09 08:02:38 -04:00
v := nc . nodeHealthMap . getDeepCopy ( nodes [ i ] . Name )
2017-10-11 19:36:39 -04:00
v . probeTimestamp = now
v . readyTransitionTimestamp = now
2019-09-09 08:02:38 -04:00
nc . nodeHealthMap . set ( nodes [ i ] . Name , v )
2017-10-11 19:36:39 -04:00
}
// We reset all rate limiters to settings appropriate for the given state.
for k := range nc . zoneStates {
nc . setLimiterInZone ( k , len ( zoneToNodeConditions [ k ] ) , newZoneStates [ k ] )
nc . zoneStates [ k ] = newZoneStates [ k ]
}
return
}
// We know that there's at least one not-fully disrupted so,
// we can use default behavior for rate limiters
for k , v := range nc . zoneStates {
newState := newZoneStates [ k ]
if v == newState {
continue
}
2022-09-22 02:37:36 -04:00
logger . Info ( "Controller detected that zone is now in new state" , "zone" , k , "newState" , newState )
2017-10-11 19:36:39 -04:00
nc . setLimiterInZone ( k , len ( zoneToNodeConditions [ k ] ) , newState )
nc . zoneStates [ k ] = newState
}
}
}
2019-08-01 12:59:22 -04:00
func ( nc * Controller ) podUpdated ( oldPod , newPod * v1 . Pod ) {
if newPod == nil {
return
}
if len ( newPod . Spec . NodeName ) != 0 && ( oldPod == nil || newPod . Spec . NodeName != oldPod . Spec . NodeName ) {
podItem := podUpdateItem { newPod . Namespace , newPod . Name }
nc . podUpdateQueue . Add ( podItem )
}
}
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) doPodProcessingWorker ( ctx context . Context ) {
2019-08-01 12:59:22 -04:00
for {
obj , shutdown := nc . podUpdateQueue . Get ( )
// "podUpdateQueue" will be shutdown when "stopCh" closed;
// we do not need to re-check "stopCh" again.
if shutdown {
return
}
2024-04-28 12:26:18 -04:00
podItem := obj
2021-04-22 14:27:59 -04:00
nc . processPod ( ctx , podItem )
2019-08-01 12:59:22 -04:00
}
}
// processPod is processing events of assigning pods to nodes. In particular:
// 1. for NodeReady=true node, taint eviction for this pod will be cancelled
// 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready
2022-09-14 05:04:08 -04:00
// 3. if node doesn't exist in cache, it will be skipped.
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) processPod ( ctx context . Context , podItem podUpdateItem ) {
2019-08-01 12:59:22 -04:00
defer nc . podUpdateQueue . Done ( podItem )
pod , err := nc . podLister . Pods ( podItem . namespace ) . Get ( podItem . name )
2022-09-22 02:37:36 -04:00
logger := klog . FromContext ( ctx )
2019-08-01 12:59:22 -04:00
if err != nil {
if apierrors . IsNotFound ( err ) {
// If the pod was deleted, there is no need to requeue.
return
}
2022-09-22 02:37:36 -04:00
logger . Info ( "Failed to read pod" , "pod" , klog . KRef ( podItem . namespace , podItem . name ) , "err" , err )
2019-08-01 12:59:22 -04:00
nc . podUpdateQueue . AddRateLimited ( podItem )
return
}
nodeName := pod . Spec . NodeName
nodeHealth := nc . nodeHealthMap . getDeepCopy ( nodeName )
if nodeHealth == nil {
2022-09-14 05:04:08 -04:00
// Node data is not gathered yet or node has been removed in the meantime.
2019-08-01 12:59:22 -04:00
return
}
2022-09-14 05:04:08 -04:00
_ , err = nc . nodeLister . Get ( nodeName )
2019-08-01 12:59:22 -04:00
if err != nil {
2022-09-22 02:37:36 -04:00
logger . Info ( "Failed to read node" , "node" , klog . KRef ( "" , nodeName ) , "err" , err )
2019-08-01 12:59:22 -04:00
nc . podUpdateQueue . AddRateLimited ( podItem )
return
}
2021-11-12 10:52:27 -05:00
_ , currentReadyCondition := controllerutil . GetNodeCondition ( nodeHealth . status , v1 . NodeReady )
2019-08-01 12:59:22 -04:00
if currentReadyCondition == nil {
// Lack of NodeReady condition may only happen after node addition (or if it will be maliciously deleted).
// In both cases, the pod will be handled correctly (evicted if needed) during processing
// of the next node update event.
return
}
pods := [ ] * v1 . Pod { pod }
if currentReadyCondition . Status != v1 . ConditionTrue {
2021-11-12 10:52:27 -05:00
if err := controllerutil . MarkPodsNotReady ( ctx , nc . kubeClient , nc . recorder , pods , nodeName ) ; err != nil {
2022-09-22 02:37:36 -04:00
logger . Info ( "Unable to mark pod NotReady on node" , "pod" , klog . KRef ( podItem . namespace , podItem . name ) , "node" , klog . KRef ( "" , nodeName ) , "err" , err )
2019-08-01 12:59:22 -04:00
nc . podUpdateQueue . AddRateLimited ( podItem )
}
}
}
2017-10-11 19:36:39 -04:00
func ( nc * Controller ) setLimiterInZone ( zone string , zoneSize int , state ZoneState ) {
switch state {
case stateNormal :
2022-09-14 05:04:08 -04:00
nc . zoneNoExecuteTainter [ zone ] . SwapLimiter ( nc . evictionLimiterQPS )
2017-10-11 19:36:39 -04:00
case statePartialDisruption :
2022-09-14 05:04:08 -04:00
nc . zoneNoExecuteTainter [ zone ] . SwapLimiter (
nc . enterPartialDisruptionFunc ( zoneSize ) )
2017-10-11 19:36:39 -04:00
case stateFullDisruption :
2022-09-14 05:04:08 -04:00
nc . zoneNoExecuteTainter [ zone ] . SwapLimiter (
nc . enterFullDisruptionFunc ( zoneSize ) )
2017-10-11 19:36:39 -04:00
}
}
2017-06-23 03:38:05 -04:00
// classifyNodes classifies the allNodes to three categories:
2022-07-19 20:54:13 -04:00
// 1. added: the nodes that in 'allNodes', but not in 'knownNodeSet'
// 2. deleted: the nodes that in 'knownNodeSet', but not in 'allNodes'
// 3. newZoneRepresentatives: the nodes that in both 'knownNodeSet' and 'allNodes', but no zone states
2017-08-08 19:25:20 -04:00
func ( nc * Controller ) classifyNodes ( allNodes [ ] * v1 . Node ) ( added , deleted , newZoneRepresentatives [ ] * v1 . Node ) {
2017-06-23 03:38:05 -04:00
for i := range allNodes {
if _ , has := nc . knownNodeSet [ allNodes [ i ] . Name ] ; ! has {
added = append ( added , allNodes [ i ] )
} else {
// Currently, we only consider new zone as updated.
2021-11-12 10:52:27 -05:00
zone := nodetopology . GetZoneKey ( allNodes [ i ] )
2017-06-23 03:38:05 -04:00
if _ , found := nc . zoneStates [ zone ] ; ! found {
newZoneRepresentatives = append ( newZoneRepresentatives , allNodes [ i ] )
}
2016-07-12 03:38:57 -04:00
}
}
2017-06-23 03:38:05 -04:00
2016-07-12 03:38:57 -04:00
// If there's a difference between lengths of known Nodes and observed nodes
// we must have removed some Node.
2017-06-23 03:38:05 -04:00
if len ( nc . knownNodeSet ) + len ( added ) != len ( allNodes ) {
2016-11-18 15:50:17 -05:00
knowSetCopy := map [ string ] * v1 . Node { }
2016-07-12 03:38:57 -04:00
for k , v := range nc . knownNodeSet {
knowSetCopy [ k ] = v
}
2017-06-23 03:38:05 -04:00
for i := range allNodes {
delete ( knowSetCopy , allNodes [ i ] . Name )
2016-07-12 03:38:57 -04:00
}
for i := range knowSetCopy {
deleted = append ( deleted , knowSetCopy [ i ] )
}
}
return
}
2017-10-11 19:36:39 -04:00
// HealthyQPSFunc returns the default value for cluster eviction rate - we take
// nodeNum for consistency with ReducedQPSFunc.
func ( nc * Controller ) HealthyQPSFunc ( nodeNum int ) float32 {
return nc . evictionLimiterQPS
}
2023-07-21 04:42:58 -04:00
// ReducedQPSFunc returns the QPS for when the cluster is large make
2017-10-11 19:36:39 -04:00
// evictions slower, if they're small stop evictions altogether.
func ( nc * Controller ) ReducedQPSFunc ( nodeNum int ) float32 {
if int32 ( nodeNum ) > nc . largeClusterThreshold {
return nc . secondaryEvictionLimiterQPS
}
return 0
}
// addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor.
2022-09-22 02:37:36 -04:00
func ( nc * Controller ) addPodEvictorForNewZone ( logger klog . Logger , node * v1 . Node ) {
2018-03-06 03:18:11 -05:00
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
2021-11-12 10:52:27 -05:00
zone := nodetopology . GetZoneKey ( node )
2017-10-11 19:36:39 -04:00
if _ , found := nc . zoneStates [ zone ] ; ! found {
nc . zoneStates [ zone ] = stateInitial
2022-09-14 05:04:08 -04:00
nc . zoneNoExecuteTainter [ zone ] =
scheduler . NewRateLimitedTimedQueue (
flowcontrol . NewTokenBucketRateLimiter ( nc . evictionLimiterQPS , scheduler . EvictionRateLimiterBurst ) )
2017-10-11 19:36:39 -04:00
// Init the metric for the new zone.
2022-09-22 02:37:36 -04:00
logger . Info ( "Initializing eviction metric for zone" , "zone" , zone )
2021-12-10 06:16:34 -05:00
evictionsTotal . WithLabelValues ( zone ) . Add ( 0 )
2017-10-11 19:36:39 -04:00
}
}
2020-03-11 11:34:53 -04:00
func ( nc * Controller ) markNodeForTainting ( node * v1 . Node , status v1 . ConditionStatus ) bool {
2017-02-06 07:58:48 -05:00
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
2020-03-11 11:34:53 -04:00
if status == v1 . ConditionFalse {
if ! taintutils . TaintExists ( node . Spec . Taints , NotReadyTaintTemplate ) {
2021-11-12 10:52:27 -05:00
nc . zoneNoExecuteTainter [ nodetopology . GetZoneKey ( node ) ] . Remove ( node . Name )
2020-03-11 11:34:53 -04:00
}
}
if status == v1 . ConditionUnknown {
if ! taintutils . TaintExists ( node . Spec . Taints , UnreachableTaintTemplate ) {
2021-11-12 10:52:27 -05:00
nc . zoneNoExecuteTainter [ nodetopology . GetZoneKey ( node ) ] . Remove ( node . Name )
2020-03-11 11:34:53 -04:00
}
}
2021-11-12 10:52:27 -05:00
return nc . zoneNoExecuteTainter [ nodetopology . GetZoneKey ( node ) ] . Add ( node . Name , string ( node . UID ) )
2017-02-06 07:58:48 -05:00
}
2021-04-22 14:27:59 -04:00
func ( nc * Controller ) markNodeAsReachable ( ctx context . Context , node * v1 . Node ) ( bool , error ) {
err := controller . RemoveTaintOffNode ( ctx , nc . kubeClient , node . Name , node , UnreachableTaintTemplate )
2022-09-22 02:37:36 -04:00
logger := klog . FromContext ( ctx )
2017-02-06 07:58:48 -05:00
if err != nil {
2024-01-08 07:54:05 -05:00
logger . Error ( err , "Failed to remove unreachable taint from node" , "node" , klog . KObj ( node ) )
2017-02-06 07:58:48 -05:00
return false , err
}
2021-04-22 14:27:59 -04:00
err = controller . RemoveTaintOffNode ( ctx , nc . kubeClient , node . Name , node , NotReadyTaintTemplate )
2017-02-06 07:58:48 -05:00
if err != nil {
2024-01-08 07:54:05 -05:00
logger . Error ( err , "Failed to remove not-ready taint from node" , "node" , klog . KObj ( node ) )
2017-02-06 07:58:48 -05:00
return false , err
}
2022-06-02 12:30:08 -04:00
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
2021-11-12 10:52:27 -05:00
return nc . zoneNoExecuteTainter [ nodetopology . GetZoneKey ( node ) ] . Remove ( node . Name ) , nil
2016-08-05 08:50:19 -04:00
}
2025-09-27 22:56:56 -04:00
// ComputeZoneState computes the state of a zone based on node ready conditions.
// It returns the number of not-ready nodes and the zone state.
2016-08-05 08:50:19 -04:00
// The zone is considered:
// - fullyDisrupted if there're no Ready Nodes,
// - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready,
// - normal otherwise
2017-08-08 19:25:20 -04:00
func ( nc * Controller ) ComputeZoneState ( nodeReadyConditions [ ] * v1 . NodeCondition ) ( int , ZoneState ) {
2016-08-05 08:50:19 -04:00
readyNodes := 0
notReadyNodes := 0
for i := range nodeReadyConditions {
2016-11-18 15:50:17 -05:00
if nodeReadyConditions [ i ] != nil && nodeReadyConditions [ i ] . Status == v1 . ConditionTrue {
2016-08-05 08:50:19 -04:00
readyNodes ++
} else {
notReadyNodes ++
}
}
switch {
case readyNodes == 0 && notReadyNodes > 0 :
2016-08-16 11:08:26 -04:00
return notReadyNodes , stateFullDisruption
2016-08-05 08:50:19 -04:00
case notReadyNodes > 2 && float32 ( notReadyNodes ) / float32 ( notReadyNodes + readyNodes ) >= nc . unhealthyZoneThreshold :
2016-08-16 11:08:26 -04:00
return notReadyNodes , statePartialDisruption
2016-08-05 08:50:19 -04:00
default :
2016-08-16 11:08:26 -04:00
return notReadyNodes , stateNormal
2016-08-05 08:50:19 -04:00
}
}
2018-08-31 03:26:19 -04:00
2019-02-22 19:09:07 -05:00
// reconcileNodeLabels reconciles node labels.
2023-03-14 15:38:14 -04:00
func ( nc * Controller ) reconcileNodeLabels ( ctx context . Context , nodeName string ) error {
2019-02-22 19:09:07 -05:00
node , err := nc . nodeLister . Get ( nodeName )
if err != nil {
// If node not found, just ignore it.
if apierrors . IsNotFound ( err ) {
return nil
}
return err
}
if node . Labels == nil {
// Nothing to reconcile.
return nil
}
labelsToUpdate := map [ string ] string { }
for _ , r := range labelReconcileInfo {
primaryValue , primaryExists := node . Labels [ r . primaryKey ]
secondaryValue , secondaryExists := node . Labels [ r . secondaryKey ]
if ! primaryExists {
// The primary label key does not exist. This should not happen
// within our supported version skew range, when no external
// components/factors modifying the node object. Ignore this case.
continue
}
if secondaryExists && primaryValue != secondaryValue {
// Secondary label exists, but not consistent with the primary
// label. Need to reconcile.
labelsToUpdate [ r . secondaryKey ] = primaryValue
} else if ! secondaryExists && r . ensureSecondaryExists {
// Apply secondary label based on primary label.
labelsToUpdate [ r . secondaryKey ] = primaryValue
}
}
if len ( labelsToUpdate ) == 0 {
return nil
}
2023-03-14 15:38:14 -04:00
if ! controllerutil . AddOrUpdateLabelsOnNode ( ctx , nc . kubeClient , labelsToUpdate , node ) {
2019-02-22 19:09:07 -05:00
return fmt . Errorf ( "failed update labels for node %+v" , node )
}
return nil
}