2019-05-24 02:47:40 -04:00
/ *
Copyright 2019 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package node
import (
2020-02-07 21:16:47 -05:00
"context"
2019-05-24 02:47:40 -04:00
"fmt"
2025-11-10 15:42:54 -05:00
"k8s.io/klog/v2"
2019-05-24 02:47:40 -04:00
"regexp"
"time"
2024-03-30 02:02:48 -04:00
"github.com/onsi/gomega"
2019-07-08 02:45:10 -04:00
v1 "k8s.io/api/core/v1"
2019-05-24 02:47:40 -04:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
2022-08-25 14:01:40 -04:00
"k8s.io/kubernetes/test/e2e/framework"
2019-05-24 02:47:40 -04:00
)
const sleepTime = 20 * time . Second
var requiredPerNodePods = [ ] * regexp . Regexp {
regexp . MustCompile ( ".*kube-proxy.*" ) ,
regexp . MustCompile ( ".*fluentd-elasticsearch.*" ) ,
regexp . MustCompile ( ".*node-problem-detector.*" ) ,
}
// WaitForReadyNodes waits up to timeout for cluster to has desired size and
2020-10-02 11:54:29 -04:00
// there is no not-ready nodes in it. By cluster size we mean number of schedulable Nodes.
2022-12-12 04:11:10 -05:00
func WaitForReadyNodes ( ctx context . Context , c clientset . Interface , size int , timeout time . Duration ) error {
_ , err := CheckReady ( ctx , c , size , timeout )
2019-05-24 02:47:40 -04:00
return err
}
// WaitForTotalHealthy checks whether all registered nodes are ready and all required Pods are running on them.
2022-12-12 04:11:10 -05:00
func WaitForTotalHealthy ( ctx context . Context , c clientset . Interface , timeout time . Duration ) error {
2025-11-10 15:42:54 -05:00
logger := klog . FromContext ( ctx )
2022-08-25 14:01:40 -04:00
framework . Logf ( "Waiting up to %v for all nodes to be ready" , timeout )
2019-05-24 02:47:40 -04:00
var notReady [ ] v1 . Node
var missingPodsPerNode map [ string ] [ ] string
2023-10-19 10:50:21 -04:00
err := wait . PollUntilContextTimeout ( ctx , poll , timeout , true , func ( ctx context . Context ) ( bool , error ) {
2019-05-24 02:47:40 -04:00
notReady = nil
// It should be OK to list unschedulable Nodes here.
2022-12-12 04:11:10 -05:00
nodes , err := c . CoreV1 ( ) . Nodes ( ) . List ( ctx , metav1 . ListOptions { ResourceVersion : "0" } )
2019-05-24 02:47:40 -04:00
if err != nil {
return false , err
}
for _ , node := range nodes . Items {
if ! IsConditionSetAsExpected ( & node , v1 . NodeReady , true ) {
notReady = append ( notReady , node )
}
}
2022-12-12 04:11:10 -05:00
pods , err := c . CoreV1 ( ) . Pods ( metav1 . NamespaceAll ) . List ( ctx , metav1 . ListOptions { ResourceVersion : "0" } )
2019-05-24 02:47:40 -04:00
if err != nil {
return false , err
}
systemPodsPerNode := make ( map [ string ] [ ] string )
for _ , pod := range pods . Items {
if pod . Namespace == metav1 . NamespaceSystem && pod . Status . Phase == v1 . PodRunning {
if pod . Spec . NodeName != "" {
systemPodsPerNode [ pod . Spec . NodeName ] = append ( systemPodsPerNode [ pod . Spec . NodeName ] , pod . Name )
}
}
}
missingPodsPerNode = make ( map [ string ] [ ] string )
for _ , node := range nodes . Items {
2025-11-10 15:42:54 -05:00
if isNodeSchedulableWithoutTaints ( logger , & node ) {
2019-05-24 02:47:40 -04:00
for _ , requiredPod := range requiredPerNodePods {
foundRequired := false
for _ , presentPod := range systemPodsPerNode [ node . Name ] {
if requiredPod . MatchString ( presentPod ) {
foundRequired = true
break
}
}
if ! foundRequired {
missingPodsPerNode [ node . Name ] = append ( missingPodsPerNode [ node . Name ] , requiredPod . String ( ) )
}
}
}
}
return len ( notReady ) == 0 && len ( missingPodsPerNode ) == 0 , nil
} )
2023-09-04 09:42:10 -04:00
if err != nil && ! wait . Interrupted ( err ) {
2019-05-24 02:47:40 -04:00
return err
}
if len ( notReady ) > 0 {
return fmt . Errorf ( "Not ready nodes: %v" , notReady )
}
if len ( missingPodsPerNode ) > 0 {
return fmt . Errorf ( "Not running system Pods: %v" , missingPodsPerNode )
}
return nil
}
// WaitConditionToBe returns whether node "name's" condition state matches wantTrue
// within timeout. If wantTrue is true, it will ensure the node condition status
// is ConditionTrue; if it's false, it ensures the node condition is in any state
// other than ConditionTrue (e.g. not true or unknown).
2022-12-12 04:11:10 -05:00
func WaitConditionToBe ( ctx context . Context , c clientset . Interface , name string , conditionType v1 . NodeConditionType , wantTrue bool , timeout time . Duration ) bool {
2022-08-25 14:01:40 -04:00
framework . Logf ( "Waiting up to %v for node %s condition %s to be %t" , timeout , name , conditionType , wantTrue )
2019-05-24 02:47:40 -04:00
for start := time . Now ( ) ; time . Since ( start ) < timeout ; time . Sleep ( poll ) {
2022-12-12 04:11:10 -05:00
node , err := c . CoreV1 ( ) . Nodes ( ) . Get ( ctx , name , metav1 . GetOptions { } )
2019-05-24 02:47:40 -04:00
if err != nil {
2022-08-25 14:01:40 -04:00
framework . Logf ( "Couldn't get node %s" , name )
2019-05-24 02:47:40 -04:00
continue
}
if IsConditionSetAsExpected ( node , conditionType , wantTrue ) {
return true
}
}
2022-08-25 14:01:40 -04:00
framework . Logf ( "Node %s didn't reach desired %s condition status (%t) within %v" , name , conditionType , wantTrue , timeout )
2019-05-24 02:47:40 -04:00
return false
}
// WaitForNodeToBeNotReady returns whether node name is not ready (i.e. the
// readiness condition is anything but ready, e.g false or unknown) within
// timeout.
2022-12-12 04:11:10 -05:00
func WaitForNodeToBeNotReady ( ctx context . Context , c clientset . Interface , name string , timeout time . Duration ) bool {
return WaitConditionToBe ( ctx , c , name , v1 . NodeReady , false , timeout )
2019-05-24 02:47:40 -04:00
}
// WaitForNodeToBeReady returns whether node name is ready within timeout.
2022-12-12 04:11:10 -05:00
func WaitForNodeToBeReady ( ctx context . Context , c clientset . Interface , name string , timeout time . Duration ) bool {
return WaitConditionToBe ( ctx , c , name , v1 . NodeReady , true , timeout )
2019-05-24 02:47:40 -04:00
}
2023-03-06 06:41:59 -05:00
func WaitForNodeSchedulable ( ctx context . Context , c clientset . Interface , name string , timeout time . Duration , wantSchedulable bool ) bool {
2025-11-10 15:42:54 -05:00
logger := klog . FromContext ( ctx )
2022-11-14 15:04:16 -05:00
framework . Logf ( "Waiting up to %v for node %s to be schedulable: %t" , timeout , name , wantSchedulable )
for start := time . Now ( ) ; time . Since ( start ) < timeout ; time . Sleep ( poll ) {
2023-03-06 06:41:59 -05:00
node , err := c . CoreV1 ( ) . Nodes ( ) . Get ( ctx , name , metav1 . GetOptions { } )
2022-11-14 15:04:16 -05:00
if err != nil {
framework . Logf ( "Couldn't get node %s" , name )
continue
}
2025-11-10 15:42:54 -05:00
if IsNodeSchedulable ( logger , node ) == wantSchedulable {
2022-11-14 15:04:16 -05:00
return true
}
}
framework . Logf ( "Node %s didn't reach desired schedulable status (%t) within %v" , name , wantSchedulable , timeout )
return false
}
2024-03-22 04:57:06 -04:00
// WaitForNodeHeartbeatAfter waits up to timeout for node to send the next
// heartbeat after the given timestamp.
//
// To ensure the node status is posted by a restarted kubelet process,
// after should be retrieved by [GetNodeHeartbeatTime] while the kubelet is down.
2024-03-30 02:02:48 -04:00
func WaitForNodeHeartbeatAfter ( ctx context . Context , c clientset . Interface , name string , after metav1 . Time , timeout time . Duration ) {
2024-03-22 04:57:06 -04:00
framework . Logf ( "Waiting up to %v for node %s to send a heartbeat after %v" , timeout , name , after )
2024-03-30 02:02:48 -04:00
gomega . Eventually ( ctx , func ( ) ( time . Time , error ) {
2024-03-22 04:57:06 -04:00
node , err := c . CoreV1 ( ) . Nodes ( ) . Get ( ctx , name , metav1 . GetOptions { } )
if err != nil {
framework . Logf ( "Couldn't get node %s" , name )
2024-03-30 02:02:48 -04:00
return time . Time { } , err
2024-03-22 04:57:06 -04:00
}
2024-03-30 02:02:48 -04:00
return GetNodeHeartbeatTime ( node ) . Time , nil
} , timeout , poll ) . Should ( gomega . BeTemporally ( ">" , after . Time ) , "Node %s didn't send a heartbeat" , name )
2024-03-22 04:57:06 -04:00
}
2019-05-24 02:47:40 -04:00
// CheckReady waits up to timeout for cluster to has desired size and
2020-10-02 11:54:29 -04:00
// there is no not-ready nodes in it. By cluster size we mean number of schedulable Nodes.
2022-12-12 04:11:10 -05:00
func CheckReady ( ctx context . Context , c clientset . Interface , size int , timeout time . Duration ) ( [ ] v1 . Node , error ) {
2019-05-24 02:47:40 -04:00
for start := time . Now ( ) ; time . Since ( start ) < timeout ; time . Sleep ( sleepTime ) {
2022-12-12 04:11:10 -05:00
nodes , err := waitListSchedulableNodes ( ctx , c )
2019-05-24 02:47:40 -04:00
if err != nil {
2022-08-25 14:01:40 -04:00
framework . Logf ( "Failed to list nodes: %v" , err )
2019-05-24 02:47:40 -04:00
continue
}
numNodes := len ( nodes . Items )
// Filter out not-ready nodes.
Filter ( nodes , func ( node v1 . Node ) bool {
nodeReady := IsConditionSetAsExpected ( & node , v1 . NodeReady , true )
2020-02-21 05:45:09 -05:00
networkReady := isConditionUnset ( & node , v1 . NodeNetworkUnavailable ) || IsConditionSetAsExpected ( & node , v1 . NodeNetworkUnavailable , false )
2019-05-24 02:47:40 -04:00
return nodeReady && networkReady
} )
numReady := len ( nodes . Items )
if numNodes == size && numReady == size {
2022-08-25 14:01:40 -04:00
framework . Logf ( "Cluster has reached the desired number of ready nodes %d" , size )
2019-05-24 02:47:40 -04:00
return nodes . Items , nil
}
2022-08-25 14:01:40 -04:00
framework . Logf ( "Waiting for ready nodes %d, current ready %d, not ready nodes %d" , size , numReady , numNodes - numReady )
2019-05-24 02:47:40 -04:00
}
return nil , fmt . Errorf ( "timeout waiting %v for number of ready nodes to be %d" , timeout , size )
}
// waitListSchedulableNodes is a wrapper around listing nodes supporting retries.
2022-12-12 04:11:10 -05:00
func waitListSchedulableNodes ( ctx context . Context , c clientset . Interface ) ( * v1 . NodeList , error ) {
2019-05-24 02:47:40 -04:00
var nodes * v1 . NodeList
var err error
2023-10-19 10:50:21 -04:00
if wait . PollUntilContextTimeout ( ctx , poll , singleCallTimeout , true , func ( ctx context . Context ) ( bool , error ) {
2022-12-12 04:11:10 -05:00
nodes , err = c . CoreV1 ( ) . Nodes ( ) . List ( ctx , metav1 . ListOptions { FieldSelector : fields . Set {
2019-05-24 02:47:40 -04:00
"spec.unschedulable" : "false" ,
} . AsSelector ( ) . String ( ) } )
if err != nil {
return false , err
}
return true , nil
} ) != nil {
return nodes , err
}
return nodes , nil
}
2019-07-08 02:45:10 -04:00
2019-07-16 19:37:29 -04:00
// checkWaitListSchedulableNodes is a wrapper around listing nodes supporting retries.
2022-12-12 04:11:10 -05:00
func checkWaitListSchedulableNodes ( ctx context . Context , c clientset . Interface ) ( * v1 . NodeList , error ) {
nodes , err := waitListSchedulableNodes ( ctx , c )
2019-07-08 02:45:10 -04:00
if err != nil {
return nil , fmt . Errorf ( "error: %s. Non-retryable failure or timed out while listing nodes for e2e cluster" , err )
}
return nodes , nil
}
2019-08-06 16:08:29 -04:00
2020-08-13 15:35:43 -04:00
// CheckReadyForTests returns a function which will return 'true' once the number of ready nodes is above the allowedNotReadyNodes threshold (i.e. to be used as a global gate for starting the tests).
2022-12-12 04:11:10 -05:00
func CheckReadyForTests ( ctx context . Context , c clientset . Interface , nonblockingTaints string , allowedNotReadyNodes , largeClusterThreshold int ) func ( ctx context . Context ) ( bool , error ) {
2025-11-10 15:42:54 -05:00
logger := klog . FromContext ( ctx )
2019-08-06 16:08:29 -04:00
attempt := 0
2022-12-12 04:11:10 -05:00
return func ( ctx context . Context ) ( bool , error ) {
2021-02-04 20:14:05 -05:00
if allowedNotReadyNodes == - 1 {
return true , nil
}
2019-08-06 16:08:29 -04:00
attempt ++
2020-08-13 15:35:43 -04:00
var nodesNotReadyYet [ ] v1 . Node
2019-08-06 16:08:29 -04:00
opts := metav1 . ListOptions {
ResourceVersion : "0" ,
2020-08-13 15:35:43 -04:00
// remove uncordoned nodes from our calculation, TODO refactor if node v2 API removes that semantic.
FieldSelector : fields . Set { "spec.unschedulable" : "false" } . AsSelector ( ) . String ( ) ,
2019-08-06 16:08:29 -04:00
}
2022-12-12 04:11:10 -05:00
allNodes , err := c . CoreV1 ( ) . Nodes ( ) . List ( ctx , opts )
2019-08-06 16:08:29 -04:00
if err != nil {
2021-04-30 18:29:12 -04:00
var terminalListNodesErr error
2022-08-25 14:01:40 -04:00
framework . Logf ( "Unexpected error listing nodes: %v" , err )
2021-04-30 18:29:12 -04:00
if attempt >= 3 {
terminalListNodesErr = err
}
return false , terminalListNodesErr
2019-08-06 16:08:29 -04:00
}
2020-08-13 15:35:43 -04:00
for _ , node := range allNodes . Items {
2025-11-10 15:42:54 -05:00
if ! readyForTests ( logger , & node , nonblockingTaints ) {
2020-08-13 15:35:43 -04:00
nodesNotReadyYet = append ( nodesNotReadyYet , node )
2019-08-06 16:08:29 -04:00
}
}
// Framework allows for <TestContext.AllowedNotReadyNodes> nodes to be non-ready,
// to make it possible e.g. for incorrect deployment of some small percentage
// of nodes (which we allow in cluster validation). Some nodes that are not
// provisioned correctly at startup will never become ready (e.g. when something
// won't install correctly), so we can't expect them to be ready at any point.
//
2020-08-13 15:35:43 -04:00
// We log the *reason* why nodes are not schedulable, specifically, its usually the network not being available.
if len ( nodesNotReadyYet ) > 0 {
2019-08-06 16:08:29 -04:00
// In large clusters, log them only every 10th pass.
2020-08-13 15:35:43 -04:00
if len ( nodesNotReadyYet ) < largeClusterThreshold || attempt % 10 == 0 {
2022-08-25 14:01:40 -04:00
framework . Logf ( "Unschedulable nodes= %v, maximum value for starting tests= %v" , len ( nodesNotReadyYet ) , allowedNotReadyNodes )
2020-08-13 15:35:43 -04:00
for _ , node := range nodesNotReadyYet {
2022-08-25 14:01:40 -04:00
framework . Logf ( " -> Node %s [[[ Ready=%t, Network(available)=%t, Taints=%v, NonblockingTaints=%v ]]]" ,
2020-08-13 15:35:43 -04:00
node . Name ,
IsConditionSetAsExpectedSilent ( & node , v1 . NodeReady , true ) ,
IsConditionSetAsExpectedSilent ( & node , v1 . NodeNetworkUnavailable , false ) ,
node . Spec . Taints ,
2019-09-05 12:48:41 -04:00
nonblockingTaints ,
2019-08-06 16:08:29 -04:00
)
}
2020-08-13 15:35:43 -04:00
if len ( nodesNotReadyYet ) > allowedNotReadyNodes {
ready := len ( allNodes . Items ) - len ( nodesNotReadyYet )
remaining := len ( nodesNotReadyYet ) - allowedNotReadyNodes
2022-08-25 14:01:40 -04:00
framework . Logf ( "==== node wait: %v out of %v nodes are ready, max notReady allowed %v. Need %v more before starting." , ready , len ( allNodes . Items ) , allowedNotReadyNodes , remaining )
2020-08-13 15:35:43 -04:00
}
2019-08-06 16:08:29 -04:00
}
}
2020-08-13 15:35:43 -04:00
return len ( nodesNotReadyYet ) <= allowedNotReadyNodes , nil
2019-08-06 16:08:29 -04:00
}
}
// readyForTests determines whether or not we should continue waiting for the nodes
// to enter a testable state. By default this means it is schedulable, NodeReady, and untainted.
2019-09-05 12:48:41 -04:00
// Nodes with taints nonblocking taints are permitted to have that taint and
2019-08-06 16:08:29 -04:00
// also have their node.Spec.Unschedulable field ignored for the purposes of this function.
2025-11-10 15:42:54 -05:00
func readyForTests ( logger klog . Logger , node * v1 . Node , nonblockingTaints string ) bool {
2019-09-05 12:48:41 -04:00
if hasNonblockingTaint ( node , nonblockingTaints ) {
// If the node has one of the nonblockingTaints taints; just check that it is ready
2019-08-06 16:08:29 -04:00
// and don't require node.Spec.Unschedulable to be set either way.
2025-11-10 15:42:54 -05:00
if ! IsNodeReady ( logger , node ) || ! isNodeUntaintedWithNonblocking ( logger , node , nonblockingTaints ) {
2019-08-06 16:08:29 -04:00
return false
}
} else {
2025-11-10 15:42:54 -05:00
if ! IsNodeSchedulable ( logger , node ) || ! isNodeUntainted ( logger , node ) {
2019-08-06 16:08:29 -04:00
return false
}
}
return true
}