2019-02-07 18:40:31 -05:00
/ *
Copyright 2019 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package node
import (
2020-02-07 21:16:47 -05:00
"context"
2019-02-07 18:40:31 -05:00
"fmt"
2019-03-06 16:12:18 -05:00
"net"
2019-02-07 18:40:31 -05:00
"sort"
"strconv"
"strings"
"time"
2019-05-03 14:23:57 -04:00
v1 "k8s.io/api/core/v1"
2019-02-07 18:40:31 -05:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
2024-12-11 16:11:51 -05:00
"k8s.io/kubernetes/test/e2e/feature"
2019-02-07 18:40:31 -05:00
"k8s.io/kubernetes/test/e2e/framework"
2019-07-29 04:23:58 -04:00
e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
2019-05-24 02:47:40 -04:00
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
2020-01-10 03:21:12 -05:00
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
2019-05-01 12:53:13 -04:00
e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
2019-02-07 18:40:31 -05:00
testutils "k8s.io/kubernetes/test/utils"
2022-04-04 08:00:06 -04:00
admissionapi "k8s.io/pod-security-admission/api"
2019-02-07 18:40:31 -05:00
2022-03-29 02:12:12 -04:00
"github.com/onsi/ginkgo/v2"
2019-05-10 00:32:08 -04:00
"github.com/onsi/gomega"
2019-02-07 18:40:31 -05:00
)
// This test checks if node-problem-detector (NPD) runs fine without error on
2021-02-12 07:19:56 -05:00
// the up to 10 nodes in the cluster. NPD's functionality is tested in e2e_node tests.
2024-12-11 16:11:51 -05:00
var _ = SIGDescribe ( "NodeProblemDetector" , feature . NodeProblemDetector , func ( ) {
2019-02-07 18:40:31 -05:00
const (
2021-02-12 07:19:56 -05:00
pollInterval = 1 * time . Second
pollTimeout = 1 * time . Minute
maxNodesToProcess = 10
2019-02-07 18:40:31 -05:00
)
f := framework . NewDefaultFramework ( "node-problem-detector" )
2023-05-10 09:38:10 -04:00
f . NamespacePodSecurityLevel = admissionapi . LevelPrivileged
2019-02-07 18:40:31 -05:00
2022-12-12 04:11:10 -05:00
ginkgo . BeforeEach ( func ( ctx context . Context ) {
2020-01-10 03:21:12 -05:00
e2eskipper . SkipUnlessSSHKeyPresent ( )
e2eskipper . SkipUnlessProviderIs ( framework . ProvidersWithSSH ... )
2025-01-20 08:50:46 -05:00
e2eskipper . SkipUnlessProviderIs ( "gce" )
2020-01-10 03:21:12 -05:00
e2eskipper . SkipUnlessNodeOSDistroIs ( "gci" , "ubuntu" )
2022-12-12 04:11:10 -05:00
e2enode . WaitForTotalHealthy ( ctx , f . ClientSet , time . Minute )
2019-02-07 18:40:31 -05:00
} )
2022-10-17 08:47:15 -04:00
ginkgo . It ( "should run without error" , func ( ctx context . Context ) {
2019-05-10 00:32:08 -04:00
ginkgo . By ( "Getting all nodes and their SSH-able IP addresses" )
2022-12-12 04:11:10 -05:00
readyNodes , err := e2enode . GetReadySchedulableNodes ( ctx , f . ClientSet )
2019-09-03 15:00:00 -04:00
framework . ExpectNoError ( err )
2021-02-12 07:19:56 -05:00
nodes := [ ] v1 . Node { }
2019-03-06 16:12:18 -05:00
hosts := [ ] string { }
2021-02-12 07:19:56 -05:00
for _ , node := range readyNodes . Items {
host := ""
2019-03-06 16:12:18 -05:00
for _ , addr := range node . Status . Addresses {
if addr . Type == v1 . NodeExternalIP {
2021-02-12 07:19:56 -05:00
host = net . JoinHostPort ( addr . Address , "22" )
2019-03-06 16:12:18 -05:00
break
}
}
2021-02-12 07:19:56 -05:00
// Not every node has to have an external IP address.
if len ( host ) > 0 {
nodes = append ( nodes , node )
hosts = append ( hosts , host )
}
}
if len ( nodes ) == 0 {
ginkgo . Skip ( "Skipping test due to lack of ready nodes with public IP" )
}
if len ( nodes ) > maxNodesToProcess {
nodes = nodes [ : maxNodesToProcess ]
hosts = hosts [ : maxNodesToProcess ]
2019-02-07 18:40:31 -05:00
}
2019-03-06 16:12:18 -05:00
isStandaloneMode := make ( map [ string ] bool )
2019-02-07 18:40:31 -05:00
cpuUsageStats := make ( map [ string ] [ ] float64 )
uptimeStats := make ( map [ string ] [ ] float64 )
rssStats := make ( map [ string ] [ ] float64 )
workingSetStats := make ( map [ string ] [ ] float64 )
2022-04-28 18:05:42 -04:00
// Some tests suites running for days.
// This test is not marked as Disruptive or Serial so we do not want to
// restart the kubelet during the test to check for KubeletStart event
// detection. We use heuristic here to check if we need to validate for the
// KubeletStart event since there is no easy way to check when test has actually started.
checkForKubeletStart := false
2019-02-07 18:40:31 -05:00
for _ , host := range hosts {
cpuUsageStats [ host ] = [ ] float64 { }
uptimeStats [ host ] = [ ] float64 { }
rssStats [ host ] = [ ] float64 { }
workingSetStats [ host ] = [ ] float64 { }
2019-03-06 16:12:18 -05:00
cmd := "systemctl status node-problem-detector.service"
2022-12-12 04:11:10 -05:00
result , err := e2essh . SSH ( ctx , cmd , host , framework . TestContext . Provider )
2019-03-06 16:12:18 -05:00
isStandaloneMode [ host ] = ( err == nil && result . Code == 0 )
2022-07-19 01:51:52 -04:00
if isStandaloneMode [ host ] {
ginkgo . By ( fmt . Sprintf ( "Check node %q has node-problem-detector process" , host ) )
// Using brackets "[n]" is a trick to prevent grep command itself from
// showing up, because string text "[n]ode-problem-detector" does not
// match regular expression "[n]ode-problem-detector".
psCmd := "ps aux | grep [n]ode-problem-detector"
2022-12-12 04:11:10 -05:00
result , err = e2essh . SSH ( ctx , psCmd , host , framework . TestContext . Provider )
2022-07-19 01:51:52 -04:00
framework . ExpectNoError ( err )
2023-07-21 05:11:36 -04:00
gomega . Expect ( result . Code ) . To ( gomega . Equal ( 0 ) )
2022-07-19 01:51:52 -04:00
gomega . Expect ( result . Stdout ) . To ( gomega . ContainSubstring ( "node-problem-detector" ) )
ginkgo . By ( fmt . Sprintf ( "Check node-problem-detector is running fine on node %q" , host ) )
journalctlCmd := "sudo journalctl -r -u node-problem-detector"
2022-12-12 04:11:10 -05:00
result , err = e2essh . SSH ( ctx , journalctlCmd , host , framework . TestContext . Provider )
2022-07-19 01:51:52 -04:00
framework . ExpectNoError ( err )
2023-07-21 05:11:36 -04:00
gomega . Expect ( result . Code ) . To ( gomega . Equal ( 0 ) )
2022-07-19 01:51:52 -04:00
gomega . Expect ( result . Stdout ) . NotTo ( gomega . ContainSubstring ( "node-problem-detector.service: Failed" ) )
2023-02-08 16:58:45 -05:00
// We only will check for the KubeletStart even if parsing of date here succeeded.
ginkgo . By ( fmt . Sprintf ( "Check when node-problem-detector started on node %q" , host ) )
npdStartTimeCommand := "sudo systemctl show --timestamp=utc node-problem-detector -P ActiveEnterTimestamp"
result , err = e2essh . SSH ( ctx , npdStartTimeCommand , host , framework . TestContext . Provider )
framework . ExpectNoError ( err )
2023-07-21 05:11:36 -04:00
gomega . Expect ( result . Code ) . To ( gomega . Equal ( 0 ) )
2022-07-19 01:51:52 -04:00
2023-02-08 16:58:45 -05:00
// The time format matches the systemd format.
// 'utc': 'Day YYYY-MM-DD HH:MM:SS UTC (see https://www.freedesktop.org/software/systemd/man/systemd.time.html)
st , err := time . Parse ( "Mon 2006-01-02 15:04:05 MST" , result . Stdout )
if err != nil {
framework . Logf ( "Failed to parse when NPD started. Got exit code: %v and stdout: %v, error: %v. Will skip check for kubelet start event." , result . Code , result . Stdout , err )
} else {
checkForKubeletStart = time . Since ( st ) < time . Hour
2022-04-28 18:05:42 -04:00
}
2022-12-12 04:11:10 -05:00
cpuUsage , uptime := getCPUStat ( ctx , f , host )
2019-03-06 16:12:18 -05:00
cpuUsageStats [ host ] = append ( cpuUsageStats [ host ] , cpuUsage )
uptimeStats [ host ] = append ( uptimeStats [ host ] , uptime )
2019-02-07 18:40:31 -05:00
2022-07-19 01:51:52 -04:00
}
2021-07-30 19:58:17 -04:00
ginkgo . By ( fmt . Sprintf ( "Inject log to trigger DockerHung on node %q" , host ) )
log := "INFO: task docker:12345 blocked for more than 120 seconds."
2019-02-07 18:40:31 -05:00
injectLogCmd := "sudo sh -c \"echo 'kernel: " + log + "' >> /dev/kmsg\""
2022-12-12 04:11:10 -05:00
result , err = e2essh . SSH ( ctx , injectLogCmd , host , framework . TestContext . Provider )
2019-02-07 18:40:31 -05:00
framework . ExpectNoError ( err )
2023-07-21 05:11:36 -04:00
gomega . Expect ( result . Code ) . To ( gomega . Equal ( 0 ) )
2019-02-07 18:40:31 -05:00
}
2019-05-10 00:32:08 -04:00
ginkgo . By ( "Gather node-problem-detector cpu and memory stats" )
2019-02-07 18:40:31 -05:00
numIterations := 60
for i := 1 ; i <= numIterations ; i ++ {
2019-03-06 16:12:18 -05:00
for j , host := range hosts {
if isStandaloneMode [ host ] {
2022-12-12 04:11:10 -05:00
rss , workingSet := getMemoryStat ( ctx , f , host )
2019-03-06 16:12:18 -05:00
rssStats [ host ] = append ( rssStats [ host ] , rss )
workingSetStats [ host ] = append ( workingSetStats [ host ] , workingSet )
if i == numIterations {
2022-12-12 04:11:10 -05:00
cpuUsage , uptime := getCPUStat ( ctx , f , host )
2019-03-06 16:12:18 -05:00
cpuUsageStats [ host ] = append ( cpuUsageStats [ host ] , cpuUsage )
uptimeStats [ host ] = append ( uptimeStats [ host ] , uptime )
}
} else {
2022-12-12 04:11:10 -05:00
cpuUsage , rss , workingSet := getNpdPodStat ( ctx , f , nodes [ j ] . Name )
2019-02-07 18:40:31 -05:00
cpuUsageStats [ host ] = append ( cpuUsageStats [ host ] , cpuUsage )
2019-03-06 16:12:18 -05:00
rssStats [ host ] = append ( rssStats [ host ] , rss )
workingSetStats [ host ] = append ( workingSetStats [ host ] , workingSet )
2019-02-07 18:40:31 -05:00
}
}
time . Sleep ( time . Second )
}
cpuStatsMsg := "CPU (core):"
rssStatsMsg := "RSS (MB):"
workingSetStatsMsg := "WorkingSet (MB):"
for i , host := range hosts {
2019-03-06 16:12:18 -05:00
if isStandaloneMode [ host ] {
// When in standalone mode, NPD is running as systemd service. We
// calculate its cpu usage from cgroup cpuacct value differences.
cpuUsage := cpuUsageStats [ host ] [ 1 ] - cpuUsageStats [ host ] [ 0 ]
totaltime := uptimeStats [ host ] [ 1 ] - uptimeStats [ host ] [ 0 ]
2021-02-12 07:19:56 -05:00
cpuStatsMsg += fmt . Sprintf ( " %s[%.3f];" , nodes [ i ] . Name , cpuUsage / totaltime )
2019-03-06 16:12:18 -05:00
} else {
sort . Float64s ( cpuUsageStats [ host ] )
2021-02-12 07:19:56 -05:00
cpuStatsMsg += fmt . Sprintf ( " %s[%.3f|%.3f|%.3f];" , nodes [ i ] . Name ,
2019-03-06 16:12:18 -05:00
cpuUsageStats [ host ] [ 0 ] , cpuUsageStats [ host ] [ len ( cpuUsageStats [ host ] ) / 2 ] , cpuUsageStats [ host ] [ len ( cpuUsageStats [ host ] ) - 1 ] )
}
2019-02-07 18:40:31 -05:00
sort . Float64s ( rssStats [ host ] )
2021-02-12 07:19:56 -05:00
rssStatsMsg += fmt . Sprintf ( " %s[%.1f|%.1f|%.1f];" , nodes [ i ] . Name ,
2019-02-07 18:40:31 -05:00
rssStats [ host ] [ 0 ] , rssStats [ host ] [ len ( rssStats [ host ] ) / 2 ] , rssStats [ host ] [ len ( rssStats [ host ] ) - 1 ] )
sort . Float64s ( workingSetStats [ host ] )
2021-02-12 07:19:56 -05:00
workingSetStatsMsg += fmt . Sprintf ( " %s[%.1f|%.1f|%.1f];" , nodes [ i ] . Name ,
2019-02-07 18:40:31 -05:00
workingSetStats [ host ] [ 0 ] , workingSetStats [ host ] [ len ( workingSetStats [ host ] ) / 2 ] , workingSetStats [ host ] [ len ( workingSetStats [ host ] ) - 1 ] )
}
2019-08-27 05:18:43 -04:00
framework . Logf ( "Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s" , cpuStatsMsg , rssStatsMsg , workingSetStatsMsg )
2024-03-14 08:15:25 -04:00
ginkgo . By ( "Check node-problem-detector can post conditions and events to API server" )
for _ , node := range nodes {
ginkgo . By ( fmt . Sprintf ( "Check node-problem-detector posted KernelDeadlock condition on node %q" , node . Name ) )
gomega . Eventually ( ctx , func ( ) error {
return verifyNodeCondition ( ctx , f , "KernelDeadlock" , v1 . ConditionTrue , "DockerHung" , node . Name )
} , pollTimeout , pollInterval ) . Should ( gomega . Succeed ( ) )
ginkgo . By ( fmt . Sprintf ( "Check node-problem-detector posted DockerHung event on node %q" , node . Name ) )
eventListOptions := metav1 . ListOptions { FieldSelector : fields . Set { "involvedObject.kind" : "Node" } . AsSelector ( ) . String ( ) }
gomega . Eventually ( ctx , func ( ctx context . Context ) error {
return verifyEvents ( ctx , f , eventListOptions , 1 , "DockerHung" , node . Name )
} , pollTimeout , pollInterval ) . Should ( gomega . Succeed ( ) )
if checkForKubeletStart {
// Node problem detector reports kubelet start events automatically starting from NPD v0.7.0+.
// Since Kubelet may be restarted for a few times after node is booted. We just check the event
// is detected, but do not check how many times Kubelet is started.
//
// Some test suites run for hours and KubeletStart event will already be cleaned up
ginkgo . By ( fmt . Sprintf ( "Check node-problem-detector posted KubeletStart event on node %q" , node . Name ) )
gomega . Eventually ( ctx , func ( ctx context . Context ) error {
return verifyEventExists ( ctx , f , eventListOptions , "KubeletStart" , node . Name )
} , pollTimeout , pollInterval ) . Should ( gomega . Succeed ( ) )
} else {
ginkgo . By ( "KubeletStart event will NOT be checked" )
}
}
2019-02-07 18:40:31 -05:00
} )
} )
2022-12-12 04:11:10 -05:00
func verifyEvents ( ctx context . Context , f * framework . Framework , options metav1 . ListOptions , num int , reason , nodeName string ) error {
events , err := f . ClientSet . CoreV1 ( ) . Events ( metav1 . NamespaceDefault ) . List ( ctx , options )
2019-02-07 18:40:31 -05:00
if err != nil {
return err
}
count := 0
for _ , event := range events . Items {
if event . Reason != reason || event . Source . Host != nodeName {
continue
}
count += int ( event . Count )
}
if count != num {
return fmt . Errorf ( "expect event number %d, got %d: %v" , num , count , events . Items )
}
return nil
}
2022-12-12 04:11:10 -05:00
func verifyEventExists ( ctx context . Context , f * framework . Framework , options metav1 . ListOptions , reason , nodeName string ) error {
events , err := f . ClientSet . CoreV1 ( ) . Events ( metav1 . NamespaceDefault ) . List ( ctx , options )
2019-07-29 17:22:41 -04:00
if err != nil {
return err
}
for _ , event := range events . Items {
if event . Reason == reason && event . Source . Host == nodeName && event . Count > 0 {
return nil
}
}
return fmt . Errorf ( "Event %s does not exist: %v" , reason , events . Items )
}
2022-12-12 04:11:10 -05:00
func verifyNodeCondition ( ctx context . Context , f * framework . Framework , condition v1 . NodeConditionType , status v1 . ConditionStatus , reason , nodeName string ) error {
node , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . Get ( ctx , nodeName , metav1 . GetOptions { } )
2019-02-07 18:40:31 -05:00
if err != nil {
return err
}
_ , c := testutils . GetNodeCondition ( & node . Status , condition )
if c == nil {
return fmt . Errorf ( "node condition %q not found" , condition )
}
if c . Status != status || c . Reason != reason {
return fmt . Errorf ( "unexpected node condition %q: %+v" , condition , c )
}
return nil
}
2022-12-12 04:11:10 -05:00
func getMemoryStat ( ctx context . Context , f * framework . Framework , host string ) ( rss , workingSet float64 ) {
2021-10-18 22:49:31 -04:00
var memCmd string
2022-12-12 04:11:10 -05:00
isCgroupV2 := isHostRunningCgroupV2 ( ctx , f , host )
2021-10-18 22:49:31 -04:00
if isCgroupV2 {
memCmd = "cat /sys/fs/cgroup/system.slice/node-problem-detector.service/memory.current && cat /sys/fs/cgroup/system.slice/node-problem-detector.service/memory.stat"
} else {
memCmd = "cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.usage_in_bytes && cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.stat"
}
2022-12-12 04:11:10 -05:00
result , err := e2essh . SSH ( ctx , memCmd , host , framework . TestContext . Provider )
2019-02-07 18:40:31 -05:00
framework . ExpectNoError ( err )
2023-07-21 05:11:36 -04:00
gomega . Expect ( result . Code ) . To ( gomega . Equal ( 0 ) )
2019-02-07 18:40:31 -05:00
lines := strings . Split ( result . Stdout , "\n" )
memoryUsage , err := strconv . ParseFloat ( lines [ 0 ] , 64 )
2019-08-20 22:01:49 -04:00
framework . ExpectNoError ( err )
2019-02-07 18:40:31 -05:00
2021-10-18 22:49:31 -04:00
var rssToken , inactiveFileToken string
if isCgroupV2 {
// Use Anon memory for RSS as cAdvisor on cgroupv2
// see https://github.com/google/cadvisor/blob/a9858972e75642c2b1914c8d5428e33e6392c08a/container/libcontainer/handler.go#L799
rssToken = "anon"
inactiveFileToken = "inactive_file"
} else {
rssToken = "total_rss"
inactiveFileToken = "total_inactive_file"
}
2019-02-07 18:40:31 -05:00
var totalInactiveFile float64
for _ , line := range lines [ 1 : ] {
tokens := strings . Split ( line , " " )
2021-10-18 22:49:31 -04:00
if tokens [ 0 ] == rssToken {
2019-02-07 18:40:31 -05:00
rss , err = strconv . ParseFloat ( tokens [ 1 ] , 64 )
2019-08-20 22:01:49 -04:00
framework . ExpectNoError ( err )
2019-02-07 18:40:31 -05:00
}
2021-10-18 22:49:31 -04:00
if tokens [ 0 ] == inactiveFileToken {
2019-02-07 18:40:31 -05:00
totalInactiveFile , err = strconv . ParseFloat ( tokens [ 1 ] , 64 )
2019-08-20 22:01:49 -04:00
framework . ExpectNoError ( err )
2019-02-07 18:40:31 -05:00
}
}
workingSet = memoryUsage
if workingSet < totalInactiveFile {
workingSet = 0
} else {
workingSet -= totalInactiveFile
}
// Convert to MB
rss = rss / 1024 / 1024
workingSet = workingSet / 1024 / 1024
return
}
2022-12-12 04:11:10 -05:00
func getCPUStat ( ctx context . Context , f * framework . Framework , host string ) ( usage , uptime float64 ) {
2021-10-18 22:49:31 -04:00
var cpuCmd string
2022-12-12 04:11:10 -05:00
if isHostRunningCgroupV2 ( ctx , f , host ) {
2021-10-18 22:49:31 -04:00
cpuCmd = " cat /sys/fs/cgroup/cpu.stat | grep 'usage_usec' | sed 's/[^0-9]*//g' && cat /proc/uptime | awk '{print $1}'"
} else {
cpuCmd = "cat /sys/fs/cgroup/cpu/system.slice/node-problem-detector.service/cpuacct.usage && cat /proc/uptime | awk '{print $1}'"
}
2022-12-12 04:11:10 -05:00
result , err := e2essh . SSH ( ctx , cpuCmd , host , framework . TestContext . Provider )
2019-02-07 18:40:31 -05:00
framework . ExpectNoError ( err )
2023-07-21 05:11:36 -04:00
gomega . Expect ( result . Code ) . To ( gomega . Equal ( 0 ) )
2019-02-07 18:40:31 -05:00
lines := strings . Split ( result . Stdout , "\n" )
usage , err = strconv . ParseFloat ( lines [ 0 ] , 64 )
2019-10-09 10:26:11 -04:00
framework . ExpectNoError ( err , "Cannot parse float for usage" )
2019-02-07 18:40:31 -05:00
uptime , err = strconv . ParseFloat ( lines [ 1 ] , 64 )
2019-10-09 10:26:11 -04:00
framework . ExpectNoError ( err , "Cannot parse float for uptime" )
2019-02-07 18:40:31 -05:00
// Convert from nanoseconds to seconds
usage *= 1e-9
return
}
2019-03-06 16:12:18 -05:00
2022-12-12 04:11:10 -05:00
func isHostRunningCgroupV2 ( ctx context . Context , f * framework . Framework , host string ) bool {
result , err := e2essh . SSH ( ctx , "stat -fc %T /sys/fs/cgroup/" , host , framework . TestContext . Provider )
2021-10-18 22:49:31 -04:00
framework . ExpectNoError ( err )
2023-07-21 05:11:36 -04:00
gomega . Expect ( result . Code ) . To ( gomega . Equal ( 0 ) )
2021-10-18 22:49:31 -04:00
// 0x63677270 == CGROUP2_SUPER_MAGIC
// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
return strings . Contains ( result . Stdout , "cgroup2" ) || strings . Contains ( result . Stdout , "0x63677270" )
}
2022-12-12 04:11:10 -05:00
func getNpdPodStat ( ctx context . Context , f * framework . Framework , nodeName string ) ( cpuUsage , rss , workingSet float64 ) {
summary , err := e2ekubelet . GetStatsSummary ( ctx , f . ClientSet , nodeName )
2019-03-06 16:12:18 -05:00
framework . ExpectNoError ( err )
hasNpdPod := false
for _ , pod := range summary . Pods {
2023-10-16 07:34:47 -04:00
if ! strings . HasPrefix ( pod . PodRef . Name , "node-problem-detector" ) {
2019-03-06 16:12:18 -05:00
continue
}
2024-02-28 10:22:09 -05:00
if pod . CPU != nil && pod . CPU . UsageNanoCores != nil {
cpuUsage = float64 ( * pod . CPU . UsageNanoCores ) * 1e-9
}
if pod . Memory != nil {
if pod . Memory . RSSBytes != nil {
rss = float64 ( * pod . Memory . RSSBytes ) / 1024 / 1024
}
if pod . Memory . WorkingSetBytes != nil {
workingSet = float64 ( * pod . Memory . WorkingSetBytes ) / 1024 / 1024
}
}
2019-03-06 16:12:18 -05:00
hasNpdPod = true
break
}
2021-11-04 10:39:59 -04:00
if ! hasNpdPod {
framework . Failf ( "No node-problem-detector pod is present in %+v" , summary . Pods )
}
2019-03-06 16:12:18 -05:00
return
}