2022-08-25 07:43:04 -04:00
/ *
Copyright 2014 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package debug
import (
"context"
"fmt"
"sort"
"time"
"github.com/onsi/ginkgo/v2"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
clientset "k8s.io/client-go/kubernetes"
restclient "k8s.io/client-go/rest"
"k8s.io/kubernetes/test/e2e/framework"
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
)
// EventsLister is a func that lists events.
type EventsLister func ( opts metav1 . ListOptions , ns string ) ( * v1 . EventList , error )
// dumpEventsInNamespace dumps events in the given namespace.
func dumpEventsInNamespace ( eventsLister EventsLister , namespace string ) {
ginkgo . By ( fmt . Sprintf ( "Collecting events from namespace %q." , namespace ) )
events , err := eventsLister ( metav1 . ListOptions { } , namespace )
framework . ExpectNoError ( err , "failed to list events in namespace %q" , namespace )
ginkgo . By ( fmt . Sprintf ( "Found %d events." , len ( events . Items ) ) )
// Sort events by their first timestamp
sortedEvents := events . Items
if len ( sortedEvents ) > 1 {
sort . Sort ( byFirstTimestamp ( sortedEvents ) )
}
for _ , e := range sortedEvents {
framework . Logf ( "At %v - event for %v: %v %v: %v" , e . FirstTimestamp , e . InvolvedObject . Name , e . Source , e . Reason , e . Message )
}
// Note that we don't wait for any Cleanup to propagate, which means
// that if you delete a bunch of pods right before ending your test,
// you may or may not see the killing/deletion/Cleanup events.
}
// DumpAllNamespaceInfo dumps events, pods and nodes information in the given namespace.
2022-12-12 04:11:10 -05:00
func DumpAllNamespaceInfo ( ctx context . Context , c clientset . Interface , namespace string ) {
2022-08-25 07:43:04 -04:00
dumpEventsInNamespace ( func ( opts metav1 . ListOptions , ns string ) ( * v1 . EventList , error ) {
2022-12-12 04:11:10 -05:00
return c . CoreV1 ( ) . Events ( ns ) . List ( ctx , opts )
2022-08-25 07:43:04 -04:00
} , namespace )
2022-12-12 04:11:10 -05:00
e2epod . DumpAllPodInfoForNamespace ( ctx , c , namespace , framework . TestContext . ReportDir )
2022-08-25 07:43:04 -04:00
// If cluster is large, then the following logs are basically useless, because:
// 1. it takes tens of minutes or hours to grab all of them
// 2. there are so many of them that working with them are mostly impossible
// So we dump them only if the cluster is relatively small.
maxNodesForDump := framework . TestContext . MaxNodesToGather
2022-12-12 04:11:10 -05:00
nodes , err := c . CoreV1 ( ) . Nodes ( ) . List ( ctx , metav1 . ListOptions { } )
2022-08-25 07:43:04 -04:00
if err != nil {
framework . Logf ( "unable to fetch node list: %v" , err )
return
}
if len ( nodes . Items ) <= maxNodesForDump {
2022-12-12 04:11:10 -05:00
dumpAllNodeInfo ( ctx , c , nodes )
2022-08-25 07:43:04 -04:00
} else {
framework . Logf ( "skipping dumping cluster info - cluster too large" )
}
}
// byFirstTimestamp sorts a slice of events by first timestamp, using their involvedObject's name as a tie breaker.
type byFirstTimestamp [ ] v1 . Event
func ( o byFirstTimestamp ) Len ( ) int { return len ( o ) }
func ( o byFirstTimestamp ) Swap ( i , j int ) { o [ i ] , o [ j ] = o [ j ] , o [ i ] }
func ( o byFirstTimestamp ) Less ( i , j int ) bool {
if o [ i ] . FirstTimestamp . Equal ( & o [ j ] . FirstTimestamp ) {
return o [ i ] . InvolvedObject . Name < o [ j ] . InvolvedObject . Name
}
return o [ i ] . FirstTimestamp . Before ( & o [ j ] . FirstTimestamp )
}
2022-12-12 04:11:10 -05:00
func dumpAllNodeInfo ( ctx context . Context , c clientset . Interface , nodes * v1 . NodeList ) {
2022-08-25 07:43:04 -04:00
names := make ( [ ] string , len ( nodes . Items ) )
for ix := range nodes . Items {
names [ ix ] = nodes . Items [ ix ] . Name
}
2022-12-12 04:11:10 -05:00
DumpNodeDebugInfo ( ctx , c , names , framework . Logf )
2022-08-25 07:43:04 -04:00
}
// DumpNodeDebugInfo dumps debug information of the given nodes.
2022-12-12 04:11:10 -05:00
func DumpNodeDebugInfo ( ctx context . Context , c clientset . Interface , nodeNames [ ] string , logFunc func ( fmt string , args ... interface { } ) ) {
2022-08-25 07:43:04 -04:00
for _ , n := range nodeNames {
logFunc ( "\nLogging node info for node %v" , n )
2022-12-12 04:11:10 -05:00
node , err := c . CoreV1 ( ) . Nodes ( ) . Get ( ctx , n , metav1 . GetOptions { } )
2022-08-25 07:43:04 -04:00
if err != nil {
logFunc ( "Error getting node info %v" , err )
}
logFunc ( "Node Info: %v" , node )
logFunc ( "\nLogging kubelet events for node %v" , n )
2022-12-12 04:11:10 -05:00
for _ , e := range getNodeEvents ( ctx , c , n ) {
2022-08-25 07:43:04 -04:00
logFunc ( "source %v type %v message %v reason %v first ts %v last ts %v, involved obj %+v" ,
e . Source , e . Type , e . Message , e . Reason , e . FirstTimestamp , e . LastTimestamp , e . InvolvedObject )
}
logFunc ( "\nLogging pods the kubelet thinks is on node %v" , n )
2022-12-12 04:11:10 -05:00
podList , err := getKubeletPods ( ctx , c , n )
2022-08-25 07:43:04 -04:00
if err != nil {
logFunc ( "Unable to retrieve kubelet pods for node %v: %v" , n , err )
continue
}
for _ , p := range podList . Items {
logFunc ( "%v started at %v (%d+%d container statuses recorded)" , p . Name , p . Status . StartTime , len ( p . Status . InitContainerStatuses ) , len ( p . Status . ContainerStatuses ) )
for _ , c := range p . Status . InitContainerStatuses {
logFunc ( "\tInit container %v ready: %v, restart count %v" ,
c . Name , c . Ready , c . RestartCount )
}
for _ , c := range p . Status . ContainerStatuses {
logFunc ( "\tContainer %v ready: %v, restart count %v" ,
c . Name , c . Ready , c . RestartCount )
}
}
2022-12-12 04:11:10 -05:00
_ , err = e2emetrics . HighLatencyKubeletOperations ( ctx , c , 10 * time . Second , n , logFunc )
framework . ExpectNoError ( err )
2022-08-25 07:43:04 -04:00
// TODO: Log node resource info
}
}
// getKubeletPods retrieves the list of pods on the kubelet.
2022-12-12 04:11:10 -05:00
func getKubeletPods ( ctx context . Context , c clientset . Interface , node string ) ( * v1 . PodList , error ) {
2022-08-25 07:43:04 -04:00
var client restclient . Result
finished := make ( chan struct { } , 1 )
go func ( ) {
// call chain tends to hang in some cases when Node is not ready. Add an artificial timeout for this call. #22165
client = c . CoreV1 ( ) . RESTClient ( ) . Get ( ) .
Resource ( "nodes" ) .
SubResource ( "proxy" ) .
Name ( fmt . Sprintf ( "%v:%v" , node , framework . KubeletPort ) ) .
Suffix ( "pods" ) .
2022-12-12 04:11:10 -05:00
Do ( ctx )
2022-08-25 07:43:04 -04:00
finished <- struct { } { }
} ( )
select {
case <- finished :
result := & v1 . PodList { }
if err := client . Into ( result ) ; err != nil {
return & v1 . PodList { } , err
}
return result , nil
case <- time . After ( framework . PodGetTimeout ) :
return & v1 . PodList { } , fmt . Errorf ( "Waiting up to %v for getting the list of pods" , framework . PodGetTimeout )
}
}
// logNodeEvents logs kubelet events from the given node. This includes kubelet
// restart and node unhealthy events. Note that listing events like this will mess
// with latency metrics, beware of calling it during a test.
2022-12-12 04:11:10 -05:00
func getNodeEvents ( ctx context . Context , c clientset . Interface , nodeName string ) [ ] v1 . Event {
2022-08-25 07:43:04 -04:00
selector := fields . Set {
"involvedObject.kind" : "Node" ,
"involvedObject.name" : nodeName ,
"involvedObject.namespace" : metav1 . NamespaceAll ,
"source" : "kubelet" ,
} . AsSelector ( ) . String ( )
options := metav1 . ListOptions { FieldSelector : selector }
2022-12-12 04:11:10 -05:00
events , err := c . CoreV1 ( ) . Events ( metav1 . NamespaceSystem ) . List ( ctx , options )
2022-08-25 07:43:04 -04:00
if err != nil {
framework . Logf ( "Unexpected error retrieving node events %v" , err )
return [ ] v1 . Event { }
}
return events . Items
}