From 146357aa395de84c0e4e28de4be174eadc6300ba Mon Sep 17 00:00:00 2001 From: Kevin Torres Date: Fri, 13 Dec 2024 23:21:19 +0000 Subject: [PATCH 1/8] Evict terminated pods on disk pressure Before evicting running pods, terminal phase pods will be evicted first --- pkg/kubelet/eviction/eviction_manager.go | 43 +++++++++++++++++++++--- pkg/kubelet/eviction/types.go | 5 ++- pkg/kubelet/kubelet.go | 2 +- pkg/kubelet/kubelet_pods.go | 23 +++++++++++++ 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index cb981dbb7e8..99ba3ae0f56 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -181,10 +181,10 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd } // Start starts the control loop to observe and response to low compute resources. -func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) { +func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, terminatedPodFunc TerminatedPodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) { thresholdHandler := func(message string) { klog.InfoS(message) - m.synchronize(diskInfoProvider, podFunc) + m.synchronize(diskInfoProvider, podFunc, terminatedPodFunc) } klog.InfoS("Eviction manager: starting control loop") if m.config.KernelMemcgNotification || runtime.GOOS == "windows" { @@ -203,7 +203,7 @@ func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePod // start the eviction manager monitoring go func() { for { - evictedPods, err := m.synchronize(diskInfoProvider, podFunc) + evictedPods, err := m.synchronize(diskInfoProvider, podFunc, terminatedPodFunc) if evictedPods != nil && err == nil { klog.InfoS("Eviction manager: pods evicted, waiting for pod to be cleaned up", "pods", klog.KObjSlice(evictedPods)) m.waitForPodsCleanup(podCleanedUpFunc, evictedPods) @@ -240,7 +240,7 @@ func (m *managerImpl) IsUnderPIDPressure() bool { // synchronize is the main control loop that enforces eviction thresholds. // Returns the pod that was killed, or nil if no pod was killed. -func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) ([]*v1.Pod, error) { +func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, terminatedPodFunc TerminatedPodsFunc) ([]*v1.Pod, error) { ctx := context.Background() // if we have nothing to do, just return thresholds := m.config.Thresholds @@ -378,6 +378,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // record an event about the resources we are now attempting to reclaim via eviction m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim) + // Evict terminated pods, so their resources can be reclaimed + if evictedPods := m.evictTerminatedPods(thresholds, statsFunc, resourceToReclaim, terminatedPodFunc, observations, thresholdToReclaim); len(evictedPods) != 0 { + klog.InfoS("Eviction manager: evicted", "terminated pods", klog.KObjSlice(evictedPods)) + } + // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. if m.reclaimNodeLevelResources(ctx, thresholdToReclaim.Signal, resourceToReclaim) { klog.InfoS("Eviction manager: able to reduce resource pressure without evicting pods.", "resourceName", resourceToReclaim) @@ -440,6 +445,36 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act return nil, nil } +func (m *managerImpl) evictTerminatedPods(thresholds []evictionapi.Threshold, statsFunc statsFunc, resourceToReclaim v1.ResourceName, terminatedPodFunc TerminatedPodsFunc, observations signalObservations, thresholdToReclaim evictionapi.Threshold) []*v1.Pod { + evictedPods := []*v1.Pod{} + if resourceToReclaim == v1.ResourceName(v1.NodeDiskPressure) { + return evictedPods + } + + terminatedPods := terminatedPodFunc() + + gracePeriodOverride := int64(immediateEvictionGracePeriodSeconds) + + for i := range terminatedPods { + pod := terminatedPods[i] + klog.InfoS("Eviction manager: evicted terminated pod", "pod", pod.Name) + + message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc, thresholds, observations) + condition := &v1.PodCondition{ + Type: v1.DisruptionTarget, + Status: v1.ConditionTrue, + Reason: v1.PodReasonTerminationByKubelet, + Message: message, + } + if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) { + metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc() + evictedPods = append(evictedPods, pod) + } + } + + return evictedPods +} + func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) { timeout := m.clock.NewTimer(podCleanupTimeout) defer timeout.Stop() diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index 83cdfcdf853..da8a6062ba7 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -59,7 +59,7 @@ type Config struct { // Manager evaluates when an eviction threshold for node stability has been met on the node. type Manager interface { // Start starts the control loop to monitor eviction thresholds at specified interval. - Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) + Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, terminatedPodFunc TerminatedPodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) // IsUnderMemoryPressure returns true if the node is under memory pressure. IsUnderMemoryPressure() bool @@ -107,6 +107,9 @@ type MirrorPodFunc func(*v1.Pod) (*v1.Pod, bool) // ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state) type ActivePodsFunc func() []*v1.Pod +// TerminatedPodsFunc return pods bound to the kubelet that are inactive (i.e. terminal state) +type TerminatedPodsFunc func() []*v1.Pod + // PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed. type PodCleanedUpFunc func(*v1.Pod) bool diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 600a006f054..3d581feaa77 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1705,7 +1705,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() { } // eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs // Eviction decisions are based on the allocated (rather than desired) pod resources. - kl.evictionManager.Start(kl.StatsProvider, kl.getAllocatedPods, kl.PodIsFinished, evictionMonitoringPeriod) + kl.evictionManager.Start(kl.StatsProvider, kl.getAllocatedPods, kl.getTerminatedPods, kl.PodIsFinished, evictionMonitoringPeriod) // container log manager must start after container runtime is up to retrieve information from container runtime // and inform container to reopen log file after log rotation. diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 86eeb0bd26d..dacfa0282f6 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -219,6 +219,14 @@ func (kl *Kubelet) getAllocatedPods() []*v1.Pod { return allocatedPods } +// Returns all terminated pods. +func (kl *Kubelet) getTerminatedPods() []*v1.Pod { + allPods := kl.podManager.GetPods() + + terminatedPods := kl.filterOutActivePods(allPods) + return terminatedPods +} + // makeBlockVolumes maps the raw block devices specified in the path of the container // Experimental func (kl *Kubelet) makeBlockVolumes(pod *v1.Pod, container *v1.Container, podVolumes kubecontainer.VolumeMap, blkutil volumepathhandler.BlockVolumePathHandler) ([]kubecontainer.DeviceInfo, error) { @@ -1144,6 +1152,21 @@ func (kl *Kubelet) filterOutInactivePods(pods []*v1.Pod) []*v1.Pod { return filteredPods } +// filterOutActivePods returns pods that are in a terminal phase +func (kl *Kubelet) filterOutActivePods(pods []*v1.Pod) []*v1.Pod { + filteredPods := make([]*v1.Pod, 0, len(pods)) + for _, p := range pods { + // skip pod if it is not in a terminal phase + if !(p.Status.Phase == v1.PodFailed || p.Status.Phase == v1.PodSucceeded) { + continue + } + + klog.InfoS("Eviction manager - Test: filterOutActivePods", "terminated pod", p) + filteredPods = append(filteredPods, p) + } + return filteredPods +} + // isAdmittedPodTerminal returns true if the provided config source pod is in // a terminal phase, or if the Kubelet has already indicated the pod has reached // a terminal phase but the config source has not accepted it yet. This method From df54470e9b86f3861a34f8306bd77be4374c0fb8 Mon Sep 17 00:00:00 2001 From: Kevin Torres Date: Fri, 3 Jan 2025 18:21:01 +0000 Subject: [PATCH 2/8] Test terminated pods are evicted on disk pressure --- pkg/kubelet/eviction/eviction_manager_test.go | 468 +++++++++++++++--- 1 file changed, 399 insertions(+), 69 deletions(-) diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index a72c145409b..4c8f02879bf 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -65,6 +65,36 @@ func (m *mockPodKiller) killPodNow(pod *v1.Pod, evict bool, gracePeriodOverride return nil } +// While for running pods only one pod is killed per eviction interval, +// when there is disk pressure, all terminated pods are killed in the same cycle +// mockPodsKiller is used to test which pods are killed +type mockPodsKiller struct { + pods []*v1.Pod + evict bool + statusFn func(*v1.PodStatus) + gracePeriodOverride *int64 +} + +// killPodsNow records the pods that were killed +func (m *mockPodsKiller) killPodsNow(pod *v1.Pod, evict bool, gracePeriodOverride *int64, statusFn func(*v1.PodStatus)) error { + m.pods = append(m.pods, pod) + m.statusFn = statusFn + m.evict = evict + m.gracePeriodOverride = gracePeriodOverride + return nil +} + +func setDiskStatsBasedOnFs(whichFs string, diskPressure string, diskStat diskStats) diskStats { + if whichFs == "nodefs" { + diskStat.rootFsAvailableBytes = diskPressure + } else if whichFs == "imagefs" { + diskStat.imageFsAvailableBytes = diskPressure + } else if whichFs == "containerfs" { + diskStat.containerFsAvailableBytes = diskPressure + } + return diskStat +} + // mockDiskInfoProvider is used to simulate testing. type mockDiskInfoProvider struct { dedicatedImageFs *bool @@ -296,6 +326,10 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -329,7 +363,7 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) { } // synchronize to detect the memory pressure - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -396,6 +430,10 @@ func TestPIDPressure_VerifyPodStatus(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false), dedicatedContainerFs: ptr.To(false)} @@ -429,7 +467,7 @@ func TestPIDPressure_VerifyPodStatus(t *testing.T) { } // synchronize to detect the PID pressure - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -574,6 +612,10 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} @@ -605,7 +647,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) { } // synchronize - pods, synchErr := manager.synchronize(diskInfoProvider, activePodsFunc) + pods, synchErr := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if synchErr == nil && tc.expectErr != "" { t.Fatalf("Manager should report error but did not") @@ -663,6 +705,10 @@ func TestMemoryPressure(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -709,7 +755,7 @@ func TestMemoryPressure(t *testing.T) { burstablePodToAdmit, _ := podMaker("burst-admit", defaultPriority, newResourceList("100m", "100Mi", ""), newResourceList("200m", "200Mi", ""), "0Gi") // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -731,7 +777,7 @@ func TestMemoryPressure(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -750,7 +796,7 @@ func TestMemoryPressure(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -779,7 +825,7 @@ func TestMemoryPressure(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -793,7 +839,7 @@ func TestMemoryPressure(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -825,7 +871,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -853,7 +899,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -933,6 +979,10 @@ func TestPIDPressure(t *testing.T) { podToEvict := pods[tc.evictPodIndex] activePodsFunc := func() []*v1.Pod { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -979,7 +1029,7 @@ func TestPIDPressure(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, 50) // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -998,7 +1048,7 @@ func TestPIDPressure(t *testing.T) { // induce soft threshold for PID pressure fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.pressurePIDUsageWithGracePeriod, podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1017,7 +1067,7 @@ func TestPIDPressure(t *testing.T) { // step forward in time past the grace period fakeClock.Step(3 * time.Minute) // no change in PID stats to simulate continued pressure - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1047,7 +1097,7 @@ func TestPIDPressure(t *testing.T) { // remove PID pressure by simulating increased PID availability fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats) // Simulate increased PID availability - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1061,7 +1111,7 @@ func TestPIDPressure(t *testing.T) { // re-induce PID pressure fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.pressurePIDUsageWithoutGracePeriod, podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1093,7 +1143,7 @@ func TestPIDPressure(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1117,7 +1167,7 @@ func TestPIDPressure(t *testing.T) { // move the clock past the transition period fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1191,6 +1241,7 @@ func TestDiskPressureNodeFs(t *testing.T) { writeableSeparateFromReadOnly bool thresholdToMonitor []evictionapi.Threshold podToMakes []podToMake + terminatedPodToMakes []podToMake dedicatedImageFs *bool expectErr string inducePressureOnWhichFs string @@ -1319,6 +1370,10 @@ func TestDiskPressureNodeFs(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} @@ -1356,7 +1411,7 @@ func TestDiskPressureNodeFs(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi", nil) // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1383,7 +1438,7 @@ func TestDiskPressureNodeFs(t *testing.T) { diskStatStart.containerFsAvailableBytes = tc.softDiskPressure } summaryProvider.result = summaryStatsMaker(diskStatStart) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1402,7 +1457,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatStart) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1431,7 +1486,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1452,7 +1507,7 @@ func TestDiskPressureNodeFs(t *testing.T) { diskStatStart.containerFsAvailableBytes = tc.hardDiskPressure } summaryProvider.result = summaryStatsMaker(diskStatStart) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1482,7 +1537,7 @@ func TestDiskPressureNodeFs(t *testing.T) { summaryProvider.result = summaryStatsMaker(diskStatConst) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1506,7 +1561,7 @@ func TestDiskPressureNodeFs(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatConst) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1530,6 +1585,259 @@ func TestDiskPressureNodeFs(t *testing.T) { } } +// Test terminated pods eviction on disk pressure works as expected +func TestTerminatedPodsEvictionOnDiskPressure(t *testing.T) { + testCases := map[string]struct { + nodeFsStats string + imageFsStats string + containerFsStats string + kubeletSeparateDiskFeature bool + writeableSeparateFromReadOnly bool + thresholdToMonitor evictionapi.Threshold + podToMakes []podToMake + terminatedPodToMakes []podToMake + dedicatedImageFs *bool + inducePressureOnWhichFs string + softDiskPressure string + hardDiskPressure string + expectContainerGcCall bool + expectImageGcCall bool + }{ + "eviction due to disk pressure; terminated pods": { + dedicatedImageFs: ptr.To(false), + nodeFsStats: "16Gi", + imageFsStats: "16Gi", + containerFsStats: "16Gi", + inducePressureOnWhichFs: "nodefs", + softDiskPressure: "1.5Gi", + hardDiskPressure: "750Mi", + expectImageGcCall: true, + expectContainerGcCall: true, + thresholdToMonitor: evictionapi.Threshold{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Gi"), + }, + }, + podToMakes: []podToMake{ + {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, + {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, + {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, + {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, + {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, + }, + terminatedPodToMakes: []podToMake{ + {name: "terminated-pod-1", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "500Mi"}, + {name: "terminated-pod-2", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "500Mi"}, + }, + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, tc.kubeletSeparateDiskFeature) + + podMaker := makePodWithDiskStats + summaryStatsMaker := makeDiskStats + podsToMake := tc.podToMakes + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil) + pods = append(pods, pod) + podStats[pod] = podStat + } + activePodsFunc := func() []*v1.Pod { + return pods + } + + terminatedPodsToMake := tc.terminatedPodToMakes + terminatedPods := []*v1.Pod{} + for _, terminatedPodToMake := range terminatedPodsToMake { + terminatedPod, _ := podMaker(terminatedPodToMake.name, terminatedPodToMake.priority, terminatedPodToMake.requests, terminatedPodToMake.limits, terminatedPodToMake.rootFsUsed, terminatedPodToMake.logsFsUsed, terminatedPodToMake.perLocalVolumeUsed, nil) + terminatedPods = append(terminatedPods, terminatedPod) + } + terminatedPodsFunc := func() []*v1.Pod { + return terminatedPods + } + + fakeClock := testingclock.NewFakeClock(time.Now()) + podKiller := &mockPodsKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} + nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []evictionapi.Threshold{tc.thresholdToMonitor}, + } + diskStatStart := diskStats{ + rootFsAvailableBytes: tc.nodeFsStats, + imageFsAvailableBytes: tc.imageFsStats, + containerFsAvailableBytes: tc.containerFsStats, + podStats: podStats, + } + // This is a constant that we use to test that disk pressure is over. Don't change! + diskStatConst := diskStatStart + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStatStart)} + diskGC := &mockDiskGC{fakeSummaryProvider: summaryProvider, err: nil} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodsNow, + imageGC: diskGC, + containerGC: diskGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // synchronize + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Errorf("Manager should not report disk pressure") + } + + // induce hard threshold + fakeClock.Step(1 * time.Minute) + + newDiskAfterHardEviction := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) + summaryProvider.result = summaryStatsMaker(newDiskAfterHardEviction) + // make GC successfully return disk usage to previous levels + diskGC.summaryAfterGC = summaryStatsMaker(diskStatConst) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure since soft threshold was met") + } + + // verify image, container or both gc were called. + // split filesystem can have container gc called without image. + // same filesystem should have both. + if diskGC.imageGCInvoked != tc.expectImageGcCall && diskGC.containerGCInvoked != tc.expectContainerGcCall { + t.Fatalf("Manager should have invoked image gc") + } + + // compare expected killed pods with actual killed pods + checkIfSamePods := func(expectedPods, actualPods []*v1.Pod) bool { + expected := make(map[*v1.Pod]bool) + for _, pod := range expectedPods { + expected[pod] = true + } + + for _, pod := range actualPods { + if !expected[pod] { + return false + } + } + + return true + } + + // verify terminated pods were killed + if podKiller.pods == nil { + t.Fatalf("Manager should have killed terminated pods") + } + + // verify only terminated pods were killed because image gc was sufficient + if !checkIfSamePods(terminatedPods, podKiller.pods) { + t.Fatalf("Manager killed running pods") + } + + // reset state + diskGC.imageGCInvoked = false + diskGC.containerGCInvoked = false + podKiller.pods = nil + + // remove disk pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker(diskStatConst) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } + + // induce disk pressure! + fakeClock.Step(1 * time.Minute) + softDiskPressure := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) + summaryProvider.result = summaryStatsMaker(softDiskPressure) + // Don't reclaim any disk + diskGC.summaryAfterGC = summaryStatsMaker(softDiskPressure) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Fatalf("Manager should report disk pressure") + } + + // verify image, container or both gc were called. + // split filesystem can have container gc called without image. + // same filesystem should have both. + if diskGC.imageGCInvoked != tc.expectImageGcCall && diskGC.containerGCInvoked != tc.expectContainerGcCall { + t.Fatalf("Manager should have invoked image gc") + } + + // Make array with only the pods names + podsNames := func(pods []*v1.Pod) []string { + var podsNames []string + for _, pod := range pods { + podsNames = append(podsNames, pod.Name) + } + return podsNames + } + + expectedKilledPods := append(terminatedPods, pods[0]) + // verify only terminated pods were killed because image gc was sufficient + if !checkIfSamePods(expectedKilledPods, podKiller.pods) { + t.Fatalf("Manager killed running pods. Expected: %v, actual: %v", podsNames(expectedKilledPods), podsNames(podKiller.pods)) + } + + // reset state + diskGC.imageGCInvoked = false + diskGC.containerGCInvoked = false + podKiller.pods = nil + + // remove disk pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker(diskStatConst) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + + if err != nil { + t.Fatalf("Manager should not have an error %v", err) + } + + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Fatalf("Manager should not report disk pressure") + } + }) + } +} + // TestMinReclaim verifies that min-reclaim works as desired. func TestMinReclaim(t *testing.T) { podMaker := makePodWithMemoryStats @@ -1553,6 +1861,10 @@ func TestMinReclaim(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -1590,7 +1902,7 @@ func TestMinReclaim(t *testing.T) { } // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Errorf("Manager should not report any errors") } @@ -1602,7 +1914,7 @@ func TestMinReclaim(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1626,7 +1938,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1.2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1650,7 +1962,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1670,7 +1982,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1841,6 +2153,10 @@ func TestNodeReclaimFuncs(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} @@ -1875,7 +2191,7 @@ func TestNodeReclaimFuncs(t *testing.T) { } // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1889,21 +2205,11 @@ func TestNodeReclaimFuncs(t *testing.T) { // induce hard threshold fakeClock.Step(1 * time.Minute) - setDiskStatsBasedOnFs := func(whichFs string, diskPressure string, diskStat diskStats) diskStats { - if tc.inducePressureOnWhichFs == "nodefs" { - diskStat.rootFsAvailableBytes = diskPressure - } else if tc.inducePressureOnWhichFs == "imagefs" { - diskStat.imageFsAvailableBytes = diskPressure - } else if tc.inducePressureOnWhichFs == "containerfs" { - diskStat.containerFsAvailableBytes = diskPressure - } - return diskStat - } newDiskAfterHardEviction := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) summaryProvider.result = summaryStatsMaker(newDiskAfterHardEviction) // make GC successfully return disk usage to previous levels diskGC.summaryAfterGC = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1933,7 +2239,7 @@ func TestNodeReclaimFuncs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1945,7 +2251,7 @@ func TestNodeReclaimFuncs(t *testing.T) { } // synchronize - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1963,7 +2269,7 @@ func TestNodeReclaimFuncs(t *testing.T) { // make GC return disk usage bellow the threshold, but not satisfying minReclaim gcBelowThreshold := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, "1.1G", newDiskAfterHardEviction) diskGC.summaryAfterGC = summaryStatsMaker(gcBelowThreshold) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1994,7 +2300,7 @@ func TestNodeReclaimFuncs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2011,7 +2317,7 @@ func TestNodeReclaimFuncs(t *testing.T) { summaryProvider.result = summaryStatsMaker(softDiskPressure) // Don't reclaim any disk diskGC.summaryAfterGC = summaryStatsMaker(softDiskPressure) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2044,7 +2350,7 @@ func TestNodeReclaimFuncs(t *testing.T) { diskGC.imageGCInvoked = false // reset state diskGC.containerGCInvoked = false // reset state podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2070,7 +2376,7 @@ func TestNodeReclaimFuncs(t *testing.T) { diskGC.imageGCInvoked = false // reset state diskGC.containerGCInvoked = false // reset state podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2304,6 +2610,10 @@ func TestInodePressureFsInodes(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} @@ -2335,7 +2645,7 @@ func TestInodePressureFsInodes(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0", "0", "0") // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2354,7 +2664,7 @@ func TestInodePressureFsInodes(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = setINodesFreeBasedOnFs(tc.inducePressureOnWhichFs, tc.softINodePressure, startingStatsModified) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2373,7 +2683,7 @@ func TestInodePressureFsInodes(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = setINodesFreeBasedOnFs(tc.inducePressureOnWhichFs, tc.softINodePressure, startingStatsModified) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2402,7 +2712,7 @@ func TestInodePressureFsInodes(t *testing.T) { // remove inode pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = startingStatsConst - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2416,7 +2726,7 @@ func TestInodePressureFsInodes(t *testing.T) { // induce inode pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = setINodesFreeBasedOnFs(tc.inducePressureOnWhichFs, tc.hardINodePressure, startingStatsModified) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2445,7 +2755,7 @@ func TestInodePressureFsInodes(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = startingStatsConst podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2470,7 +2780,7 @@ func TestInodePressureFsInodes(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = startingStatsConst podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2522,6 +2832,10 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -2567,7 +2881,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2586,7 +2900,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2608,7 +2922,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2628,7 +2942,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2685,6 +2999,10 @@ func TestStorageLimitEvictions(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -2727,7 +3045,7 @@ func TestStorageLimitEvictions(t *testing.T) { localStorageCapacityIsolation: true, } - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) } @@ -2768,6 +3086,10 @@ func TestAllocatableMemoryPressure(t *testing.T) { return pods } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -2806,7 +3128,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { burstablePodToAdmit, _ := podMaker("burst-admit", defaultPriority, newResourceList("100m", "100Mi", ""), newResourceList("200m", "200Mi", ""), "0Gi") // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2830,7 +3152,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { pod, podStat := podMaker("guaranteed-high-2", defaultPriority, newResourceList("100m", "1Gi", ""), newResourceList("100m", "1Gi", ""), "1Gi") podStats[pod] = podStat summaryProvider.result = summaryStatsMaker("500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2870,7 +3192,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { } summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2898,7 +3220,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2928,6 +3250,10 @@ func TestUpdateMemcgThreshold(t *testing.T) { return []*v1.Pod{} } + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -2968,14 +3294,14 @@ func TestUpdateMemcgThreshold(t *testing.T) { } // The UpdateThreshold method should have been called once, since this is the first run. - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) } // The UpdateThreshold method should not have been called again, since not enough time has passed - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2983,7 +3309,7 @@ func TestUpdateMemcgThreshold(t *testing.T) { // The UpdateThreshold method should be called again since enough time has passed fakeClock.Step(2 * notifierRefreshInterval) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2998,7 +3324,7 @@ func TestUpdateMemcgThreshold(t *testing.T) { // The UpdateThreshold method should be called because at least notifierRefreshInterval time has passed. // The Description method should be called because UpdateThreshold returned an error fakeClock.Step(2 * notifierRefreshInterval) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -3066,7 +3392,11 @@ func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) { return pods } - evictedPods, err := mgr.synchronize(diskInfoProvider, activePodsFunc) + terminatedPodsFunc := func() []*v1.Pod { + return []*v1.Pod{} + } + + evictedPods, err := mgr.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) if err != nil { t.Fatalf("Manager should not have error but got %v", err) From a59ce54d79fc68a499263882f020645ea9e6d4c4 Mon Sep 17 00:00:00 2001 From: Kevin Torres Date: Sat, 11 Jan 2025 00:45:09 +0000 Subject: [PATCH 3/8] TerminatedPodsEvictionOnDiskPressure e2e node test --- test/e2e_node/eviction_test.go | 70 ++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index 383d3a8ed5d..86988e80bbf 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -66,6 +66,7 @@ const ( lotsOfFiles = 1000000000 // 1 billion resourceInodes = v1.ResourceName("inodes") noStarvedResource = v1.ResourceName("none") + waitPodName = "wait-pod" ) // InodeEviction tests that the node responds to node disk pressure by evicting only responsible pods. @@ -103,6 +104,53 @@ var _ = SIGDescribe("InodeEviction", framework.WithSlow(), framework.WithSerial( }) }) +// TerminatedPodsEvictionOnDiskPressure tests that the node does not evict pods +// when there are terminated pods and the image garbage collection can remove +// their resources without evicting running pods +var _ = SIGDescribe("TerminatedPodsEvictionOnDiskPressure", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { + f := framework.NewDefaultFramework("terminated-pods-eviction-on-disk-pressure-test") + f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged + pressureTimeout := 10 * time.Minute + expectedNodeCondition := v1.NodeDiskPressure + expectedStarvedResource := v1.ResourceEphemeralStorage + + ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { + tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { + // Calculate eviction thresholds based on available disk space + summary := eventuallyGetSummary(ctx) + availableBytesOnSystem := *(summary.Node.Fs.AvailableBytes) + + diskConsumedByTest := resource.MustParse("4Gi") + evictionThreshold := strconv.FormatUint(availableBytesOnSystem-uint64(diskConsumedByTest.Value()), 10) + + if availableBytesOnSystem <= uint64(diskConsumedByTest.Value()) { + e2eskipper.Skipf("Not enough disk space to run the test, available: %d", availableBytesOnSystem) + } + + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): evictionThreshold} + initialConfig.EvictionMinimumReclaim = map[string]string{} + }) + + runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{ + { + evictionPriority: 0, + pod: innocentPod(), + }, + { + evictionPriority: 0, + pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), + }, + // This is the terminated pod, runEvictionTest will wait for it to finish + { + evictionPriority: 1, + // This is probably working because empty dir is being cleared when the + // pod finishes + pod: diskConsumingPod(fmt.Sprintf("disk-hog-%s", waitPodName), lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), + }, + }) + }) +}) + // ImageGCNoEviction tests that the eviction manager is able to prevent eviction // by reclaiming resources(inodes) through image garbage collection. // Disk pressure is induced by consuming a lot of inodes on the node. @@ -720,6 +768,28 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe pods = append(pods, spec.pod) } e2epod.NewPodClient(f).CreateBatch(ctx, pods) + + // Wait for the expected terminated pod to get to terminated state + for _, spec := range testSpecs { + if strings.Contains(spec.pod.Name, waitPodName) { + ginkgo.By(fmt.Sprintf("Waiting for the %s pod to complete", spec.pod.Name)) + podClient := f.ClientSet.CoreV1().Pods(f.Namespace.Name) + gomega.Eventually(ctx, func() error { + pod, err := podClient.Get(ctx, spec.pod.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to retrieve %s: %v", spec.pod.Name, err) + } + + if pod.Status.Phase != v1.PodSucceeded && pod.Status.Phase != v1.PodFailed { + return fmt.Errorf("%s pod not yet completed or failed: %v", spec.pod.Name, pod.Status.Phase) + } + framework.Logf("%s in phase: %v", spec.pod.Name, pod.Status.Phase) + return nil + }, pressureTimeout, evictionPollInterval).Should(gomega.BeNil()) + + break + } + } }) ginkgo.It("should eventually evict all of the correct pods", func(ctx context.Context) { From 2cad51f6c0da9f24aec9a95678afbecdf815de6f Mon Sep 17 00:00:00 2001 From: Kevin Torres Date: Fri, 24 Jan 2025 07:56:16 +0000 Subject: [PATCH 4/8] Add ImageGCTerminatedPodsEviction e2e node test Removed TerminatedPodsEvictionOnDiskPressure, since it did not test the expected functionality gofmt ./pkg/kubelet/eviction/eviction_manager_test.go --- pkg/kubelet/eviction/eviction_manager.go | 28 ++- pkg/kubelet/eviction/eviction_manager_test.go | 116 +++++------ test/e2e_node/eviction_test.go | 194 +++++++++--------- test/e2e_node/split_disk_test.go | 2 +- 4 files changed, 179 insertions(+), 161 deletions(-) diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 99ba3ae0f56..513c5d61c0a 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -204,7 +204,7 @@ func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePod go func() { for { evictedPods, err := m.synchronize(diskInfoProvider, podFunc, terminatedPodFunc) - if evictedPods != nil && err == nil { + if evictedPods != nil && len(evictedPods) > 0 && err == nil { klog.InfoS("Eviction manager: pods evicted, waiting for pod to be cleaned up", "pods", klog.KObjSlice(evictedPods)) m.waitForPodsCleanup(podCleanedUpFunc, evictedPods) } else { @@ -378,15 +378,16 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // record an event about the resources we are now attempting to reclaim via eviction m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim) + var evictedPods []*v1.Pod // Evict terminated pods, so their resources can be reclaimed - if evictedPods := m.evictTerminatedPods(thresholds, statsFunc, resourceToReclaim, terminatedPodFunc, observations, thresholdToReclaim); len(evictedPods) != 0 { + if evictedPods = m.evictTerminatedPods(thresholds, statsFunc, resourceToReclaim, terminatedPodFunc, observations, thresholdToReclaim); len(evictedPods) != 0 { klog.InfoS("Eviction manager: evicted", "terminated pods", klog.KObjSlice(evictedPods)) } // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. if m.reclaimNodeLevelResources(ctx, thresholdToReclaim.Signal, resourceToReclaim) { - klog.InfoS("Eviction manager: able to reduce resource pressure without evicting pods.", "resourceName", resourceToReclaim) - return nil, nil + klog.InfoS("Eviction manager: able to reduce resource pressure without evicting running pods.", "resourceName", resourceToReclaim) + return evictedPods, nil } klog.InfoS("Eviction manager: must evict pod(s) to reclaim", "resourceName", resourceToReclaim) @@ -395,13 +396,13 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal] if !ok { klog.ErrorS(nil, "Eviction manager: no ranking function for signal", "threshold", thresholdToReclaim.Signal) - return nil, nil + return evictedPods, nil } // the only candidates viable for eviction are those pods that had anything running. if len(activePods) == 0 { klog.ErrorS(nil, "Eviction manager: eviction thresholds have been met, but no pods are active to evict") - return nil, nil + return evictedPods, nil } // rank the running pods for eviction for the specified resource @@ -438,16 +439,23 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act } if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) { metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc() - return []*v1.Pod{pod}, nil + + return append(evictedPods, pod), nil } } - klog.InfoS("Eviction manager: unable to evict any pods from the node") - return nil, nil + + if len(evictedPods) > 0 { + klog.InfoS("Eviction manager: only evicted terminated pods from the node") + } else { + klog.InfoS("Eviction manager: unable to evict any pods from the node") + } + + return evictedPods, nil } func (m *managerImpl) evictTerminatedPods(thresholds []evictionapi.Threshold, statsFunc statsFunc, resourceToReclaim v1.ResourceName, terminatedPodFunc TerminatedPodsFunc, observations signalObservations, thresholdToReclaim evictionapi.Threshold) []*v1.Pod { evictedPods := []*v1.Pod{} - if resourceToReclaim == v1.ResourceName(v1.NodeDiskPressure) { + if !hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure) { return evictedPods } diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 4c8f02879bf..ed37768677b 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -77,7 +77,7 @@ type mockPodsKiller struct { // killPodsNow records the pods that were killed func (m *mockPodsKiller) killPodsNow(pod *v1.Pod, evict bool, gracePeriodOverride *int64, statusFn func(*v1.PodStatus)) error { - m.pods = append(m.pods, pod) + m.pods = append(m.pods, pod) m.statusFn = statusFn m.evict = evict m.gracePeriodOverride = gracePeriodOverride @@ -85,14 +85,14 @@ func (m *mockPodsKiller) killPodsNow(pod *v1.Pod, evict bool, gracePeriodOverrid } func setDiskStatsBasedOnFs(whichFs string, diskPressure string, diskStat diskStats) diskStats { - if whichFs == "nodefs" { - diskStat.rootFsAvailableBytes = diskPressure - } else if whichFs == "imagefs" { - diskStat.imageFsAvailableBytes = diskPressure - } else if whichFs == "containerfs" { - diskStat.containerFsAvailableBytes = diskPressure - } - return diskStat + if whichFs == "nodefs" { + diskStat.rootFsAvailableBytes = diskPressure + } else if whichFs == "imagefs" { + diskStat.imageFsAvailableBytes = diskPressure + } else if whichFs == "containerfs" { + diskStat.containerFsAvailableBytes = diskPressure + } + return diskStat } // mockDiskInfoProvider is used to simulate testing. @@ -431,8 +431,8 @@ func TestPIDPressure_VerifyPodStatus(t *testing.T) { } terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } + return []*v1.Pod{} + } fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} @@ -1636,7 +1636,7 @@ func TestTerminatedPodsEvictionOnDiskPressure(t *testing.T) { for name, tc := range testCases { t.Run(name, func(t *testing.T) { - featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, tc.kubeletSeparateDiskFeature) + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, tc.kubeletSeparateDiskFeature) podMaker := makePodWithDiskStats summaryStatsMaker := makeDiskStats @@ -1652,12 +1652,12 @@ func TestTerminatedPodsEvictionOnDiskPressure(t *testing.T) { return pods } - terminatedPodsToMake := tc.terminatedPodToMakes - terminatedPods := []*v1.Pod{} - for _, terminatedPodToMake := range terminatedPodsToMake { - terminatedPod, _ := podMaker(terminatedPodToMake.name, terminatedPodToMake.priority, terminatedPodToMake.requests, terminatedPodToMake.limits, terminatedPodToMake.rootFsUsed, terminatedPodToMake.logsFsUsed, terminatedPodToMake.perLocalVolumeUsed, nil) - terminatedPods = append(terminatedPods, terminatedPod) - } + terminatedPodsToMake := tc.terminatedPodToMakes + terminatedPods := []*v1.Pod{} + for _, terminatedPodToMake := range terminatedPodsToMake { + terminatedPod, _ := podMaker(terminatedPodToMake.name, terminatedPodToMake.priority, terminatedPodToMake.requests, terminatedPodToMake.limits, terminatedPodToMake.rootFsUsed, terminatedPodToMake.logsFsUsed, terminatedPodToMake.perLocalVolumeUsed, nil) + terminatedPods = append(terminatedPods, terminatedPod) + } terminatedPodsFunc := func() []*v1.Pod { return terminatedPods } @@ -1702,7 +1702,7 @@ func TestTerminatedPodsEvictionOnDiskPressure(t *testing.T) { t.Fatalf("Manager should not have an error %v", err) } - // we should not have disk pressure + // we should not have disk pressure if manager.IsUnderDiskPressure() { t.Errorf("Manager should not report disk pressure") } @@ -1729,39 +1729,39 @@ func TestTerminatedPodsEvictionOnDiskPressure(t *testing.T) { // split filesystem can have container gc called without image. // same filesystem should have both. if diskGC.imageGCInvoked != tc.expectImageGcCall && diskGC.containerGCInvoked != tc.expectContainerGcCall { - t.Fatalf("Manager should have invoked image gc") + t.Fatalf("Manager should have invoked image gc") } - // compare expected killed pods with actual killed pods + // compare expected killed pods with actual killed pods checkIfSamePods := func(expectedPods, actualPods []*v1.Pod) bool { - expected := make(map[*v1.Pod]bool) - for _, pod := range expectedPods { - expected[pod] = true - } + expected := make(map[*v1.Pod]bool) + for _, pod := range expectedPods { + expected[pod] = true + } - for _, pod := range actualPods { - if !expected[pod] { - return false - } - } + for _, pod := range actualPods { + if !expected[pod] { + return false + } + } - return true - } + return true + } - // verify terminated pods were killed - if podKiller.pods == nil { - t.Fatalf("Manager should have killed terminated pods") - } + // verify terminated pods were killed + if podKiller.pods == nil { + t.Fatalf("Manager should have killed terminated pods") + } - // verify only terminated pods were killed because image gc was sufficient - if !checkIfSamePods(terminatedPods, podKiller.pods) { - t.Fatalf("Manager killed running pods") - } + // verify only terminated pods were killed because image gc was sufficient + if !checkIfSamePods(terminatedPods, podKiller.pods) { + t.Fatalf("Manager killed running pods") + } - // reset state + // reset state diskGC.imageGCInvoked = false diskGC.containerGCInvoked = false - podKiller.pods = nil + podKiller.pods = nil // remove disk pressure fakeClock.Step(20 * time.Minute) @@ -1777,7 +1777,7 @@ func TestTerminatedPodsEvictionOnDiskPressure(t *testing.T) { t.Fatalf("Manager should not report disk pressure") } - // induce disk pressure! + // induce disk pressure! fakeClock.Step(1 * time.Minute) softDiskPressure := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) summaryProvider.result = summaryStatsMaker(softDiskPressure) @@ -1801,25 +1801,25 @@ func TestTerminatedPodsEvictionOnDiskPressure(t *testing.T) { t.Fatalf("Manager should have invoked image gc") } - // Make array with only the pods names - podsNames := func(pods []*v1.Pod) []string { - var podsNames []string - for _, pod := range pods { - podsNames = append(podsNames, pod.Name) - } - return podsNames - } + // Make array with only the pods names + podsNames := func(pods []*v1.Pod) []string { + var podsNames []string + for _, pod := range pods { + podsNames = append(podsNames, pod.Name) + } + return podsNames + } - expectedKilledPods := append(terminatedPods, pods[0]) - // verify only terminated pods were killed because image gc was sufficient - if !checkIfSamePods(expectedKilledPods, podKiller.pods) { - t.Fatalf("Manager killed running pods. Expected: %v, actual: %v", podsNames(expectedKilledPods), podsNames(podKiller.pods)) - } + expectedKilledPods := append(terminatedPods, pods[0]) + // verify only terminated pods were killed because image gc was sufficient + if !checkIfSamePods(expectedKilledPods, podKiller.pods) { + t.Fatalf("Manager killed running pods. Expected: %v, actual: %v", podsNames(expectedKilledPods), podsNames(podKiller.pods)) + } - // reset state + // reset state diskGC.imageGCInvoked = false diskGC.containerGCInvoked = false - podKiller.pods = nil + podKiller.pods = nil // remove disk pressure fakeClock.Step(20 * time.Minute) diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index 86988e80bbf..93dd161930c 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -66,7 +66,7 @@ const ( lotsOfFiles = 1000000000 // 1 billion resourceInodes = v1.ResourceName("inodes") noStarvedResource = v1.ResourceName("none") - waitPodName = "wait-pod" + waitPodSufix = "wait-pod" ) // InodeEviction tests that the node responds to node disk pressure by evicting only responsible pods. @@ -94,7 +94,7 @@ var _ = SIGDescribe("InodeEviction", framework.WithSlow(), framework.WithSerial( evictionPriority: 1, // TODO(#127864): Container runtime may not immediate free up the resources after the pod eviction, // causing the test to fail. We provision an emptyDir volume to avoid relying on the runtime behavior. - pod: inodeConsumingPod("volume-inode-hog", lotsOfFiles, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}), + pod: inodeConsumingPod("volume-inode-hog", lotsOfFiles, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, true), }, { evictionPriority: 0, @@ -104,53 +104,6 @@ var _ = SIGDescribe("InodeEviction", framework.WithSlow(), framework.WithSerial( }) }) -// TerminatedPodsEvictionOnDiskPressure tests that the node does not evict pods -// when there are terminated pods and the image garbage collection can remove -// their resources without evicting running pods -var _ = SIGDescribe("TerminatedPodsEvictionOnDiskPressure", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { - f := framework.NewDefaultFramework("terminated-pods-eviction-on-disk-pressure-test") - f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - pressureTimeout := 10 * time.Minute - expectedNodeCondition := v1.NodeDiskPressure - expectedStarvedResource := v1.ResourceEphemeralStorage - - ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { - tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { - // Calculate eviction thresholds based on available disk space - summary := eventuallyGetSummary(ctx) - availableBytesOnSystem := *(summary.Node.Fs.AvailableBytes) - - diskConsumedByTest := resource.MustParse("4Gi") - evictionThreshold := strconv.FormatUint(availableBytesOnSystem-uint64(diskConsumedByTest.Value()), 10) - - if availableBytesOnSystem <= uint64(diskConsumedByTest.Value()) { - e2eskipper.Skipf("Not enough disk space to run the test, available: %d", availableBytesOnSystem) - } - - initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): evictionThreshold} - initialConfig.EvictionMinimumReclaim = map[string]string{} - }) - - runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{ - { - evictionPriority: 0, - pod: innocentPod(), - }, - { - evictionPriority: 0, - pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), - }, - // This is the terminated pod, runEvictionTest will wait for it to finish - { - evictionPriority: 1, - // This is probably working because empty dir is being cleared when the - // pod finishes - pod: diskConsumingPod(fmt.Sprintf("disk-hog-%s", waitPodName), lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), - }, - }) - }) -}) - // ImageGCNoEviction tests that the eviction manager is able to prevent eviction // by reclaiming resources(inodes) through image garbage collection. // Disk pressure is induced by consuming a lot of inodes on the node. @@ -163,6 +116,41 @@ var _ = SIGDescribe("ImageGCNoEviction", framework.WithSlow(), framework.WithSer expectedNodeCondition := v1.NodeDiskPressure expectedStarvedResource := resourceInodes inodesConsumed := uint64(100000) + ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { + tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { + // Set the eviction threshold to inodesFree - inodesConsumed, so that using inodesConsumed causes an eviction. + summary := eventuallyGetSummary(ctx) + inodesFree := *summary.Node.Fs.InodesFree + if inodesFree <= inodesConsumed { + e2eskipper.Skipf("Too few inodes free on the host for the InodeEviction test to run") + } + framework.Logf("Setting eviction threshold to %d inodes", inodesFree-inodesConsumed) + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesConsumed)} + initialConfig.EvictionMinimumReclaim = map[string]string{} + }) + // Consume enough inodes to induce disk pressure, + // but expect that image garbage collection can reduce it enough to avoid an eviction + runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{ + { + evictionPriority: 0, + pod: inodeConsumingPod("container-inode", 110000, nil, true), + }, + }) + }) +}) + +// ImageGCTerminatedPodsEviction tests that the node responds to disk pressure +// by only evicting terminated pods, when reclaiming their resources freed +// enough space to eliminate the disk pressure +// Disk pressure is induced by pulling large images +var _ = SIGDescribe("ImageGCTerminatedPodsEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { + f := framework.NewDefaultFramework("image-gc-terminated-pods-eviction-test") + f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged + pressureTimeout := 10 * time.Minute + expectedNodeCondition := v1.NodeDiskPressure + expectedStarvedResource := resourceInodes + inodesConsumed := uint64(100000) + ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { prepull := func(ctx context.Context) { // Prepull images for image garbage collector to remove them @@ -186,12 +174,16 @@ var _ = SIGDescribe("ImageGCNoEviction", framework.WithSlow(), framework.WithSer initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesConsumed)} initialConfig.EvictionMinimumReclaim = map[string]string{} }) - // Consume enough inodes to induce disk pressure, - // but expect that image garbage collection can reduce it enough to avoid an eviction - runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{ + runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logInodeMetrics, []podEvictSpec{ + // Setting the volume as nil, will make the file to be written to the + // container writable layer, which defaults to using the node's root fs. + { + evictionPriority: 1, + pod: inodeConsumingPod(makeWaitPodName("disk-hog"), 30000, nil, false), + }, { evictionPriority: 0, - pod: inodeConsumingPod("container-inode", 110000, nil), + pod: inodeConsumingPod("container-inode", 80000, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, true), }, }) }) @@ -295,7 +287,7 @@ var _ = SIGDescribe("LocalStorageEviction", framework.WithSlow(), framework.With evictionPriority: 1, // TODO(#127864): Container runtime may not immediate free up the resources after the pod eviction, // causing the test to fail. We provision an emptyDir volume to avoid relying on the runtime behavior. - pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), + pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}, true), }, { evictionPriority: 0, @@ -336,7 +328,7 @@ var _ = SIGDescribe("LocalStorageSoftEviction", framework.WithSlow(), framework. evictionPriority: 1, // TODO(#127864): Container runtime may not immediate free up the resources after the pod eviction, // causing the test to fail. We provision an emptyDir volume to avoid relying on the runtime behavior. - pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), + pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}, true), }, { evictionPriority: 0, @@ -378,7 +370,7 @@ var _ = SIGDescribe("LocalStorageSoftEvictionNotOverwriteTerminationGracePeriodS evictionMaxPodGracePeriod: evictionSoftGracePeriod, evictionSoftGracePeriod: evictionMaxPodGracePeriod, evictionPriority: 1, - pod: diskConsumingPod("container-disk-hog", lotsOfDisk, nil, v1.ResourceRequirements{}), + pod: diskConsumingPod("container-disk-hog", lotsOfDisk, nil, v1.ResourceRequirements{}, true), }, }) }) @@ -404,32 +396,32 @@ var _ = SIGDescribe("LocalStorageCapacityIsolationEviction", framework.WithSlow( evictionPriority: 1, // This pod should be evicted because emptyDir (default storage type) usage violation pod: diskConsumingPod("emptydir-disk-sizelimit", useOverLimit, &v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{SizeLimit: &sizeLimit}, - }, v1.ResourceRequirements{}), + }, v1.ResourceRequirements{}, true), }, { evictionPriority: 1, // This pod should cross the container limit by writing to its writable layer. - pod: diskConsumingPod("container-disk-limit", useOverLimit, nil, v1.ResourceRequirements{Limits: containerLimit}), + pod: diskConsumingPod("container-disk-limit", useOverLimit, nil, v1.ResourceRequirements{Limits: containerLimit}, true), }, { evictionPriority: 1, // This pod should hit the container limit by writing to an emptydir pod: diskConsumingPod("container-emptydir-disk-limit", useOverLimit, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, - v1.ResourceRequirements{Limits: containerLimit}), + v1.ResourceRequirements{Limits: containerLimit}, true), }, { evictionPriority: 0, // This pod should not be evicted because MemoryBackedVolumes cannot use more space than is allocated to them since SizeMemoryBackedVolumes was enabled pod: diskConsumingPod("emptydir-memory-sizelimit", useOverLimit, &v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sizeLimit}, - }, v1.ResourceRequirements{}), + }, v1.ResourceRequirements{}, true), }, { evictionPriority: 0, // This pod should not be evicted because it uses less than its limit pod: diskConsumingPod("emptydir-disk-below-sizelimit", useUnderLimit, &v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{SizeLimit: &sizeLimit}, - }, v1.ResourceRequirements{}), + }, v1.ResourceRequirements{}, true), }, { evictionPriority: 0, // This pod should not be evicted because it uses less than its limit - pod: diskConsumingPod("container-disk-below-sizelimit", useUnderLimit, nil, v1.ResourceRequirements{Limits: containerLimit}), + pod: diskConsumingPod("container-disk-below-sizelimit", useUnderLimit, nil, v1.ResourceRequirements{Limits: containerLimit}, true), }, }) }) @@ -619,13 +611,13 @@ var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering", framework.WithSlow() evictionPriority: 2, // TODO(#127864): Container runtime may not immediate free up the resources after the pod eviction, // causing the test to fail. We provision an emptyDir volume to avoid relying on the runtime behavior. - pod: diskConsumingPod("best-effort-disk", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), + pod: diskConsumingPod("best-effort-disk", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}, true), }, { evictionPriority: 1, // TODO(#127864): Container runtime may not immediate free up the resources after the pod eviction, // causing the test to fail. We provision an emptyDir volume to avoid relying on the runtime behavior. - pod: diskConsumingPod("high-priority-disk", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), + pod: diskConsumingPod("high-priority-disk", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}, true), }, { evictionPriority: 0, @@ -637,7 +629,7 @@ var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering", framework.WithSlow() Limits: v1.ResourceList{ v1.ResourceEphemeralStorage: resource.MustParse("300Mi"), }, - }), + }, true), }, } specs[1].pod.Spec.PriorityClassName = highPriorityClassName @@ -771,24 +763,26 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe // Wait for the expected terminated pod to get to terminated state for _, spec := range testSpecs { - if strings.Contains(spec.pod.Name, waitPodName) { - ginkgo.By(fmt.Sprintf("Waiting for the %s pod to complete", spec.pod.Name)) - podClient := f.ClientSet.CoreV1().Pods(f.Namespace.Name) - gomega.Eventually(ctx, func() error { - pod, err := podClient.Get(ctx, spec.pod.Name, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("failed to retrieve %s: %v", spec.pod.Name, err) - } - - if pod.Status.Phase != v1.PodSucceeded && pod.Status.Phase != v1.PodFailed { - return fmt.Errorf("%s pod not yet completed or failed: %v", spec.pod.Name, pod.Status.Phase) - } - framework.Logf("%s in phase: %v", spec.pod.Name, pod.Status.Phase) - return nil - }, pressureTimeout, evictionPollInterval).Should(gomega.BeNil()) - - break + if !isPodExpectedToSucceed(spec.pod) { + continue } + + ginkgo.By(fmt.Sprintf("Waiting for the %s pod to complete", spec.pod.Name)) + podClient := f.ClientSet.CoreV1().Pods(f.Namespace.Name) + gomega.Eventually(ctx, func() error { + pod, err := podClient.Get(ctx, spec.pod.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to retrieve %s: %v", spec.pod.Name, err) + } + + if pod.Status.Phase != v1.PodSucceeded { + return fmt.Errorf("%s pod not yet completed or failed: %v", spec.pod.Name, pod.Status.Phase) + } + + return nil + }, pressureTimeout, evictionPollInterval).Should(gomega.BeNil()) + + break } }) @@ -925,8 +919,10 @@ func verifyEvictionOrdering(ctx context.Context, f *framework.Framework, testSpe } } gomega.Expect(priorityPod).NotTo(gomega.BeNil()) - gomega.Expect(priorityPod.Status.Phase).ToNot(gomega.Equal(v1.PodSucceeded), - fmt.Sprintf("pod: %s succeeded unexpectedly", priorityPod.Name)) + if !isPodExpectedToSucceed(&priorityPod) { + gomega.Expect(priorityPod.Status.Phase).ToNot(gomega.Equal(v1.PodSucceeded), + fmt.Sprintf("pod: %s succeeded unexpectedly", priorityPod.Name)) + } // Check eviction ordering. // Note: it is alright for a priority 1 and priority 2 pod (for example) to fail in the same round, @@ -958,7 +954,7 @@ func verifyEvictionOrdering(ctx context.Context, f *framework.Framework, testSpe } // If a pod that is not evictionPriority 0 has not been evicted, we are not done - if priorityPodSpec.evictionPriority != 0 && priorityPod.Status.Phase != v1.PodFailed { + if priorityPodSpec.evictionPriority != 0 && priorityPod.Status.Phase != v1.PodFailed && !isPodExpectedToSucceed(&priorityPod) { pendingPods = append(pendingPods, priorityPod.ObjectMeta.Name) done = false } @@ -1022,6 +1018,12 @@ func verifyEvictionEvents(ctx context.Context, f *framework.Framework, testSpecs event := podEvictEvents.Items[0] if expectedStarvedResource != noStarvedResource { + // Terminated pods are not considered for stats collections, + // so they do not have annotations to be verified + if isPodExpectedToSucceed(pod) { + return + } + // Check the eviction.StarvedResourceKey starved, found := event.Annotations[eviction.StarvedResourceKey] if !found { @@ -1255,32 +1257,32 @@ const ( volumeName = "test-volume" ) -func inodeConsumingPod(name string, numFiles int, volumeSource *v1.VolumeSource) *v1.Pod { +func inodeConsumingPod(name string, numFiles int, volumeSource *v1.VolumeSource, putToSleep bool) *v1.Pod { path := "" if volumeSource != nil { path = volumeMountPath } // Each iteration creates an empty file - return podWithCommand(volumeSource, v1.ResourceRequirements{}, numFiles, name, fmt.Sprintf("touch %s${i}.txt; sleep 0.001;", filepath.Join(path, "file"))) + return podWithCommand(volumeSource, v1.ResourceRequirements{}, numFiles, name, fmt.Sprintf("touch %s${i}.txt; sleep 0.001;", filepath.Join(path, "file")), putToSleep) } -func diskConsumingPod(name string, diskConsumedMB int, volumeSource *v1.VolumeSource, resources v1.ResourceRequirements) *v1.Pod { +func diskConsumingPod(name string, diskConsumedMB int, volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, putToSleep bool) *v1.Pod { path := "" if volumeSource != nil { path = volumeMountPath } // Each iteration writes 1 Mb, so do diskConsumedMB iterations. - return podWithCommand(volumeSource, resources, diskConsumedMB, name, fmt.Sprintf("dd if=/dev/urandom of=%s${i} bs=1048576 count=1 2>/dev/null; sleep .1;", filepath.Join(path, "file"))) + return podWithCommand(volumeSource, resources, diskConsumedMB, name, fmt.Sprintf("dd if=/dev/urandom of=%s${i} bs=1048576 count=1 2>/dev/null; sleep .1;", filepath.Join(path, "file")), putToSleep) } func pidConsumingPod(name string, numProcesses int) *v1.Pod { // Slowing down the iteration speed to prevent a race condition where eviction may occur // before the correct number of processes is captured in the stats during a sudden surge in processes. - return podWithCommand(nil, v1.ResourceRequirements{}, numProcesses, name, "/bin/sleep 0.01; (/bin/sleep 3600)&") + return podWithCommand(nil, v1.ResourceRequirements{}, numProcesses, name, "/bin/sleep 0.01; (/bin/sleep 3600)&", true) } // podWithCommand returns a pod with the provided volumeSource and resourceRequirements. -func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, iterations int, name, command string) *v1.Pod { +func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, iterations int, name, command string, putToSleep bool) *v1.Pod { // Due to https://github.com/kubernetes/kubernetes/issues/115819, // When evictionHard to used, we were setting grace period to 0 which meant the default setting (30 seconds) // This could help with flakiness as we should send sigterm right away. @@ -1303,7 +1305,7 @@ func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirem Command: []string{ "sh", "-c", - fmt.Sprintf("i=0; while [ $i -lt %d ]; do %s i=$(($i+1)); done; while true; do sleep 5; done", iterations, command), + fmt.Sprintf("i=0; while [ $i -lt %d ]; do %s i=$(($i+1)); done; while %v; do sleep 5; done", iterations, command, putToSleep), }, Resources: resources, VolumeMounts: volumeMounts, @@ -1372,3 +1374,11 @@ func getMemhogPodWithPodLevelResources(podName string, podLevelRes v1.ResourceRe return pod } + +func makeWaitPodName(podName string) string { + return fmt.Sprintf("%s-%s", podName, waitPodSufix) +} + +func isPodExpectedToSucceed(pod *v1.Pod) bool { + return strings.Contains(pod.Name, waitPodSufix) +} diff --git a/test/e2e_node/split_disk_test.go b/test/e2e_node/split_disk_test.go index 3796361c077..cf840364c14 100644 --- a/test/e2e_node/split_disk_test.go +++ b/test/e2e_node/split_disk_test.go @@ -104,7 +104,7 @@ var _ = SIGDescribe("KubeletSeparateDiskGC", feature.KubeletSeparateDiskGC, func // This pod should exceed disk capacity on nodeFs since it writes a lot to writeable layer. evictionPriority: 1, pod: diskConsumingPod("container-emptydir-disk-limit", diskTestInMb, nil, - v1.ResourceRequirements{}), + v1.ResourceRequirements{}, true), }, }) }) From c9ccbae0d9e179138f1164dc0d6e7f4e4ce8129e Mon Sep 17 00:00:00 2001 From: Kevin Torres Date: Fri, 7 Feb 2025 18:10:29 +0000 Subject: [PATCH 5/8] Remove terminated pods eviction code --- pkg/kubelet/eviction/eviction_manager.go | 67 +-- pkg/kubelet/eviction/eviction_manager_test.go | 468 +++--------------- pkg/kubelet/eviction/types.go | 5 +- pkg/kubelet/kubelet.go | 2 +- pkg/kubelet/kubelet_pods.go | 23 - test/e2e_node/eviction_test.go | 53 +- 6 files changed, 108 insertions(+), 510 deletions(-) diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 513c5d61c0a..cb981dbb7e8 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -181,10 +181,10 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd } // Start starts the control loop to observe and response to low compute resources. -func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, terminatedPodFunc TerminatedPodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) { +func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) { thresholdHandler := func(message string) { klog.InfoS(message) - m.synchronize(diskInfoProvider, podFunc, terminatedPodFunc) + m.synchronize(diskInfoProvider, podFunc) } klog.InfoS("Eviction manager: starting control loop") if m.config.KernelMemcgNotification || runtime.GOOS == "windows" { @@ -203,8 +203,8 @@ func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePod // start the eviction manager monitoring go func() { for { - evictedPods, err := m.synchronize(diskInfoProvider, podFunc, terminatedPodFunc) - if evictedPods != nil && len(evictedPods) > 0 && err == nil { + evictedPods, err := m.synchronize(diskInfoProvider, podFunc) + if evictedPods != nil && err == nil { klog.InfoS("Eviction manager: pods evicted, waiting for pod to be cleaned up", "pods", klog.KObjSlice(evictedPods)) m.waitForPodsCleanup(podCleanedUpFunc, evictedPods) } else { @@ -240,7 +240,7 @@ func (m *managerImpl) IsUnderPIDPressure() bool { // synchronize is the main control loop that enforces eviction thresholds. // Returns the pod that was killed, or nil if no pod was killed. -func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, terminatedPodFunc TerminatedPodsFunc) ([]*v1.Pod, error) { +func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) ([]*v1.Pod, error) { ctx := context.Background() // if we have nothing to do, just return thresholds := m.config.Thresholds @@ -378,16 +378,10 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // record an event about the resources we are now attempting to reclaim via eviction m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim) - var evictedPods []*v1.Pod - // Evict terminated pods, so their resources can be reclaimed - if evictedPods = m.evictTerminatedPods(thresholds, statsFunc, resourceToReclaim, terminatedPodFunc, observations, thresholdToReclaim); len(evictedPods) != 0 { - klog.InfoS("Eviction manager: evicted", "terminated pods", klog.KObjSlice(evictedPods)) - } - // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. if m.reclaimNodeLevelResources(ctx, thresholdToReclaim.Signal, resourceToReclaim) { - klog.InfoS("Eviction manager: able to reduce resource pressure without evicting running pods.", "resourceName", resourceToReclaim) - return evictedPods, nil + klog.InfoS("Eviction manager: able to reduce resource pressure without evicting pods.", "resourceName", resourceToReclaim) + return nil, nil } klog.InfoS("Eviction manager: must evict pod(s) to reclaim", "resourceName", resourceToReclaim) @@ -396,13 +390,13 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal] if !ok { klog.ErrorS(nil, "Eviction manager: no ranking function for signal", "threshold", thresholdToReclaim.Signal) - return evictedPods, nil + return nil, nil } // the only candidates viable for eviction are those pods that had anything running. if len(activePods) == 0 { klog.ErrorS(nil, "Eviction manager: eviction thresholds have been met, but no pods are active to evict") - return evictedPods, nil + return nil, nil } // rank the running pods for eviction for the specified resource @@ -439,48 +433,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act } if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) { metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc() - - return append(evictedPods, pod), nil + return []*v1.Pod{pod}, nil } } - - if len(evictedPods) > 0 { - klog.InfoS("Eviction manager: only evicted terminated pods from the node") - } else { - klog.InfoS("Eviction manager: unable to evict any pods from the node") - } - - return evictedPods, nil -} - -func (m *managerImpl) evictTerminatedPods(thresholds []evictionapi.Threshold, statsFunc statsFunc, resourceToReclaim v1.ResourceName, terminatedPodFunc TerminatedPodsFunc, observations signalObservations, thresholdToReclaim evictionapi.Threshold) []*v1.Pod { - evictedPods := []*v1.Pod{} - if !hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure) { - return evictedPods - } - - terminatedPods := terminatedPodFunc() - - gracePeriodOverride := int64(immediateEvictionGracePeriodSeconds) - - for i := range terminatedPods { - pod := terminatedPods[i] - klog.InfoS("Eviction manager: evicted terminated pod", "pod", pod.Name) - - message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc, thresholds, observations) - condition := &v1.PodCondition{ - Type: v1.DisruptionTarget, - Status: v1.ConditionTrue, - Reason: v1.PodReasonTerminationByKubelet, - Message: message, - } - if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) { - metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc() - evictedPods = append(evictedPods, pod) - } - } - - return evictedPods + klog.InfoS("Eviction manager: unable to evict any pods from the node") + return nil, nil } func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) { diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index ed37768677b..bbc5c677eba 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -65,36 +65,6 @@ func (m *mockPodKiller) killPodNow(pod *v1.Pod, evict bool, gracePeriodOverride return nil } -// While for running pods only one pod is killed per eviction interval, -// when there is disk pressure, all terminated pods are killed in the same cycle -// mockPodsKiller is used to test which pods are killed -type mockPodsKiller struct { - pods []*v1.Pod - evict bool - statusFn func(*v1.PodStatus) - gracePeriodOverride *int64 -} - -// killPodsNow records the pods that were killed -func (m *mockPodsKiller) killPodsNow(pod *v1.Pod, evict bool, gracePeriodOverride *int64, statusFn func(*v1.PodStatus)) error { - m.pods = append(m.pods, pod) - m.statusFn = statusFn - m.evict = evict - m.gracePeriodOverride = gracePeriodOverride - return nil -} - -func setDiskStatsBasedOnFs(whichFs string, diskPressure string, diskStat diskStats) diskStats { - if whichFs == "nodefs" { - diskStat.rootFsAvailableBytes = diskPressure - } else if whichFs == "imagefs" { - diskStat.imageFsAvailableBytes = diskPressure - } else if whichFs == "containerfs" { - diskStat.containerFsAvailableBytes = diskPressure - } - return diskStat -} - // mockDiskInfoProvider is used to simulate testing. type mockDiskInfoProvider struct { dedicatedImageFs *bool @@ -326,10 +296,6 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -363,7 +329,7 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) { } // synchronize to detect the memory pressure - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -430,10 +396,6 @@ func TestPIDPressure_VerifyPodStatus(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false), dedicatedContainerFs: ptr.To(false)} @@ -467,7 +429,7 @@ func TestPIDPressure_VerifyPodStatus(t *testing.T) { } // synchronize to detect the PID pressure - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -612,10 +574,6 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} @@ -647,7 +605,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) { } // synchronize - pods, synchErr := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + pods, synchErr := manager.synchronize(diskInfoProvider, activePodsFunc) if synchErr == nil && tc.expectErr != "" { t.Fatalf("Manager should report error but did not") @@ -705,10 +663,6 @@ func TestMemoryPressure(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -755,7 +709,7 @@ func TestMemoryPressure(t *testing.T) { burstablePodToAdmit, _ := podMaker("burst-admit", defaultPriority, newResourceList("100m", "100Mi", ""), newResourceList("200m", "200Mi", ""), "0Gi") // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -777,7 +731,7 @@ func TestMemoryPressure(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -796,7 +750,7 @@ func TestMemoryPressure(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -825,7 +779,7 @@ func TestMemoryPressure(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -839,7 +793,7 @@ func TestMemoryPressure(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -871,7 +825,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -899,7 +853,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -979,10 +933,6 @@ func TestPIDPressure(t *testing.T) { podToEvict := pods[tc.evictPodIndex] activePodsFunc := func() []*v1.Pod { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -1029,7 +979,7 @@ func TestPIDPressure(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, 50) // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1048,7 +998,7 @@ func TestPIDPressure(t *testing.T) { // induce soft threshold for PID pressure fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.pressurePIDUsageWithGracePeriod, podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1067,7 +1017,7 @@ func TestPIDPressure(t *testing.T) { // step forward in time past the grace period fakeClock.Step(3 * time.Minute) // no change in PID stats to simulate continued pressure - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1097,7 +1047,7 @@ func TestPIDPressure(t *testing.T) { // remove PID pressure by simulating increased PID availability fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats) // Simulate increased PID availability - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1111,7 +1061,7 @@ func TestPIDPressure(t *testing.T) { // re-induce PID pressure fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.pressurePIDUsageWithoutGracePeriod, podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1143,7 +1093,7 @@ func TestPIDPressure(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1167,7 +1117,7 @@ func TestPIDPressure(t *testing.T) { // move the clock past the transition period fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1241,7 +1191,6 @@ func TestDiskPressureNodeFs(t *testing.T) { writeableSeparateFromReadOnly bool thresholdToMonitor []evictionapi.Threshold podToMakes []podToMake - terminatedPodToMakes []podToMake dedicatedImageFs *bool expectErr string inducePressureOnWhichFs string @@ -1370,10 +1319,6 @@ func TestDiskPressureNodeFs(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} @@ -1411,7 +1356,7 @@ func TestDiskPressureNodeFs(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi", nil) // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1438,7 +1383,7 @@ func TestDiskPressureNodeFs(t *testing.T) { diskStatStart.containerFsAvailableBytes = tc.softDiskPressure } summaryProvider.result = summaryStatsMaker(diskStatStart) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1457,7 +1402,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatStart) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1486,7 +1431,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1507,7 +1452,7 @@ func TestDiskPressureNodeFs(t *testing.T) { diskStatStart.containerFsAvailableBytes = tc.hardDiskPressure } summaryProvider.result = summaryStatsMaker(diskStatStart) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) @@ -1537,7 +1482,7 @@ func TestDiskPressureNodeFs(t *testing.T) { summaryProvider.result = summaryStatsMaker(diskStatConst) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1561,7 +1506,7 @@ func TestDiskPressureNodeFs(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatConst) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1585,259 +1530,6 @@ func TestDiskPressureNodeFs(t *testing.T) { } } -// Test terminated pods eviction on disk pressure works as expected -func TestTerminatedPodsEvictionOnDiskPressure(t *testing.T) { - testCases := map[string]struct { - nodeFsStats string - imageFsStats string - containerFsStats string - kubeletSeparateDiskFeature bool - writeableSeparateFromReadOnly bool - thresholdToMonitor evictionapi.Threshold - podToMakes []podToMake - terminatedPodToMakes []podToMake - dedicatedImageFs *bool - inducePressureOnWhichFs string - softDiskPressure string - hardDiskPressure string - expectContainerGcCall bool - expectImageGcCall bool - }{ - "eviction due to disk pressure; terminated pods": { - dedicatedImageFs: ptr.To(false), - nodeFsStats: "16Gi", - imageFsStats: "16Gi", - containerFsStats: "16Gi", - inducePressureOnWhichFs: "nodefs", - softDiskPressure: "1.5Gi", - hardDiskPressure: "750Mi", - expectImageGcCall: true, - expectContainerGcCall: true, - thresholdToMonitor: evictionapi.Threshold{ - Signal: evictionapi.SignalNodeFsAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("1Gi"), - }, - }, - podToMakes: []podToMake{ - {name: "low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "900Mi"}, - {name: "below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), logsFsUsed: "50Mi"}, - {name: "above-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), rootFsUsed: "400Mi"}, - {name: "high-priority-high-usage", priority: highPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "400Mi"}, - {name: "low-priority-low-usage", priority: lowPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), rootFsUsed: "100Mi"}, - }, - terminatedPodToMakes: []podToMake{ - {name: "terminated-pod-1", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "500Mi"}, - {name: "terminated-pod-2", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), rootFsUsed: "500Mi"}, - }, - }, - } - - for name, tc := range testCases { - t.Run(name, func(t *testing.T) { - featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.KubeletSeparateDiskGC, tc.kubeletSeparateDiskFeature) - - podMaker := makePodWithDiskStats - summaryStatsMaker := makeDiskStats - podsToMake := tc.podToMakes - pods := []*v1.Pod{} - podStats := map[*v1.Pod]statsapi.PodStats{} - for _, podToMake := range podsToMake { - pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil) - pods = append(pods, pod) - podStats[pod] = podStat - } - activePodsFunc := func() []*v1.Pod { - return pods - } - - terminatedPodsToMake := tc.terminatedPodToMakes - terminatedPods := []*v1.Pod{} - for _, terminatedPodToMake := range terminatedPodsToMake { - terminatedPod, _ := podMaker(terminatedPodToMake.name, terminatedPodToMake.priority, terminatedPodToMake.requests, terminatedPodToMake.limits, terminatedPodToMake.rootFsUsed, terminatedPodToMake.logsFsUsed, terminatedPodToMake.perLocalVolumeUsed, nil) - terminatedPods = append(terminatedPods, terminatedPod) - } - terminatedPodsFunc := func() []*v1.Pod { - return terminatedPods - } - - fakeClock := testingclock.NewFakeClock(time.Now()) - podKiller := &mockPodsKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} - nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - - config := Config{ - MaxPodGracePeriodSeconds: 5, - PressureTransitionPeriod: time.Minute * 5, - Thresholds: []evictionapi.Threshold{tc.thresholdToMonitor}, - } - diskStatStart := diskStats{ - rootFsAvailableBytes: tc.nodeFsStats, - imageFsAvailableBytes: tc.imageFsStats, - containerFsAvailableBytes: tc.containerFsStats, - podStats: podStats, - } - // This is a constant that we use to test that disk pressure is over. Don't change! - diskStatConst := diskStatStart - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStatStart)} - diskGC := &mockDiskGC{fakeSummaryProvider: summaryProvider, err: nil} - manager := &managerImpl{ - clock: fakeClock, - killPodFunc: podKiller.killPodsNow, - imageGC: diskGC, - containerGC: diskGC, - config: config, - recorder: &record.FakeRecorder{}, - summaryProvider: summaryProvider, - nodeRef: nodeRef, - nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, - thresholdsFirstObservedAt: thresholdsObservedAt{}, - } - - // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) - - if err != nil { - t.Fatalf("Manager should not have an error %v", err) - } - - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") - } - - // induce hard threshold - fakeClock.Step(1 * time.Minute) - - newDiskAfterHardEviction := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) - summaryProvider.result = summaryStatsMaker(newDiskAfterHardEviction) - // make GC successfully return disk usage to previous levels - diskGC.summaryAfterGC = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) - - if err != nil { - t.Fatalf("Manager should not have an error %v", err) - } - - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Fatalf("Manager should report disk pressure since soft threshold was met") - } - - // verify image, container or both gc were called. - // split filesystem can have container gc called without image. - // same filesystem should have both. - if diskGC.imageGCInvoked != tc.expectImageGcCall && diskGC.containerGCInvoked != tc.expectContainerGcCall { - t.Fatalf("Manager should have invoked image gc") - } - - // compare expected killed pods with actual killed pods - checkIfSamePods := func(expectedPods, actualPods []*v1.Pod) bool { - expected := make(map[*v1.Pod]bool) - for _, pod := range expectedPods { - expected[pod] = true - } - - for _, pod := range actualPods { - if !expected[pod] { - return false - } - } - - return true - } - - // verify terminated pods were killed - if podKiller.pods == nil { - t.Fatalf("Manager should have killed terminated pods") - } - - // verify only terminated pods were killed because image gc was sufficient - if !checkIfSamePods(terminatedPods, podKiller.pods) { - t.Fatalf("Manager killed running pods") - } - - // reset state - diskGC.imageGCInvoked = false - diskGC.containerGCInvoked = false - podKiller.pods = nil - - // remove disk pressure - fakeClock.Step(20 * time.Minute) - summaryProvider.result = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) - - if err != nil { - t.Fatalf("Manager should not have an error %v", err) - } - - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Fatalf("Manager should not report disk pressure") - } - - // induce disk pressure! - fakeClock.Step(1 * time.Minute) - softDiskPressure := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) - summaryProvider.result = summaryStatsMaker(softDiskPressure) - // Don't reclaim any disk - diskGC.summaryAfterGC = summaryStatsMaker(softDiskPressure) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) - - if err != nil { - t.Fatalf("Manager should not have an error %v", err) - } - - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Fatalf("Manager should report disk pressure") - } - - // verify image, container or both gc were called. - // split filesystem can have container gc called without image. - // same filesystem should have both. - if diskGC.imageGCInvoked != tc.expectImageGcCall && diskGC.containerGCInvoked != tc.expectContainerGcCall { - t.Fatalf("Manager should have invoked image gc") - } - - // Make array with only the pods names - podsNames := func(pods []*v1.Pod) []string { - var podsNames []string - for _, pod := range pods { - podsNames = append(podsNames, pod.Name) - } - return podsNames - } - - expectedKilledPods := append(terminatedPods, pods[0]) - // verify only terminated pods were killed because image gc was sufficient - if !checkIfSamePods(expectedKilledPods, podKiller.pods) { - t.Fatalf("Manager killed running pods. Expected: %v, actual: %v", podsNames(expectedKilledPods), podsNames(podKiller.pods)) - } - - // reset state - diskGC.imageGCInvoked = false - diskGC.containerGCInvoked = false - podKiller.pods = nil - - // remove disk pressure - fakeClock.Step(20 * time.Minute) - summaryProvider.result = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) - - if err != nil { - t.Fatalf("Manager should not have an error %v", err) - } - - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Fatalf("Manager should not report disk pressure") - } - }) - } -} - // TestMinReclaim verifies that min-reclaim works as desired. func TestMinReclaim(t *testing.T) { podMaker := makePodWithMemoryStats @@ -1861,10 +1553,6 @@ func TestMinReclaim(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -1902,7 +1590,7 @@ func TestMinReclaim(t *testing.T) { } // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Errorf("Manager should not report any errors") } @@ -1914,7 +1602,7 @@ func TestMinReclaim(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1938,7 +1626,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1.2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1962,7 +1650,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -1982,7 +1670,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2153,10 +1841,6 @@ func TestNodeReclaimFuncs(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} @@ -2191,7 +1875,7 @@ func TestNodeReclaimFuncs(t *testing.T) { } // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2205,11 +1889,21 @@ func TestNodeReclaimFuncs(t *testing.T) { // induce hard threshold fakeClock.Step(1 * time.Minute) + setDiskStatsBasedOnFs := func(whichFs string, diskPressure string, diskStat diskStats) diskStats { + if whichFs == "nodefs" { + diskStat.rootFsAvailableBytes = diskPressure + } else if whichFs == "imagefs" { + diskStat.imageFsAvailableBytes = diskPressure + } else if whichFs == "containerfs" { + diskStat.containerFsAvailableBytes = diskPressure + } + return diskStat + } newDiskAfterHardEviction := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, tc.hardDiskPressure, diskStatStart) summaryProvider.result = summaryStatsMaker(newDiskAfterHardEviction) // make GC successfully return disk usage to previous levels diskGC.summaryAfterGC = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2239,7 +1933,7 @@ func TestNodeReclaimFuncs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2251,7 +1945,7 @@ func TestNodeReclaimFuncs(t *testing.T) { } // synchronize - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2269,7 +1963,7 @@ func TestNodeReclaimFuncs(t *testing.T) { // make GC return disk usage bellow the threshold, but not satisfying minReclaim gcBelowThreshold := setDiskStatsBasedOnFs(tc.inducePressureOnWhichFs, "1.1G", newDiskAfterHardEviction) diskGC.summaryAfterGC = summaryStatsMaker(gcBelowThreshold) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2300,7 +1994,7 @@ func TestNodeReclaimFuncs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker(diskStatConst) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2317,7 +2011,7 @@ func TestNodeReclaimFuncs(t *testing.T) { summaryProvider.result = summaryStatsMaker(softDiskPressure) // Don't reclaim any disk diskGC.summaryAfterGC = summaryStatsMaker(softDiskPressure) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2350,7 +2044,7 @@ func TestNodeReclaimFuncs(t *testing.T) { diskGC.imageGCInvoked = false // reset state diskGC.containerGCInvoked = false // reset state podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2376,7 +2070,7 @@ func TestNodeReclaimFuncs(t *testing.T) { diskGC.imageGCInvoked = false // reset state diskGC.containerGCInvoked = false // reset state podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2610,10 +2304,6 @@ func TestInodePressureFsInodes(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: tc.dedicatedImageFs, dedicatedContainerFs: &tc.writeableSeparateFromReadOnly} @@ -2645,7 +2335,7 @@ func TestInodePressureFsInodes(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0", "0", "0") // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2664,7 +2354,7 @@ func TestInodePressureFsInodes(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = setINodesFreeBasedOnFs(tc.inducePressureOnWhichFs, tc.softINodePressure, startingStatsModified) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2683,7 +2373,7 @@ func TestInodePressureFsInodes(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = setINodesFreeBasedOnFs(tc.inducePressureOnWhichFs, tc.softINodePressure, startingStatsModified) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2712,7 +2402,7 @@ func TestInodePressureFsInodes(t *testing.T) { // remove inode pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = startingStatsConst - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2726,7 +2416,7 @@ func TestInodePressureFsInodes(t *testing.T) { // induce inode pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = setINodesFreeBasedOnFs(tc.inducePressureOnWhichFs, tc.hardINodePressure, startingStatsModified) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2755,7 +2445,7 @@ func TestInodePressureFsInodes(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = startingStatsConst podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2780,7 +2470,7 @@ func TestInodePressureFsInodes(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = startingStatsConst podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2832,10 +2522,6 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -2881,7 +2567,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2900,7 +2586,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2922,7 +2608,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2942,7 +2628,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -2999,10 +2685,6 @@ func TestStorageLimitEvictions(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -3045,7 +2727,7 @@ func TestStorageLimitEvictions(t *testing.T) { localStorageCapacityIsolation: true, } - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager expects no error but got %v", err) } @@ -3086,10 +2768,6 @@ func TestAllocatableMemoryPressure(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -3128,7 +2806,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { burstablePodToAdmit, _ := podMaker("burst-admit", defaultPriority, newResourceList("100m", "100Mi", ""), newResourceList("200m", "200Mi", ""), "0Gi") // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -3152,7 +2830,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { pod, podStat := podMaker("guaranteed-high-2", defaultPriority, newResourceList("100m", "1Gi", ""), newResourceList("100m", "1Gi", ""), "1Gi") podStats[pod] = podStat summaryProvider.result = summaryStatsMaker("500Mi", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -3192,7 +2870,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { } summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -3220,7 +2898,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -3250,10 +2928,6 @@ func TestUpdateMemcgThreshold(t *testing.T) { return []*v1.Pod{} } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - fakeClock := testingclock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} @@ -3294,14 +2968,14 @@ func TestUpdateMemcgThreshold(t *testing.T) { } // The UpdateThreshold method should have been called once, since this is the first run. - _, err := manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) } // The UpdateThreshold method should not have been called again, since not enough time has passed - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -3309,7 +2983,7 @@ func TestUpdateMemcgThreshold(t *testing.T) { // The UpdateThreshold method should be called again since enough time has passed fakeClock.Step(2 * notifierRefreshInterval) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -3324,7 +2998,7 @@ func TestUpdateMemcgThreshold(t *testing.T) { // The UpdateThreshold method should be called because at least notifierRefreshInterval time has passed. // The Description method should be called because UpdateThreshold returned an error fakeClock.Step(2 * notifierRefreshInterval) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have an error %v", err) @@ -3392,11 +3066,7 @@ func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) { return pods } - terminatedPodsFunc := func() []*v1.Pod { - return []*v1.Pod{} - } - - evictedPods, err := mgr.synchronize(diskInfoProvider, activePodsFunc, terminatedPodsFunc) + evictedPods, err := mgr.synchronize(diskInfoProvider, activePodsFunc) if err != nil { t.Fatalf("Manager should not have error but got %v", err) diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index da8a6062ba7..83cdfcdf853 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -59,7 +59,7 @@ type Config struct { // Manager evaluates when an eviction threshold for node stability has been met on the node. type Manager interface { // Start starts the control loop to monitor eviction thresholds at specified interval. - Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, terminatedPodFunc TerminatedPodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) + Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) // IsUnderMemoryPressure returns true if the node is under memory pressure. IsUnderMemoryPressure() bool @@ -107,9 +107,6 @@ type MirrorPodFunc func(*v1.Pod) (*v1.Pod, bool) // ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state) type ActivePodsFunc func() []*v1.Pod -// TerminatedPodsFunc return pods bound to the kubelet that are inactive (i.e. terminal state) -type TerminatedPodsFunc func() []*v1.Pod - // PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed. type PodCleanedUpFunc func(*v1.Pod) bool diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 3d581feaa77..600a006f054 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1705,7 +1705,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() { } // eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs // Eviction decisions are based on the allocated (rather than desired) pod resources. - kl.evictionManager.Start(kl.StatsProvider, kl.getAllocatedPods, kl.getTerminatedPods, kl.PodIsFinished, evictionMonitoringPeriod) + kl.evictionManager.Start(kl.StatsProvider, kl.getAllocatedPods, kl.PodIsFinished, evictionMonitoringPeriod) // container log manager must start after container runtime is up to retrieve information from container runtime // and inform container to reopen log file after log rotation. diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index dacfa0282f6..86eeb0bd26d 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -219,14 +219,6 @@ func (kl *Kubelet) getAllocatedPods() []*v1.Pod { return allocatedPods } -// Returns all terminated pods. -func (kl *Kubelet) getTerminatedPods() []*v1.Pod { - allPods := kl.podManager.GetPods() - - terminatedPods := kl.filterOutActivePods(allPods) - return terminatedPods -} - // makeBlockVolumes maps the raw block devices specified in the path of the container // Experimental func (kl *Kubelet) makeBlockVolumes(pod *v1.Pod, container *v1.Container, podVolumes kubecontainer.VolumeMap, blkutil volumepathhandler.BlockVolumePathHandler) ([]kubecontainer.DeviceInfo, error) { @@ -1152,21 +1144,6 @@ func (kl *Kubelet) filterOutInactivePods(pods []*v1.Pod) []*v1.Pod { return filteredPods } -// filterOutActivePods returns pods that are in a terminal phase -func (kl *Kubelet) filterOutActivePods(pods []*v1.Pod) []*v1.Pod { - filteredPods := make([]*v1.Pod, 0, len(pods)) - for _, p := range pods { - // skip pod if it is not in a terminal phase - if !(p.Status.Phase == v1.PodFailed || p.Status.Phase == v1.PodSucceeded) { - continue - } - - klog.InfoS("Eviction manager - Test: filterOutActivePods", "terminated pod", p) - filteredPods = append(filteredPods, p) - } - return filteredPods -} - // isAdmittedPodTerminal returns true if the provided config source pod is in // a terminal phase, or if the Kubelet has already indicated the pod has reached // a terminal phase but the config source has not accepted it yet. This method diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index 93dd161930c..a5b6de74b5c 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -66,7 +66,7 @@ const ( lotsOfFiles = 1000000000 // 1 billion resourceInodes = v1.ResourceName("inodes") noStarvedResource = v1.ResourceName("none") - waitPodSufix = "wait-pod" + waitPodSufix = "wait-pod" ) // InodeEviction tests that the node responds to node disk pressure by evicting only responsible pods. @@ -139,12 +139,13 @@ var _ = SIGDescribe("ImageGCNoEviction", framework.WithSlow(), framework.WithSer }) }) -// ImageGCTerminatedPodsEviction tests that the node responds to disk pressure -// by only evicting terminated pods, when reclaiming their resources freed -// enough space to eliminate the disk pressure +// ImageGCTerminatedPodsContainersCleanup tests that the node responds to disk +// pressure by cleaning up containers from terminated pods without evicting +// running pods, when reclaiming their resources freed enough space to eliminate +// the disk pressure // Disk pressure is induced by pulling large images -var _ = SIGDescribe("ImageGCTerminatedPodsEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { - f := framework.NewDefaultFramework("image-gc-terminated-pods-eviction-test") +var _ = SIGDescribe("ImageGCTerminatedPodsContainersCleanup", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { + f := framework.NewDefaultFramework("image-gc-terminated-pods-containers-cleanup-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged pressureTimeout := 10 * time.Minute expectedNodeCondition := v1.NodeDiskPressure @@ -178,7 +179,7 @@ var _ = SIGDescribe("ImageGCTerminatedPodsEviction", framework.WithSlow(), frame // Setting the volume as nil, will make the file to be written to the // container writable layer, which defaults to using the node's root fs. { - evictionPriority: 1, + evictionPriority: 0, pod: inodeConsumingPod(makeWaitPodName("disk-hog"), 30000, nil, false), }, { @@ -753,37 +754,39 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe time.Sleep(30 * time.Second) ginkgo.By("setting up pods to be used by tests") pods := []*v1.Pod{} + podsToSucceed := []*v1.Pod{} for _, spec := range testSpecs { if spec.prePodCreationModificationFunc != nil { spec.prePodCreationModificationFunc(ctx, spec.pod) } - pods = append(pods, spec.pod) - } - e2epod.NewPodClient(f).CreateBatch(ctx, pods) - - // Wait for the expected terminated pod to get to terminated state - for _, spec := range testSpecs { - if !isPodExpectedToSucceed(spec.pod) { + if isPodExpectedToSucceed(spec.pod) { + podsToSucceed = append(podsToSucceed, spec.pod) continue } + pods = append(pods, spec.pod) + } + e2ePodClient := e2epod.NewPodClient(f) + e2ePodClient.CreateBatch(ctx, podsToSucceed) - ginkgo.By(fmt.Sprintf("Waiting for the %s pod to complete", spec.pod.Name)) - podClient := f.ClientSet.CoreV1().Pods(f.Namespace.Name) + // Wait for the expected succeeded pods to get to that state + for _, podToSucceed := range podsToSucceed { + ginkgo.By(fmt.Sprintf("Waiting for the %s pod to complete", podToSucceed.Name)) gomega.Eventually(ctx, func() error { - pod, err := podClient.Get(ctx, spec.pod.Name, metav1.GetOptions{}) + pod, err := e2ePodClient.PodInterface.Get(ctx, podToSucceed.Name, metav1.GetOptions{}) if err != nil { - return fmt.Errorf("failed to retrieve %s: %v", spec.pod.Name, err) + return fmt.Errorf("failed to retrieve %s: %w", podToSucceed.Name, err) } if pod.Status.Phase != v1.PodSucceeded { - return fmt.Errorf("%s pod not yet completed or failed: %v", spec.pod.Name, pod.Status.Phase) + return fmt.Errorf("%s pod not yet completed or failed: %v", pod.Name, pod.Status.Phase) } return nil }, pressureTimeout, evictionPollInterval).Should(gomega.BeNil()) - - break } + + // Create all other pods after expected to succeed pods terminated + e2ePodClient.CreateBatch(ctx, pods) }) ginkgo.It("should eventually evict all of the correct pods", func(ctx context.Context) { @@ -954,7 +957,7 @@ func verifyEvictionOrdering(ctx context.Context, f *framework.Framework, testSpe } // If a pod that is not evictionPriority 0 has not been evicted, we are not done - if priorityPodSpec.evictionPriority != 0 && priorityPod.Status.Phase != v1.PodFailed && !isPodExpectedToSucceed(&priorityPod) { + if priorityPodSpec.evictionPriority != 0 && priorityPod.Status.Phase != v1.PodFailed { pendingPods = append(pendingPods, priorityPod.ObjectMeta.Name) done = false } @@ -1018,12 +1021,6 @@ func verifyEvictionEvents(ctx context.Context, f *framework.Framework, testSpecs event := podEvictEvents.Items[0] if expectedStarvedResource != noStarvedResource { - // Terminated pods are not considered for stats collections, - // so they do not have annotations to be verified - if isPodExpectedToSucceed(pod) { - return - } - // Check the eviction.StarvedResourceKey starved, found := event.Annotations[eviction.StarvedResourceKey] if !found { From 388046c3ea9e65121782610ffc40da6ccca0b179 Mon Sep 17 00:00:00 2001 From: Kevin Torres Date: Wed, 26 Mar 2025 18:08:59 +0000 Subject: [PATCH 6/8] ImageGCTerminatedPodsContainersCleanup e2e node test Updated ImageGCTerminatedPodsEviction to ImageGCTerminatedPodsContainersCleanup to test that terminated containers are being cleaned up, instead of testing if terminated pods were being evicted --- test/e2e_node/eviction_test.go | 181 ++++++++++++++++----------------- 1 file changed, 88 insertions(+), 93 deletions(-) diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index a5b6de74b5c..8a90c53d652 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -66,7 +66,6 @@ const ( lotsOfFiles = 1000000000 // 1 billion resourceInodes = v1.ResourceName("inodes") noStarvedResource = v1.ResourceName("none") - waitPodSufix = "wait-pod" ) // InodeEviction tests that the node responds to node disk pressure by evicting only responsible pods. @@ -74,19 +73,19 @@ const ( var _ = SIGDescribe("InodeEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("inode-eviction-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - expectedNodeCondition := v1.NodeDiskPressure - expectedStarvedResource := resourceInodes - pressureTimeout := 15 * time.Minute - inodesConsumed := uint64(200000) + const expectedNodeCondition = v1.NodeDiskPressure + const expectedStarvedResource = resourceInodes + const pressureTimeout = 15 * time.Minute + const inodesToThreshold = uint64(200000) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { - // Set the eviction threshold to inodesFree - inodesConsumed, so that using inodesConsumed causes an eviction. + // Set the eviction threshold to inodesFree - inodesToThreshold, so that using inodesToThreshold causes an eviction. summary := eventuallyGetSummary(ctx) inodesFree := *summary.Node.Fs.InodesFree - if inodesFree <= inodesConsumed { + if inodesFree <= inodesToThreshold { e2eskipper.Skipf("Too few inodes free on the host for the InodeEviction test to run") } - initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesConsumed)} + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesToThreshold)} initialConfig.EvictionMinimumReclaim = map[string]string{} }) runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logInodeMetrics, []podEvictSpec{ @@ -108,24 +107,24 @@ var _ = SIGDescribe("InodeEviction", framework.WithSlow(), framework.WithSerial( // by reclaiming resources(inodes) through image garbage collection. // Disk pressure is induced by consuming a lot of inodes on the node. // Images are pre-pulled before running the test workload to ensure -// that the image garbage collerctor can remove them to avoid eviction. +// that the image garbage collector can remove them to avoid eviction. var _ = SIGDescribe("ImageGCNoEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("image-gc-eviction-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - pressureTimeout := 10 * time.Minute - expectedNodeCondition := v1.NodeDiskPressure - expectedStarvedResource := resourceInodes - inodesConsumed := uint64(100000) + const pressureTimeout = 10 * time.Minute + const expectedNodeCondition = v1.NodeDiskPressure + const expectedStarvedResource = resourceInodes + const inodesToThreshold = uint64(100000) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { - // Set the eviction threshold to inodesFree - inodesConsumed, so that using inodesConsumed causes an eviction. + // Set the eviction threshold to inodesFree - inodesToThreshold, so that using inodesToThreshold causes an eviction. summary := eventuallyGetSummary(ctx) inodesFree := *summary.Node.Fs.InodesFree - if inodesFree <= inodesConsumed { + if inodesFree <= inodesToThreshold { e2eskipper.Skipf("Too few inodes free on the host for the InodeEviction test to run") } - framework.Logf("Setting eviction threshold to %d inodes", inodesFree-inodesConsumed) - initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesConsumed)} + framework.Logf("Setting eviction threshold to %d inodes", inodesFree-inodesToThreshold) + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesToThreshold)} initialConfig.EvictionMinimumReclaim = map[string]string{} }) // Consume enough inodes to induce disk pressure, @@ -147,10 +146,10 @@ var _ = SIGDescribe("ImageGCNoEviction", framework.WithSlow(), framework.WithSer var _ = SIGDescribe("ImageGCTerminatedPodsContainersCleanup", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("image-gc-terminated-pods-containers-cleanup-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - pressureTimeout := 10 * time.Minute - expectedNodeCondition := v1.NodeDiskPressure - expectedStarvedResource := resourceInodes - inodesConsumed := uint64(100000) + const pressureTimeout = 10 * time.Minute + const expectedNodeCondition = v1.NodeDiskPressure + const expectedStarvedResource = resourceInodes + const inodesToThreshold = uint64(50000) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { prepull := func(ctx context.Context) { @@ -165,14 +164,14 @@ var _ = SIGDescribe("ImageGCTerminatedPodsContainersCleanup", framework.WithSlow } tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { - // Set the eviction threshold to inodesFree - inodesConsumed, so that using inodesConsumed causes an eviction. + // Set the eviction threshold to inodesFree - inodesToThreshold, so that using inodesToThreshold causes an eviction. summary := eventuallyGetSummary(ctx) inodesFree := *summary.Node.Fs.InodesFree - if inodesFree <= inodesConsumed { + if inodesFree <= inodesToThreshold { e2eskipper.Skipf("Too few inodes free on the host for the InodeEviction test to run") } - framework.Logf("Setting eviction threshold to %d inodes", inodesFree-inodesConsumed) - initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesConsumed)} + framework.Logf("Setting eviction threshold to %d inodes", inodesFree-inodesToThreshold) + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesToThreshold)} initialConfig.EvictionMinimumReclaim = map[string]string{} }) runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logInodeMetrics, []podEvictSpec{ @@ -180,11 +179,12 @@ var _ = SIGDescribe("ImageGCTerminatedPodsContainersCleanup", framework.WithSlow // container writable layer, which defaults to using the node's root fs. { evictionPriority: 0, - pod: inodeConsumingPod(makeWaitPodName("disk-hog"), 30000, nil, false), + pod: inodeConsumingPod("disk-hog", 30000, nil, false), + expectToSucceed: true, }, { evictionPriority: 0, - pod: inodeConsumingPod("container-inode", 80000, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, true), + pod: inodeConsumingPod("container-inode", 30000, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, true), }, }) }) @@ -195,9 +195,9 @@ var _ = SIGDescribe("ImageGCTerminatedPodsContainersCleanup", framework.WithSlow var _ = SIGDescribe("MemoryAllocatableEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("memory-allocatable-eviction-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - expectedNodeCondition := v1.NodeMemoryPressure - expectedStarvedResource := v1.ResourceMemory - pressureTimeout := 10 * time.Minute + const expectedNodeCondition = v1.NodeMemoryPressure + const expectedStarvedResource = v1.ResourceMemory + const pressureTimeout = 10 * time.Minute ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { // Set large system and kube reserved values to trigger allocatable thresholds far before hard eviction thresholds. @@ -229,9 +229,9 @@ var _ = SIGDescribe("MemoryAllocatableEviction", framework.WithSlow(), framework var _ = SIGDescribe("MemoryAllocatableEvictionPodLevelResources", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.PodLevelResources, func() { f := framework.NewDefaultFramework("memory-allocatable-eviction-test-pod-level-resources") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - expectedNodeCondition := v1.NodeMemoryPressure - expectedStarvedResource := v1.ResourceMemory - pressureTimeout := 10 * time.Minute + const expectedNodeCondition = v1.NodeMemoryPressure + const expectedStarvedResource = v1.ResourceMemory + const pressureTimeout = 10 * time.Minute ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { // Set large system and kube reserved values to trigger allocatable thresholds far before hard eviction thresholds. @@ -263,9 +263,9 @@ var _ = SIGDescribe("MemoryAllocatableEvictionPodLevelResources", framework.With var _ = SIGDescribe("LocalStorageEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("localstorage-eviction-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - pressureTimeout := 15 * time.Minute - expectedNodeCondition := v1.NodeDiskPressure - expectedStarvedResource := v1.ResourceEphemeralStorage + const pressureTimeout = 15 * time.Minute + const expectedNodeCondition = v1.NodeDiskPressure + const expectedStarvedResource = v1.ResourceEphemeralStorage ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { @@ -304,9 +304,9 @@ var _ = SIGDescribe("LocalStorageEviction", framework.WithSlow(), framework.With var _ = SIGDescribe("LocalStorageSoftEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("localstorage-eviction-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - pressureTimeout := 10 * time.Minute - expectedNodeCondition := v1.NodeDiskPressure - expectedStarvedResource := v1.ResourceEphemeralStorage + const pressureTimeout = 10 * time.Minute + const expectedNodeCondition = v1.NodeDiskPressure + const expectedStarvedResource = v1.ResourceEphemeralStorage ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { diskConsumed := resource.MustParse("4Gi") @@ -342,12 +342,12 @@ var _ = SIGDescribe("LocalStorageSoftEviction", framework.WithSlow(), framework. var _ = SIGDescribe("LocalStorageSoftEvictionNotOverwriteTerminationGracePeriodSeconds", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("localstorage-eviction-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - pressureTimeout := 10 * time.Minute - expectedNodeCondition := v1.NodeDiskPressure - expectedStarvedResource := v1.ResourceEphemeralStorage + const pressureTimeout = 10 * time.Minute + const expectedNodeCondition = v1.NodeDiskPressure + const expectedStarvedResource = v1.ResourceEphemeralStorage - evictionMaxPodGracePeriod := 30 - evictionSoftGracePeriod := 30 + const evictionMaxPodGracePeriod = 30 + const evictionSoftGracePeriod = 30 ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { diskConsumed := resource.MustParse("4Gi") @@ -381,7 +381,7 @@ var _ = SIGDescribe("LocalStorageSoftEvictionNotOverwriteTerminationGracePeriodS var _ = SIGDescribe("LocalStorageCapacityIsolationEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.LocalStorageCapacityIsolationQuota, feature.Eviction, func() { f := framework.NewDefaultFramework("localstorage-eviction-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - evictionTestTimeout := 10 * time.Minute + const evictionTestTimeout = 10 * time.Minute ginkgo.Context(fmt.Sprintf(testContextFmt, "evictions due to pod local storage violations"), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { // setting a threshold to 0% disables; non-empty map overrides default value (necessary due to omitempty) @@ -434,12 +434,12 @@ var _ = SIGDescribe("LocalStorageCapacityIsolationEviction", framework.WithSlow( var _ = SIGDescribe("PriorityMemoryEvictionOrdering", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("priority-memory-eviction-ordering-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - expectedNodeCondition := v1.NodeMemoryPressure - expectedStarvedResource := v1.ResourceMemory - pressureTimeout := 10 * time.Minute + const expectedNodeCondition = v1.NodeMemoryPressure + const expectedStarvedResource = v1.ResourceMemory + const pressureTimeout = 10 * time.Minute highPriorityClassName := f.BaseName + "-high-priority" - highPriority := int32(999999999) + const highPriority = int32(999999999) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { @@ -512,12 +512,12 @@ var _ = SIGDescribe("PriorityMemoryEvictionOrdering", framework.WithSlow(), fram var _ = SIGDescribe("PriorityMemoryEvictionOrderingPodLevelResources", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.PodLevelResources, func() { f := framework.NewDefaultFramework("priority-memory-eviction-ordering-test-pod-level-resources") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - expectedNodeCondition := v1.NodeMemoryPressure - expectedStarvedResource := v1.ResourceMemory - pressureTimeout := 10 * time.Minute + const expectedNodeCondition = v1.NodeMemoryPressure + const expectedStarvedResource = v1.ResourceMemory + const pressureTimeout = 10 * time.Minute highPriorityClassName := f.BaseName + "-high-priority" - highPriority := int32(999999999) + const highPriority = int32(999999999) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { @@ -579,12 +579,12 @@ var _ = SIGDescribe("PriorityMemoryEvictionOrderingPodLevelResources", framework var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("priority-disk-eviction-ordering-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - expectedNodeCondition := v1.NodeDiskPressure - expectedStarvedResource := v1.ResourceEphemeralStorage - pressureTimeout := 15 * time.Minute + const expectedNodeCondition = v1.NodeDiskPressure + const expectedStarvedResource = v1.ResourceEphemeralStorage + const pressureTimeout = 15 * time.Minute highPriorityClassName := f.BaseName + "-high-priority" - highPriority := int32(999999999) + const highPriority = int32(999999999) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { @@ -642,16 +642,16 @@ var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering", framework.WithSlow() var _ = SIGDescribe("PriorityPidEvictionOrdering", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.Eviction, func() { f := framework.NewDefaultFramework("pidpressure-eviction-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - pressureTimeout := 10 * time.Minute - expectedNodeCondition := v1.NodePIDPressure - expectedStarvedResource := noStarvedResource + const pressureTimeout = 10 * time.Minute + const expectedNodeCondition = v1.NodePIDPressure + const expectedStarvedResource = noStarvedResource highPriorityClassName := f.BaseName + "-high-priority" - highPriority := int32(999999999) + const highPriority = int32(999999999) // Apparently there is a threshold at around 10,000+. If it's over the threshold, // the processes are likely to be oom-killed instead of evicted. // One test can have at most two pidConsumingPods at a time not to cause oom-kill. - processes := 5000 + const processes = 5000 // if criStats is true, PodAndContainerStatsFromCRI will use data from cri instead of cadvisor for kubelet to get pid count of pods for _, criStats := range []bool{true, false} { @@ -727,6 +727,7 @@ type podEvictSpec struct { evictionPriority int pod *v1.Pod wantPodDisruptionCondition *v1.PodConditionType + expectToSucceed bool evictionMaxPodGracePeriod int evictionSoftGracePeriod int @@ -759,30 +760,26 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe if spec.prePodCreationModificationFunc != nil { spec.prePodCreationModificationFunc(ctx, spec.pod) } - if isPodExpectedToSucceed(spec.pod) { + if spec.expectToSucceed { podsToSucceed = append(podsToSucceed, spec.pod) continue } pods = append(pods, spec.pod) } + e2ePodClient := e2epod.NewPodClient(f) - e2ePodClient.CreateBatch(ctx, podsToSucceed) - // Wait for the expected succeeded pods to get to that state - for _, podToSucceed := range podsToSucceed { - ginkgo.By(fmt.Sprintf("Waiting for the %s pod to complete", podToSucceed.Name)) - gomega.Eventually(ctx, func() error { - pod, err := e2ePodClient.PodInterface.Get(ctx, podToSucceed.Name, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("failed to retrieve %s: %w", podToSucceed.Name, err) - } + // Create all expected pods and wait for them to succeed. + // CreateBatch was not used to prevent a race condition because of + // underlying call to CreateSync, which may fail if the wait function + // misses to see the pod readiness before it terminates. + for _, pod := range podsToSucceed { + *pod = *e2ePodClient.Create(ctx, pod) + } - if pod.Status.Phase != v1.PodSucceeded { - return fmt.Errorf("%s pod not yet completed or failed: %v", pod.Name, pod.Status.Phase) - } - - return nil - }, pressureTimeout, evictionPollInterval).Should(gomega.BeNil()) + ginkgo.By(fmt.Sprintf("Waiting for the expected pods to complete: %v", podsToSucceed)) + for _, pod := range podsToSucceed { + framework.ExpectNoError(e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace)) } // Create all other pods after expected to succeed pods terminated @@ -922,7 +919,7 @@ func verifyEvictionOrdering(ctx context.Context, f *framework.Framework, testSpe } } gomega.Expect(priorityPod).NotTo(gomega.BeNil()) - if !isPodExpectedToSucceed(&priorityPod) { + if !priorityPodSpec.expectToSucceed { gomega.Expect(priorityPod.Status.Phase).ToNot(gomega.Equal(v1.PodSucceeded), fmt.Sprintf("pod: %s succeeded unexpectedly", priorityPod.Name)) } @@ -1254,22 +1251,22 @@ const ( volumeName = "test-volume" ) -func inodeConsumingPod(name string, numFiles int, volumeSource *v1.VolumeSource, putToSleep bool) *v1.Pod { +func inodeConsumingPod(name string, numFiles int, volumeSource *v1.VolumeSource, sleepAfterExecuting bool) *v1.Pod { path := "" if volumeSource != nil { path = volumeMountPath } // Each iteration creates an empty file - return podWithCommand(volumeSource, v1.ResourceRequirements{}, numFiles, name, fmt.Sprintf("touch %s${i}.txt; sleep 0.001;", filepath.Join(path, "file")), putToSleep) + return podWithCommand(volumeSource, v1.ResourceRequirements{}, numFiles, name, fmt.Sprintf("touch %s${i}.txt; sleep 0.001;", filepath.Join(path, "file")), sleepAfterExecuting) } -func diskConsumingPod(name string, diskConsumedMB int, volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, putToSleep bool) *v1.Pod { +func diskConsumingPod(name string, diskConsumedMB int, volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, sleepAfterExecuting bool) *v1.Pod { path := "" if volumeSource != nil { path = volumeMountPath } // Each iteration writes 1 Mb, so do diskConsumedMB iterations. - return podWithCommand(volumeSource, resources, diskConsumedMB, name, fmt.Sprintf("dd if=/dev/urandom of=%s${i} bs=1048576 count=1 2>/dev/null; sleep .1;", filepath.Join(path, "file")), putToSleep) + return podWithCommand(volumeSource, resources, diskConsumedMB, name, fmt.Sprintf("dd if=/dev/urandom of=%s${i} bs=1048576 count=1 2>/dev/null; sleep .1;", filepath.Join(path, "file")), sleepAfterExecuting) } func pidConsumingPod(name string, numProcesses int) *v1.Pod { @@ -1279,7 +1276,7 @@ func pidConsumingPod(name string, numProcesses int) *v1.Pod { } // podWithCommand returns a pod with the provided volumeSource and resourceRequirements. -func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, iterations int, name, command string, putToSleep bool) *v1.Pod { +func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, iterations int, name, command string, sleepAfterExecuting bool) *v1.Pod { // Due to https://github.com/kubernetes/kubernetes/issues/115819, // When evictionHard to used, we were setting grace period to 0 which meant the default setting (30 seconds) // This could help with flakiness as we should send sigterm right away. @@ -1290,6 +1287,12 @@ func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirem volumeMounts = []v1.VolumeMount{{MountPath: volumeMountPath, Name: volumeName}} volumes = []v1.Volume{{Name: volumeName, VolumeSource: *volumeSource}} } + + cmd := fmt.Sprintf("i=0; while [ $i -lt %d ]; do %s i=$(($i+1)); done", iterations, command) + if sleepAfterExecuting { + cmd += "; " + e2epod.InfiniteSleepCommand + } + return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-pod", name)}, Spec: v1.PodSpec{ @@ -1302,7 +1305,7 @@ func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirem Command: []string{ "sh", "-c", - fmt.Sprintf("i=0; while [ $i -lt %d ]; do %s i=$(($i+1)); done; while %v; do sleep 5; done", iterations, command, putToSleep), + cmd, }, Resources: resources, VolumeMounts: volumeMounts, @@ -1371,11 +1374,3 @@ func getMemhogPodWithPodLevelResources(podName string, podLevelRes v1.ResourceRe return pod } - -func makeWaitPodName(podName string) string { - return fmt.Sprintf("%s-%s", podName, waitPodSufix) -} - -func isPodExpectedToSucceed(pod *v1.Pod) bool { - return strings.Contains(pod.Name, waitPodSufix) -} From 7cf39066b34853b270342fcccf7368f23e01760b Mon Sep 17 00:00:00 2001 From: Kevin Torres Date: Thu, 1 May 2025 19:25:02 +0000 Subject: [PATCH 7/8] Remove sleepAfterExecuting param from diskConsumingPod --- test/e2e_node/eviction_test.go | 28 ++++++++++++++-------------- test/e2e_node/split_disk_test.go | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index 8a90c53d652..9d822156e69 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -288,7 +288,7 @@ var _ = SIGDescribe("LocalStorageEviction", framework.WithSlow(), framework.With evictionPriority: 1, // TODO(#127864): Container runtime may not immediate free up the resources after the pod eviction, // causing the test to fail. We provision an emptyDir volume to avoid relying on the runtime behavior. - pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}, true), + pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), }, { evictionPriority: 0, @@ -329,7 +329,7 @@ var _ = SIGDescribe("LocalStorageSoftEviction", framework.WithSlow(), framework. evictionPriority: 1, // TODO(#127864): Container runtime may not immediate free up the resources after the pod eviction, // causing the test to fail. We provision an emptyDir volume to avoid relying on the runtime behavior. - pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}, true), + pod: diskConsumingPod("container-disk-hog", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), }, { evictionPriority: 0, @@ -371,7 +371,7 @@ var _ = SIGDescribe("LocalStorageSoftEvictionNotOverwriteTerminationGracePeriodS evictionMaxPodGracePeriod: evictionSoftGracePeriod, evictionSoftGracePeriod: evictionMaxPodGracePeriod, evictionPriority: 1, - pod: diskConsumingPod("container-disk-hog", lotsOfDisk, nil, v1.ResourceRequirements{}, true), + pod: diskConsumingPod("container-disk-hog", lotsOfDisk, nil, v1.ResourceRequirements{}), }, }) }) @@ -397,32 +397,32 @@ var _ = SIGDescribe("LocalStorageCapacityIsolationEviction", framework.WithSlow( evictionPriority: 1, // This pod should be evicted because emptyDir (default storage type) usage violation pod: diskConsumingPod("emptydir-disk-sizelimit", useOverLimit, &v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{SizeLimit: &sizeLimit}, - }, v1.ResourceRequirements{}, true), + }, v1.ResourceRequirements{}), }, { evictionPriority: 1, // This pod should cross the container limit by writing to its writable layer. - pod: diskConsumingPod("container-disk-limit", useOverLimit, nil, v1.ResourceRequirements{Limits: containerLimit}, true), + pod: diskConsumingPod("container-disk-limit", useOverLimit, nil, v1.ResourceRequirements{Limits: containerLimit}), }, { evictionPriority: 1, // This pod should hit the container limit by writing to an emptydir pod: diskConsumingPod("container-emptydir-disk-limit", useOverLimit, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, - v1.ResourceRequirements{Limits: containerLimit}, true), + v1.ResourceRequirements{Limits: containerLimit}), }, { evictionPriority: 0, // This pod should not be evicted because MemoryBackedVolumes cannot use more space than is allocated to them since SizeMemoryBackedVolumes was enabled pod: diskConsumingPod("emptydir-memory-sizelimit", useOverLimit, &v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sizeLimit}, - }, v1.ResourceRequirements{}, true), + }, v1.ResourceRequirements{}), }, { evictionPriority: 0, // This pod should not be evicted because it uses less than its limit pod: diskConsumingPod("emptydir-disk-below-sizelimit", useUnderLimit, &v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{SizeLimit: &sizeLimit}, - }, v1.ResourceRequirements{}, true), + }, v1.ResourceRequirements{}), }, { evictionPriority: 0, // This pod should not be evicted because it uses less than its limit - pod: diskConsumingPod("container-disk-below-sizelimit", useUnderLimit, nil, v1.ResourceRequirements{Limits: containerLimit}, true), + pod: diskConsumingPod("container-disk-below-sizelimit", useUnderLimit, nil, v1.ResourceRequirements{Limits: containerLimit}), }, }) }) @@ -612,13 +612,13 @@ var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering", framework.WithSlow() evictionPriority: 2, // TODO(#127864): Container runtime may not immediate free up the resources after the pod eviction, // causing the test to fail. We provision an emptyDir volume to avoid relying on the runtime behavior. - pod: diskConsumingPod("best-effort-disk", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}, true), + pod: diskConsumingPod("best-effort-disk", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), }, { evictionPriority: 1, // TODO(#127864): Container runtime may not immediate free up the resources after the pod eviction, // causing the test to fail. We provision an emptyDir volume to avoid relying on the runtime behavior. - pod: diskConsumingPod("high-priority-disk", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}, true), + pod: diskConsumingPod("high-priority-disk", lotsOfDisk, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}, v1.ResourceRequirements{}), }, { evictionPriority: 0, @@ -630,7 +630,7 @@ var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering", framework.WithSlow() Limits: v1.ResourceList{ v1.ResourceEphemeralStorage: resource.MustParse("300Mi"), }, - }, true), + }), }, } specs[1].pod.Spec.PriorityClassName = highPriorityClassName @@ -1260,13 +1260,13 @@ func inodeConsumingPod(name string, numFiles int, volumeSource *v1.VolumeSource, return podWithCommand(volumeSource, v1.ResourceRequirements{}, numFiles, name, fmt.Sprintf("touch %s${i}.txt; sleep 0.001;", filepath.Join(path, "file")), sleepAfterExecuting) } -func diskConsumingPod(name string, diskConsumedMB int, volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, sleepAfterExecuting bool) *v1.Pod { +func diskConsumingPod(name string, diskConsumedMB int, volumeSource *v1.VolumeSource, resources v1.ResourceRequirements) *v1.Pod { path := "" if volumeSource != nil { path = volumeMountPath } // Each iteration writes 1 Mb, so do diskConsumedMB iterations. - return podWithCommand(volumeSource, resources, diskConsumedMB, name, fmt.Sprintf("dd if=/dev/urandom of=%s${i} bs=1048576 count=1 2>/dev/null; sleep .1;", filepath.Join(path, "file")), sleepAfterExecuting) + return podWithCommand(volumeSource, resources, diskConsumedMB, name, fmt.Sprintf("dd if=/dev/urandom of=%s${i} bs=1048576 count=1 2>/dev/null; sleep .1;", filepath.Join(path, "file")), true) } func pidConsumingPod(name string, numProcesses int) *v1.Pod { diff --git a/test/e2e_node/split_disk_test.go b/test/e2e_node/split_disk_test.go index cf840364c14..3796361c077 100644 --- a/test/e2e_node/split_disk_test.go +++ b/test/e2e_node/split_disk_test.go @@ -104,7 +104,7 @@ var _ = SIGDescribe("KubeletSeparateDiskGC", feature.KubeletSeparateDiskGC, func // This pod should exceed disk capacity on nodeFs since it writes a lot to writeable layer. evictionPriority: 1, pod: diskConsumingPod("container-emptydir-disk-limit", diskTestInMb, nil, - v1.ResourceRequirements{}, true), + v1.ResourceRequirements{}), }, }) }) From 86e3ad233f723bee5da220eef2fbf22eec2695a8 Mon Sep 17 00:00:00 2001 From: Kevin Torres Date: Thu, 1 May 2025 23:28:19 +0000 Subject: [PATCH 8/8] Revert trapping TERM for podWithCommand --- pkg/kubelet/eviction/eviction_manager_test.go | 6 ++--- test/e2e_node/eviction_test.go | 22 ++++++++++++------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index bbc5c677eba..a72c145409b 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -1890,11 +1890,11 @@ func TestNodeReclaimFuncs(t *testing.T) { fakeClock.Step(1 * time.Minute) setDiskStatsBasedOnFs := func(whichFs string, diskPressure string, diskStat diskStats) diskStats { - if whichFs == "nodefs" { + if tc.inducePressureOnWhichFs == "nodefs" { diskStat.rootFsAvailableBytes = diskPressure - } else if whichFs == "imagefs" { + } else if tc.inducePressureOnWhichFs == "imagefs" { diskStat.imageFsAvailableBytes = diskPressure - } else if whichFs == "containerfs" { + } else if tc.inducePressureOnWhichFs == "containerfs" { diskStat.containerFsAvailableBytes = diskPressure } return diskStat diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index 9d822156e69..f965abfe74e 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -76,7 +76,7 @@ var _ = SIGDescribe("InodeEviction", framework.WithSlow(), framework.WithSerial( const expectedNodeCondition = v1.NodeDiskPressure const expectedStarvedResource = resourceInodes const pressureTimeout = 15 * time.Minute - const inodesToThreshold = uint64(200000) + const inodesToThreshold = uint64(100000) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { // Set the eviction threshold to inodesFree - inodesToThreshold, so that using inodesToThreshold causes an eviction. @@ -116,6 +116,17 @@ var _ = SIGDescribe("ImageGCNoEviction", framework.WithSlow(), framework.WithSer const expectedStarvedResource = resourceInodes const inodesToThreshold = uint64(100000) ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { + prepull := func(ctx context.Context) { + // Prepull images for image garbage collector to remove them + // when reclaiming resources + err := PrePullAllImages(ctx) + gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) + } + ginkgo.BeforeEach(prepull) + if framework.TestContext.PrepullImages { + ginkgo.AfterEach(prepull) + } + tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { // Set the eviction threshold to inodesFree - inodesToThreshold, so that using inodesToThreshold causes an eviction. summary := eventuallyGetSummary(ctx) @@ -129,7 +140,7 @@ var _ = SIGDescribe("ImageGCNoEviction", framework.WithSlow(), framework.WithSer }) // Consume enough inodes to induce disk pressure, // but expect that image garbage collection can reduce it enough to avoid an eviction - runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{ + runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logInodeMetrics, []podEvictSpec{ { evictionPriority: 0, pod: inodeConsumingPod("container-inode", 110000, nil, true), @@ -1288,11 +1299,6 @@ func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirem volumes = []v1.Volume{{Name: volumeName, VolumeSource: *volumeSource}} } - cmd := fmt.Sprintf("i=0; while [ $i -lt %d ]; do %s i=$(($i+1)); done", iterations, command) - if sleepAfterExecuting { - cmd += "; " + e2epod.InfiniteSleepCommand - } - return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-pod", name)}, Spec: v1.PodSpec{ @@ -1305,7 +1311,7 @@ func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirem Command: []string{ "sh", "-c", - cmd, + fmt.Sprintf("i=0; while [ $i -lt %d ]; do %s i=$(($i+1)); done; while %v; do sleep 5; done", iterations, command, sleepAfterExecuting), }, Resources: resources, VolumeMounts: volumeMounts,