From 81c14b7dd5dd1b39cec9799532fac94b493bbe33 Mon Sep 17 00:00:00 2001 From: Alima Azamat Date: Fri, 23 May 2025 16:24:43 -0700 Subject: [PATCH 1/5] initContainerDuration calculated and subtracted from podStartSLOduration --- pkg/kubelet/kubelet.go | 7 +- pkg/kubelet/kubelet_test.go | 7 +- .../kuberuntime/kuberuntime_manager.go | 84 ++++++++++++++----- .../util/pod_startup_latency_tracker.go | 41 ++++++++- 4 files changed, 109 insertions(+), 30 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index fc83e5449eb..369bab828c5 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -640,6 +640,7 @@ func NewMainKubelet(ctx context.Context, nodeStatusMaxImages: nodeStatusMaxImages, tracer: tracer, nodeStartupLatencyTracker: kubeDeps.NodeStartupLatencyTracker, + podStartupLatencyTracker: kubeDeps.PodStartupLatencyTracker, healthChecker: kubeDeps.HealthChecker, flagz: kubeDeps.Flagz, } @@ -795,10 +796,11 @@ func NewMainKubelet(ctx context.Context, kubeCfg.MemorySwap.SwapBehavior, kubeDeps.ContainerManager.GetNodeAllocatableAbsolute, *kubeCfg.MemoryThrottlingFactor, - kubeDeps.PodStartupLatencyTracker, + klet.podStartupLatencyTracker, kubeDeps.TracerProvider, tokenManager, getServiceAccount, + klet.podStartupLatencyTracker, ) if err != nil { return nil, err @@ -1519,6 +1521,9 @@ type Kubelet struct { // Track node startup latencies nodeStartupLatencyTracker util.NodeStartupLatencyTracker + // Track pod startup latencies + podStartupLatencyTracker util.PodStartupLatencyTracker + // Health check kubelet healthChecker watchdog.HealthChecker diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 38890c487c7..8c59b52244f 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -302,8 +302,8 @@ func newTestKubeletWithImageList( kubelet.configMapManager = configMapManager kubelet.mirrorPodClient = fakeMirrorClient kubelet.podManager = kubepod.NewBasicPodManager() - podStartupLatencyTracker := kubeletutil.NewPodStartupLatencyTracker() - kubelet.statusManager = status.NewManager(fakeKubeClient, kubelet.podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker) + kubelet.podStartupLatencyTracker = kubeletutil.NewPodStartupLatencyTracker() + kubelet.statusManager = status.NewManager(fakeKubeClient, kubelet.podManager, &statustest.FakePodDeletionSafetyProvider{}, kubelet.podStartupLatencyTracker) kubelet.nodeStartupLatencyTracker = kubeletutil.NewNodeStartupLatencyTracker() kubelet.podCertificateManager = &podcertificate.NoOpManager{} @@ -3474,10 +3474,11 @@ func TestSyncPodSpans(t *testing.T) { kubeCfg.MemorySwap.SwapBehavior, kubelet.containerManager.GetNodeAllocatableAbsolute, *kubeCfg.MemoryThrottlingFactor, - kubeletutil.NewPodStartupLatencyTracker(), + kubelet.podStartupLatencyTracker, tp, token.NewManager(kubelet.kubeClient), func(string, string) (*v1.ServiceAccount, error) { return nil, nil }, + kubelet.podStartupLatencyTracker, ) assert.NoError(t, err) kubelet.allocationManager.SetContainerRuntime(kubelet.containerRuntime) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 46eec1c2abc..3e64dd6179c 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -106,6 +106,11 @@ type podStateProvider interface { ShouldPodRuntimeBeRemoved(kubetypes.UID) bool } +type PodInitContainerTimeRecorder interface { + RecordInitContainerStarted(podUID kubetypes.UID, startedAt time.Time) + RecordInitContainerFinished(podUID kubetypes.UID, finishedAt time.Time) +} + type kubeGenericRuntimeManager struct { runtimeName string recorder record.EventRecorder @@ -191,6 +196,9 @@ type kubeGenericRuntimeManager struct { // Swap controller availability check function (Linux only) // Uses sync.OnceValue for lazy initialization getSwapControllerAvailable func() bool + + // Records first initContainer start time and last initContainer finish time + podInitContainerTimeRecorder PodInitContainerTimeRecorder } // KubeGenericRuntime is a interface contains interfaces for container runtime and command. @@ -240,6 +248,7 @@ func NewKubeGenericRuntimeManager( tracerProvider trace.TracerProvider, tokenManager *token.Manager, getServiceAccount plugin.GetServiceAccountFunc, + podInitContainerTimeRecorder PodInitContainerTimeRecorder, ) (KubeGenericRuntime, []images.PostImageGCHook, error) { logger := klog.FromContext(ctx) @@ -247,29 +256,31 @@ func NewKubeGenericRuntimeManager( imageService = newInstrumentedImageManagerService(imageService) tracer := tracerProvider.Tracer(instrumentationScope) kubeRuntimeManager := &kubeGenericRuntimeManager{ - recorder: recorder, - singleProcessOOMKill: singleProcessOOMKill, - cpuCFSQuota: cpuCFSQuota, - cpuCFSQuotaPeriod: cpuCFSQuotaPeriod, - seccompProfileRoot: filepath.Join(rootDirectory, "seccomp"), - livenessManager: livenessManager, - readinessManager: readinessManager, - startupManager: startupManager, - machineInfo: machineInfo, - osInterface: osInterface, - runtimeHelper: runtimeHelper, - runtimeService: runtimeService, - imageService: imageService, - containerManager: containerManager, - internalLifecycle: containerManager.InternalContainerLifecycle(), - logManager: logManager, - runtimeClassManager: runtimeClassManager, - logReduction: logreduction.NewLogReduction(identicalErrorDelay), - seccompDefault: seccompDefault, - memorySwapBehavior: memorySwapBehavior, - getNodeAllocatable: getNodeAllocatable, - memoryThrottlingFactor: memoryThrottlingFactor, - podLogsDirectory: podLogsDirectory, + recorder: recorder, + singleProcessOOMKill: singleProcessOOMKill, + cpuCFSQuota: cpuCFSQuota, + cpuCFSQuotaPeriod: cpuCFSQuotaPeriod, + seccompProfileRoot: filepath.Join(rootDirectory, "seccomp"), + livenessManager: livenessManager, + readinessManager: readinessManager, + startupManager: startupManager, + machineInfo: machineInfo, + osInterface: osInterface, + runtimeHelper: runtimeHelper, + runtimeService: runtimeService, + imageService: imageService, + containerManager: containerManager, + internalLifecycle: containerManager.InternalContainerLifecycle(), + logManager: logManager, + runtimeClassManager: runtimeClassManager, + allocationManager: allocationManager, + logReduction: logreduction.NewLogReduction(identicalErrorDelay), + seccompDefault: seccompDefault, + memorySwapBehavior: memorySwapBehavior, + getNodeAllocatable: getNodeAllocatable, + memoryThrottlingFactor: memoryThrottlingFactor, + podLogsDirectory: podLogsDirectory, + podInitContainerTimeRecorder: podInitContainerTimeRecorder, } // Initialize swap controller availability check with lazy evaluation @@ -1665,6 +1676,11 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po } return err } + if typeName == "init container" { + if m.podInitContainerTimeRecorder != nil { + m.podInitContainerTimeRecorder.RecordInitContainerStarted(pod.UID, time.Now()) + } + } return nil } @@ -1694,6 +1710,28 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po logger.V(4).Info("Completed init container for pod", "containerName", container.Name, "pod", klog.KObj(pod)) } + for _, cs := range podStatus.ContainerStatuses { + // Check if this is an init container + for _, init := range pod.Spec.InitContainers { + if cs.Name == init.Name && cs.State == kubecontainer.ContainerStateExited && !cs.FinishedAt.IsZero() { + if m.podInitContainerTimeRecorder != nil { + m.podInitContainerTimeRecorder.RecordInitContainerFinished(pod.UID, cs.FinishedAt) + } + } + } + } + + for _, cs := range podStatus.ContainerStatuses { + // Check if this is an init container + for _, init := range pod.Spec.InitContainers { + if cs.Name == init.Name && cs.State == kubecontainer.ContainerStateExited && !cs.FinishedAt.IsZero() { + if m.podInitContainerTimeRecorder != nil { + m.podInitContainerTimeRecorder.RecordInitContainerFinished(pod.UID, cs.FinishedAt) + } + } + } + } + // Step 7: For containers in podContainerChanges.ContainersToUpdate[CPU,Memory] list, invoke UpdateContainerResources if resizable, _, _ := allocation.IsInPlacePodVerticalScalingAllowed(pod); resizable { if len(podContainerChanges.ContainersToUpdate) > 0 || podContainerChanges.UpdatePodResources || podContainerChanges.UpdatePodLevelResources { diff --git a/pkg/kubelet/util/pod_startup_latency_tracker.go b/pkg/kubelet/util/pod_startup_latency_tracker.go index d972815a781..60b4b6ebe9c 100644 --- a/pkg/kubelet/util/pod_startup_latency_tracker.go +++ b/pkg/kubelet/util/pod_startup_latency_tracker.go @@ -34,6 +34,8 @@ type PodStartupLatencyTracker interface { ObservedPodOnWatch(pod *v1.Pod, when time.Time) RecordImageStartedPulling(podUID types.UID) RecordImageFinishedPulling(podUID types.UID) + RecordInitContainerStarted(podUID types.UID, startedAt time.Time) + RecordInitContainerFinished(podUID types.UID, finishedAt time.Time) RecordStatusUpdated(pod *v1.Pod) DeletePodStartupState(podUID types.UID) } @@ -49,8 +51,10 @@ type basicPodStartupLatencyTracker struct { } type perPodState struct { - firstStartedPulling time.Time - lastFinishedPulling time.Time + firstStartedPulling time.Time + lastFinishedPulling time.Time + firstInitContainerStart time.Time + lastInitContainerFinish time.Time // first time, when pod status changed into Running observedRunningTime time.Time // log, if pod latency was already Observed @@ -103,7 +107,8 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim logger := klog.FromContext(ctx) podStartingDuration := when.Sub(pod.CreationTimestamp.Time) imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling) - podStartSLOduration := (podStartingDuration - imagePullingDuration).Seconds() + initContainerDuration := state.lastInitContainerFinish.Sub(state.firstInitContainerStart) + podStartSLOduration := (podStartingDuration - imagePullingDuration - initContainerDuration).Seconds() logger.Info("Observed pod startup duration", "pod", klog.KObj(pod), @@ -112,6 +117,8 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim "podCreationTimestamp", pod.CreationTimestamp.Time, "firstStartedPulling", state.firstStartedPulling, "lastFinishedPulling", state.lastFinishedPulling, + "firstInitContainerStart", state.firstInitContainerStart, + "lastInitContainerFinish", state.lastInitContainerFinish, "observedRunningTime", state.observedRunningTime, "watchObservedRunningTime", when) @@ -155,6 +162,34 @@ func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types. } } +func (p *basicPodStartupLatencyTracker) RecordInitContainerStarted(podUID types.UID, startedAt time.Time) { + p.lock.Lock() + defer p.lock.Unlock() + + state := p.pods[podUID] + if state == nil { + return + } + + if state.firstInitContainerStart.IsZero() || startedAt.Before(state.firstInitContainerStart) { + state.firstInitContainerStart = startedAt + } +} + +func (p *basicPodStartupLatencyTracker) RecordInitContainerFinished(podUID types.UID, finishedAt time.Time) { + p.lock.Lock() + defer p.lock.Unlock() + + state := p.pods[podUID] + if state == nil { + return + } + + if finishedAt.After(state.lastInitContainerFinish) { + state.lastInitContainerFinish = finishedAt + } +} + func (p *basicPodStartupLatencyTracker) RecordStatusUpdated(pod *v1.Pod) { p.lock.Lock() defer p.lock.Unlock() From 871f6e1ebe50f4445d8ecae07a3278e4a3900677 Mon Sep 17 00:00:00 2001 From: alimaazamat Date: Thu, 24 Jul 2025 17:02:29 -0700 Subject: [PATCH 2/5] fix: exclude init container + image pulling time and check pod is stateless --- .../util/pod_startup_latency_tracker.go | 100 ++++++++++++++++-- 1 file changed, 89 insertions(+), 11 deletions(-) diff --git a/pkg/kubelet/util/pod_startup_latency_tracker.go b/pkg/kubelet/util/pod_startup_latency_tracker.go index 60b4b6ebe9c..7c3b7b63dd3 100644 --- a/pkg/kubelet/util/pod_startup_latency_tracker.go +++ b/pkg/kubelet/util/pod_startup_latency_tracker.go @@ -106,14 +106,67 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim ctx := context.TODO() logger := klog.FromContext(ctx) podStartingDuration := when.Sub(pod.CreationTimestamp.Time) - imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling) - initContainerDuration := state.lastInitContainerFinish.Sub(state.firstInitContainerStart) - podStartSLOduration := (podStartingDuration - imagePullingDuration - initContainerDuration).Seconds() + var podStartSLOduration time.Duration + var excludedTimeStart time.Time + var excludedTimeEnd time.Time + + // Add time from pod creation to first excluded activity starts (either image pulling or init containers starting) + if !state.firstStartedPulling.IsZero() && !state.firstInitContainerStart.IsZero() { + if state.firstStartedPulling.Before(state.firstInitContainerStart) { + excludedTimeStart = state.firstStartedPulling + } else { + excludedTimeStart = state.firstInitContainerStart + } + } else if !state.firstStartedPulling.IsZero() { + excludedTimeStart = state.firstStartedPulling + } else if !state.firstInitContainerStart.IsZero() { + excludedTimeStart = state.firstInitContainerStart + } + + if !excludedTimeStart.IsZero() { + preExcludedDuration := excludedTimeStart.Sub(pod.CreationTimestamp.Time) + if preExcludedDuration > 0 { + podStartSLOduration += preExcludedDuration + } + } + + // Add gap between image pulling end and init container start if there is any + if !state.lastFinishedPulling.IsZero() && !state.firstInitContainerStart.IsZero() { + // Only count gap if init container starts after image pulling ends (no overlap) + if state.firstInitContainerStart.After(state.lastFinishedPulling) { + gapDuration := state.firstInitContainerStart.Sub(state.lastFinishedPulling) + if gapDuration > 0 { + podStartSLOduration += gapDuration + } + } + } + + // Add time from last dependency completion to containers running + if state.lastFinishedPulling.After(state.lastInitContainerFinish) { + excludedTimeEnd = state.lastFinishedPulling + } else if !state.lastInitContainerFinish.IsZero() { + excludedTimeEnd = state.lastInitContainerFinish + } else if !state.lastFinishedPulling.IsZero() { + excludedTimeEnd = state.lastFinishedPulling + } + + if !excludedTimeEnd.IsZero() { + postExcludedDuration := when.Sub(excludedTimeEnd) + if postExcludedDuration > 0 { + podStartSLOduration += postExcludedDuration + } + } else if excludedTimeStart.IsZero() { + // No dependencies at all, count entire duration + podStartSLOduration = podStartingDuration + } + + isStatefulPod := isStatefulPod(pod) logger.Info("Observed pod startup duration", "pod", klog.KObj(pod), - "podStartSLOduration", podStartSLOduration, + "podStartSLOduration", podStartSLOduration.Seconds(), "podStartE2EDuration", podStartingDuration, + "isStatefulPod", isStatefulPod, "podCreationTimestamp", pod.CreationTimestamp.Time, "firstStartedPulling", state.firstStartedPulling, "lastFinishedPulling", state.lastFinishedPulling, @@ -122,15 +175,17 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim "observedRunningTime", state.observedRunningTime, "watchObservedRunningTime", when) - metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration) metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds()) - state.metricRecorded = true - // if is the first Pod with network track the start values - // these metrics will help to identify problems with the CNI plugin - if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen { - metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration) - p.firstNetworkPodSeen = true + if !isStatefulPod { + metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration.Seconds()) + // if is the first Pod with network track the start values + // these metrics will help to identify problems with the CNI plugin + if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen { + metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration.Seconds()) + p.firstNetworkPodSeen = true + } } + state.metricRecorded = true } } @@ -232,6 +287,29 @@ func hasPodStartedSLO(pod *v1.Pod) bool { return true } +// isStatefulPod determines if a pod is stateful according to the SLI documentation: +// "A stateful pod is defined as a pod that mounts at least one volume with sources +// other than secrets, config maps, downward API and empty dir." +// This should reflect the "stateful pod" definition +// ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md +func isStatefulPod(pod *v1.Pod) bool { + for _, volume := range pod.Spec.Volumes { + // Check if this volume is NOT a stateless/ephemeral type + if volume.Secret == nil && + volume.ConfigMap == nil && + volume.DownwardAPI == nil && + volume.EmptyDir == nil && + volume.Projected == nil && + volume.GitRepo == nil && + volume.Image == nil && + volume.Ephemeral == nil && + (volume.CSI == nil || volume.CSI.VolumeAttributes["csi.storage.k8s.io/ephemeral"] != "true") { + return true + } + } + return false +} + func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) { p.lock.Lock() defer p.lock.Unlock() From faac0ea4caa7835e37d8e1af82b04f7e671489dc Mon Sep 17 00:00:00 2001 From: alimaazamat Date: Thu, 4 Sep 2025 17:24:00 -0700 Subject: [PATCH 3/5] fix: only immediately schedulable pods + sequential init container pulling/running --- .../kuberuntime/kuberuntime_manager.go | 20 +-- .../util/pod_startup_latency_tracker.go | 164 +++++++++--------- .../util/pod_startup_latency_tracker_test.go | 159 +++++++++++++++-- 3 files changed, 233 insertions(+), 110 deletions(-) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 3e64dd6179c..9df6cd21e56 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -1677,8 +1677,11 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po return err } if typeName == "init container" { - if m.podInitContainerTimeRecorder != nil { - m.podInitContainerTimeRecorder.RecordInitContainerStarted(pod.UID, time.Now()) + // Don't measure restartable init containers (sidecars) + if !podutil.IsRestartableInitContainer(spec.container) { + if m.podInitContainerTimeRecorder != nil { + m.podInitContainerTimeRecorder.RecordInitContainerStarted(pod.UID, time.Now()) + } } } @@ -1713,17 +1716,10 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po for _, cs := range podStatus.ContainerStatuses { // Check if this is an init container for _, init := range pod.Spec.InitContainers { - if cs.Name == init.Name && cs.State == kubecontainer.ContainerStateExited && !cs.FinishedAt.IsZero() { - if m.podInitContainerTimeRecorder != nil { - m.podInitContainerTimeRecorder.RecordInitContainerFinished(pod.UID, cs.FinishedAt) - } + // Don't measure restartable init containers (sidecars) + if podutil.IsRestartableInitContainer(&init) { + continue } - } - } - - for _, cs := range podStatus.ContainerStatuses { - // Check if this is an init container - for _, init := range pod.Spec.InitContainers { if cs.Name == init.Name && cs.State == kubecontainer.ContainerStateExited && !cs.FinishedAt.IsZero() { if m.podInitContainerTimeRecorder != nil { m.podInitContainerTimeRecorder.RecordInitContainerFinished(pod.UID, cs.FinishedAt) diff --git a/pkg/kubelet/util/pod_startup_latency_tracker.go b/pkg/kubelet/util/pod_startup_latency_tracker.go index 7c3b7b63dd3..57c4d51fa48 100644 --- a/pkg/kubelet/util/pod_startup_latency_tracker.go +++ b/pkg/kubelet/util/pod_startup_latency_tracker.go @@ -50,15 +50,20 @@ type basicPodStartupLatencyTracker struct { clock clock.Clock } +type imagePullSession struct { + start time.Time + end time.Time +} type perPodState struct { - firstStartedPulling time.Time - lastFinishedPulling time.Time - firstInitContainerStart time.Time - lastInitContainerFinish time.Time - // first time, when pod status changed into Running + // Session-based image pulling tracking for accurate overlap handling + imagePullSessions []imagePullSession + imagePullSessionsStarts []time.Time // Track multiple concurrent pull starts + // Init container tracking + totalInitContainerRuntime time.Duration // Accumulated runtime of all init containers + currentInitContainerStart time.Time // Track current init container start + // Pod lifecycle tracking observedRunningTime time.Time - // log, if pod latency was already Observed - metricRecorded bool + metricRecorded bool } // NewPodStartupLatencyTracker creates an instance of PodStartupLatencyTracker @@ -79,6 +84,14 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim return } + // if the pod is not immediately schedulable, we do not have to track it anymore for startup + for _, condition := range pod.Status.Conditions { + if condition.Type == v1.PodScheduled && condition.Status == v1.ConditionFalse { + delete(p.pods, pod.UID) + return + } + } + state := p.pods[pod.UID] if state == nil { // create a new record for pod, only if it was not yet acknowledged by the Kubelet @@ -106,77 +119,34 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim ctx := context.TODO() logger := klog.FromContext(ctx) podStartingDuration := when.Sub(pod.CreationTimestamp.Time) - var podStartSLOduration time.Duration - var excludedTimeStart time.Time - var excludedTimeEnd time.Time + podStartSLOduration := podStartingDuration - // Add time from pod creation to first excluded activity starts (either image pulling or init containers starting) - if !state.firstStartedPulling.IsZero() && !state.firstInitContainerStart.IsZero() { - if state.firstStartedPulling.Before(state.firstInitContainerStart) { - excludedTimeStart = state.firstStartedPulling - } else { - excludedTimeStart = state.firstInitContainerStart - } - } else if !state.firstStartedPulling.IsZero() { - excludedTimeStart = state.firstStartedPulling - } else if !state.firstInitContainerStart.IsZero() { - excludedTimeStart = state.firstInitContainerStart + totalImagesPullingTime := calculateImagePullingTime(state.imagePullSessions) + if totalImagesPullingTime > 0 { + podStartSLOduration -= totalImagesPullingTime } - if !excludedTimeStart.IsZero() { - preExcludedDuration := excludedTimeStart.Sub(pod.CreationTimestamp.Time) - if preExcludedDuration > 0 { - podStartSLOduration += preExcludedDuration - } + if state.totalInitContainerRuntime > 0 { + podStartSLOduration -= state.totalInitContainerRuntime } - // Add gap between image pulling end and init container start if there is any - if !state.lastFinishedPulling.IsZero() && !state.firstInitContainerStart.IsZero() { - // Only count gap if init container starts after image pulling ends (no overlap) - if state.firstInitContainerStart.After(state.lastFinishedPulling) { - gapDuration := state.firstInitContainerStart.Sub(state.lastFinishedPulling) - if gapDuration > 0 { - podStartSLOduration += gapDuration - } - } - } - - // Add time from last dependency completion to containers running - if state.lastFinishedPulling.After(state.lastInitContainerFinish) { - excludedTimeEnd = state.lastFinishedPulling - } else if !state.lastInitContainerFinish.IsZero() { - excludedTimeEnd = state.lastInitContainerFinish - } else if !state.lastFinishedPulling.IsZero() { - excludedTimeEnd = state.lastFinishedPulling - } - - if !excludedTimeEnd.IsZero() { - postExcludedDuration := when.Sub(excludedTimeEnd) - if postExcludedDuration > 0 { - podStartSLOduration += postExcludedDuration - } - } else if excludedTimeStart.IsZero() { - // No dependencies at all, count entire duration - podStartSLOduration = podStartingDuration - } - - isStatefulPod := isStatefulPod(pod) + podIsStateful := isStatefulPod(pod) logger.Info("Observed pod startup duration", "pod", klog.KObj(pod), "podStartSLOduration", podStartSLOduration.Seconds(), "podStartE2EDuration", podStartingDuration, - "isStatefulPod", isStatefulPod, + "totalImagesPullingTime", totalImagesPullingTime, + "totalInitContainerRuntime", state.totalInitContainerRuntime, + "isStatefulPod", podIsStateful, "podCreationTimestamp", pod.CreationTimestamp.Time, - "firstStartedPulling", state.firstStartedPulling, - "lastFinishedPulling", state.lastFinishedPulling, - "firstInitContainerStart", state.firstInitContainerStart, - "lastInitContainerFinish", state.lastInitContainerFinish, + "imagePullSessionsCount", len(state.imagePullSessions), + "imagePullSessionsStartsCount", len(state.imagePullSessionsStarts), "observedRunningTime", state.observedRunningTime, "watchObservedRunningTime", when) metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds()) - if !isStatefulPod { + if !podIsStateful { metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration.Seconds()) // if is the first Pod with network track the start values // these metrics will help to identify problems with the CNI plugin @@ -189,6 +159,36 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim } } +// calculateImagePullingTime computes the total time spent pulling images, +// accounting for overlapping pull sessions properly +func calculateImagePullingTime(sessions []imagePullSession) time.Duration { + if len(sessions) == 0 { + return 0 + } + + var totalTime time.Duration + var currentEnd time.Time + + for i, session := range sessions { + if session.end.IsZero() { + continue + } + + if i == 0 || session.start.After(currentEnd) { + // First session or no overlap with previous session + totalTime += session.end.Sub(session.start) + currentEnd = session.end + } else if session.end.After(currentEnd) { + // Partial overlap - add only the non-overlapping part + totalTime += session.end.Sub(currentEnd) + currentEnd = session.end + } + // If session.end <= currentEnd, it's completely overlapped + } + + return totalTime +} + func (p *basicPodStartupLatencyTracker) RecordImageStartedPulling(podUID types.UID) { p.lock.Lock() defer p.lock.Unlock() @@ -198,9 +198,8 @@ func (p *basicPodStartupLatencyTracker) RecordImageStartedPulling(podUID types.U return } - if state.firstStartedPulling.IsZero() { - state.firstStartedPulling = p.clock.Now() - } + now := p.clock.Now() + state.imagePullSessionsStarts = append(state.imagePullSessionsStarts, now) } func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types.UID) { @@ -212,8 +211,18 @@ func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types. return } - if !state.firstStartedPulling.IsZero() { - state.lastFinishedPulling = p.clock.Now() // Now is always grater than values from the past. + now := p.clock.Now() + + // Complete the oldest pull session if we have active starts + if len(state.imagePullSessionsStarts) > 0 { + // Take the first (oldest) start and create a session + startTime := state.imagePullSessionsStarts[0] + session := imagePullSession{ + start: startTime, + end: now, + } + state.imagePullSessions = append(state.imagePullSessions, session) + state.imagePullSessionsStarts = state.imagePullSessionsStarts[1:] } } @@ -226,9 +235,7 @@ func (p *basicPodStartupLatencyTracker) RecordInitContainerStarted(podUID types. return } - if state.firstInitContainerStart.IsZero() || startedAt.Before(state.firstInitContainerStart) { - state.firstInitContainerStart = startedAt - } + state.currentInitContainerStart = startedAt } func (p *basicPodStartupLatencyTracker) RecordInitContainerFinished(podUID types.UID, finishedAt time.Time) { @@ -240,8 +247,12 @@ func (p *basicPodStartupLatencyTracker) RecordInitContainerFinished(podUID types return } - if finishedAt.After(state.lastInitContainerFinish) { - state.lastInitContainerFinish = finishedAt + if !state.currentInitContainerStart.IsZero() { + initDuration := finishedAt.Sub(state.currentInitContainerStart) + if initDuration > 0 { + state.totalInitContainerRuntime += initDuration + } + state.currentInitContainerStart = time.Time{} } } @@ -298,12 +309,7 @@ func isStatefulPod(pod *v1.Pod) bool { if volume.Secret == nil && volume.ConfigMap == nil && volume.DownwardAPI == nil && - volume.EmptyDir == nil && - volume.Projected == nil && - volume.GitRepo == nil && - volume.Image == nil && - volume.Ephemeral == nil && - (volume.CSI == nil || volume.CSI.VolumeAttributes["csi.storage.k8s.io/ephemeral"] != "true") { + volume.EmptyDir == nil { return true } } diff --git a/pkg/kubelet/util/pod_startup_latency_tracker_test.go b/pkg/kubelet/util/pod_startup_latency_tracker_test.go index 5679f0568dd..f46d9c54896 100644 --- a/pkg/kubelet/util/pod_startup_latency_tracker_test.go +++ b/pkg/kubelet/util/pod_startup_latency_tracker_test.go @@ -145,9 +145,9 @@ kubelet_pod_start_sli_duration_seconds_count 1 if !ok { t.Errorf("expected to track pod: %s, but pod not found", podInit.UID) } - if !podState.lastFinishedPulling.Equal(podState.firstStartedPulling.Add(time.Millisecond * 100)) { - t.Errorf("expected pod firstStartedPulling: %s and lastFinishedPulling: %s but got firstStartedPulling: %s and lastFinishedPulling: %s", - podState.firstStartedPulling, podState.firstStartedPulling.Add(time.Millisecond*100), podState.firstStartedPulling, podState.lastFinishedPulling) + + if len(podState.imagePullSessions) != 1 { + t.Errorf("expected one image pull session to be recorded") } podStarted := buildRunningPod() @@ -170,7 +170,7 @@ kubelet_pod_start_sli_duration_seconds_count 1 func TestSinglePodMultipleDownloadsAndRestartsRecorded(t *testing.T) { - t.Run("single pod; started in 30s, image pulling between 10th and 20th seconds", func(t *testing.T) { + t.Run("single pod; started in 30s, overlapping image pulling between 10th and 20th seconds", func(t *testing.T) { wants := ` # HELP kubelet_pod_start_sli_duration_seconds [ALPHA] Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch @@ -216,21 +216,19 @@ kubelet_pod_start_sli_duration_seconds_count 1 podInitializing := buildInitializingPod() tracker.ObservedPodOnWatch(podInitializing, frozenTime) - - // image pulling started at 10s and the last one finished at 30s - // first image starts pulling at 10s + // Image 1: 10-16s fakeClock.SetTime(frozenTime.Add(time.Second * 10)) tracker.RecordImageStartedPulling(podInitializing.UID) - // second image starts pulling at 11s + // Image 2: 11-18s fakeClock.SetTime(frozenTime.Add(time.Second * 11)) tracker.RecordImageStartedPulling(podInitializing.UID) - // third image starts pulling at 14s + // Image 3: 14-20s fakeClock.SetTime(frozenTime.Add(time.Second * 14)) tracker.RecordImageStartedPulling(podInitializing.UID) - // first image finished pulling at 18s + fakeClock.SetTime(frozenTime.Add(time.Second * 16)) + tracker.RecordImageFinishedPulling(podInitializing.UID) fakeClock.SetTime(frozenTime.Add(time.Second * 18)) tracker.RecordImageFinishedPulling(podInitializing.UID) - // second and third finished pulling at 20s fakeClock.SetTime(frozenTime.Add(time.Second * 20)) tracker.RecordImageFinishedPulling(podInitializing.UID) @@ -238,13 +236,13 @@ kubelet_pod_start_sli_duration_seconds_count 1 if !ok { t.Errorf("expected to track pod: %s, but pod not found", podInitializing.UID) } - if !podState.firstStartedPulling.Equal(frozenTime.Add(time.Second * 10)) { // second and third image start pulling should not affect pod firstStartedPulling - t.Errorf("expected pod firstStartedPulling: %s but got firstStartedPulling: %s", - podState.firstStartedPulling.Add(time.Second*10), podState.firstStartedPulling) + if len(podState.imagePullSessions) != 3 { + t.Errorf("expected 3 image pull sessions but got %d", len(podState.imagePullSessions)) } - if !podState.lastFinishedPulling.Equal(frozenTime.Add(time.Second * 20)) { // should be updated when the pod's last image finished pulling - t.Errorf("expected pod lastFinishedPulling: %s but got lastFinishedPulling: %s", - podState.lastFinishedPulling.Add(time.Second*20), podState.lastFinishedPulling) + totalTime := calculateImagePullingTime(podState.imagePullSessions) + expectedTime := time.Second * 10 + if totalTime != expectedTime { + t.Errorf("expected total pulling time: %v but got %v", expectedTime, totalTime) } // pod started @@ -275,6 +273,119 @@ kubelet_pod_start_sli_duration_seconds_count 1 }) } +func TestPodWithInitContainersAndMainContainers(t *testing.T) { + + t.Run("single pod with multiple init containers and main containers", func(t *testing.T) { + + wants := ` +# HELP kubelet_pod_start_sli_duration_seconds [ALPHA] Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch +# TYPE kubelet_pod_start_sli_duration_seconds histogram +kubelet_pod_start_sli_duration_seconds_bucket{le="0.5"} 0 +kubelet_pod_start_sli_duration_seconds_bucket{le="1"} 0 +kubelet_pod_start_sli_duration_seconds_bucket{le="2"} 0 +kubelet_pod_start_sli_duration_seconds_bucket{le="3"} 0 +kubelet_pod_start_sli_duration_seconds_bucket{le="4"} 0 +kubelet_pod_start_sli_duration_seconds_bucket{le="5"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="6"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="8"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="10"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="20"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="30"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="45"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="60"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="120"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="180"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="240"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="300"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="360"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="480"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="600"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="900"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="1200"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="1800"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="2700"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="3600"} 1 +kubelet_pod_start_sli_duration_seconds_bucket{le="+Inf"} 1 +kubelet_pod_start_sli_duration_seconds_sum 4.2 +kubelet_pod_start_sli_duration_seconds_count 1 + ` + + fakeClock := testingclock.NewFakeClock(frozenTime) + + metrics.Register() + + tracker := &basicPodStartupLatencyTracker{ + pods: map[types.UID]*perPodState{}, + clock: fakeClock, + } + + podInit := buildInitializingPod("init-1", "init-2") + tracker.ObservedPodOnWatch(podInit, frozenTime) + + // Init container 1 image pull: 0.5s-1.5s + fakeClock.SetTime(frozenTime.Add(time.Millisecond * 500)) + tracker.RecordImageStartedPulling(podInit.UID) + fakeClock.SetTime(frozenTime.Add(time.Millisecond * 1500)) + tracker.RecordImageFinishedPulling(podInit.UID) + // Init container 1 runtime: 2s-4s + fakeClock.SetTime(frozenTime.Add(time.Second * 2)) + tracker.RecordInitContainerStarted(types.UID(uid), fakeClock.Now()) + fakeClock.SetTime(frozenTime.Add(time.Second * 4)) + tracker.RecordInitContainerFinished(types.UID(uid), fakeClock.Now()) + + // Init container 2 image pull: 4.2s-5.2s (sequential - after init-1 completes) + fakeClock.SetTime(frozenTime.Add(time.Millisecond * 4200)) + tracker.RecordImageStartedPulling(podInit.UID) + fakeClock.SetTime(frozenTime.Add(time.Millisecond * 5200)) + tracker.RecordImageFinishedPulling(podInit.UID) + // Init container 2 runtime: 5.5s-7s + fakeClock.SetTime(frozenTime.Add(time.Millisecond * 5500)) + tracker.RecordInitContainerStarted(types.UID(uid), fakeClock.Now()) + fakeClock.SetTime(frozenTime.Add(time.Second * 7)) + tracker.RecordInitContainerFinished(types.UID(uid), fakeClock.Now()) + + // Main container 1 image pull: 7.2s-8.2s + fakeClock.SetTime(frozenTime.Add(time.Millisecond * 7200)) + tracker.RecordImageStartedPulling(podInit.UID) + fakeClock.SetTime(frozenTime.Add(time.Millisecond * 8200)) + tracker.RecordImageFinishedPulling(podInit.UID) + // Main container 2 image pull: 7.3s-8.5s + fakeClock.SetTime(frozenTime.Add(time.Millisecond * 7300)) + tracker.RecordImageStartedPulling(podInit.UID) + fakeClock.SetTime(frozenTime.Add(time.Millisecond * 8500)) + tracker.RecordImageFinishedPulling(podInit.UID) + + // Pod becomes running at 11s + podStarted := buildRunningPod() + tracker.RecordStatusUpdated(podStarted) + tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*11)) + + if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil { + t.Fatal(err) + } + + state := tracker.pods[types.UID(uid)] + assert.NotNil(t, state, "Pod state should exist") + + expectedInitRuntime := 2*time.Second + 1500*time.Millisecond + assert.Equal(t, expectedInitRuntime, state.totalInitContainerRuntime, + "Total init container runtime should be 3.5s (2s + 1.5s)") + + totalImageTime := calculateImagePullingTime(state.imagePullSessions) + expectedImageTime := 3300 * time.Millisecond // 1s + 1s + 1.3s concurrent overlap + assert.Equal(t, expectedImageTime, totalImageTime, + "Image pulling time should be 3.3s: init-1(1s) + init-2(1s) + concurrent-main(1.3s)") + + assert.Len(t, state.imagePullSessions, 4, "Should have 4 image pull sessions") + + // cleanup + tracker.DeletePodStartupState(podStarted.UID) + + assert.Empty(t, tracker.pods) + metrics.PodStartSLIDuration.Reset() + }) +} + func TestFirstNetworkPodMetrics(t *testing.T) { t.Run("first network pod; started in 30s, image pulling between 10th and 20th seconds", func(t *testing.T) { @@ -338,10 +449,20 @@ kubelet_first_network_pod_start_sli_duration_seconds 30 }) } -func buildInitializingPod() *corev1.Pod { - return buildPodWithStatus([]corev1.ContainerStatus{ +func buildInitializingPod(initContainerNames ...string) *corev1.Pod { + pod := buildPodWithStatus([]corev1.ContainerStatus{ {State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "PodInitializing"}}}, }) + + // Add init containers if specified + if len(initContainerNames) > 0 { + pod.Spec.InitContainers = make([]corev1.Container, len(initContainerNames)) + for i, name := range initContainerNames { + pod.Spec.InitContainers[i] = corev1.Container{Name: name} + } + } + + return pod } func buildRunningPod() *corev1.Pod { From 9b30ec8e5710ddb2cd71052af6d93e2e77377b45 Mon Sep 17 00:00:00 2001 From: alimaazamat Date: Tue, 9 Sep 2025 16:56:18 -0700 Subject: [PATCH 4/5] fix: exclude cases when pod becomes schedulable later --- .../kuberuntime/kuberuntime_manager.go | 1 - .../util/pod_startup_latency_tracker.go | 66 ++++-- .../util/pod_startup_latency_tracker_test.go | 221 +++++++++++++++++- 3 files changed, 261 insertions(+), 27 deletions(-) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 9df6cd21e56..56d98487bc5 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -273,7 +273,6 @@ func NewKubeGenericRuntimeManager( internalLifecycle: containerManager.InternalContainerLifecycle(), logManager: logManager, runtimeClassManager: runtimeClassManager, - allocationManager: allocationManager, logReduction: logreduction.NewLogReduction(identicalErrorDelay), seccompDefault: seccompDefault, memorySwapBehavior: memorySwapBehavior, diff --git a/pkg/kubelet/util/pod_startup_latency_tracker.go b/pkg/kubelet/util/pod_startup_latency_tracker.go index 57c4d51fa48..f0c116cc757 100644 --- a/pkg/kubelet/util/pod_startup_latency_tracker.go +++ b/pkg/kubelet/util/pod_startup_latency_tracker.go @@ -44,6 +44,9 @@ type basicPodStartupLatencyTracker struct { // protect against concurrent read and write on pods map lock sync.Mutex pods map[types.UID]*perPodState + // Track pods that were excluded from SLI due to unschedulability + // These pods should never be re-added even if they later become schedulable + excludedPods map[types.UID]bool // metrics for the first network pod only firstNetworkPodSeen bool // For testability @@ -59,18 +62,20 @@ type perPodState struct { imagePullSessions []imagePullSession imagePullSessionsStarts []time.Time // Track multiple concurrent pull starts // Init container tracking - totalInitContainerRuntime time.Duration // Accumulated runtime of all init containers - currentInitContainerStart time.Time // Track current init container start - // Pod lifecycle tracking + totalInitContainerRuntime time.Duration + currentInitContainerStart time.Time + // first time, when pod status changed into Running observedRunningTime time.Time - metricRecorded bool + // log, if pod latency was already Observed + metricRecorded bool } // NewPodStartupLatencyTracker creates an instance of PodStartupLatencyTracker func NewPodStartupLatencyTracker() PodStartupLatencyTracker { return &basicPodStartupLatencyTracker{ - pods: map[types.UID]*perPodState{}, - clock: clock.RealClock{}, + pods: map[types.UID]*perPodState{}, + excludedPods: map[types.UID]bool{}, + clock: clock.RealClock{}, } } @@ -84,23 +89,31 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim return } - // if the pod is not immediately schedulable, we do not have to track it anymore for startup - for _, condition := range pod.Status.Conditions { - if condition.Type == v1.PodScheduled && condition.Status == v1.ConditionFalse { - delete(p.pods, pod.UID) - return - } - } - state := p.pods[pod.UID] if state == nil { - // create a new record for pod, only if it was not yet acknowledged by the Kubelet - // this is required, as we want to log metric only for those pods, that where scheduled - // after Kubelet started - if pod.Status.StartTime.IsZero() { - p.pods[pod.UID] = &perPodState{} + // if pod was previously unschedulable, don't track it again + if p.excludedPods[pod.UID] { + return } + // create a new record for pod + if pod.Status.StartTime.IsZero() { + if isPodUnschedulable(pod) { + p.excludedPods[pod.UID] = true + return + } + + // if pod is schedulable then track it + state = &perPodState{} + p.pods[pod.UID] = state + } + return + } + + // remove existing pods from tracking (this handles cases where scheduling state becomes known later) + if isPodUnschedulable(pod) { + delete(p.pods, pod.UID) + p.excludedPods[pod.UID] = true return } @@ -316,9 +329,24 @@ func isStatefulPod(pod *v1.Pod) bool { return false } +// isPodUnschedulable determines if a pod should be excluded from SLI tracking +// according to the SLI definition: "By schedulable pod we mean a pod that has to be +// immediately (without actions from any other components) schedulable in the cluster +// without causing any preemption." +// Any pod with PodScheduled=False is not immediately schedulable and should be excluded. +func isPodUnschedulable(pod *v1.Pod) bool { + for _, condition := range pod.Status.Conditions { + if condition.Type == v1.PodScheduled && condition.Status == v1.ConditionFalse { + return true + } + } + return false +} + func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) { p.lock.Lock() defer p.lock.Unlock() delete(p.pods, podUID) + delete(p.excludedPods, podUID) } diff --git a/pkg/kubelet/util/pod_startup_latency_tracker_test.go b/pkg/kubelet/util/pod_startup_latency_tracker_test.go index f46d9c54896..8c5ad2a539c 100644 --- a/pkg/kubelet/util/pod_startup_latency_tracker_test.go +++ b/pkg/kubelet/util/pod_startup_latency_tracker_test.go @@ -47,7 +47,8 @@ func TestNoEvents(t *testing.T) { metrics.Register() tracker := &basicPodStartupLatencyTracker{ - pods: map[types.UID]*perPodState{}, + pods: map[types.UID]*perPodState{}, + excludedPods: map[types.UID]bool{}, } if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil { @@ -68,7 +69,8 @@ func TestPodsRunningBeforeKubeletStarted(t *testing.T) { metrics.Register() tracker := &basicPodStartupLatencyTracker{ - pods: map[types.UID]*perPodState{}, + pods: map[types.UID]*perPodState{}, + excludedPods: map[types.UID]bool{}, } if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil { @@ -129,7 +131,7 @@ kubelet_pod_start_sli_duration_seconds_count 1 metrics.Register() tracker := &basicPodStartupLatencyTracker{ - pods: map[types.UID]*perPodState{}, + pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{}, clock: fakeClock, } @@ -210,7 +212,7 @@ kubelet_pod_start_sli_duration_seconds_count 1 metrics.Register() tracker := &basicPodStartupLatencyTracker{ - pods: map[types.UID]*perPodState{}, + pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{}, clock: fakeClock, } @@ -315,7 +317,7 @@ kubelet_pod_start_sli_duration_seconds_count 1 metrics.Register() tracker := &basicPodStartupLatencyTracker{ - pods: map[types.UID]*perPodState{}, + pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{}, clock: fakeClock, } @@ -333,7 +335,7 @@ kubelet_pod_start_sli_duration_seconds_count 1 fakeClock.SetTime(frozenTime.Add(time.Second * 4)) tracker.RecordInitContainerFinished(types.UID(uid), fakeClock.Now()) - // Init container 2 image pull: 4.2s-5.2s (sequential - after init-1 completes) + // Init container 2 image pull: 4.2s-5.2s fakeClock.SetTime(frozenTime.Add(time.Millisecond * 4200)) tracker.RecordImageStartedPulling(podInit.UID) fakeClock.SetTime(frozenTime.Add(time.Millisecond * 5200)) @@ -386,6 +388,211 @@ kubelet_pod_start_sli_duration_seconds_count 1 }) } +func TestImmediatelySchedulablePods(t *testing.T) { + t.Run("pods not immediately schedulable should be excluded from tracking", func(t *testing.T) { + wants := "" + + fakeClock := testingclock.NewFakeClock(frozenTime) + metrics.Register() + + tracker := &basicPodStartupLatencyTracker{ + pods: map[types.UID]*perPodState{}, + excludedPods: map[types.UID]bool{}, + clock: fakeClock, + } + + // pod that is not immediately schedulable (PodScheduled=False) + podNotSchedulable := buildInitializingPod() + podNotSchedulable.UID = "not-schedulable-pod" + podNotSchedulable.Status.Conditions = []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Reason: "Unschedulable", + }, + } + tracker.ObservedPodOnWatch(podNotSchedulable, frozenTime) + assert.Empty(t, tracker.pods, "Pod with PodScheduled=False should not be tracked") + + // pod that is immediately schedulable (PodScheduled=True) + podSchedulable := buildInitializingPod() + podSchedulable.UID = "schedulable-pod" + podSchedulable.Status.Conditions = []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionTrue, + Reason: "Scheduled", + }, + } + tracker.ObservedPodOnWatch(podSchedulable, frozenTime) + assert.Len(t, tracker.pods, 1, "Pod with PodScheduled=True should be tracked") + + // pod without PodScheduled condition + podNoCondition := buildInitializingPod() + podNoCondition.UID = "no-condition-pod" + tracker.ObservedPodOnWatch(podNoCondition, frozenTime) + assert.Len(t, tracker.pods, 2, "Pod without PodScheduled condition should be tracked by default") + + // Verify metrics are empty + if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil { + t.Fatal(err) + } + + // cleanup + tracker.DeletePodStartupState(podSchedulable.UID) + tracker.DeletePodStartupState(podNoCondition.UID) + assert.Empty(t, tracker.pods) + metrics.PodStartSLIDuration.Reset() + }) + + t.Run("pod observed as schedulable first, then becomes unschedulable should not be tracked", func(t *testing.T) { + wants := "" + + fakeClock := testingclock.NewFakeClock(frozenTime) + metrics.Register() + + tracker := &basicPodStartupLatencyTracker{ + pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{}, + clock: fakeClock, + } + + // First observe pod as schedulable + podSchedulable := buildInitializingPod() + podSchedulable.UID = "becomes-unschedulable" + tracker.ObservedPodOnWatch(podSchedulable, frozenTime) + assert.Len(t, tracker.pods, 1, "Pod should be tracked initially") + + // Later observe the same pod as unschedulable + podUnschedulable := buildInitializingPod() + podUnschedulable.UID = "becomes-unschedulable" + podUnschedulable.Status.Conditions = []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Reason: "Unschedulable", + }, + } + + tracker.ObservedPodOnWatch(podUnschedulable, frozenTime.Add(time.Second)) + assert.Empty(t, tracker.pods, "Pod should be removed when it becomes unschedulable") + + // Verify no metrics + if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil { + t.Fatal(err) + } + + metrics.PodStartSLIDuration.Reset() + }) + + t.Run("pod observed as unschedulable first, then becomes schedulable should remain not tracked", func(t *testing.T) { + wants := "" + + fakeClock := testingclock.NewFakeClock(frozenTime) + metrics.Register() + + tracker := &basicPodStartupLatencyTracker{ + pods: map[types.UID]*perPodState{}, + excludedPods: map[types.UID]bool{}, + clock: fakeClock, + } + + // First observe pod as unschedulable + podUnschedulable := buildInitializingPod() + podUnschedulable.UID = "unschedulable-first" + podUnschedulable.Status.Conditions = []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Reason: "Unschedulable", + }, + } + + tracker.ObservedPodOnWatch(podUnschedulable, frozenTime) + assert.Empty(t, tracker.pods, "Pod should not be tracked when first observed as unschedulable") + assert.True(t, tracker.excludedPods[podUnschedulable.UID], "Pod should be in excludedPods map") + + // Later observe the same pod as schedulable (e.g., after cluster autoscaling) + podSchedulable := buildInitializingPod() + podSchedulable.UID = "unschedulable-first" + podSchedulable.Status.Conditions = []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionTrue, + Reason: "Scheduled", + }, + } + + tracker.ObservedPodOnWatch(podSchedulable, frozenTime.Add(time.Second*5)) + assert.Empty(t, tracker.pods, "Pod should remain excluded even after becoming schedulable") + assert.True(t, tracker.excludedPods[podSchedulable.UID], "Pod should remain in excludedPods map") + + // Complete the startup process - should not record metrics + podRunning := buildRunningPod() + podRunning.UID = "unschedulable-first" + tracker.RecordStatusUpdated(podRunning) + tracker.ObservedPodOnWatch(podRunning, frozenTime.Add(time.Second*10)) + + // Verify no SLI metrics recorded + if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil { + t.Fatal(err) + } + + // cleanup + tracker.DeletePodStartupState(podRunning.UID) + assert.Empty(t, tracker.pods) + assert.False(t, tracker.excludedPods[podRunning.UID], "Pod should be removed from excludedPods on cleanup") + metrics.PodStartSLIDuration.Reset() + }) +} + +func TestStatefulPodExclusion(t *testing.T) { + t.Run("stateful pods should be excluded from SLI metrics", func(t *testing.T) { + wantsSLI := "" + + fakeClock := testingclock.NewFakeClock(frozenTime) + metrics.Register() + + tracker := &basicPodStartupLatencyTracker{ + pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{}, + clock: fakeClock, + } + + // Create a stateful pod (with PVC volume) + statefulPod := buildInitializingPod() + statefulPod.UID = "stateful-pod" + statefulPod.Spec.Volumes = []corev1.Volume{ + { + Name: "persistent-storage", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: "test-pvc", + }, + }, + }, + } + + tracker.ObservedPodOnWatch(statefulPod, frozenTime) + + statefulPodRunning := buildRunningPod() + statefulPodRunning.UID = "stateful-pod" + statefulPodRunning.Spec.Volumes = statefulPod.Spec.Volumes + tracker.RecordStatusUpdated(statefulPodRunning) + + // Observe pod as running after 3 seconds + tracker.ObservedPodOnWatch(statefulPodRunning, frozenTime.Add(time.Second*3)) + + // Verify no SLI metrics for stateful pod ( + if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wantsSLI), metricsName); err != nil { + t.Fatal(err) + } + + // cleanup + tracker.DeletePodStartupState(statefulPod.UID) + assert.Empty(t, tracker.pods) + metrics.PodStartSLIDuration.Reset() + }) +} + func TestFirstNetworkPodMetrics(t *testing.T) { t.Run("first network pod; started in 30s, image pulling between 10th and 20th seconds", func(t *testing.T) { @@ -401,7 +608,7 @@ kubelet_first_network_pod_start_sli_duration_seconds 30 metrics.Register() tracker := &basicPodStartupLatencyTracker{ - pods: map[types.UID]*perPodState{}, + pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{}, clock: fakeClock, } From 914230c9c909883cdc62541d0f94a3cc0f851602 Mon Sep 17 00:00:00 2001 From: alimaazamat Date: Wed, 8 Oct 2025 15:46:44 -0700 Subject: [PATCH 5/5] unit tests for calculateImagePullingTime --- .../util/pod_startup_latency_tracker_test.go | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/pkg/kubelet/util/pod_startup_latency_tracker_test.go b/pkg/kubelet/util/pod_startup_latency_tracker_test.go index 8c5ad2a539c..4c73093495b 100644 --- a/pkg/kubelet/util/pod_startup_latency_tracker_test.go +++ b/pkg/kubelet/util/pod_startup_latency_tracker_test.go @@ -689,3 +689,78 @@ func buildPodWithStatus(cs []corev1.ContainerStatus) *corev1.Pod { }, } } + +func TestCalculateImagePullingTime(t *testing.T) { + baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + + t.Run("empty sessions should return zero", func(t *testing.T) { + sessions := []imagePullSession{} + result := calculateImagePullingTime(sessions) + assert.Equal(t, time.Duration(0), result) + }) + + t.Run("incomplete sessions should be ignored (for example: kubelet restart)", func(t *testing.T) { + sessions := []imagePullSession{ + {start: baseTime, end: baseTime.Add(3 * time.Second)}, // valid: 3s + {start: baseTime.Add(5 * time.Second), end: time.Time{}}, // incomplete, ignored + {start: baseTime.Add(10 * time.Second), end: baseTime.Add(12 * time.Second)}, // valid: 2s + } + result := calculateImagePullingTime(sessions) + assert.Equal(t, 5*time.Second, result) // 3s + 2s + }) + + t.Run("non-overlapping sessions", func(t *testing.T) { + sessions := []imagePullSession{ + {start: baseTime, end: baseTime.Add(2 * time.Second)}, // 2s + {start: baseTime.Add(5 * time.Second), end: baseTime.Add(8 * time.Second)}, // 3s + {start: baseTime.Add(10 * time.Second), end: baseTime.Add(15 * time.Second)}, // 5s + } + result := calculateImagePullingTime(sessions) + assert.Equal(t, 10*time.Second, result) // 2s + 3s + 5s + }) + + t.Run("partially overlapping sessions", func(t *testing.T) { + sessions := []imagePullSession{ + {start: baseTime, end: baseTime.Add(5 * time.Second)}, // 0-5s + {start: baseTime.Add(3 * time.Second), end: baseTime.Add(8 * time.Second)}, // 3-8s (overlap 3-5s) + {start: baseTime.Add(7 * time.Second), end: baseTime.Add(12 * time.Second)}, // 7-12s (overlap 7-8s) + } + result := calculateImagePullingTime(sessions) + assert.Equal(t, 12*time.Second, result) // 12s + }) + + t.Run("completely overlapped sessions", func(t *testing.T) { + sessions := []imagePullSession{ + {start: baseTime, end: baseTime.Add(10 * time.Second)}, // 0-10s + {start: baseTime.Add(2 * time.Second), end: baseTime.Add(8 * time.Second)}, // 2-8s (completely inside) + {start: baseTime.Add(3 * time.Second), end: baseTime.Add(5 * time.Second)}, // 3-5s (completely inside) + } + result := calculateImagePullingTime(sessions) + assert.Equal(t, 10*time.Second, result) // Only outer session: 10s + }) + + t.Run("partially and completely overlapping sessions", func(t *testing.T) { + sessions := []imagePullSession{ + {start: baseTime, end: baseTime.Add(10 * time.Second)}, // 0-10s + {start: baseTime.Add(2 * time.Second), end: baseTime.Add(5 * time.Second)}, // 2-5s (completely inside first) + {start: baseTime.Add(3 * time.Second), end: baseTime.Add(7 * time.Second)}, // 3-7s (completely inside first and partially overlapping second) + } + result := calculateImagePullingTime(sessions) + assert.Equal(t, 10*time.Second, result) // 10s + }) + + t.Run("completely overlapped and partially overlapped sessions with gap", func(t *testing.T) { + // Test mixing completely overlapped sessions with separate sessions + sessions := []imagePullSession{ + {start: baseTime, end: baseTime.Add(8 * time.Second)}, // 0-8s + {start: baseTime.Add(2 * time.Second), end: baseTime.Add(6 * time.Second)}, // 2-6s (completely inside first) + {start: baseTime.Add(3 * time.Second), end: baseTime.Add(5 * time.Second)}, // 3-5s (completely inside first) + {start: baseTime.Add(15 * time.Second), end: baseTime.Add(20 * time.Second)}, // 15-20s (separate) + {start: baseTime.Add(16 * time.Second), end: baseTime.Add(18 * time.Second)}, // 16-18s (completely inside fourth) + {start: baseTime.Add(17 * time.Second), end: baseTime.Add(21 * time.Second)}, // 17-21s (partially overlapping fourth and fifth) + } + result := calculateImagePullingTime(sessions) + // 8s (first group) + 6s (second group) = 14s + assert.Equal(t, 14*time.Second, result) + }) +}