diff --git a/pkg/kubelet/util/pod_startup_latency_tracker.go b/pkg/kubelet/util/pod_startup_latency_tracker.go index 60b4b6ebe9c..7c3b7b63dd3 100644 --- a/pkg/kubelet/util/pod_startup_latency_tracker.go +++ b/pkg/kubelet/util/pod_startup_latency_tracker.go @@ -106,14 +106,67 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim ctx := context.TODO() logger := klog.FromContext(ctx) podStartingDuration := when.Sub(pod.CreationTimestamp.Time) - imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling) - initContainerDuration := state.lastInitContainerFinish.Sub(state.firstInitContainerStart) - podStartSLOduration := (podStartingDuration - imagePullingDuration - initContainerDuration).Seconds() + var podStartSLOduration time.Duration + var excludedTimeStart time.Time + var excludedTimeEnd time.Time + + // Add time from pod creation to first excluded activity starts (either image pulling or init containers starting) + if !state.firstStartedPulling.IsZero() && !state.firstInitContainerStart.IsZero() { + if state.firstStartedPulling.Before(state.firstInitContainerStart) { + excludedTimeStart = state.firstStartedPulling + } else { + excludedTimeStart = state.firstInitContainerStart + } + } else if !state.firstStartedPulling.IsZero() { + excludedTimeStart = state.firstStartedPulling + } else if !state.firstInitContainerStart.IsZero() { + excludedTimeStart = state.firstInitContainerStart + } + + if !excludedTimeStart.IsZero() { + preExcludedDuration := excludedTimeStart.Sub(pod.CreationTimestamp.Time) + if preExcludedDuration > 0 { + podStartSLOduration += preExcludedDuration + } + } + + // Add gap between image pulling end and init container start if there is any + if !state.lastFinishedPulling.IsZero() && !state.firstInitContainerStart.IsZero() { + // Only count gap if init container starts after image pulling ends (no overlap) + if state.firstInitContainerStart.After(state.lastFinishedPulling) { + gapDuration := state.firstInitContainerStart.Sub(state.lastFinishedPulling) + if gapDuration > 0 { + podStartSLOduration += gapDuration + } + } + } + + // Add time from last dependency completion to containers running + if state.lastFinishedPulling.After(state.lastInitContainerFinish) { + excludedTimeEnd = state.lastFinishedPulling + } else if !state.lastInitContainerFinish.IsZero() { + excludedTimeEnd = state.lastInitContainerFinish + } else if !state.lastFinishedPulling.IsZero() { + excludedTimeEnd = state.lastFinishedPulling + } + + if !excludedTimeEnd.IsZero() { + postExcludedDuration := when.Sub(excludedTimeEnd) + if postExcludedDuration > 0 { + podStartSLOduration += postExcludedDuration + } + } else if excludedTimeStart.IsZero() { + // No dependencies at all, count entire duration + podStartSLOduration = podStartingDuration + } + + isStatefulPod := isStatefulPod(pod) logger.Info("Observed pod startup duration", "pod", klog.KObj(pod), - "podStartSLOduration", podStartSLOduration, + "podStartSLOduration", podStartSLOduration.Seconds(), "podStartE2EDuration", podStartingDuration, + "isStatefulPod", isStatefulPod, "podCreationTimestamp", pod.CreationTimestamp.Time, "firstStartedPulling", state.firstStartedPulling, "lastFinishedPulling", state.lastFinishedPulling, @@ -122,15 +175,17 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim "observedRunningTime", state.observedRunningTime, "watchObservedRunningTime", when) - metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration) metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds()) - state.metricRecorded = true - // if is the first Pod with network track the start values - // these metrics will help to identify problems with the CNI plugin - if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen { - metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration) - p.firstNetworkPodSeen = true + if !isStatefulPod { + metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration.Seconds()) + // if is the first Pod with network track the start values + // these metrics will help to identify problems with the CNI plugin + if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen { + metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration.Seconds()) + p.firstNetworkPodSeen = true + } } + state.metricRecorded = true } } @@ -232,6 +287,29 @@ func hasPodStartedSLO(pod *v1.Pod) bool { return true } +// isStatefulPod determines if a pod is stateful according to the SLI documentation: +// "A stateful pod is defined as a pod that mounts at least one volume with sources +// other than secrets, config maps, downward API and empty dir." +// This should reflect the "stateful pod" definition +// ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md +func isStatefulPod(pod *v1.Pod) bool { + for _, volume := range pod.Spec.Volumes { + // Check if this volume is NOT a stateless/ephemeral type + if volume.Secret == nil && + volume.ConfigMap == nil && + volume.DownwardAPI == nil && + volume.EmptyDir == nil && + volume.Projected == nil && + volume.GitRepo == nil && + volume.Image == nil && + volume.Ephemeral == nil && + (volume.CSI == nil || volume.CSI.VolumeAttributes["csi.storage.k8s.io/ephemeral"] != "true") { + return true + } + } + return false +} + func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) { p.lock.Lock() defer p.lock.Unlock()