fix: exclude init container + image pulling time and check pod is stateless

This commit is contained in:
alimaazamat 2025-07-24 17:02:29 -07:00
parent 81c14b7dd5
commit 871f6e1ebe

View file

@ -106,14 +106,67 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim
ctx := context.TODO()
logger := klog.FromContext(ctx)
podStartingDuration := when.Sub(pod.CreationTimestamp.Time)
imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling)
initContainerDuration := state.lastInitContainerFinish.Sub(state.firstInitContainerStart)
podStartSLOduration := (podStartingDuration - imagePullingDuration - initContainerDuration).Seconds()
var podStartSLOduration time.Duration
var excludedTimeStart time.Time
var excludedTimeEnd time.Time
// Add time from pod creation to first excluded activity starts (either image pulling or init containers starting)
if !state.firstStartedPulling.IsZero() && !state.firstInitContainerStart.IsZero() {
if state.firstStartedPulling.Before(state.firstInitContainerStart) {
excludedTimeStart = state.firstStartedPulling
} else {
excludedTimeStart = state.firstInitContainerStart
}
} else if !state.firstStartedPulling.IsZero() {
excludedTimeStart = state.firstStartedPulling
} else if !state.firstInitContainerStart.IsZero() {
excludedTimeStart = state.firstInitContainerStart
}
if !excludedTimeStart.IsZero() {
preExcludedDuration := excludedTimeStart.Sub(pod.CreationTimestamp.Time)
if preExcludedDuration > 0 {
podStartSLOduration += preExcludedDuration
}
}
// Add gap between image pulling end and init container start if there is any
if !state.lastFinishedPulling.IsZero() && !state.firstInitContainerStart.IsZero() {
// Only count gap if init container starts after image pulling ends (no overlap)
if state.firstInitContainerStart.After(state.lastFinishedPulling) {
gapDuration := state.firstInitContainerStart.Sub(state.lastFinishedPulling)
if gapDuration > 0 {
podStartSLOduration += gapDuration
}
}
}
// Add time from last dependency completion to containers running
if state.lastFinishedPulling.After(state.lastInitContainerFinish) {
excludedTimeEnd = state.lastFinishedPulling
} else if !state.lastInitContainerFinish.IsZero() {
excludedTimeEnd = state.lastInitContainerFinish
} else if !state.lastFinishedPulling.IsZero() {
excludedTimeEnd = state.lastFinishedPulling
}
if !excludedTimeEnd.IsZero() {
postExcludedDuration := when.Sub(excludedTimeEnd)
if postExcludedDuration > 0 {
podStartSLOduration += postExcludedDuration
}
} else if excludedTimeStart.IsZero() {
// No dependencies at all, count entire duration
podStartSLOduration = podStartingDuration
}
isStatefulPod := isStatefulPod(pod)
logger.Info("Observed pod startup duration",
"pod", klog.KObj(pod),
"podStartSLOduration", podStartSLOduration,
"podStartSLOduration", podStartSLOduration.Seconds(),
"podStartE2EDuration", podStartingDuration,
"isStatefulPod", isStatefulPod,
"podCreationTimestamp", pod.CreationTimestamp.Time,
"firstStartedPulling", state.firstStartedPulling,
"lastFinishedPulling", state.lastFinishedPulling,
@ -122,15 +175,17 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim
"observedRunningTime", state.observedRunningTime,
"watchObservedRunningTime", when)
metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration)
metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds())
state.metricRecorded = true
// if is the first Pod with network track the start values
// these metrics will help to identify problems with the CNI plugin
if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen {
metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration)
p.firstNetworkPodSeen = true
if !isStatefulPod {
metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration.Seconds())
// if is the first Pod with network track the start values
// these metrics will help to identify problems with the CNI plugin
if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen {
metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration.Seconds())
p.firstNetworkPodSeen = true
}
}
state.metricRecorded = true
}
}
@ -232,6 +287,29 @@ func hasPodStartedSLO(pod *v1.Pod) bool {
return true
}
// isStatefulPod determines if a pod is stateful according to the SLI documentation:
// "A stateful pod is defined as a pod that mounts at least one volume with sources
// other than secrets, config maps, downward API and empty dir."
// This should reflect the "stateful pod" definition
// ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md
func isStatefulPod(pod *v1.Pod) bool {
for _, volume := range pod.Spec.Volumes {
// Check if this volume is NOT a stateless/ephemeral type
if volume.Secret == nil &&
volume.ConfigMap == nil &&
volume.DownwardAPI == nil &&
volume.EmptyDir == nil &&
volume.Projected == nil &&
volume.GitRepo == nil &&
volume.Image == nil &&
volume.Ephemeral == nil &&
(volume.CSI == nil || volume.CSI.VolumeAttributes["csi.storage.k8s.io/ephemeral"] != "true") {
return true
}
}
return false
}
func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) {
p.lock.Lock()
defer p.lock.Unlock()