Merge 914230c9c9 into 44ff6085cd

2026-02-03 20:40:26 -05:00 · 2026-02-04 02:09:08 +01:00 · 2026-02-04 02:09:08 +01:00 · e7b7d72d51
commit e7b7d72d51
parent 44ff6085cd 914230c9c9
5 changed files with 666 additions and 77 deletions
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -640,6 +640,7 @@ func NewMainKubelet(ctx context.Context,
 		nodeStatusMaxImages:          nodeStatusMaxImages,
 		tracer:                       tracer,
 		nodeStartupLatencyTracker:    kubeDeps.NodeStartupLatencyTracker,
+		podStartupLatencyTracker:     kubeDeps.PodStartupLatencyTracker,
 		healthChecker:                kubeDeps.HealthChecker,
 		flagz:                        kubeDeps.Flagz,
 	}
@ -795,10 +796,11 @@ func NewMainKubelet(ctx context.Context,
 		kubeCfg.MemorySwap.SwapBehavior,
 		kubeDeps.ContainerManager.GetNodeAllocatableAbsolute,
 		*kubeCfg.MemoryThrottlingFactor,
-		kubeDeps.PodStartupLatencyTracker,
+		klet.podStartupLatencyTracker,
 		kubeDeps.TracerProvider,
 		tokenManager,
 		getServiceAccount,
+		klet.podStartupLatencyTracker,
 	)
 	if err != nil {
 		return nil, err
@ -1519,6 +1521,9 @@ type Kubelet struct {
 	// Track node startup latencies
 	nodeStartupLatencyTracker util.NodeStartupLatencyTracker

+	// Track pod startup latencies
+	podStartupLatencyTracker util.PodStartupLatencyTracker
+
 	// Health check kubelet
 	healthChecker watchdog.HealthChecker

--- a/pkg/kubelet/kubelet_test.go
+++ b/pkg/kubelet/kubelet_test.go
@ -302,8 +302,8 @@ func newTestKubeletWithImageList(
 	kubelet.configMapManager = configMapManager
 	kubelet.mirrorPodClient = fakeMirrorClient
 	kubelet.podManager = kubepod.NewBasicPodManager()
-	podStartupLatencyTracker := kubeletutil.NewPodStartupLatencyTracker()
-	kubelet.statusManager = status.NewManager(fakeKubeClient, kubelet.podManager, &statustest.FakePodDeletionSafetyProvider{}, podStartupLatencyTracker)
+	kubelet.podStartupLatencyTracker = kubeletutil.NewPodStartupLatencyTracker()
+	kubelet.statusManager = status.NewManager(fakeKubeClient, kubelet.podManager, &statustest.FakePodDeletionSafetyProvider{}, kubelet.podStartupLatencyTracker)
 	kubelet.nodeStartupLatencyTracker = kubeletutil.NewNodeStartupLatencyTracker()
 	kubelet.podCertificateManager = &podcertificate.NoOpManager{}

@ -3474,10 +3474,11 @@ func TestSyncPodSpans(t *testing.T) {
 		kubeCfg.MemorySwap.SwapBehavior,
 		kubelet.containerManager.GetNodeAllocatableAbsolute,
 		*kubeCfg.MemoryThrottlingFactor,
-		kubeletutil.NewPodStartupLatencyTracker(),
+		kubelet.podStartupLatencyTracker,
 		tp,
 		token.NewManager(kubelet.kubeClient),
 		func(string, string) (*v1.ServiceAccount, error) { return nil, nil },
+		kubelet.podStartupLatencyTracker,
 	)
 	assert.NoError(t, err)
 	kubelet.allocationManager.SetContainerRuntime(kubelet.containerRuntime)
--- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go
@ -106,6 +106,11 @@ type podStateProvider interface {
 	ShouldPodRuntimeBeRemoved(kubetypes.UID) bool
 }

+type PodInitContainerTimeRecorder interface {
+	RecordInitContainerStarted(podUID kubetypes.UID, startedAt time.Time)
+	RecordInitContainerFinished(podUID kubetypes.UID, finishedAt time.Time)
+}
+
 type kubeGenericRuntimeManager struct {
 	runtimeName string
 	recorder    record.EventRecorder
@ -191,6 +196,9 @@ type kubeGenericRuntimeManager struct {
 	// Swap controller availability check function (Linux only)
 	// Uses sync.OnceValue for lazy initialization
 	getSwapControllerAvailable func() bool
+
+	// Records first initContainer start time and last initContainer finish time
+	podInitContainerTimeRecorder PodInitContainerTimeRecorder
 }

 // KubeGenericRuntime is a interface contains interfaces for container runtime and command.
@ -240,6 +248,7 @@ func NewKubeGenericRuntimeManager(
 	tracerProvider trace.TracerProvider,
 	tokenManager *token.Manager,
 	getServiceAccount plugin.GetServiceAccountFunc,
+	podInitContainerTimeRecorder PodInitContainerTimeRecorder,
 ) (KubeGenericRuntime, []images.PostImageGCHook, error) {
 	logger := klog.FromContext(ctx)

@ -247,29 +256,30 @@ func NewKubeGenericRuntimeManager(
 	imageService = newInstrumentedImageManagerService(imageService)
 	tracer := tracerProvider.Tracer(instrumentationScope)
 	kubeRuntimeManager := &kubeGenericRuntimeManager{
-		recorder:               recorder,
-		singleProcessOOMKill:   singleProcessOOMKill,
-		cpuCFSQuota:            cpuCFSQuota,
-		cpuCFSQuotaPeriod:      cpuCFSQuotaPeriod,
-		seccompProfileRoot:     filepath.Join(rootDirectory, "seccomp"),
-		livenessManager:        livenessManager,
-		readinessManager:       readinessManager,
-		startupManager:         startupManager,
-		machineInfo:            machineInfo,
-		osInterface:            osInterface,
-		runtimeHelper:          runtimeHelper,
-		runtimeService:         runtimeService,
-		imageService:           imageService,
-		containerManager:       containerManager,
-		internalLifecycle:      containerManager.InternalContainerLifecycle(),
-		logManager:             logManager,
-		runtimeClassManager:    runtimeClassManager,
-		logReduction:           logreduction.NewLogReduction(identicalErrorDelay),
-		seccompDefault:         seccompDefault,
-		memorySwapBehavior:     memorySwapBehavior,
-		getNodeAllocatable:     getNodeAllocatable,
-		memoryThrottlingFactor: memoryThrottlingFactor,
-		podLogsDirectory:       podLogsDirectory,
+		recorder:                     recorder,
+		singleProcessOOMKill:         singleProcessOOMKill,
+		cpuCFSQuota:                  cpuCFSQuota,
+		cpuCFSQuotaPeriod:            cpuCFSQuotaPeriod,
+		seccompProfileRoot:           filepath.Join(rootDirectory, "seccomp"),
+		livenessManager:              livenessManager,
+		readinessManager:             readinessManager,
+		startupManager:               startupManager,
+		machineInfo:                  machineInfo,
+		osInterface:                  osInterface,
+		runtimeHelper:                runtimeHelper,
+		runtimeService:               runtimeService,
+		imageService:                 imageService,
+		containerManager:             containerManager,
+		internalLifecycle:            containerManager.InternalContainerLifecycle(),
+		logManager:                   logManager,
+		runtimeClassManager:          runtimeClassManager,
+		logReduction:                 logreduction.NewLogReduction(identicalErrorDelay),
+		seccompDefault:               seccompDefault,
+		memorySwapBehavior:           memorySwapBehavior,
+		getNodeAllocatable:           getNodeAllocatable,
+		memoryThrottlingFactor:       memoryThrottlingFactor,
+		podLogsDirectory:             podLogsDirectory,
+		podInitContainerTimeRecorder: podInitContainerTimeRecorder,
 	}

 	// Initialize swap controller availability check with lazy evaluation
@ -1665,6 +1675,14 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po
 			}
 			return err
 		}
+		if typeName == "init container" {
+			// Don't measure restartable init containers (sidecars)
+			if !podutil.IsRestartableInitContainer(spec.container) {
+				if m.podInitContainerTimeRecorder != nil {
+					m.podInitContainerTimeRecorder.RecordInitContainerStarted(pod.UID, time.Now())
+				}
+			}
+		}

 		return nil
 	}
@ -1694,6 +1712,21 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po
 		logger.V(4).Info("Completed init container for pod", "containerName", container.Name, "pod", klog.KObj(pod))
 	}

+	for _, cs := range podStatus.ContainerStatuses {
+		// Check if this is an init container
+		for _, init := range pod.Spec.InitContainers {
+			// Don't measure restartable init containers (sidecars)
+			if podutil.IsRestartableInitContainer(&init) {
+				continue
+			}
+			if cs.Name == init.Name && cs.State == kubecontainer.ContainerStateExited && !cs.FinishedAt.IsZero() {
+				if m.podInitContainerTimeRecorder != nil {
+					m.podInitContainerTimeRecorder.RecordInitContainerFinished(pod.UID, cs.FinishedAt)
+				}
+			}
+		}
+	}
+
 	// Step 7: For containers in podContainerChanges.ContainersToUpdate[CPU,Memory] list, invoke UpdateContainerResources
 	if resizable, _, _ := allocation.IsInPlacePodVerticalScalingAllowed(pod); resizable {
 		if len(podContainerChanges.ContainersToUpdate) > 0 || podContainerChanges.UpdatePodResources || podContainerChanges.UpdatePodLevelResources {
--- a/pkg/kubelet/util/pod_startup_latency_tracker.go
+++ b/pkg/kubelet/util/pod_startup_latency_tracker.go
@ -34,6 +34,8 @@ type PodStartupLatencyTracker interface {
 	ObservedPodOnWatch(pod *v1.Pod, when time.Time)
 	RecordImageStartedPulling(podUID types.UID)
 	RecordImageFinishedPulling(podUID types.UID)
+	RecordInitContainerStarted(podUID types.UID, startedAt time.Time)
+	RecordInitContainerFinished(podUID types.UID, finishedAt time.Time)
 	RecordStatusUpdated(pod *v1.Pod)
 	DeletePodStartupState(podUID types.UID)
 }
@ -42,15 +44,26 @@ type basicPodStartupLatencyTracker struct {
 	// protect against concurrent read and write on pods map
 	lock sync.Mutex
 	pods map[types.UID]*perPodState
+	// Track pods that were excluded from SLI due to unschedulability
+	// These pods should never be re-added even if they later become schedulable
+	excludedPods map[types.UID]bool
 	// metrics for the first network pod only
 	firstNetworkPodSeen bool
 	// For testability
 	clock clock.Clock
 }

+type imagePullSession struct {
+	start time.Time
+	end   time.Time
+}
 type perPodState struct {
-	firstStartedPulling time.Time
-	lastFinishedPulling time.Time
+	// Session-based image pulling tracking for accurate overlap handling
+	imagePullSessions       []imagePullSession
+	imagePullSessionsStarts []time.Time // Track multiple concurrent pull starts
+	// Init container tracking
+	totalInitContainerRuntime time.Duration
+	currentInitContainerStart time.Time
 	// first time, when pod status changed into Running
 	observedRunningTime time.Time
 	// log, if pod latency was already Observed
@ -60,8 +73,9 @@ type perPodState struct {
 // NewPodStartupLatencyTracker creates an instance of PodStartupLatencyTracker
 func NewPodStartupLatencyTracker() PodStartupLatencyTracker {
 	return &basicPodStartupLatencyTracker{
-		pods:  map[types.UID]*perPodState{},
-		clock: clock.RealClock{},
+		pods:         map[types.UID]*perPodState{},
+		excludedPods: map[types.UID]bool{},
+		clock:        clock.RealClock{},
 	}
 }

@ -77,13 +91,29 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim

 	state := p.pods[pod.UID]
 	if state == nil {
-		// create a new record for pod, only if it was not yet acknowledged by the Kubelet
-		// this is required, as we want to log metric only for those pods, that where scheduled
-		// after Kubelet started
-		if pod.Status.StartTime.IsZero() {
-			p.pods[pod.UID] = &perPodState{}
+		// if pod was previously unschedulable, don't track it again
+		if p.excludedPods[pod.UID] {
+			return
 		}

+		// create a new record for pod
+		if pod.Status.StartTime.IsZero() {
+			if isPodUnschedulable(pod) {
+				p.excludedPods[pod.UID] = true
+				return
+			}
+
+			// if pod is schedulable then track it
+			state = &perPodState{}
+			p.pods[pod.UID] = state
+		}
+		return
+	}
+
+	// remove existing pods from tracking (this handles cases where scheduling state becomes known later)
+	if isPodUnschedulable(pod) {
+		delete(p.pods, pod.UID)
+		p.excludedPods[pod.UID] = true
 		return
 	}

@ -102,31 +132,76 @@ func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when tim
 		ctx := context.TODO()
 		logger := klog.FromContext(ctx)
 		podStartingDuration := when.Sub(pod.CreationTimestamp.Time)
-		imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling)
-		podStartSLOduration := (podStartingDuration - imagePullingDuration).Seconds()
+		podStartSLOduration := podStartingDuration
+
+		totalImagesPullingTime := calculateImagePullingTime(state.imagePullSessions)
+		if totalImagesPullingTime > 0 {
+			podStartSLOduration -= totalImagesPullingTime
+		}
+
+		if state.totalInitContainerRuntime > 0 {
+			podStartSLOduration -= state.totalInitContainerRuntime
+		}
+
+		podIsStateful := isStatefulPod(pod)

 		logger.Info("Observed pod startup duration",
 			"pod", klog.KObj(pod),
-			"podStartSLOduration", podStartSLOduration,
+			"podStartSLOduration", podStartSLOduration.Seconds(),
 			"podStartE2EDuration", podStartingDuration,
+			"totalImagesPullingTime", totalImagesPullingTime,
+			"totalInitContainerRuntime", state.totalInitContainerRuntime,
+			"isStatefulPod", podIsStateful,
 			"podCreationTimestamp", pod.CreationTimestamp.Time,
-			"firstStartedPulling", state.firstStartedPulling,
-			"lastFinishedPulling", state.lastFinishedPulling,
+			"imagePullSessionsCount", len(state.imagePullSessions),
+			"imagePullSessionsStartsCount", len(state.imagePullSessionsStarts),
 			"observedRunningTime", state.observedRunningTime,
 			"watchObservedRunningTime", when)

-		metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration)
 		metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds())
-		state.metricRecorded = true
-		// if is the first Pod with network track the start values
-		// these metrics will help to identify problems with the CNI plugin
-		if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen {
-			metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration)
-			p.firstNetworkPodSeen = true
+		if !podIsStateful {
+			metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration.Seconds())
+			// if is the first Pod with network track the start values
+			// these metrics will help to identify problems with the CNI plugin
+			if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen {
+				metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration.Seconds())
+				p.firstNetworkPodSeen = true
+			}
 		}
+		state.metricRecorded = true
 	}
 }

+// calculateImagePullingTime computes the total time spent pulling images,
+// accounting for overlapping pull sessions properly
+func calculateImagePullingTime(sessions []imagePullSession) time.Duration {
+	if len(sessions) == 0 {
+		return 0
+	}
+
+	var totalTime time.Duration
+	var currentEnd time.Time
+
+	for i, session := range sessions {
+		if session.end.IsZero() {
+			continue
+		}
+
+		if i == 0 || session.start.After(currentEnd) {
+			// First session or no overlap with previous session
+			totalTime += session.end.Sub(session.start)
+			currentEnd = session.end
+		} else if session.end.After(currentEnd) {
+			// Partial overlap - add only the non-overlapping part
+			totalTime += session.end.Sub(currentEnd)
+			currentEnd = session.end
+		}
+		// If session.end <= currentEnd, it's completely overlapped
+	}
+
+	return totalTime
+}
+
 func (p *basicPodStartupLatencyTracker) RecordImageStartedPulling(podUID types.UID) {
 	p.lock.Lock()
 	defer p.lock.Unlock()
@ -136,9 +211,8 @@ func (p *basicPodStartupLatencyTracker) RecordImageStartedPulling(podUID types.U
 		return
 	}

-	if state.firstStartedPulling.IsZero() {
-		state.firstStartedPulling = p.clock.Now()
-	}
+	now := p.clock.Now()
+	state.imagePullSessionsStarts = append(state.imagePullSessionsStarts, now)
 }

 func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types.UID) {
@ -150,8 +224,48 @@ func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types.
 		return
 	}

-	if !state.firstStartedPulling.IsZero() {
-		state.lastFinishedPulling = p.clock.Now() // Now is always grater than values from the past.
+	now := p.clock.Now()
+
+	// Complete the oldest pull session if we have active starts
+	if len(state.imagePullSessionsStarts) > 0 {
+		// Take the first (oldest) start and create a session
+		startTime := state.imagePullSessionsStarts[0]
+		session := imagePullSession{
+			start: startTime,
+			end:   now,
+		}
+		state.imagePullSessions = append(state.imagePullSessions, session)
+		state.imagePullSessionsStarts = state.imagePullSessionsStarts[1:]
+	}
+}
+
+func (p *basicPodStartupLatencyTracker) RecordInitContainerStarted(podUID types.UID, startedAt time.Time) {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+
+	state := p.pods[podUID]
+	if state == nil {
+		return
+	}
+
+	state.currentInitContainerStart = startedAt
+}
+
+func (p *basicPodStartupLatencyTracker) RecordInitContainerFinished(podUID types.UID, finishedAt time.Time) {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+
+	state := p.pods[podUID]
+	if state == nil {
+		return
+	}
+
+	if !state.currentInitContainerStart.IsZero() {
+		initDuration := finishedAt.Sub(state.currentInitContainerStart)
+		if initDuration > 0 {
+			state.totalInitContainerRuntime += initDuration
+		}
+		state.currentInitContainerStart = time.Time{}
 	}
 }

@ -197,9 +311,42 @@ func hasPodStartedSLO(pod *v1.Pod) bool {
 	return true
 }

+// isStatefulPod determines if a pod is stateful according to the SLI documentation:
+// "A stateful pod is defined as a pod that mounts at least one volume with sources
+// other than secrets, config maps, downward API and empty dir."
+// This should reflect the "stateful pod" definition
+// ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md
+func isStatefulPod(pod *v1.Pod) bool {
+	for _, volume := range pod.Spec.Volumes {
+		// Check if this volume is NOT a stateless/ephemeral type
+		if volume.Secret == nil &&
+			volume.ConfigMap == nil &&
+			volume.DownwardAPI == nil &&
+			volume.EmptyDir == nil {
+			return true
+		}
+	}
+	return false
+}
+
+// isPodUnschedulable determines if a pod should be excluded from SLI tracking
+// according to the SLI definition: "By schedulable pod we mean a pod that has to be
+// immediately (without actions from any other components) schedulable in the cluster
+// without causing any preemption."
+// Any pod with PodScheduled=False is not immediately schedulable and should be excluded.
+func isPodUnschedulable(pod *v1.Pod) bool {
+	for _, condition := range pod.Status.Conditions {
+		if condition.Type == v1.PodScheduled && condition.Status == v1.ConditionFalse {
+			return true
+		}
+	}
+	return false
+}
+
 func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) {
 	p.lock.Lock()
 	defer p.lock.Unlock()

 	delete(p.pods, podUID)
+	delete(p.excludedPods, podUID)
 }
--- a/pkg/kubelet/util/pod_startup_latency_tracker_test.go
+++ b/pkg/kubelet/util/pod_startup_latency_tracker_test.go
@ -47,7 +47,8 @@ func TestNoEvents(t *testing.T) {
 		metrics.Register()

 		tracker := &basicPodStartupLatencyTracker{
-			pods: map[types.UID]*perPodState{},
+			pods:         map[types.UID]*perPodState{},
+			excludedPods: map[types.UID]bool{},
 		}

 		if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
@ -68,7 +69,8 @@ func TestPodsRunningBeforeKubeletStarted(t *testing.T) {
 		metrics.Register()

 		tracker := &basicPodStartupLatencyTracker{
-			pods: map[types.UID]*perPodState{},
+			pods:         map[types.UID]*perPodState{},
+			excludedPods: map[types.UID]bool{},
 		}

 		if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
@ -129,7 +131,7 @@ kubelet_pod_start_sli_duration_seconds_count 1
 		metrics.Register()

 		tracker := &basicPodStartupLatencyTracker{
-			pods:  map[types.UID]*perPodState{},
+			pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{},
 			clock: fakeClock,
 		}

@ -145,9 +147,9 @@ kubelet_pod_start_sli_duration_seconds_count 1
 		if !ok {
 			t.Errorf("expected to track pod: %s, but pod not found", podInit.UID)
 		}
-		if !podState.lastFinishedPulling.Equal(podState.firstStartedPulling.Add(time.Millisecond * 100)) {
-			t.Errorf("expected pod firstStartedPulling: %s and lastFinishedPulling: %s but got firstStartedPulling: %s and lastFinishedPulling: %s",
-				podState.firstStartedPulling, podState.firstStartedPulling.Add(time.Millisecond*100), podState.firstStartedPulling, podState.lastFinishedPulling)
+
+		if len(podState.imagePullSessions) != 1 {
+			t.Errorf("expected one image pull session to be recorded")
 		}

 		podStarted := buildRunningPod()
@ -170,7 +172,7 @@ kubelet_pod_start_sli_duration_seconds_count 1

 func TestSinglePodMultipleDownloadsAndRestartsRecorded(t *testing.T) {

-	t.Run("single pod; started in 30s, image pulling between 10th and 20th seconds", func(t *testing.T) {
+	t.Run("single pod; started in 30s, overlapping image pulling between 10th and 20th seconds", func(t *testing.T) {

 		wants := `
 # HELP kubelet_pod_start_sli_duration_seconds [ALPHA] Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch
@ -210,27 +212,25 @@ kubelet_pod_start_sli_duration_seconds_count 1
 		metrics.Register()

 		tracker := &basicPodStartupLatencyTracker{
-			pods:  map[types.UID]*perPodState{},
+			pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{},
 			clock: fakeClock,
 		}

 		podInitializing := buildInitializingPod()
 		tracker.ObservedPodOnWatch(podInitializing, frozenTime)
-
-		// image pulling started at 10s and the last one finished at 30s
-		// first image starts pulling at 10s
+		// Image 1: 10-16s
 		fakeClock.SetTime(frozenTime.Add(time.Second * 10))
 		tracker.RecordImageStartedPulling(podInitializing.UID)
-		// second image starts pulling at 11s
+		// Image 2: 11-18s
 		fakeClock.SetTime(frozenTime.Add(time.Second * 11))
 		tracker.RecordImageStartedPulling(podInitializing.UID)
-		// third image starts pulling at 14s
+		// Image 3: 14-20s
 		fakeClock.SetTime(frozenTime.Add(time.Second * 14))
 		tracker.RecordImageStartedPulling(podInitializing.UID)
-		// first image finished pulling at 18s
+		fakeClock.SetTime(frozenTime.Add(time.Second * 16))
+		tracker.RecordImageFinishedPulling(podInitializing.UID)
 		fakeClock.SetTime(frozenTime.Add(time.Second * 18))
 		tracker.RecordImageFinishedPulling(podInitializing.UID)
-		// second and third finished pulling at 20s
 		fakeClock.SetTime(frozenTime.Add(time.Second * 20))
 		tracker.RecordImageFinishedPulling(podInitializing.UID)

@ -238,13 +238,13 @@ kubelet_pod_start_sli_duration_seconds_count 1
 		if !ok {
 			t.Errorf("expected to track pod: %s, but pod not found", podInitializing.UID)
 		}
-		if !podState.firstStartedPulling.Equal(frozenTime.Add(time.Second * 10)) { // second and third image start pulling should not affect pod firstStartedPulling
-			t.Errorf("expected pod firstStartedPulling: %s but got firstStartedPulling: %s",
-				podState.firstStartedPulling.Add(time.Second*10), podState.firstStartedPulling)
+		if len(podState.imagePullSessions) != 3 {
+			t.Errorf("expected 3 image pull sessions but got %d", len(podState.imagePullSessions))
 		}
-		if !podState.lastFinishedPulling.Equal(frozenTime.Add(time.Second * 20)) { // should be updated when the pod's last image finished pulling
-			t.Errorf("expected pod lastFinishedPulling: %s but got lastFinishedPulling: %s",
-				podState.lastFinishedPulling.Add(time.Second*20), podState.lastFinishedPulling)
+		totalTime := calculateImagePullingTime(podState.imagePullSessions)
+		expectedTime := time.Second * 10
+		if totalTime != expectedTime {
+			t.Errorf("expected total pulling time: %v but got %v", expectedTime, totalTime)
 		}

 		// pod started
@ -275,6 +275,324 @@ kubelet_pod_start_sli_duration_seconds_count 1
 	})
 }

+func TestPodWithInitContainersAndMainContainers(t *testing.T) {
+
+	t.Run("single pod with multiple init containers and main containers", func(t *testing.T) {
+
+		wants := `
+# HELP kubelet_pod_start_sli_duration_seconds [ALPHA] Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch
+# TYPE kubelet_pod_start_sli_duration_seconds histogram
+kubelet_pod_start_sli_duration_seconds_bucket{le="0.5"} 0
+kubelet_pod_start_sli_duration_seconds_bucket{le="1"} 0
+kubelet_pod_start_sli_duration_seconds_bucket{le="2"} 0
+kubelet_pod_start_sli_duration_seconds_bucket{le="3"} 0
+kubelet_pod_start_sli_duration_seconds_bucket{le="4"} 0
+kubelet_pod_start_sli_duration_seconds_bucket{le="5"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="6"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="8"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="10"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="20"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="30"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="45"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="60"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="120"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="180"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="240"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="300"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="360"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="480"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="600"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="900"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="1200"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="1800"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="2700"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="3600"} 1
+kubelet_pod_start_sli_duration_seconds_bucket{le="+Inf"} 1
+kubelet_pod_start_sli_duration_seconds_sum 4.2
+kubelet_pod_start_sli_duration_seconds_count 1
+		`
+
+		fakeClock := testingclock.NewFakeClock(frozenTime)
+
+		metrics.Register()
+
+		tracker := &basicPodStartupLatencyTracker{
+			pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{},
+			clock: fakeClock,
+		}
+
+		podInit := buildInitializingPod("init-1", "init-2")
+		tracker.ObservedPodOnWatch(podInit, frozenTime)
+
+		// Init container 1 image pull: 0.5s-1.5s
+		fakeClock.SetTime(frozenTime.Add(time.Millisecond * 500))
+		tracker.RecordImageStartedPulling(podInit.UID)
+		fakeClock.SetTime(frozenTime.Add(time.Millisecond * 1500))
+		tracker.RecordImageFinishedPulling(podInit.UID)
+		// Init container 1 runtime: 2s-4s
+		fakeClock.SetTime(frozenTime.Add(time.Second * 2))
+		tracker.RecordInitContainerStarted(types.UID(uid), fakeClock.Now())
+		fakeClock.SetTime(frozenTime.Add(time.Second * 4))
+		tracker.RecordInitContainerFinished(types.UID(uid), fakeClock.Now())
+
+		// Init container 2 image pull: 4.2s-5.2s
+		fakeClock.SetTime(frozenTime.Add(time.Millisecond * 4200))
+		tracker.RecordImageStartedPulling(podInit.UID)
+		fakeClock.SetTime(frozenTime.Add(time.Millisecond * 5200))
+		tracker.RecordImageFinishedPulling(podInit.UID)
+		// Init container 2 runtime: 5.5s-7s
+		fakeClock.SetTime(frozenTime.Add(time.Millisecond * 5500))
+		tracker.RecordInitContainerStarted(types.UID(uid), fakeClock.Now())
+		fakeClock.SetTime(frozenTime.Add(time.Second * 7))
+		tracker.RecordInitContainerFinished(types.UID(uid), fakeClock.Now())
+
+		// Main container 1 image pull: 7.2s-8.2s
+		fakeClock.SetTime(frozenTime.Add(time.Millisecond * 7200))
+		tracker.RecordImageStartedPulling(podInit.UID)
+		fakeClock.SetTime(frozenTime.Add(time.Millisecond * 8200))
+		tracker.RecordImageFinishedPulling(podInit.UID)
+		// Main container 2 image pull: 7.3s-8.5s
+		fakeClock.SetTime(frozenTime.Add(time.Millisecond * 7300))
+		tracker.RecordImageStartedPulling(podInit.UID)
+		fakeClock.SetTime(frozenTime.Add(time.Millisecond * 8500))
+		tracker.RecordImageFinishedPulling(podInit.UID)
+
+		// Pod becomes running at 11s
+		podStarted := buildRunningPod()
+		tracker.RecordStatusUpdated(podStarted)
+		tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*11))
+
+		if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
+			t.Fatal(err)
+		}
+
+		state := tracker.pods[types.UID(uid)]
+		assert.NotNil(t, state, "Pod state should exist")
+
+		expectedInitRuntime := 2*time.Second + 1500*time.Millisecond
+		assert.Equal(t, expectedInitRuntime, state.totalInitContainerRuntime,
+			"Total init container runtime should be 3.5s (2s + 1.5s)")
+
+		totalImageTime := calculateImagePullingTime(state.imagePullSessions)
+		expectedImageTime := 3300 * time.Millisecond // 1s + 1s + 1.3s concurrent overlap
+		assert.Equal(t, expectedImageTime, totalImageTime,
+			"Image pulling time should be 3.3s: init-1(1s) + init-2(1s) + concurrent-main(1.3s)")
+
+		assert.Len(t, state.imagePullSessions, 4, "Should have 4 image pull sessions")
+
+		// cleanup
+		tracker.DeletePodStartupState(podStarted.UID)
+
+		assert.Empty(t, tracker.pods)
+		metrics.PodStartSLIDuration.Reset()
+	})
+}
+
+func TestImmediatelySchedulablePods(t *testing.T) {
+	t.Run("pods not immediately schedulable should be excluded from tracking", func(t *testing.T) {
+		wants := ""
+
+		fakeClock := testingclock.NewFakeClock(frozenTime)
+		metrics.Register()
+
+		tracker := &basicPodStartupLatencyTracker{
+			pods:         map[types.UID]*perPodState{},
+			excludedPods: map[types.UID]bool{},
+			clock:        fakeClock,
+		}
+
+		// pod that is not immediately schedulable (PodScheduled=False)
+		podNotSchedulable := buildInitializingPod()
+		podNotSchedulable.UID = "not-schedulable-pod"
+		podNotSchedulable.Status.Conditions = []corev1.PodCondition{
+			{
+				Type:   corev1.PodScheduled,
+				Status: corev1.ConditionFalse,
+				Reason: "Unschedulable",
+			},
+		}
+		tracker.ObservedPodOnWatch(podNotSchedulable, frozenTime)
+		assert.Empty(t, tracker.pods, "Pod with PodScheduled=False should not be tracked")
+
+		// pod that is immediately schedulable (PodScheduled=True)
+		podSchedulable := buildInitializingPod()
+		podSchedulable.UID = "schedulable-pod"
+		podSchedulable.Status.Conditions = []corev1.PodCondition{
+			{
+				Type:   corev1.PodScheduled,
+				Status: corev1.ConditionTrue,
+				Reason: "Scheduled",
+			},
+		}
+		tracker.ObservedPodOnWatch(podSchedulable, frozenTime)
+		assert.Len(t, tracker.pods, 1, "Pod with PodScheduled=True should be tracked")
+
+		// pod without PodScheduled condition
+		podNoCondition := buildInitializingPod()
+		podNoCondition.UID = "no-condition-pod"
+		tracker.ObservedPodOnWatch(podNoCondition, frozenTime)
+		assert.Len(t, tracker.pods, 2, "Pod without PodScheduled condition should be tracked by default")
+
+		// Verify metrics are empty
+		if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
+			t.Fatal(err)
+		}
+
+		// cleanup
+		tracker.DeletePodStartupState(podSchedulable.UID)
+		tracker.DeletePodStartupState(podNoCondition.UID)
+		assert.Empty(t, tracker.pods)
+		metrics.PodStartSLIDuration.Reset()
+	})
+
+	t.Run("pod observed as schedulable first, then becomes unschedulable should not be tracked", func(t *testing.T) {
+		wants := ""
+
+		fakeClock := testingclock.NewFakeClock(frozenTime)
+		metrics.Register()
+
+		tracker := &basicPodStartupLatencyTracker{
+			pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{},
+			clock: fakeClock,
+		}
+
+		// First observe pod as schedulable
+		podSchedulable := buildInitializingPod()
+		podSchedulable.UID = "becomes-unschedulable"
+		tracker.ObservedPodOnWatch(podSchedulable, frozenTime)
+		assert.Len(t, tracker.pods, 1, "Pod should be tracked initially")
+
+		// Later observe the same pod as unschedulable
+		podUnschedulable := buildInitializingPod()
+		podUnschedulable.UID = "becomes-unschedulable"
+		podUnschedulable.Status.Conditions = []corev1.PodCondition{
+			{
+				Type:   corev1.PodScheduled,
+				Status: corev1.ConditionFalse,
+				Reason: "Unschedulable",
+			},
+		}
+
+		tracker.ObservedPodOnWatch(podUnschedulable, frozenTime.Add(time.Second))
+		assert.Empty(t, tracker.pods, "Pod should be removed when it becomes unschedulable")
+
+		// Verify no metrics
+		if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
+			t.Fatal(err)
+		}
+
+		metrics.PodStartSLIDuration.Reset()
+	})
+
+	t.Run("pod observed as unschedulable first, then becomes schedulable should remain not tracked", func(t *testing.T) {
+		wants := ""
+
+		fakeClock := testingclock.NewFakeClock(frozenTime)
+		metrics.Register()
+
+		tracker := &basicPodStartupLatencyTracker{
+			pods:         map[types.UID]*perPodState{},
+			excludedPods: map[types.UID]bool{},
+			clock:        fakeClock,
+		}
+
+		// First observe pod as unschedulable
+		podUnschedulable := buildInitializingPod()
+		podUnschedulable.UID = "unschedulable-first"
+		podUnschedulable.Status.Conditions = []corev1.PodCondition{
+			{
+				Type:   corev1.PodScheduled,
+				Status: corev1.ConditionFalse,
+				Reason: "Unschedulable",
+			},
+		}
+
+		tracker.ObservedPodOnWatch(podUnschedulable, frozenTime)
+		assert.Empty(t, tracker.pods, "Pod should not be tracked when first observed as unschedulable")
+		assert.True(t, tracker.excludedPods[podUnschedulable.UID], "Pod should be in excludedPods map")
+
+		// Later observe the same pod as schedulable (e.g., after cluster autoscaling)
+		podSchedulable := buildInitializingPod()
+		podSchedulable.UID = "unschedulable-first"
+		podSchedulable.Status.Conditions = []corev1.PodCondition{
+			{
+				Type:   corev1.PodScheduled,
+				Status: corev1.ConditionTrue,
+				Reason: "Scheduled",
+			},
+		}
+
+		tracker.ObservedPodOnWatch(podSchedulable, frozenTime.Add(time.Second*5))
+		assert.Empty(t, tracker.pods, "Pod should remain excluded even after becoming schedulable")
+		assert.True(t, tracker.excludedPods[podSchedulable.UID], "Pod should remain in excludedPods map")
+
+		// Complete the startup process - should not record metrics
+		podRunning := buildRunningPod()
+		podRunning.UID = "unschedulable-first"
+		tracker.RecordStatusUpdated(podRunning)
+		tracker.ObservedPodOnWatch(podRunning, frozenTime.Add(time.Second*10))
+
+		// Verify no SLI metrics recorded
+		if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
+			t.Fatal(err)
+		}
+
+		// cleanup
+		tracker.DeletePodStartupState(podRunning.UID)
+		assert.Empty(t, tracker.pods)
+		assert.False(t, tracker.excludedPods[podRunning.UID], "Pod should be removed from excludedPods on cleanup")
+		metrics.PodStartSLIDuration.Reset()
+	})
+}
+
+func TestStatefulPodExclusion(t *testing.T) {
+	t.Run("stateful pods should be excluded from SLI metrics", func(t *testing.T) {
+		wantsSLI := ""
+
+		fakeClock := testingclock.NewFakeClock(frozenTime)
+		metrics.Register()
+
+		tracker := &basicPodStartupLatencyTracker{
+			pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{},
+			clock: fakeClock,
+		}
+
+		// Create a stateful pod (with PVC volume)
+		statefulPod := buildInitializingPod()
+		statefulPod.UID = "stateful-pod"
+		statefulPod.Spec.Volumes = []corev1.Volume{
+			{
+				Name: "persistent-storage",
+				VolumeSource: corev1.VolumeSource{
+					PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+						ClaimName: "test-pvc",
+					},
+				},
+			},
+		}
+
+		tracker.ObservedPodOnWatch(statefulPod, frozenTime)
+
+		statefulPodRunning := buildRunningPod()
+		statefulPodRunning.UID = "stateful-pod"
+		statefulPodRunning.Spec.Volumes = statefulPod.Spec.Volumes
+		tracker.RecordStatusUpdated(statefulPodRunning)
+
+		// Observe pod as running after 3 seconds
+		tracker.ObservedPodOnWatch(statefulPodRunning, frozenTime.Add(time.Second*3))
+
+		// Verify no SLI metrics for stateful pod (
+		if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wantsSLI), metricsName); err != nil {
+			t.Fatal(err)
+		}
+
+		// cleanup
+		tracker.DeletePodStartupState(statefulPod.UID)
+		assert.Empty(t, tracker.pods)
+		metrics.PodStartSLIDuration.Reset()
+	})
+}
+
 func TestFirstNetworkPodMetrics(t *testing.T) {

 	t.Run("first network pod; started in 30s, image pulling between 10th and 20th seconds", func(t *testing.T) {
@ -290,7 +608,7 @@ kubelet_first_network_pod_start_sli_duration_seconds 30
 		metrics.Register()

 		tracker := &basicPodStartupLatencyTracker{
-			pods:  map[types.UID]*perPodState{},
+			pods: map[types.UID]*perPodState{}, excludedPods: map[types.UID]bool{},
 			clock: fakeClock,
 		}

@ -338,10 +656,20 @@ kubelet_first_network_pod_start_sli_duration_seconds 30
 	})
 }

-func buildInitializingPod() *corev1.Pod {
-	return buildPodWithStatus([]corev1.ContainerStatus{
+func buildInitializingPod(initContainerNames ...string) *corev1.Pod {
+	pod := buildPodWithStatus([]corev1.ContainerStatus{
 		{State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "PodInitializing"}}},
 	})
+
+	// Add init containers if specified
+	if len(initContainerNames) > 0 {
+		pod.Spec.InitContainers = make([]corev1.Container, len(initContainerNames))
+		for i, name := range initContainerNames {
+			pod.Spec.InitContainers[i] = corev1.Container{Name: name}
+		}
+	}
+
+	return pod
 }

 func buildRunningPod() *corev1.Pod {
@ -361,3 +689,78 @@ func buildPodWithStatus(cs []corev1.ContainerStatus) *corev1.Pod {
 		},
 	}
 }
+
+func TestCalculateImagePullingTime(t *testing.T) {
+	baseTime := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
+
+	t.Run("empty sessions should return zero", func(t *testing.T) {
+		sessions := []imagePullSession{}
+		result := calculateImagePullingTime(sessions)
+		assert.Equal(t, time.Duration(0), result)
+	})
+
+	t.Run("incomplete sessions should be ignored (for example: kubelet restart)", func(t *testing.T) {
+		sessions := []imagePullSession{
+			{start: baseTime, end: baseTime.Add(3 * time.Second)},                        // valid: 3s
+			{start: baseTime.Add(5 * time.Second), end: time.Time{}},                     // incomplete, ignored
+			{start: baseTime.Add(10 * time.Second), end: baseTime.Add(12 * time.Second)}, // valid: 2s
+		}
+		result := calculateImagePullingTime(sessions)
+		assert.Equal(t, 5*time.Second, result) // 3s + 2s
+	})
+
+	t.Run("non-overlapping sessions", func(t *testing.T) {
+		sessions := []imagePullSession{
+			{start: baseTime, end: baseTime.Add(2 * time.Second)},                        // 2s
+			{start: baseTime.Add(5 * time.Second), end: baseTime.Add(8 * time.Second)},   // 3s
+			{start: baseTime.Add(10 * time.Second), end: baseTime.Add(15 * time.Second)}, // 5s
+		}
+		result := calculateImagePullingTime(sessions)
+		assert.Equal(t, 10*time.Second, result) // 2s + 3s + 5s
+	})
+
+	t.Run("partially overlapping sessions", func(t *testing.T) {
+		sessions := []imagePullSession{
+			{start: baseTime, end: baseTime.Add(5 * time.Second)},                       // 0-5s
+			{start: baseTime.Add(3 * time.Second), end: baseTime.Add(8 * time.Second)},  // 3-8s (overlap 3-5s)
+			{start: baseTime.Add(7 * time.Second), end: baseTime.Add(12 * time.Second)}, // 7-12s (overlap 7-8s)
+		}
+		result := calculateImagePullingTime(sessions)
+		assert.Equal(t, 12*time.Second, result) // 12s
+	})
+
+	t.Run("completely overlapped sessions", func(t *testing.T) {
+		sessions := []imagePullSession{
+			{start: baseTime, end: baseTime.Add(10 * time.Second)},                     // 0-10s
+			{start: baseTime.Add(2 * time.Second), end: baseTime.Add(8 * time.Second)}, // 2-8s (completely inside)
+			{start: baseTime.Add(3 * time.Second), end: baseTime.Add(5 * time.Second)}, // 3-5s (completely inside)
+		}
+		result := calculateImagePullingTime(sessions)
+		assert.Equal(t, 10*time.Second, result) // Only outer session: 10s
+	})
+
+	t.Run("partially and completely overlapping sessions", func(t *testing.T) {
+		sessions := []imagePullSession{
+			{start: baseTime, end: baseTime.Add(10 * time.Second)},                     // 0-10s
+			{start: baseTime.Add(2 * time.Second), end: baseTime.Add(5 * time.Second)}, // 2-5s (completely inside first)
+			{start: baseTime.Add(3 * time.Second), end: baseTime.Add(7 * time.Second)}, // 3-7s (completely inside first and partially overlapping second)
+		}
+		result := calculateImagePullingTime(sessions)
+		assert.Equal(t, 10*time.Second, result) // 10s
+	})
+
+	t.Run("completely overlapped and partially overlapped sessions with gap", func(t *testing.T) {
+		// Test mixing completely overlapped sessions with separate sessions
+		sessions := []imagePullSession{
+			{start: baseTime, end: baseTime.Add(8 * time.Second)},                        // 0-8s
+			{start: baseTime.Add(2 * time.Second), end: baseTime.Add(6 * time.Second)},   // 2-6s (completely inside first)
+			{start: baseTime.Add(3 * time.Second), end: baseTime.Add(5 * time.Second)},   // 3-5s (completely inside first)
+			{start: baseTime.Add(15 * time.Second), end: baseTime.Add(20 * time.Second)}, // 15-20s (separate)
+			{start: baseTime.Add(16 * time.Second), end: baseTime.Add(18 * time.Second)}, // 16-18s (completely inside fourth)
+			{start: baseTime.Add(17 * time.Second), end: baseTime.Add(21 * time.Second)}, // 17-21s (partially overlapping fourth and fifth)
+		}
+		result := calculateImagePullingTime(sessions)
+		// 8s (first group) + 6s (second group) = 14s
+		assert.Equal(t, 14*time.Second, result)
+	})
+}