diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index aef191f07df..66529a48658 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -884,6 +884,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend CPUManagerReconcilePeriod: s.CPUManagerReconcilePeriod.Duration, MemoryManagerPolicy: s.MemoryManagerPolicy, MemoryManagerReservedMemory: s.ReservedMemory, + MemoryReservationPolicy: s.MemoryReservationPolicy, PodPidsLimit: s.PodPidsLimit, EnforceCPULimits: s.CPUCFSQuota, CPUCFSQuotaPeriod: s.CPUCFSQuotaPeriod.Duration, diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index 95edd8b539b..2459b26b1db 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -71891,6 +71891,13 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen Format: "double", }, }, + "memoryReservationPolicy": { + SchemaProps: spec.SchemaProps{ + Description: "MemoryReservationPolicy controls how the kubelet applies cgroup v2 memory protection. \"None\" (default): The kubelet does not set memory.min for containers and pods, ensuring no hard memory is locked by the kernel. \"HardReservation\": The kubelet sets the cgroup v2 memory.min value based on pod and container memory requests. This ensures the requested memory is never reclaimed by the kernel, but may trigger an OOM if the reservation cannot be satisfied. See https://kep.k8s.io/2570 for more details. Default: None", + Type: []string{"string"}, + Format: "", + }, + }, "registerWithTaints": { SchemaProps: spec.SchemaProps{ Description: "registerWithTaints are an array of taints to add to a node object when the kubelet registers itself. This only takes effect when registerNode is true and upon the initial registration of the node. Default: nil", diff --git a/pkg/kubelet/apis/config/fuzzer/fuzzer.go b/pkg/kubelet/apis/config/fuzzer/fuzzer.go index bc2f9b07007..beb8183f975 100644 --- a/pkg/kubelet/apis/config/fuzzer/fuzzer.go +++ b/pkg/kubelet/apis/config/fuzzer/fuzzer.go @@ -122,6 +122,7 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} { } obj.EnableSystemLogHandler = true obj.MemoryThrottlingFactor = ptr.To(rand.Float64()) + obj.MemoryReservationPolicy = kubeletconfig.NoneMemoryReservationPolicy obj.LocalStorageCapacityIsolation = true obj.FeatureGates = map[string]bool{ "AllAlpha": false, diff --git a/pkg/kubelet/apis/config/helpers_test.go b/pkg/kubelet/apis/config/helpers_test.go index 5a5a3f0f473..7cda6b706fd 100644 --- a/pkg/kubelet/apis/config/helpers_test.go +++ b/pkg/kubelet/apis/config/helpers_test.go @@ -300,6 +300,7 @@ var ( "ShutdownGracePeriod.Duration", "ShutdownGracePeriodCriticalPods.Duration", "MemoryThrottlingFactor", + "MemoryReservationPolicy", "ContainerRuntimeEndpoint", "ImageServiceEndpoint", "Tracing.Endpoint", diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml index 3b0ea0fc8d9..7134ebcb64e 100644 --- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml +++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml @@ -70,6 +70,7 @@ makeIPTablesUtilChains: true maxOpenFiles: 1000000 maxPods: 110 memoryManagerPolicy: None +memoryReservationPolicy: None memorySwap: {} memoryThrottlingFactor: 0.9 mergeDefaultEvictionSettings: false diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml index 1ceb95dc24e..dc9d85d9a64 100644 --- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml +++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml @@ -70,6 +70,7 @@ makeIPTablesUtilChains: true maxOpenFiles: 1000000 maxPods: 110 memoryManagerPolicy: None +memoryReservationPolicy: None memorySwap: {} memoryThrottlingFactor: 0.9 mergeDefaultEvictionSettings: false diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index 9914cc103f4..e66b8ff37a3 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -503,6 +503,16 @@ type KubeletConfiguration struct { // +featureGate=MemoryQoS // +optional MemoryThrottlingFactor *float64 + // MemoryReservationPolicy controls how the kubelet applies cgroup v2 memory protection. + // "None" (default): The kubelet does not set memory.min for containers and pods, + // ensuring no hard memory is locked by the kernel. + // "HardReservation": The kubelet sets the cgroup v2 memory.min value based on pod and container memory requests. + // This ensures the requested memory is never reclaimed by the kernel, but may trigger an OOM if the reservation cannot be satisfied. + // See https://kep.k8s.io/2570 for more details. + // Default: None + // +featureGate=MemoryQoS + // +optional + MemoryReservationPolicy MemoryReservationPolicy // registerWithTaints are an array of taints to add to a node object when // the kubelet registers itself. This only takes effect when registerNode // is true and upon the initial registration of the node. @@ -852,6 +862,17 @@ const ( AlwaysVerify ImagePullCredentialsVerificationPolicy = "AlwaysVerify" ) +// MemoryReservationPolicy defines how the kubelet applies cgroup v2 memory protection. +type MemoryReservationPolicy string + +const ( + // NoneMemoryReservationPolicy disables memory.min protection for containers and pods. + // This is the default to maintain node stability by preventing "locked" memory. + NoneMemoryReservationPolicy MemoryReservationPolicy = "None" + // HardReservationMemoryReservationPolicy enables memory.min for containers and pods. + HardReservationMemoryReservationPolicy MemoryReservationPolicy = "HardReservation" +) + // ImagePullIntent is a record of the kubelet attempting to pull an image. // // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object diff --git a/pkg/kubelet/apis/config/v1beta1/defaults.go b/pkg/kubelet/apis/config/v1beta1/defaults.go index ba5e7aeb600..2d30c50a38e 100644 --- a/pkg/kubelet/apis/config/v1beta1/defaults.go +++ b/pkg/kubelet/apis/config/v1beta1/defaults.go @@ -41,7 +41,8 @@ const ( DefaultVolumePluginDir = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/" DefaultPodLogsDir = "/var/log/pods" // See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos - DefaultMemoryThrottlingFactor = 0.9 + DefaultMemoryThrottlingFactor = 0.9 + DefaultMemoryReservationPolicy = kubeletconfigv1beta1.NoneMemoryReservationPolicy // MaxContainerBackOff is the max backoff period for container restarts, exported for the e2e test MaxContainerBackOff = 300 * time.Second ) @@ -295,6 +296,9 @@ func SetDefaults_KubeletConfiguration(obj *kubeletconfigv1beta1.KubeletConfigura if obj.MemoryThrottlingFactor == nil { obj.MemoryThrottlingFactor = ptr.To(DefaultMemoryThrottlingFactor) } + if obj.MemoryReservationPolicy == "" { + obj.MemoryReservationPolicy = DefaultMemoryReservationPolicy + } if obj.RegisterNode == nil { obj.RegisterNode = ptr.To(true) } diff --git a/pkg/kubelet/apis/config/v1beta1/defaults_test.go b/pkg/kubelet/apis/config/v1beta1/defaults_test.go index 471e74924db..abc6926f7b5 100644 --- a/pkg/kubelet/apis/config/v1beta1/defaults_test.go +++ b/pkg/kubelet/apis/config/v1beta1/defaults_test.go @@ -128,6 +128,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { SeccompDefault: ptr.To(false), FailCgroupV1: ptr.To(true), MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor), + MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy, RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: DefaultPodLogsDir, @@ -265,6 +266,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { SeccompDefault: ptr.To(false), FailCgroupV1: ptr.To(true), MemoryThrottlingFactor: ptr.To[float64](0), + MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy, RegisterNode: ptr.To(false), LocalStorageCapacityIsolation: ptr.To(false), PodLogsDir: "", @@ -370,6 +372,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { SeccompDefault: ptr.To(false), FailCgroupV1: ptr.To(true), MemoryThrottlingFactor: ptr.To[float64](0), + MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy, RegisterNode: ptr.To(false), LocalStorageCapacityIsolation: ptr.To(false), PodLogsDir: DefaultPodLogsDir, @@ -531,6 +534,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { SeccompDefault: ptr.To(true), FailCgroupV1: ptr.To(true), MemoryThrottlingFactor: ptr.To[float64](1), + MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy, RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: "/custom/path", @@ -689,6 +693,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { SeccompDefault: ptr.To(true), FailCgroupV1: ptr.To(true), MemoryThrottlingFactor: ptr.To[float64](1), + MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy, RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: "/custom/path", @@ -788,6 +793,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { SeccompDefault: ptr.To(false), FailCgroupV1: ptr.To(true), MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor), + MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy, RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: DefaultPodLogsDir, @@ -887,6 +893,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { SeccompDefault: ptr.To(false), FailCgroupV1: ptr.To(true), MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor), + MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy, RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: DefaultPodLogsDir, @@ -986,6 +993,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { SeccompDefault: ptr.To(false), FailCgroupV1: ptr.To(true), MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor), + MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy, RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: DefaultPodLogsDir, @@ -1085,6 +1093,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { SeccompDefault: ptr.To(false), FailCgroupV1: ptr.To(true), MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor), + MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy, RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: DefaultPodLogsDir, diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go index 43a651df576..05bf134e67d 100644 --- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go +++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go @@ -750,6 +750,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in return err } out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor)) + out.MemoryReservationPolicy = config.MemoryReservationPolicy(in.MemoryReservationPolicy) out.RegisterWithTaints = *(*[]corev1.Taint)(unsafe.Pointer(&in.RegisterWithTaints)) if err := v1.Convert_Pointer_bool_To_bool(&in.RegisterNode, &out.RegisterNode, s); err != nil { return err @@ -954,6 +955,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in return err } out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor)) + out.MemoryReservationPolicy = configv1beta1.MemoryReservationPolicy(in.MemoryReservationPolicy) out.RegisterWithTaints = *(*[]corev1.Taint)(unsafe.Pointer(&in.RegisterWithTaints)) if err := v1.Convert_bool_To_Pointer_bool(&in.RegisterNode, &out.RegisterNode, s); err != nil { return err diff --git a/pkg/kubelet/apis/config/validation/validation.go b/pkg/kubelet/apis/config/validation/validation.go index 02429683a78..1f8aff79db2 100644 --- a/pkg/kubelet/apis/config/validation/validation.go +++ b/pkg/kubelet/apis/config/validation/validation.go @@ -351,6 +351,17 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration, featur allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryThrottlingFactor %v must be greater than 0 and less than or equal to 1.0", *kc.MemoryThrottlingFactor)) } + if !localFeatureGate.Enabled(features.MemoryQoS) && + kc.MemoryReservationPolicy == kubeletconfig.HardReservationMemoryReservationPolicy { + allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryReservationPolicy %q requires MemoryQoS feature gate to be enabled", + kc.MemoryReservationPolicy)) + } + switch kc.MemoryReservationPolicy { + case kubeletconfig.NoneMemoryReservationPolicy, kubeletconfig.HardReservationMemoryReservationPolicy: + default: + allErrors = append(allErrors, fmt.Errorf("invalid configuration: option %q specified for memoryReservationPolicy. Valid options are %q or %q", kc.MemoryReservationPolicy, kubeletconfig.NoneMemoryReservationPolicy, kubeletconfig.HardReservationMemoryReservationPolicy)) + } + if kc.ContainerRuntimeEndpoint == "" { allErrors = append(allErrors, fmt.Errorf("invalid configuration: the containerRuntimeEndpoint was not specified or empty")) } diff --git a/pkg/kubelet/apis/config/validation/validation_test.go b/pkg/kubelet/apis/config/validation/validation_test.go index b6b2e3f947a..233bbd732b5 100644 --- a/pkg/kubelet/apis/config/validation/validation_test.go +++ b/pkg/kubelet/apis/config/validation/validation_test.go @@ -71,6 +71,7 @@ var ( ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second}, ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, MemoryThrottlingFactor: ptr.To(0.9), + MemoryReservationPolicy: kubeletconfig.NoneMemoryReservationPolicy, FeatureGates: map[string]bool{ "GracefulNodeShutdown": true, "MemoryQoS": true, @@ -563,6 +564,21 @@ func TestValidateKubeletConfiguration(t *testing.T) { return conf }, errMsg: "invalid configuration: memoryThrottlingFactor 1.1 must be greater than 0 and less than or equal to 1.0", + }, { + name: "MemoryReservationPolicy requires MemoryQoS", + configure: func(conf *kubeletconfig.KubeletConfiguration) *kubeletconfig.KubeletConfiguration { + conf.FeatureGates = map[string]bool{"MemoryQoS": false} + conf.MemoryReservationPolicy = kubeletconfig.HardReservationMemoryReservationPolicy + return conf + }, + errMsg: "invalid configuration: memoryReservationPolicy \"HardReservation\" requires MemoryQoS feature gate to be enabled", + }, { + name: "invalid MemoryReservationPolicy", + configure: func(conf *kubeletconfig.KubeletConfiguration) *kubeletconfig.KubeletConfiguration { + conf.MemoryReservationPolicy = "invalid" + return conf + }, + errMsg: "invalid configuration: option \"invalid\" specified for memoryReservationPolicy. Valid options are \"None\" or \"HardReservation\"", }, { name: "invalid Taint.TimeAdded", configure: func(conf *kubeletconfig.KubeletConfiguration) *kubeletconfig.KubeletConfiguration { diff --git a/pkg/kubelet/cm/container_manager.go b/pkg/kubelet/cm/container_manager.go index 0e4357f7fb7..b761f6d9520 100644 --- a/pkg/kubelet/cm/container_manager.go +++ b/pkg/kubelet/cm/container_manager.go @@ -193,6 +193,7 @@ type NodeConfig struct { CPUManagerReconcilePeriod time.Duration MemoryManagerPolicy string MemoryManagerReservedMemory []kubeletconfig.MemoryReservation + MemoryReservationPolicy kubeletconfig.MemoryReservationPolicy PodPidsLimit int64 EnforceCPULimits bool CPUCFSQuotaPeriod time.Duration diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index 8ec4e78635f..ace30752d8f 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -406,8 +406,9 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager { enforceCPULimits: cm.EnforceCPULimits, // cpuCFSQuotaPeriod is in microseconds. NodeConfig.CPUCFSQuotaPeriod is time.Duration (measured in nano seconds). // Convert (cm.CPUCFSQuotaPeriod) [nanoseconds] / time.Microsecond (1000) to get cpuCFSQuotaPeriod in microseconds. - cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond), - podContainerManager: cm, + cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond), + podContainerManager: cm, + memoryReservationPolicy: cm.MemoryReservationPolicy, } } return &podContainerManagerNoop{ diff --git a/pkg/kubelet/cm/helpers.go b/pkg/kubelet/cm/helpers.go index 1e7a5ca0c6d..9efdfe82871 100644 --- a/pkg/kubelet/cm/helpers.go +++ b/pkg/kubelet/cm/helpers.go @@ -25,6 +25,7 @@ import ( internalapi "k8s.io/cri-api/pkg/apis" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/klog/v2" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/cm/containermap" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" ) @@ -32,7 +33,7 @@ import ( // for typecheck across platforms var _ func(int64, int64) int64 = MilliCPUToQuota var _ func(int64) uint64 = MilliCPUToShares -var _ func(*v1.Pod, bool, uint64, bool) *ResourceConfig = ResourceConfigForPod +var _ func(*v1.Pod, bool, uint64, bool, kubeletconfig.MemoryReservationPolicy) *ResourceConfig = ResourceConfigForPod var _ func() (*CgroupSubsystems, error) = GetCgroupSubsystems var _ func(string) ([]int, error) = getCgroupProcs var _ func(types.UID) string = GetPodCgroupNameSuffix diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go index 02a4ce0f675..6eff274b497 100644 --- a/pkg/kubelet/cm/helpers_linux.go +++ b/pkg/kubelet/cm/helpers_linux.go @@ -34,6 +34,7 @@ import ( v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" kubefeatures "k8s.io/kubernetes/pkg/features" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/cm/util" ) @@ -123,7 +124,7 @@ func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 { } // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. -func ResourceConfigForPod(allocatedPod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig { +func ResourceConfigForPod(allocatedPod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool, memoryReservationPolicy kubeletconfig.MemoryReservationPolicy) *ResourceConfig { podLevelResourcesEnabled := utilfeature.DefaultFeatureGate.Enabled(kubefeatures.PodLevelResources) // sum requests and limits. reqs := resourcehelper.PodRequests(allocatedPod, resourcehelper.PodResourcesOptions{ @@ -207,7 +208,7 @@ func ResourceConfigForPod(allocatedPod *v1.Pod, enforceCPULimits bool, cpuPeriod } result.HugePageLimit = hugePageLimits - if enforceMemoryQoS { + if enforceMemoryQoS && memoryReservationPolicy == kubeletconfig.HardReservationMemoryReservationPolicy { memoryMin := int64(0) if request, found := reqs[v1.ResourceMemory]; found { memoryMin = request.Value() diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go index 7122b984cbf..f6378fd5d04 100644 --- a/pkg/kubelet/cm/helpers_linux_test.go +++ b/pkg/kubelet/cm/helpers_linux_test.go @@ -29,6 +29,7 @@ import ( utilfeature "k8s.io/apiserver/pkg/util/feature" featuregatetesting "k8s.io/component-base/featuregate/testing" pkgfeatures "k8s.io/kubernetes/pkg/features" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" ) // getResourceList returns a ResourceList with the @@ -482,7 +483,7 @@ func TestResourceConfigForPod(t *testing.T) { for _, testCase := range testCases { t.Run(testCase.description, func(t *testing.T) { featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, pkgfeatures.PodLevelResources, testCase.podLevelResourcesEnabled) - actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false) + actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false, kubeletconfig.NoneMemoryReservationPolicy) if !reflect.DeepEqual(actual.CPUPeriod, testCase.expected.CPUPeriod) { t.Errorf("cpu period not as expected. Expected: %v, Actual:%v", *testCase.expected.CPUPeriod, *actual.CPUPeriod) } @@ -641,7 +642,7 @@ func TestHugePageLimits(t *testing.T) { }, }, } - resultValuePod := ResourceConfigForPod(&p, false, 0, false) + resultValuePod := ResourceConfigForPod(&p, false, 0, false, kubeletconfig.NoneMemoryReservationPolicy) if !reflect.DeepEqual(testcase.expected, resultValuePod.HugePageLimit) { t.Errorf("unexpected result for ResourceConfigForPod(), expected: %v, actual: %v", testcase.expected, resultValuePod) } @@ -849,7 +850,7 @@ func TestResourceConfigForPodWithEnforceMemoryQoS(t *testing.T) { for testName, testCase := range testCases { - actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, true) + actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, true, kubeletconfig.HardReservationMemoryReservationPolicy) if !reflect.DeepEqual(actual.Unified, testCase.expected.Unified) { t.Errorf("unexpected result, test: %v, unified not as expected", testName) diff --git a/pkg/kubelet/cm/helpers_unsupported.go b/pkg/kubelet/cm/helpers_unsupported.go index d86722aa9f2..148e45bc02b 100644 --- a/pkg/kubelet/cm/helpers_unsupported.go +++ b/pkg/kubelet/cm/helpers_unsupported.go @@ -23,6 +23,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" ) const ( @@ -48,7 +49,7 @@ func MilliCPUToShares(milliCPU int64) uint64 { } // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. -func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig { +func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64, enforceMemoryQoS bool, memoryReservationPolicy kubeletconfig.MemoryReservationPolicy) *ResourceConfig { return nil } diff --git a/pkg/kubelet/cm/pod_container_manager_linux.go b/pkg/kubelet/cm/pod_container_manager_linux.go index 0407841245b..327e612c421 100644 --- a/pkg/kubelet/cm/pod_container_manager_linux.go +++ b/pkg/kubelet/cm/pod_container_manager_linux.go @@ -31,6 +31,7 @@ import ( "k8s.io/klog/v2" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" kubefeatures "k8s.io/kubernetes/pkg/features" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" ) const ( @@ -57,6 +58,8 @@ type podContainerManagerImpl struct { cpuCFSQuotaPeriod uint64 // podContainerManager is the ContainerManager running on the machine podContainerManager ContainerManager + // memoryReservationPolicy controls memory reservation protection behavior + memoryReservationPolicy kubeletconfig.MemoryReservationPolicy } // Make sure that podContainerManagerImpl implements the PodContainerManager interface @@ -89,7 +92,7 @@ func (m *podContainerManagerImpl) EnsureExists(logger klog.Logger, pod *v1.Pod) podContainerName, _ := m.GetPodContainerName(pod) containerConfig := &CgroupConfig{ Name: podContainerName, - ResourceParameters: ResourceConfigForPod(pod, enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS), + ResourceParameters: ResourceConfigForPod(pod, enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS, m.memoryReservationPolicy), } if m.podPidsLimit > 0 { containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go index 71379e35dd4..10af0685b7e 100644 --- a/pkg/kubelet/cm/qos_container_manager_linux.go +++ b/pkg/kubelet/cm/qos_container_manager_linux.go @@ -37,6 +37,7 @@ import ( "k8s.io/component-helpers/resource" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" kubefeatures "k8s.io/kubernetes/pkg/features" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" ) const ( @@ -53,13 +54,14 @@ type QOSContainerManager interface { type qosContainerManagerImpl struct { sync.Mutex - qosContainersInfo QOSContainersInfo - subsystems *CgroupSubsystems - cgroupManager CgroupManager - activePods ActivePodsFunc - getNodeAllocatable func() v1.ResourceList - cgroupRoot CgroupName - qosReserved map[v1.ResourceName]int64 + qosContainersInfo QOSContainersInfo + subsystems *CgroupSubsystems + cgroupManager CgroupManager + activePods ActivePodsFunc + getNodeAllocatable func() v1.ResourceList + cgroupRoot CgroupName + qosReserved map[v1.ResourceName]int64 + memoryReservationPolicy kubeletconfig.MemoryReservationPolicy } func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) { @@ -70,10 +72,11 @@ func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, } return &qosContainerManagerImpl{ - subsystems: subsystems, - cgroupManager: cgroupManager, - cgroupRoot: cgroupRoot, - qosReserved: nodeConfig.QOSReserved, + subsystems: subsystems, + cgroupManager: cgroupManager, + cgroupRoot: cgroupRoot, + qosReserved: nodeConfig.QOSReserved, + memoryReservationPolicy: nodeConfig.MemoryReservationPolicy, }, nil } @@ -296,6 +299,12 @@ func (m *qosContainerManagerImpl) setMemoryQoS(logger klog.Logger, configs map[v logger.V(4).Info("MemoryQoS config for qos", "qos", qos, "memoryMin", memoryMin) } + if m.memoryReservationPolicy != kubeletconfig.HardReservationMemoryReservationPolicy { + setMemoryMin(v1.PodQOSGuaranteed, 0) + setMemoryMin(v1.PodQOSBurstable, 0) + return + } + qosMemoryRequests := m.getQoSMemoryRequests() // Calculate the memory.min: diff --git a/pkg/kubelet/cm/qos_container_manager_linux_test.go b/pkg/kubelet/cm/qos_container_manager_linux_test.go index 20620b65cfb..99a552c2f7a 100644 --- a/pkg/kubelet/cm/qos_container_manager_linux_test.go +++ b/pkg/kubelet/cm/qos_container_manager_linux_test.go @@ -36,6 +36,7 @@ import ( "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/klog/v2" "k8s.io/klog/v2/ktesting" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" ) func activeTestPods() []*v1.Pod { @@ -124,10 +125,11 @@ func createTestQOSContainerManager(logger klog.Logger) (*qosContainerManagerImpl cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName) qosContainerManager := &qosContainerManagerImpl{ - subsystems: subsystems, - cgroupManager: NewCgroupManager(logger, subsystems, "cgroupfs"), - cgroupRoot: cgroupRoot, - qosReserved: nil, + subsystems: subsystems, + cgroupManager: NewCgroupManager(logger, subsystems, "cgroupfs"), + cgroupRoot: cgroupRoot, + qosReserved: nil, + memoryReservationPolicy: kubeletconfig.NoneMemoryReservationPolicy, } qosContainerManager.activePods = activeTestPods @@ -206,6 +208,8 @@ func TestQoSContainerCgroup(t *testing.T) { logger, _ := ktesting.NewTestContext(t) m, err := createTestQOSContainerManager(logger) require.NoError(t, err) + // Set memory reservation policy to HardReservation to enable memory.min + m.memoryReservationPolicy = kubeletconfig.HardReservationMemoryReservationPolicy m.activePods = func() []*v1.Pod { return tc.pods } guaranteedUnified := map[string]string{} @@ -244,6 +248,62 @@ func TestQoSContainerCgroup(t *testing.T) { } } +func TestQoSContainerCgroupWithMemoryReservationPolicyNone(t *testing.T) { + tests := []struct { + name string + initialGuaranteed map[string]string + initialBurstable map[string]string + }{ + { + name: "explicitly resets memory.min to 0 when unset", + initialGuaranteed: nil, + initialBurstable: nil, + }, + { + name: "sets memory.min to zero when MemoryReservationPolicyNone ", + initialGuaranteed: map[string]string{ + Cgroup2MemoryMin: "1234", + }, + initialBurstable: map[string]string{ + Cgroup2MemoryMin: "5678", + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + logger, _ := ktesting.NewTestContext(t) + m, err := createTestQOSContainerManager(logger) + require.NoError(t, err) + + qosConfigs := map[v1.PodQOSClass]*CgroupConfig{ + v1.PodQOSGuaranteed: { + Name: m.qosContainersInfo.Guaranteed, + ResourceParameters: &ResourceConfig{ + Unified: tc.initialGuaranteed, + }, + }, + v1.PodQOSBurstable: { + Name: m.qosContainersInfo.Burstable, + ResourceParameters: &ResourceConfig{ + Unified: tc.initialBurstable, + }, + }, + v1.PodQOSBestEffort: { + Name: m.qosContainersInfo.BestEffort, + ResourceParameters: &ResourceConfig{}, + }, + } + + m.setMemoryQoS(logger, qosConfigs) + + assert.Equal(t, "0", qosConfigs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin]) + assert.Equal(t, "0", qosConfigs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin]) + assert.NotContains(t, qosConfigs[v1.PodQOSBestEffort].ResourceParameters.Unified, Cgroup2MemoryMin) + }) + } +} + // fakeCgroupManager is used because Start() requires a functional // CgroupManager. All methods are stubbed so that Start() can // complete successfully without using real cgroups. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index bb2cc695513..24d3be228f2 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -796,6 +796,7 @@ func NewMainKubelet(ctx context.Context, kubeCfg.MemorySwap.SwapBehavior, kubeDeps.ContainerManager.GetNodeAllocatableAbsolute, *kubeCfg.MemoryThrottlingFactor, + kubeCfg.MemoryReservationPolicy, klet.podStartupLatencyTracker, kubeDeps.TracerProvider, tokenManager, diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 5e1a8b0de1d..4eae3c6b34c 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -3638,6 +3638,7 @@ func TestSyncPodSpans(t *testing.T) { kubeCfg.MemorySwap.SwapBehavior, kubelet.containerManager.GetNodeAllocatableAbsolute, *kubeCfg.MemoryThrottlingFactor, + kubeCfg.MemoryReservationPolicy, kubelet.podStartupLatencyTracker, tp, token.NewManager(kubelet.kubeClient), diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go index 9b9cb8c1148..6bc4544f84e 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go @@ -42,6 +42,7 @@ import ( v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" kubefeatures "k8s.io/kubernetes/pkg/features" + kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/qos" @@ -154,8 +155,10 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(ctx context. unified := map[string]string{} memoryRequest := container.Resources.Requests.Memory().Value() memoryLimit := container.Resources.Limits.Memory().Value() - if memoryRequest != 0 { + if memoryRequest != 0 && m.memoryReservationPolicy == kubeletconfiginternal.HardReservationMemoryReservationPolicy { unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10) + } else { + unified[cm.Cgroup2MemoryMin] = "0" } // Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit. diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go index 35d54dedce5..419085e3547 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go @@ -41,6 +41,7 @@ import ( runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/kubernetes/pkg/apis/scheduling" "k8s.io/kubernetes/pkg/features" + kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/types" @@ -558,6 +559,7 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) { tCtx := ktesting.Init(t) _, _, m, err := createTestRuntimeManager(tCtx) assert.NoError(t, err) + m.memoryReservationPolicy = kubeletconfiginternal.HardReservationMemoryReservationPolicy podRequestMemory := resource.MustParse("128Mi") pod1LimitMemory := resource.MustParse("256Mi") @@ -661,6 +663,41 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) { } } +func TestGenerateContainerConfigMemoryQoSPolicyNone(t *testing.T) { + tCtx := ktesting.Init(t) + _, _, m, err := createTestRuntimeManager(tCtx) + require.NoError(t, err) + m.memoryReservationPolicy = kubeletconfiginternal.NoneMemoryReservationPolicy + + podRequestMemory := resource.MustParse("128Mi") + podLimitMemory := resource.MustParse("256Mi") + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "12345678", + Name: "bar", + Namespace: "new", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "foo", + Image: "busybox", + ImagePullPolicy: v1.PullIfNotPresent, + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceMemory: podRequestMemory}, + Limits: v1.ResourceList{v1.ResourceMemory: podLimitMemory}, + }, + }, + }, + }, + } + + linuxConfig, err := m.generateLinuxContainerConfig(tCtx, &pod.Spec.Containers[0], pod, new(int64), "", nil, true) + require.NoError(t, err) + assert.Equal(t, "0", linuxConfig.GetResources().GetUnified()["memory.min"]) + assert.NotEmpty(t, linuxConfig.GetResources().GetUnified()["memory.high"]) +} + func TestGetHugepageLimitsFromResources(t *testing.T) { tCtx := ktesting.Init(t) var baseHugepage []*runtimeapi.HugepageLimit @@ -2012,7 +2049,7 @@ func TestGenerateUpdatePodSandboxResourcesRequest(t *testing.T) { expectedLcr := m.calculateSandboxResources(tCtx, tc.pod) expectedLcrOverhead := m.convertOverheadToLinuxResources(tc.pod) - podResourcesCfg := cm.ResourceConfigForPod(tc.pod, tc.enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false) + podResourcesCfg := cm.ResourceConfigForPod(tc.pod, tc.enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false, kubeletconfiginternal.NoneMemoryReservationPolicy) assert.NotNil(t, podResourcesCfg, "podResourcesCfg is expected to be not nil") if podResourcesCfg.CPUPeriod == nil { diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index ae83bfaff9c..bcaf5abd776 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -189,6 +189,8 @@ type kubeGenericRuntimeManager struct { // Memory throttling factor for MemoryQoS memoryThrottlingFactor float64 + // Memory reservation policy for MemoryQoS memory.min behavior + memoryReservationPolicy kubeletconfiginternal.MemoryReservationPolicy // Root directory used to store pod logs podLogsDirectory string @@ -244,6 +246,7 @@ func NewKubeGenericRuntimeManager( memorySwapBehavior string, getNodeAllocatable func() v1.ResourceList, memoryThrottlingFactor float64, + memoryReservationPolicy kubeletconfiginternal.MemoryReservationPolicy, podPullingTimeRecorder images.ImagePodPullingTimeRecorder, tracerProvider trace.TracerProvider, tokenManager *token.Manager, @@ -278,6 +281,7 @@ func NewKubeGenericRuntimeManager( memorySwapBehavior: memorySwapBehavior, getNodeAllocatable: getNodeAllocatable, memoryThrottlingFactor: memoryThrottlingFactor, + memoryReservationPolicy: memoryReservationPolicy, podLogsDirectory: podLogsDirectory, podInitContainerTimeRecorder: podInitContainerTimeRecorder, } @@ -824,7 +828,7 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(ctx context.Context, pod * enforceCPULimits = false logger.V(2).Info("Disabled CFS quota", "pod", klog.KObj(pod)) } - podResources := cm.ResourceConfigForPod(pod, enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false) + podResources := cm.ResourceConfigForPod(pod, enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false, kubeletconfiginternal.NoneMemoryReservationPolicy) if podResources == nil { logger.Error(nil, "Unable to get resource configuration", "pod", klog.KObj(pod)) resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("unable to get resource configuration processing resize for pod %q", format.Pod(pod))) diff --git a/pkg/volume/emptydir/empty_dir.go b/pkg/volume/emptydir/empty_dir.go index d5c29b182fe..622704495d7 100644 --- a/pkg/volume/emptydir/empty_dir.go +++ b/pkg/volume/emptydir/empty_dir.go @@ -35,6 +35,7 @@ import ( utilfeature "k8s.io/apiserver/pkg/util/feature" resourcehelper "k8s.io/component-helpers/resource" v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" + "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/cm" usernamespacefeature "k8s.io/kubernetes/pkg/kubelet/userns" "k8s.io/kubernetes/pkg/volume" @@ -119,7 +120,7 @@ func calculateEmptyDirMemorySize(nodeAllocatableMemory *resource.Quantity, spec // determine pod resource allocation // we use the same function for pod cgroup assignment to maintain consistent behavior // NOTE: this could be nil on systems that do not support pod memory containment (i.e. windows) - podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000), false) + podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000), false, config.NoneMemoryReservationPolicy) if podResourceConfig != nil && podResourceConfig.Memory != nil { podMemoryLimit := resource.NewQuantity(*(podResourceConfig.Memory), resource.BinarySI) // ensure 0 < value < size diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go index dae3fe95c2a..108c94bafc1 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go @@ -102,6 +102,17 @@ const ( AlwaysVerify ImagePullCredentialsVerificationPolicy = "AlwaysVerify" ) +// MemoryReservationPolicy defines how the kubelet applies cgroup v2 memory protection. +type MemoryReservationPolicy string + +const ( + // NoneMemoryReservationPolicy disables memory.min protection for containers and pods. + // This is the default to maintain node stability by preventing "locked" memory. + NoneMemoryReservationPolicy MemoryReservationPolicy = "None" + // HardReservationMemoryReservationPolicy enables memory.min for containers and pods. + HardReservationMemoryReservationPolicy MemoryReservationPolicy = "HardReservation" +) + // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // KubeletConfiguration contains the configuration for the Kubelet @@ -888,6 +899,16 @@ type KubeletConfiguration struct { // +featureGate=MemoryQoS // +optional MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"` + // MemoryReservationPolicy controls how the kubelet applies cgroup v2 memory protection. + // "None" (default): The kubelet does not set memory.min for containers and pods, + // ensuring no hard memory is locked by the kernel. + // "HardReservation": The kubelet sets the cgroup v2 memory.min value based on pod and container memory requests. + // This ensures the requested memory is never reclaimed by the kernel, but may trigger an OOM if the reservation cannot be satisfied. + // See https://kep.k8s.io/2570 for more details. + // Default: None + // +featureGate=MemoryQoS + // +optional + MemoryReservationPolicy MemoryReservationPolicy `json:"memoryReservationPolicy,omitempty"` // registerWithTaints are an array of taints to add to a node object when // the kubelet registers itself. This only takes effect when registerNode // is true and upon the initial registration of the node.