Add MemoryReservationPolicy to kubeletconfig for MemoryQoS feature

Add MemoryReservationPolicy (None/HardReservation) controls memory.min. This allows
independently of memory.min protection, providing operators more
granular control over memoryQoS behavior.
Signed-off-by: Qi Wang <qiwan@redhat.com>
This commit is contained in:
Qi Wang 2026-03-09 19:57:43 -04:00
parent 9c7e57bb7c
commit f1ccd2d8aa
28 changed files with 251 additions and 30 deletions

View file

@ -884,6 +884,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend
CPUManagerReconcilePeriod: s.CPUManagerReconcilePeriod.Duration,
MemoryManagerPolicy: s.MemoryManagerPolicy,
MemoryManagerReservedMemory: s.ReservedMemory,
MemoryReservationPolicy: s.MemoryReservationPolicy,
PodPidsLimit: s.PodPidsLimit,
EnforceCPULimits: s.CPUCFSQuota,
CPUCFSQuotaPeriod: s.CPUCFSQuotaPeriod.Duration,

View file

@ -71891,6 +71891,13 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen
Format: "double",
},
},
"memoryReservationPolicy": {
SchemaProps: spec.SchemaProps{
Description: "MemoryReservationPolicy controls how the kubelet applies cgroup v2 memory protection. \"None\" (default): The kubelet does not set memory.min for containers and pods, ensuring no hard memory is locked by the kernel. \"HardReservation\": The kubelet sets the cgroup v2 memory.min value based on pod and container memory requests. This ensures the requested memory is never reclaimed by the kernel, but may trigger an OOM if the reservation cannot be satisfied. See https://kep.k8s.io/2570 for more details. Default: None",
Type: []string{"string"},
Format: "",
},
},
"registerWithTaints": {
SchemaProps: spec.SchemaProps{
Description: "registerWithTaints are an array of taints to add to a node object when the kubelet registers itself. This only takes effect when registerNode is true and upon the initial registration of the node. Default: nil",

View file

@ -122,6 +122,7 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
}
obj.EnableSystemLogHandler = true
obj.MemoryThrottlingFactor = ptr.To(rand.Float64())
obj.MemoryReservationPolicy = kubeletconfig.NoneMemoryReservationPolicy
obj.LocalStorageCapacityIsolation = true
obj.FeatureGates = map[string]bool{
"AllAlpha": false,

View file

@ -300,6 +300,7 @@ var (
"ShutdownGracePeriod.Duration",
"ShutdownGracePeriodCriticalPods.Duration",
"MemoryThrottlingFactor",
"MemoryReservationPolicy",
"ContainerRuntimeEndpoint",
"ImageServiceEndpoint",
"Tracing.Endpoint",

View file

@ -70,6 +70,7 @@ makeIPTablesUtilChains: true
maxOpenFiles: 1000000
maxPods: 110
memoryManagerPolicy: None
memoryReservationPolicy: None
memorySwap: {}
memoryThrottlingFactor: 0.9
mergeDefaultEvictionSettings: false

View file

@ -70,6 +70,7 @@ makeIPTablesUtilChains: true
maxOpenFiles: 1000000
maxPods: 110
memoryManagerPolicy: None
memoryReservationPolicy: None
memorySwap: {}
memoryThrottlingFactor: 0.9
mergeDefaultEvictionSettings: false

View file

@ -503,6 +503,16 @@ type KubeletConfiguration struct {
// +featureGate=MemoryQoS
// +optional
MemoryThrottlingFactor *float64
// MemoryReservationPolicy controls how the kubelet applies cgroup v2 memory protection.
// "None" (default): The kubelet does not set memory.min for containers and pods,
// ensuring no hard memory is locked by the kernel.
// "HardReservation": The kubelet sets the cgroup v2 memory.min value based on pod and container memory requests.
// This ensures the requested memory is never reclaimed by the kernel, but may trigger an OOM if the reservation cannot be satisfied.
// See https://kep.k8s.io/2570 for more details.
// Default: None
// +featureGate=MemoryQoS
// +optional
MemoryReservationPolicy MemoryReservationPolicy
// registerWithTaints are an array of taints to add to a node object when
// the kubelet registers itself. This only takes effect when registerNode
// is true and upon the initial registration of the node.
@ -852,6 +862,17 @@ const (
AlwaysVerify ImagePullCredentialsVerificationPolicy = "AlwaysVerify"
)
// MemoryReservationPolicy defines how the kubelet applies cgroup v2 memory protection.
type MemoryReservationPolicy string
const (
// NoneMemoryReservationPolicy disables memory.min protection for containers and pods.
// This is the default to maintain node stability by preventing "locked" memory.
NoneMemoryReservationPolicy MemoryReservationPolicy = "None"
// HardReservationMemoryReservationPolicy enables memory.min for containers and pods.
HardReservationMemoryReservationPolicy MemoryReservationPolicy = "HardReservation"
)
// ImagePullIntent is a record of the kubelet attempting to pull an image.
//
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

View file

@ -41,7 +41,8 @@ const (
DefaultVolumePluginDir = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/"
DefaultPodLogsDir = "/var/log/pods"
// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos
DefaultMemoryThrottlingFactor = 0.9
DefaultMemoryThrottlingFactor = 0.9
DefaultMemoryReservationPolicy = kubeletconfigv1beta1.NoneMemoryReservationPolicy
// MaxContainerBackOff is the max backoff period for container restarts, exported for the e2e test
MaxContainerBackOff = 300 * time.Second
)
@ -295,6 +296,9 @@ func SetDefaults_KubeletConfiguration(obj *kubeletconfigv1beta1.KubeletConfigura
if obj.MemoryThrottlingFactor == nil {
obj.MemoryThrottlingFactor = ptr.To(DefaultMemoryThrottlingFactor)
}
if obj.MemoryReservationPolicy == "" {
obj.MemoryReservationPolicy = DefaultMemoryReservationPolicy
}
if obj.RegisterNode == nil {
obj.RegisterNode = ptr.To(true)
}

View file

@ -128,6 +128,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
SeccompDefault: ptr.To(false),
FailCgroupV1: ptr.To(true),
MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor),
MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy,
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: DefaultPodLogsDir,
@ -265,6 +266,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
SeccompDefault: ptr.To(false),
FailCgroupV1: ptr.To(true),
MemoryThrottlingFactor: ptr.To[float64](0),
MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy,
RegisterNode: ptr.To(false),
LocalStorageCapacityIsolation: ptr.To(false),
PodLogsDir: "",
@ -370,6 +372,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
SeccompDefault: ptr.To(false),
FailCgroupV1: ptr.To(true),
MemoryThrottlingFactor: ptr.To[float64](0),
MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy,
RegisterNode: ptr.To(false),
LocalStorageCapacityIsolation: ptr.To(false),
PodLogsDir: DefaultPodLogsDir,
@ -531,6 +534,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
SeccompDefault: ptr.To(true),
FailCgroupV1: ptr.To(true),
MemoryThrottlingFactor: ptr.To[float64](1),
MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy,
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: "/custom/path",
@ -689,6 +693,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
SeccompDefault: ptr.To(true),
FailCgroupV1: ptr.To(true),
MemoryThrottlingFactor: ptr.To[float64](1),
MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy,
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: "/custom/path",
@ -788,6 +793,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
SeccompDefault: ptr.To(false),
FailCgroupV1: ptr.To(true),
MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor),
MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy,
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: DefaultPodLogsDir,
@ -887,6 +893,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
SeccompDefault: ptr.To(false),
FailCgroupV1: ptr.To(true),
MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor),
MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy,
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: DefaultPodLogsDir,
@ -986,6 +993,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
SeccompDefault: ptr.To(false),
FailCgroupV1: ptr.To(true),
MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor),
MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy,
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: DefaultPodLogsDir,
@ -1085,6 +1093,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
SeccompDefault: ptr.To(false),
FailCgroupV1: ptr.To(true),
MemoryThrottlingFactor: ptr.To(DefaultMemoryThrottlingFactor),
MemoryReservationPolicy: v1beta1.NoneMemoryReservationPolicy,
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: DefaultPodLogsDir,

View file

@ -750,6 +750,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in
return err
}
out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor))
out.MemoryReservationPolicy = config.MemoryReservationPolicy(in.MemoryReservationPolicy)
out.RegisterWithTaints = *(*[]corev1.Taint)(unsafe.Pointer(&in.RegisterWithTaints))
if err := v1.Convert_Pointer_bool_To_bool(&in.RegisterNode, &out.RegisterNode, s); err != nil {
return err
@ -954,6 +955,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in
return err
}
out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor))
out.MemoryReservationPolicy = configv1beta1.MemoryReservationPolicy(in.MemoryReservationPolicy)
out.RegisterWithTaints = *(*[]corev1.Taint)(unsafe.Pointer(&in.RegisterWithTaints))
if err := v1.Convert_bool_To_Pointer_bool(&in.RegisterNode, &out.RegisterNode, s); err != nil {
return err

View file

@ -351,6 +351,17 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration, featur
allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryThrottlingFactor %v must be greater than 0 and less than or equal to 1.0", *kc.MemoryThrottlingFactor))
}
if !localFeatureGate.Enabled(features.MemoryQoS) &&
kc.MemoryReservationPolicy == kubeletconfig.HardReservationMemoryReservationPolicy {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryReservationPolicy %q requires MemoryQoS feature gate to be enabled",
kc.MemoryReservationPolicy))
}
switch kc.MemoryReservationPolicy {
case kubeletconfig.NoneMemoryReservationPolicy, kubeletconfig.HardReservationMemoryReservationPolicy:
default:
allErrors = append(allErrors, fmt.Errorf("invalid configuration: option %q specified for memoryReservationPolicy. Valid options are %q or %q", kc.MemoryReservationPolicy, kubeletconfig.NoneMemoryReservationPolicy, kubeletconfig.HardReservationMemoryReservationPolicy))
}
if kc.ContainerRuntimeEndpoint == "" {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: the containerRuntimeEndpoint was not specified or empty"))
}

View file

@ -71,6 +71,7 @@ var (
ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
MemoryThrottlingFactor: ptr.To(0.9),
MemoryReservationPolicy: kubeletconfig.NoneMemoryReservationPolicy,
FeatureGates: map[string]bool{
"GracefulNodeShutdown": true,
"MemoryQoS": true,
@ -563,6 +564,21 @@ func TestValidateKubeletConfiguration(t *testing.T) {
return conf
},
errMsg: "invalid configuration: memoryThrottlingFactor 1.1 must be greater than 0 and less than or equal to 1.0",
}, {
name: "MemoryReservationPolicy requires MemoryQoS",
configure: func(conf *kubeletconfig.KubeletConfiguration) *kubeletconfig.KubeletConfiguration {
conf.FeatureGates = map[string]bool{"MemoryQoS": false}
conf.MemoryReservationPolicy = kubeletconfig.HardReservationMemoryReservationPolicy
return conf
},
errMsg: "invalid configuration: memoryReservationPolicy \"HardReservation\" requires MemoryQoS feature gate to be enabled",
}, {
name: "invalid MemoryReservationPolicy",
configure: func(conf *kubeletconfig.KubeletConfiguration) *kubeletconfig.KubeletConfiguration {
conf.MemoryReservationPolicy = "invalid"
return conf
},
errMsg: "invalid configuration: option \"invalid\" specified for memoryReservationPolicy. Valid options are \"None\" or \"HardReservation\"",
}, {
name: "invalid Taint.TimeAdded",
configure: func(conf *kubeletconfig.KubeletConfiguration) *kubeletconfig.KubeletConfiguration {

View file

@ -193,6 +193,7 @@ type NodeConfig struct {
CPUManagerReconcilePeriod time.Duration
MemoryManagerPolicy string
MemoryManagerReservedMemory []kubeletconfig.MemoryReservation
MemoryReservationPolicy kubeletconfig.MemoryReservationPolicy
PodPidsLimit int64
EnforceCPULimits bool
CPUCFSQuotaPeriod time.Duration

View file

@ -406,8 +406,9 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
enforceCPULimits: cm.EnforceCPULimits,
// cpuCFSQuotaPeriod is in microseconds. NodeConfig.CPUCFSQuotaPeriod is time.Duration (measured in nano seconds).
// Convert (cm.CPUCFSQuotaPeriod) [nanoseconds] / time.Microsecond (1000) to get cpuCFSQuotaPeriod in microseconds.
cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond),
podContainerManager: cm,
cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond),
podContainerManager: cm,
memoryReservationPolicy: cm.MemoryReservationPolicy,
}
}
return &podContainerManagerNoop{

View file

@ -25,6 +25,7 @@ import (
internalapi "k8s.io/cri-api/pkg/apis"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
@ -32,7 +33,7 @@ import (
// for typecheck across platforms
var _ func(int64, int64) int64 = MilliCPUToQuota
var _ func(int64) uint64 = MilliCPUToShares
var _ func(*v1.Pod, bool, uint64, bool) *ResourceConfig = ResourceConfigForPod
var _ func(*v1.Pod, bool, uint64, bool, kubeletconfig.MemoryReservationPolicy) *ResourceConfig = ResourceConfigForPod
var _ func() (*CgroupSubsystems, error) = GetCgroupSubsystems
var _ func(string) ([]int, error) = getCgroupProcs
var _ func(types.UID) string = GetPodCgroupNameSuffix

View file

@ -34,6 +34,7 @@ import (
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm/util"
)
@ -123,7 +124,7 @@ func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(allocatedPod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
func ResourceConfigForPod(allocatedPod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool, memoryReservationPolicy kubeletconfig.MemoryReservationPolicy) *ResourceConfig {
podLevelResourcesEnabled := utilfeature.DefaultFeatureGate.Enabled(kubefeatures.PodLevelResources)
// sum requests and limits.
reqs := resourcehelper.PodRequests(allocatedPod, resourcehelper.PodResourcesOptions{
@ -207,7 +208,7 @@ func ResourceConfigForPod(allocatedPod *v1.Pod, enforceCPULimits bool, cpuPeriod
}
result.HugePageLimit = hugePageLimits
if enforceMemoryQoS {
if enforceMemoryQoS && memoryReservationPolicy == kubeletconfig.HardReservationMemoryReservationPolicy {
memoryMin := int64(0)
if request, found := reqs[v1.ResourceMemory]; found {
memoryMin = request.Value()

View file

@ -29,6 +29,7 @@ import (
utilfeature "k8s.io/apiserver/pkg/util/feature"
featuregatetesting "k8s.io/component-base/featuregate/testing"
pkgfeatures "k8s.io/kubernetes/pkg/features"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
)
// getResourceList returns a ResourceList with the
@ -482,7 +483,7 @@ func TestResourceConfigForPod(t *testing.T) {
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, pkgfeatures.PodLevelResources, testCase.podLevelResourcesEnabled)
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false)
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false, kubeletconfig.NoneMemoryReservationPolicy)
if !reflect.DeepEqual(actual.CPUPeriod, testCase.expected.CPUPeriod) {
t.Errorf("cpu period not as expected. Expected: %v, Actual:%v", *testCase.expected.CPUPeriod, *actual.CPUPeriod)
}
@ -641,7 +642,7 @@ func TestHugePageLimits(t *testing.T) {
},
},
}
resultValuePod := ResourceConfigForPod(&p, false, 0, false)
resultValuePod := ResourceConfigForPod(&p, false, 0, false, kubeletconfig.NoneMemoryReservationPolicy)
if !reflect.DeepEqual(testcase.expected, resultValuePod.HugePageLimit) {
t.Errorf("unexpected result for ResourceConfigForPod(), expected: %v, actual: %v", testcase.expected, resultValuePod)
}
@ -849,7 +850,7 @@ func TestResourceConfigForPodWithEnforceMemoryQoS(t *testing.T) {
for testName, testCase := range testCases {
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, true)
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, true, kubeletconfig.HardReservationMemoryReservationPolicy)
if !reflect.DeepEqual(actual.Unified, testCase.expected.Unified) {
t.Errorf("unexpected result, test: %v, unified not as expected", testName)

View file

@ -23,6 +23,7 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
)
const (
@ -48,7 +49,7 @@ func MilliCPUToShares(milliCPU int64) uint64 {
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64, enforceMemoryQoS bool, memoryReservationPolicy kubeletconfig.MemoryReservationPolicy) *ResourceConfig {
return nil
}

View file

@ -31,6 +31,7 @@ import (
"k8s.io/klog/v2"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
)
const (
@ -57,6 +58,8 @@ type podContainerManagerImpl struct {
cpuCFSQuotaPeriod uint64
// podContainerManager is the ContainerManager running on the machine
podContainerManager ContainerManager
// memoryReservationPolicy controls memory reservation protection behavior
memoryReservationPolicy kubeletconfig.MemoryReservationPolicy
}
// Make sure that podContainerManagerImpl implements the PodContainerManager interface
@ -89,7 +92,7 @@ func (m *podContainerManagerImpl) EnsureExists(logger klog.Logger, pod *v1.Pod)
podContainerName, _ := m.GetPodContainerName(pod)
containerConfig := &CgroupConfig{
Name: podContainerName,
ResourceParameters: ResourceConfigForPod(pod, enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS),
ResourceParameters: ResourceConfigForPod(pod, enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS, m.memoryReservationPolicy),
}
if m.podPidsLimit > 0 {
containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit

View file

@ -37,6 +37,7 @@ import (
"k8s.io/component-helpers/resource"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
)
const (
@ -53,13 +54,14 @@ type QOSContainerManager interface {
type qosContainerManagerImpl struct {
sync.Mutex
qosContainersInfo QOSContainersInfo
subsystems *CgroupSubsystems
cgroupManager CgroupManager
activePods ActivePodsFunc
getNodeAllocatable func() v1.ResourceList
cgroupRoot CgroupName
qosReserved map[v1.ResourceName]int64
qosContainersInfo QOSContainersInfo
subsystems *CgroupSubsystems
cgroupManager CgroupManager
activePods ActivePodsFunc
getNodeAllocatable func() v1.ResourceList
cgroupRoot CgroupName
qosReserved map[v1.ResourceName]int64
memoryReservationPolicy kubeletconfig.MemoryReservationPolicy
}
func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
@ -70,10 +72,11 @@ func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName,
}
return &qosContainerManagerImpl{
subsystems: subsystems,
cgroupManager: cgroupManager,
cgroupRoot: cgroupRoot,
qosReserved: nodeConfig.QOSReserved,
subsystems: subsystems,
cgroupManager: cgroupManager,
cgroupRoot: cgroupRoot,
qosReserved: nodeConfig.QOSReserved,
memoryReservationPolicy: nodeConfig.MemoryReservationPolicy,
}, nil
}
@ -296,6 +299,12 @@ func (m *qosContainerManagerImpl) setMemoryQoS(logger klog.Logger, configs map[v
logger.V(4).Info("MemoryQoS config for qos", "qos", qos, "memoryMin", memoryMin)
}
if m.memoryReservationPolicy != kubeletconfig.HardReservationMemoryReservationPolicy {
setMemoryMin(v1.PodQOSGuaranteed, 0)
setMemoryMin(v1.PodQOSBurstable, 0)
return
}
qosMemoryRequests := m.getQoSMemoryRequests()
// Calculate the memory.min:

View file

@ -36,6 +36,7 @@ import (
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/klog/v2"
"k8s.io/klog/v2/ktesting"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
)
func activeTestPods() []*v1.Pod {
@ -124,10 +125,11 @@ func createTestQOSContainerManager(logger klog.Logger) (*qosContainerManagerImpl
cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName)
qosContainerManager := &qosContainerManagerImpl{
subsystems: subsystems,
cgroupManager: NewCgroupManager(logger, subsystems, "cgroupfs"),
cgroupRoot: cgroupRoot,
qosReserved: nil,
subsystems: subsystems,
cgroupManager: NewCgroupManager(logger, subsystems, "cgroupfs"),
cgroupRoot: cgroupRoot,
qosReserved: nil,
memoryReservationPolicy: kubeletconfig.NoneMemoryReservationPolicy,
}
qosContainerManager.activePods = activeTestPods
@ -206,6 +208,8 @@ func TestQoSContainerCgroup(t *testing.T) {
logger, _ := ktesting.NewTestContext(t)
m, err := createTestQOSContainerManager(logger)
require.NoError(t, err)
// Set memory reservation policy to HardReservation to enable memory.min
m.memoryReservationPolicy = kubeletconfig.HardReservationMemoryReservationPolicy
m.activePods = func() []*v1.Pod { return tc.pods }
guaranteedUnified := map[string]string{}
@ -244,6 +248,62 @@ func TestQoSContainerCgroup(t *testing.T) {
}
}
func TestQoSContainerCgroupWithMemoryReservationPolicyNone(t *testing.T) {
tests := []struct {
name string
initialGuaranteed map[string]string
initialBurstable map[string]string
}{
{
name: "explicitly resets memory.min to 0 when unset",
initialGuaranteed: nil,
initialBurstable: nil,
},
{
name: "sets memory.min to zero when MemoryReservationPolicyNone ",
initialGuaranteed: map[string]string{
Cgroup2MemoryMin: "1234",
},
initialBurstable: map[string]string{
Cgroup2MemoryMin: "5678",
},
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
logger, _ := ktesting.NewTestContext(t)
m, err := createTestQOSContainerManager(logger)
require.NoError(t, err)
qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
v1.PodQOSGuaranteed: {
Name: m.qosContainersInfo.Guaranteed,
ResourceParameters: &ResourceConfig{
Unified: tc.initialGuaranteed,
},
},
v1.PodQOSBurstable: {
Name: m.qosContainersInfo.Burstable,
ResourceParameters: &ResourceConfig{
Unified: tc.initialBurstable,
},
},
v1.PodQOSBestEffort: {
Name: m.qosContainersInfo.BestEffort,
ResourceParameters: &ResourceConfig{},
},
}
m.setMemoryQoS(logger, qosConfigs)
assert.Equal(t, "0", qosConfigs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin])
assert.Equal(t, "0", qosConfigs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin])
assert.NotContains(t, qosConfigs[v1.PodQOSBestEffort].ResourceParameters.Unified, Cgroup2MemoryMin)
})
}
}
// fakeCgroupManager is used because Start() requires a functional
// CgroupManager. All methods are stubbed so that Start() can
// complete successfully without using real cgroups.

View file

@ -796,6 +796,7 @@ func NewMainKubelet(ctx context.Context,
kubeCfg.MemorySwap.SwapBehavior,
kubeDeps.ContainerManager.GetNodeAllocatableAbsolute,
*kubeCfg.MemoryThrottlingFactor,
kubeCfg.MemoryReservationPolicy,
klet.podStartupLatencyTracker,
kubeDeps.TracerProvider,
tokenManager,

View file

@ -3638,6 +3638,7 @@ func TestSyncPodSpans(t *testing.T) {
kubeCfg.MemorySwap.SwapBehavior,
kubelet.containerManager.GetNodeAllocatableAbsolute,
*kubeCfg.MemoryThrottlingFactor,
kubeCfg.MemoryReservationPolicy,
kubelet.podStartupLatencyTracker,
tp,
token.NewManager(kubelet.kubeClient),

View file

@ -42,6 +42,7 @@ import (
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/qos"
@ -154,8 +155,10 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(ctx context.
unified := map[string]string{}
memoryRequest := container.Resources.Requests.Memory().Value()
memoryLimit := container.Resources.Limits.Memory().Value()
if memoryRequest != 0 {
if memoryRequest != 0 && m.memoryReservationPolicy == kubeletconfiginternal.HardReservationMemoryReservationPolicy {
unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
} else {
unified[cm.Cgroup2MemoryMin] = "0"
}
// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.

View file

@ -41,6 +41,7 @@ import (
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/kubernetes/pkg/apis/scheduling"
"k8s.io/kubernetes/pkg/features"
kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/types"
@ -558,6 +559,7 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
tCtx := ktesting.Init(t)
_, _, m, err := createTestRuntimeManager(tCtx)
assert.NoError(t, err)
m.memoryReservationPolicy = kubeletconfiginternal.HardReservationMemoryReservationPolicy
podRequestMemory := resource.MustParse("128Mi")
pod1LimitMemory := resource.MustParse("256Mi")
@ -661,6 +663,41 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
}
}
func TestGenerateContainerConfigMemoryQoSPolicyNone(t *testing.T) {
tCtx := ktesting.Init(t)
_, _, m, err := createTestRuntimeManager(tCtx)
require.NoError(t, err)
m.memoryReservationPolicy = kubeletconfiginternal.NoneMemoryReservationPolicy
podRequestMemory := resource.MustParse("128Mi")
podLimitMemory := resource.MustParse("256Mi")
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "bar",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "foo",
Image: "busybox",
ImagePullPolicy: v1.PullIfNotPresent,
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{v1.ResourceMemory: podRequestMemory},
Limits: v1.ResourceList{v1.ResourceMemory: podLimitMemory},
},
},
},
},
}
linuxConfig, err := m.generateLinuxContainerConfig(tCtx, &pod.Spec.Containers[0], pod, new(int64), "", nil, true)
require.NoError(t, err)
assert.Equal(t, "0", linuxConfig.GetResources().GetUnified()["memory.min"])
assert.NotEmpty(t, linuxConfig.GetResources().GetUnified()["memory.high"])
}
func TestGetHugepageLimitsFromResources(t *testing.T) {
tCtx := ktesting.Init(t)
var baseHugepage []*runtimeapi.HugepageLimit
@ -2012,7 +2049,7 @@ func TestGenerateUpdatePodSandboxResourcesRequest(t *testing.T) {
expectedLcr := m.calculateSandboxResources(tCtx, tc.pod)
expectedLcrOverhead := m.convertOverheadToLinuxResources(tc.pod)
podResourcesCfg := cm.ResourceConfigForPod(tc.pod, tc.enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false)
podResourcesCfg := cm.ResourceConfigForPod(tc.pod, tc.enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false, kubeletconfiginternal.NoneMemoryReservationPolicy)
assert.NotNil(t, podResourcesCfg, "podResourcesCfg is expected to be not nil")
if podResourcesCfg.CPUPeriod == nil {

View file

@ -189,6 +189,8 @@ type kubeGenericRuntimeManager struct {
// Memory throttling factor for MemoryQoS
memoryThrottlingFactor float64
// Memory reservation policy for MemoryQoS memory.min behavior
memoryReservationPolicy kubeletconfiginternal.MemoryReservationPolicy
// Root directory used to store pod logs
podLogsDirectory string
@ -244,6 +246,7 @@ func NewKubeGenericRuntimeManager(
memorySwapBehavior string,
getNodeAllocatable func() v1.ResourceList,
memoryThrottlingFactor float64,
memoryReservationPolicy kubeletconfiginternal.MemoryReservationPolicy,
podPullingTimeRecorder images.ImagePodPullingTimeRecorder,
tracerProvider trace.TracerProvider,
tokenManager *token.Manager,
@ -278,6 +281,7 @@ func NewKubeGenericRuntimeManager(
memorySwapBehavior: memorySwapBehavior,
getNodeAllocatable: getNodeAllocatable,
memoryThrottlingFactor: memoryThrottlingFactor,
memoryReservationPolicy: memoryReservationPolicy,
podLogsDirectory: podLogsDirectory,
podInitContainerTimeRecorder: podInitContainerTimeRecorder,
}
@ -824,7 +828,7 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(ctx context.Context, pod *
enforceCPULimits = false
logger.V(2).Info("Disabled CFS quota", "pod", klog.KObj(pod))
}
podResources := cm.ResourceConfigForPod(pod, enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false)
podResources := cm.ResourceConfigForPod(pod, enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false, kubeletconfiginternal.NoneMemoryReservationPolicy)
if podResources == nil {
logger.Error(nil, "Unable to get resource configuration", "pod", klog.KObj(pod))
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("unable to get resource configuration processing resize for pod %q", format.Pod(pod)))

View file

@ -35,6 +35,7 @@ import (
utilfeature "k8s.io/apiserver/pkg/util/feature"
resourcehelper "k8s.io/component-helpers/resource"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm"
usernamespacefeature "k8s.io/kubernetes/pkg/kubelet/userns"
"k8s.io/kubernetes/pkg/volume"
@ -119,7 +120,7 @@ func calculateEmptyDirMemorySize(nodeAllocatableMemory *resource.Quantity, spec
// determine pod resource allocation
// we use the same function for pod cgroup assignment to maintain consistent behavior
// NOTE: this could be nil on systems that do not support pod memory containment (i.e. windows)
podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000), false)
podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000), false, config.NoneMemoryReservationPolicy)
if podResourceConfig != nil && podResourceConfig.Memory != nil {
podMemoryLimit := resource.NewQuantity(*(podResourceConfig.Memory), resource.BinarySI)
// ensure 0 < value < size

View file

@ -102,6 +102,17 @@ const (
AlwaysVerify ImagePullCredentialsVerificationPolicy = "AlwaysVerify"
)
// MemoryReservationPolicy defines how the kubelet applies cgroup v2 memory protection.
type MemoryReservationPolicy string
const (
// NoneMemoryReservationPolicy disables memory.min protection for containers and pods.
// This is the default to maintain node stability by preventing "locked" memory.
NoneMemoryReservationPolicy MemoryReservationPolicy = "None"
// HardReservationMemoryReservationPolicy enables memory.min for containers and pods.
HardReservationMemoryReservationPolicy MemoryReservationPolicy = "HardReservation"
)
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// KubeletConfiguration contains the configuration for the Kubelet
@ -888,6 +899,16 @@ type KubeletConfiguration struct {
// +featureGate=MemoryQoS
// +optional
MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"`
// MemoryReservationPolicy controls how the kubelet applies cgroup v2 memory protection.
// "None" (default): The kubelet does not set memory.min for containers and pods,
// ensuring no hard memory is locked by the kernel.
// "HardReservation": The kubelet sets the cgroup v2 memory.min value based on pod and container memory requests.
// This ensures the requested memory is never reclaimed by the kernel, but may trigger an OOM if the reservation cannot be satisfied.
// See https://kep.k8s.io/2570 for more details.
// Default: None
// +featureGate=MemoryQoS
// +optional
MemoryReservationPolicy MemoryReservationPolicy `json:"memoryReservationPolicy,omitempty"`
// registerWithTaints are an array of taints to add to a node object when
// the kubelet registers itself. This only takes effect when registerNode
// is true and upon the initial registration of the node.