mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-02-15 08:47:59 -05:00
492 lines
17 KiB
Go
492 lines
17 KiB
Go
/*
|
|
Copyright 2017 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package noderesources
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
"sync"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
"k8s.io/dynamic-resource-allocation/cel"
|
|
"k8s.io/klog/v2"
|
|
"k8s.io/utils/ptr"
|
|
|
|
resourceapi "k8s.io/api/resource/v1"
|
|
resourcehelper "k8s.io/component-helpers/resource"
|
|
"k8s.io/dynamic-resource-allocation/structured"
|
|
fwk "k8s.io/kube-scheduler/framework"
|
|
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
|
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
|
)
|
|
|
|
// scorer is decorator for resourceAllocationScorer
|
|
type scorer func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer
|
|
|
|
// DRACaches holds various caches used for DRA-related computations
|
|
type DRACaches struct {
|
|
// celCache is a cache for compiled CEL expressions used in device class selectors.
|
|
celCache *cel.Cache
|
|
// Cache for DeviceMatches results to avoid expensive repeated evaluations
|
|
deviceMatchCache sync.Map // map[deviceMatchCacheKey]bool
|
|
// Cache for NodeMatches results to avoid expensive repeated node selector evaluations
|
|
nodeMatchCache sync.Map // map[nodeMatchCacheKey]bool
|
|
}
|
|
|
|
// resourceAllocationScorer contains information to calculate resource allocation score.
|
|
type resourceAllocationScorer struct {
|
|
Name string
|
|
enableInPlacePodVerticalScaling bool
|
|
enablePodLevelResources bool
|
|
enableDRAExtendedResource bool
|
|
enableInPlacePodLevelResourcesVerticalScaling bool
|
|
// used to decide whether to use Requested or NonZeroRequested for
|
|
// cpu and memory.
|
|
useRequested bool
|
|
scorer func(requested, allocable []int64) int64
|
|
resources []config.ResourceSpec
|
|
draFeatures structured.Features
|
|
draManager fwk.SharedDRAManager
|
|
// Caches for DRA-related computations
|
|
DRACaches
|
|
}
|
|
|
|
// buildNodeMatchCacheKey creates a string cache key for node matching results
|
|
// Using a string key is significantly faster than struct keys with sync.Map
|
|
func buildNodeMatchCacheKey(nodeName string, nodeNameToMatch string, allNodesMatch bool, nodeSelectorHash string) string {
|
|
// Pre-allocate sufficient capacity to avoid reallocation
|
|
var b strings.Builder
|
|
b.Grow(len(nodeName) + len(nodeNameToMatch) + len(nodeSelectorHash) + 4)
|
|
|
|
b.WriteString(nodeName)
|
|
b.WriteByte('|')
|
|
b.WriteString(nodeNameToMatch)
|
|
b.WriteByte('|')
|
|
if allNodesMatch {
|
|
b.WriteByte('1')
|
|
} else {
|
|
b.WriteByte('0')
|
|
}
|
|
b.WriteByte('|')
|
|
b.WriteString(nodeSelectorHash)
|
|
|
|
return b.String()
|
|
}
|
|
|
|
// buildDeviceMatchCacheKey creates a string cache key for device matching results
|
|
// Using a string key is significantly faster than struct keys with sync.Map
|
|
// This concatenates expression|driver|poolName|deviceName with pipe separators
|
|
func buildDeviceMatchCacheKey(expression string, driver string, poolName string, deviceName string) string {
|
|
// Pre-allocate sufficient capacity to avoid reallocation
|
|
var b strings.Builder
|
|
b.Grow(len(expression) + len(driver) + len(poolName) + len(deviceName) + 3)
|
|
|
|
b.WriteString(expression)
|
|
b.WriteByte('|')
|
|
b.WriteString(driver)
|
|
b.WriteByte('|')
|
|
b.WriteString(poolName)
|
|
b.WriteByte('|')
|
|
b.WriteString(deviceName)
|
|
|
|
return b.String()
|
|
}
|
|
|
|
// nodeMatches is a cached wrapper around structured.NodeMatches
|
|
func (r *resourceAllocationScorer) nodeMatches(node *v1.Node, nodeNameToMatch string, allNodesMatch bool, nodeSelector *v1.NodeSelector) (bool, error) {
|
|
|
|
var nodeName string
|
|
if node != nil {
|
|
nodeName = node.Name
|
|
}
|
|
|
|
nodeSelectorStr := nodeSelector.String()
|
|
key := buildNodeMatchCacheKey(nodeName, nodeNameToMatch, allNodesMatch, nodeSelectorStr)
|
|
|
|
// Check cache first
|
|
if matches, ok := r.nodeMatchCache.Load(key); ok {
|
|
return matches.(bool), nil
|
|
}
|
|
|
|
// Call the original function
|
|
matches, err := structured.NodeMatches(r.draFeatures, node, nodeNameToMatch, allNodesMatch, nodeSelector)
|
|
|
|
// Cache the result (even if there was an error, to avoid repeated failures)
|
|
if err == nil {
|
|
r.nodeMatchCache.Store(key, matches)
|
|
}
|
|
|
|
return matches, err
|
|
}
|
|
|
|
// score will use `scorer` function to calculate the score.
|
|
func (r *resourceAllocationScorer) score(
|
|
ctx context.Context,
|
|
pod *v1.Pod,
|
|
nodeInfo fwk.NodeInfo,
|
|
podRequests []int64,
|
|
draPreScoreState *draPreScoreState,
|
|
) (int64, *fwk.Status) {
|
|
logger := klog.FromContext(ctx)
|
|
node := nodeInfo.Node()
|
|
|
|
// resources not set, nothing scheduled,
|
|
if len(r.resources) == 0 {
|
|
return 0, fwk.NewStatus(fwk.Error, "resources not found")
|
|
}
|
|
|
|
requested := make([]int64, len(r.resources))
|
|
allocatable := make([]int64, len(r.resources))
|
|
for i := range r.resources {
|
|
alloc, req := r.calculateResourceAllocatableRequest(ctx, nodeInfo, v1.ResourceName(r.resources[i].Name), podRequests[i], draPreScoreState)
|
|
// Only fill the extended resource entry when it's non-zero.
|
|
if alloc == 0 {
|
|
continue
|
|
}
|
|
allocatable[i] = alloc
|
|
requested[i] = req
|
|
}
|
|
|
|
score := r.scorer(requested, allocatable)
|
|
|
|
if loggerV := logger.V(10); loggerV.Enabled() { // Serializing these maps is costly.
|
|
loggerV.Info("Listed internal info for allocatable resources, requested resources and score", "pod",
|
|
klog.KObj(pod), "node", klog.KObj(node), "resourceAllocationScorer", r.Name,
|
|
"allocatableResource", allocatable, "requestedResource", requested, "resourceScore", score,
|
|
)
|
|
}
|
|
|
|
return score, nil
|
|
}
|
|
|
|
// calculateResourceAllocatableRequest returns 2 parameters:
|
|
// - 1st param: quantity of allocatable resource on the node.
|
|
// - 2nd param: aggregated quantity of requested resource on the node.
|
|
// Note: if it's an extended resource, and the pod doesn't request it, (0, 0) is returned.
|
|
func (r *resourceAllocationScorer) calculateResourceAllocatableRequest(
|
|
ctx context.Context,
|
|
nodeInfo fwk.NodeInfo,
|
|
resource v1.ResourceName,
|
|
podRequest int64,
|
|
draPreScoreState *draPreScoreState,
|
|
) (int64, int64) {
|
|
requested := nodeInfo.GetNonZeroRequested()
|
|
if r.useRequested {
|
|
requested = nodeInfo.GetRequested()
|
|
}
|
|
|
|
// If it's an extended resource, and the pod doesn't request it. We return (0, 0)
|
|
// as an implication to bypass scoring on this resource.
|
|
if podRequest == 0 && schedutil.IsScalarResourceName(resource) {
|
|
return 0, 0
|
|
}
|
|
switch resource {
|
|
case v1.ResourceCPU:
|
|
return nodeInfo.GetAllocatable().GetMilliCPU(), (requested.GetMilliCPU() + podRequest)
|
|
case v1.ResourceMemory:
|
|
return nodeInfo.GetAllocatable().GetMemory(), (requested.GetMemory() + podRequest)
|
|
case v1.ResourceEphemeralStorage:
|
|
return nodeInfo.GetAllocatable().GetEphemeralStorage(), (nodeInfo.GetRequested().GetEphemeralStorage() + podRequest)
|
|
default:
|
|
allocatable, exists := nodeInfo.GetAllocatable().GetScalarResources()[resource]
|
|
if allocatable == 0 && r.enableDRAExtendedResource && draPreScoreState != nil {
|
|
// Allocatable 0 means that this resource is not handled by device plugin.
|
|
// Calculate allocatable and requested for resources backed by DRA.
|
|
allocatable, allocated := r.calculateDRAExtendedResourceAllocatableRequest(ctx, nodeInfo.Node(), resource, draPreScoreState)
|
|
if allocatable > 0 {
|
|
return allocatable, allocated + podRequest
|
|
}
|
|
}
|
|
if exists {
|
|
return allocatable, (nodeInfo.GetRequested().GetScalarResources()[resource] + podRequest)
|
|
}
|
|
}
|
|
klog.FromContext(ctx).V(10).Info("Requested resource is omitted for node score calculation", "resourceName", resource)
|
|
return 0, 0
|
|
}
|
|
|
|
// calculatePodResourceRequest returns the total non-zero requests. If Overhead is defined for the pod
|
|
// the Overhead is added to the result.
|
|
func (r *resourceAllocationScorer) calculatePodResourceRequest(pod *v1.Pod, resourceName v1.ResourceName) int64 {
|
|
|
|
opts := resourcehelper.PodResourcesOptions{
|
|
UseStatusResources: r.enableInPlacePodVerticalScaling,
|
|
InPlacePodLevelResourcesVerticalScalingEnabled: r.enableInPlacePodLevelResourcesVerticalScaling,
|
|
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
|
|
SkipPodLevelResources: !r.enablePodLevelResources,
|
|
}
|
|
|
|
if !r.useRequested {
|
|
opts.NonMissingContainerRequests = v1.ResourceList{
|
|
v1.ResourceCPU: *resource.NewMilliQuantity(schedutil.DefaultMilliCPURequest, resource.DecimalSI),
|
|
v1.ResourceMemory: *resource.NewQuantity(schedutil.DefaultMemoryRequest, resource.DecimalSI),
|
|
}
|
|
}
|
|
|
|
requests := resourcehelper.PodRequests(pod, opts)
|
|
|
|
quantity := requests[resourceName]
|
|
if resourceName == v1.ResourceCPU {
|
|
return quantity.MilliValue()
|
|
}
|
|
return quantity.Value()
|
|
}
|
|
|
|
func (r *resourceAllocationScorer) calculatePodResourceRequestList(pod *v1.Pod, resources []config.ResourceSpec) []int64 {
|
|
podRequests := make([]int64, len(resources))
|
|
for i := range resources {
|
|
podRequests[i] = r.calculatePodResourceRequest(pod, v1.ResourceName(resources[i].Name))
|
|
}
|
|
return podRequests
|
|
}
|
|
|
|
func (r *resourceAllocationScorer) isBestEffortPod(podRequests []int64) bool {
|
|
for _, request := range podRequests {
|
|
if request != 0 {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// getDRAPreScoredParams returns the DRA allocated state and resource slices for DRA extended resource scoring.
|
|
func getDRAPreScoredParams(draManager fwk.SharedDRAManager, resources []config.ResourceSpec) (*draPreScoreState, *fwk.Status) {
|
|
anyBackedByDRA := false
|
|
for _, resource := range resources {
|
|
resourceName := v1.ResourceName(resource.Name)
|
|
if !schedutil.IsDRAExtendedResourceName(resourceName) {
|
|
continue
|
|
}
|
|
deviceClass := draManager.DeviceClassResolver().GetDeviceClass(resourceName)
|
|
if deviceClass != nil {
|
|
anyBackedByDRA = true
|
|
break
|
|
}
|
|
}
|
|
// There's no point in returning DRA data as there are no resources backed by DRA.
|
|
if !anyBackedByDRA {
|
|
return nil, nil
|
|
}
|
|
|
|
allocatedState, err := draManager.ResourceClaims().GatherAllocatedState()
|
|
if err != nil {
|
|
return nil, fwk.AsStatus(err)
|
|
}
|
|
resourceSlices, err := draManager.ResourceSlices().ListWithDeviceTaintRules()
|
|
if err != nil {
|
|
return nil, fwk.AsStatus(err)
|
|
}
|
|
|
|
return &draPreScoreState{
|
|
allocatedState: allocatedState,
|
|
resourceSlices: resourceSlices,
|
|
}, nil
|
|
}
|
|
|
|
// calculateDRAExtendedResourceAllocatableRequest calculates allocatable and allocated
|
|
// quantities for extended resources backed by DRA.
|
|
func (r *resourceAllocationScorer) calculateDRAExtendedResourceAllocatableRequest(
|
|
ctx context.Context,
|
|
node *v1.Node,
|
|
resource v1.ResourceName,
|
|
draPreScoreState *draPreScoreState,
|
|
) (int64, int64) {
|
|
logger := klog.FromContext(ctx)
|
|
deviceClass := r.draManager.DeviceClassResolver().GetDeviceClass(resource)
|
|
if deviceClass == nil {
|
|
// This resource is not backed by DRA.
|
|
logger.V(7).Info("Extended resource not found in device class mapping", "resource", resource)
|
|
return 0, 0
|
|
}
|
|
|
|
capacity, allocated, err := r.calculateDRAResourceTotals(ctx, node, deviceClass, draPreScoreState.allocatedState, draPreScoreState.resourceSlices)
|
|
if err != nil {
|
|
logger.Error(err, "Failed to calculate DRA resource capacity and allocated", "node", node.Name, "resource", resource, "deviceClass", deviceClass.Name)
|
|
return 0, 0
|
|
}
|
|
|
|
logger.V(7).Info("DRA extended resource calculation", "node", node.Name, "resource", resource, "deviceClass", deviceClass.Name, "capacity", capacity, "allocated", allocated)
|
|
return capacity, allocated
|
|
}
|
|
|
|
// calculateDRAResourceTotals computes the total capacity and total allocated count of devices
|
|
// matching the specified Device Class on the given node. It queries the DRA manager for resource
|
|
// slices and allocated devices, filters devices by class and driver, and returns the counts.
|
|
// Returns an error if resource information cannot be retrieved or if node matching fails.
|
|
//
|
|
// Parameters:
|
|
//
|
|
// ctx - context for cancellation and deadlines
|
|
// node - the node to evaluate device resources on
|
|
// deviceClass - the device class to filter devices by
|
|
//
|
|
// Returns:
|
|
//
|
|
// totalCapacity - total number of devices matching the device class on the node
|
|
// totalAllocated - number of devices currently allocated from the matching set
|
|
// error - any error encountered during processing
|
|
func (r *resourceAllocationScorer) calculateDRAResourceTotals(ctx context.Context, node *v1.Node, deviceClass *resourceapi.DeviceClass, allocatedState *structured.AllocatedState, resourceSlices []*resourceapi.ResourceSlice,
|
|
) (int64, int64, error) {
|
|
var totalCapacity, totalAllocated int64
|
|
nodeName := node.Name
|
|
|
|
for _, slice := range resourceSlices {
|
|
// Early filtering: check if slice applies to this node
|
|
perDeviceNodeSelection := ptr.Deref(slice.Spec.PerDeviceNodeSelection, false)
|
|
|
|
var devices []resourceapi.Device
|
|
|
|
if perDeviceNodeSelection {
|
|
// Per-device node selection: filter devices individually
|
|
devices = make([]resourceapi.Device, 0, len(slice.Spec.Devices))
|
|
for _, device := range slice.Spec.Devices {
|
|
deviceNodeName := ptr.Deref(device.NodeName, "")
|
|
deviceAllNodes := ptr.Deref(device.AllNodes, false)
|
|
|
|
// Fast path: check AllNodes or exact name match first
|
|
if deviceAllNodes || (deviceNodeName != "" && deviceNodeName == nodeName) {
|
|
devices = append(devices, device)
|
|
continue
|
|
}
|
|
|
|
// Slow path: only if we have a node selector
|
|
if device.NodeSelector != nil {
|
|
deviceMatches, err := r.nodeMatches(node, deviceNodeName, deviceAllNodes, device.NodeSelector)
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
if deviceMatches {
|
|
devices = append(devices, device)
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Slice-level node selection
|
|
sliceNodeName := ptr.Deref(slice.Spec.NodeName, "")
|
|
sliceAllNodes := ptr.Deref(slice.Spec.AllNodes, false)
|
|
|
|
// Fast path: check AllNodes or exact name match first
|
|
if !sliceAllNodes && sliceNodeName != nodeName && slice.Spec.NodeSelector != nil {
|
|
// Need to check node selector
|
|
matches, err := r.nodeMatches(node, sliceNodeName, sliceAllNodes, slice.Spec.NodeSelector)
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
if !matches {
|
|
continue // Skip this slice
|
|
}
|
|
} else if !sliceAllNodes && sliceNodeName != "" && sliceNodeName != nodeName {
|
|
// Node name specified but doesn't match
|
|
continue
|
|
}
|
|
|
|
devices = slice.Spec.Devices
|
|
}
|
|
|
|
// Fast path for device class with no selectors
|
|
if len(deviceClass.Spec.Selectors) == 0 {
|
|
driver := slice.Spec.Driver
|
|
pool := slice.Spec.Pool.Name
|
|
for _, device := range devices {
|
|
totalCapacity++
|
|
deviceID := structured.MakeDeviceID(driver, pool, device.Name)
|
|
if structured.IsDeviceAllocated(deviceID, allocatedState) {
|
|
totalAllocated++
|
|
}
|
|
}
|
|
} else {
|
|
// Slow path: check device class selectors
|
|
driver := slice.Spec.Driver
|
|
pool := slice.Spec.Pool.Name
|
|
for _, device := range devices {
|
|
matches, err := r.deviceMatchesClass(ctx, device, deviceClass, driver, pool)
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
if matches {
|
|
totalCapacity++
|
|
deviceID := structured.MakeDeviceID(driver, pool, device.Name)
|
|
if structured.IsDeviceAllocated(deviceID, allocatedState) {
|
|
totalAllocated++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return totalCapacity, totalAllocated, nil
|
|
}
|
|
|
|
// deviceMatchesClass checks if a device matches the selectors of a device class.
|
|
// Note: This method assumes the device class has ExtendedResourceName set, as filtering
|
|
// should be done by the caller to ensure we only process DRA resources meant for extended
|
|
// resource scoring.
|
|
func (r *resourceAllocationScorer) deviceMatchesClass(ctx context.Context, device resourceapi.Device, deviceClass *resourceapi.DeviceClass, driver string, poolName string) (bool, error) {
|
|
// If no selectors are defined, all devices match
|
|
if len(deviceClass.Spec.Selectors) == 0 {
|
|
return true, nil
|
|
}
|
|
|
|
// Lazily create the CEL device only when needed (first CEL selector that's not cached)
|
|
var celDevice cel.Device
|
|
celDeviceCreated := false
|
|
|
|
// All selectors must match for the device to be considered a match
|
|
for _, selector := range deviceClass.Spec.Selectors {
|
|
if selector.CEL == nil {
|
|
continue
|
|
}
|
|
|
|
key := buildDeviceMatchCacheKey(selector.CEL.Expression, driver, poolName, device.Name)
|
|
|
|
// Check if result is already cached
|
|
if matches, ok := r.deviceMatchCache.Load(key); ok {
|
|
if !matches.(bool) {
|
|
return false, nil
|
|
}
|
|
continue // This selector matches, check the next one
|
|
}
|
|
|
|
// Cache miss - need to evaluate CEL expression
|
|
// Create CEL device if we haven't already
|
|
if !celDeviceCreated {
|
|
celDevice = cel.Device{
|
|
Driver: driver,
|
|
Attributes: device.Attributes,
|
|
Capacity: device.Capacity,
|
|
}
|
|
celDeviceCreated = true
|
|
}
|
|
|
|
// Use cached CEL compilation for performance
|
|
result := r.celCache.GetOrCompile(selector.CEL.Expression)
|
|
if result.Error != nil {
|
|
return false, result.Error
|
|
}
|
|
|
|
matches, _, err := result.DeviceMatches(ctx, celDevice)
|
|
if err != nil || !matches {
|
|
return false, err
|
|
}
|
|
|
|
// Cache the result for future use
|
|
r.deviceMatchCache.Store(key, matches)
|
|
}
|
|
|
|
return true, nil
|
|
}
|