kubernetes/pkg/scheduler/framework/plugins/noderesources/resource_allocation.go

/*
Copyright 2017 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package noderesources

import (
	"context"
	"strings"
	"sync"

	v1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/resource"
	"k8s.io/dynamic-resource-allocation/cel"
	"k8s.io/klog/v2"
	"k8s.io/utils/ptr"

	resourceapi "k8s.io/api/resource/v1"
	resourcehelper "k8s.io/component-helpers/resource"
	"k8s.io/dynamic-resource-allocation/structured"
	fwk "k8s.io/kube-scheduler/framework"
	"k8s.io/kubernetes/pkg/scheduler/apis/config"
	schedutil "k8s.io/kubernetes/pkg/scheduler/util"
)

// scorer is decorator for resourceAllocationScorer
type scorer func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer

// DRACaches holds various caches used for DRA-related computations
type DRACaches struct {
	// celCache is a cache for compiled CEL expressions used in device class selectors.
	celCache *cel.Cache
	// Cache for DeviceMatches results to avoid expensive repeated evaluations
	deviceMatchCache sync.Map // map[deviceMatchCacheKey]bool
	// Cache for NodeMatches results to avoid expensive repeated node selector evaluations
	nodeMatchCache sync.Map // map[nodeMatchCacheKey]bool
}

// resourceAllocationScorer contains information to calculate resource allocation score.
type resourceAllocationScorer struct {
	Name                                          string
	enableInPlacePodVerticalScaling               bool
	enablePodLevelResources                       bool
	enableDRAExtendedResource                     bool
	enableInPlacePodLevelResourcesVerticalScaling bool
	// used to decide whether to use Requested or NonZeroRequested for
	// cpu and memory.
	useRequested bool
	scorer       func(requested, allocable []int64) int64
	resources    []config.ResourceSpec
	draFeatures  structured.Features
	draManager   fwk.SharedDRAManager
	// Caches for DRA-related computations
	DRACaches
}

// buildNodeMatchCacheKey creates a string cache key for node matching results
// Using a string key is significantly faster than struct keys with sync.Map
func buildNodeMatchCacheKey(nodeName string, nodeNameToMatch string, allNodesMatch bool, nodeSelectorHash string) string {
	// Pre-allocate sufficient capacity to avoid reallocation
	var b strings.Builder
	b.Grow(len(nodeName) + len(nodeNameToMatch) + len(nodeSelectorHash) + 4)

	b.WriteString(nodeName)
	b.WriteByte('|')
	b.WriteString(nodeNameToMatch)
	b.WriteByte('|')
	if allNodesMatch {
		b.WriteByte('1')
	} else {
		b.WriteByte('0')
	}
	b.WriteByte('|')
	b.WriteString(nodeSelectorHash)

	return b.String()
}

// buildDeviceMatchCacheKey creates a string cache key for device matching results
// Using a string key is significantly faster than struct keys with sync.Map
// This concatenates expression|driver|poolName|deviceName with pipe separators
func buildDeviceMatchCacheKey(expression string, driver string, poolName string, deviceName string) string {
	// Pre-allocate sufficient capacity to avoid reallocation
	var b strings.Builder
	b.Grow(len(expression) + len(driver) + len(poolName) + len(deviceName) + 3)

	b.WriteString(expression)
	b.WriteByte('|')
	b.WriteString(driver)
	b.WriteByte('|')
	b.WriteString(poolName)
	b.WriteByte('|')
	b.WriteString(deviceName)

	return b.String()
}

// nodeMatches is a cached wrapper around structured.NodeMatches
func (r *resourceAllocationScorer) nodeMatches(node *v1.Node, nodeNameToMatch string, allNodesMatch bool, nodeSelector *v1.NodeSelector) (bool, error) {

	var nodeName string
	if node != nil {
		nodeName = node.Name
	}

	nodeSelectorStr := nodeSelector.String()
	key := buildNodeMatchCacheKey(nodeName, nodeNameToMatch, allNodesMatch, nodeSelectorStr)

	// Check cache first
	if matches, ok := r.nodeMatchCache.Load(key); ok {
		return matches.(bool), nil
	}

	// Call the original function
	matches, err := structured.NodeMatches(r.draFeatures, node, nodeNameToMatch, allNodesMatch, nodeSelector)

	// Cache the result (even if there was an error, to avoid repeated failures)
	if err == nil {
		r.nodeMatchCache.Store(key, matches)
	}

	return matches, err
}

// score will use `scorer` function to calculate the score.
func (r *resourceAllocationScorer) score(
	ctx context.Context,
	pod *v1.Pod,
	nodeInfo fwk.NodeInfo,
	podRequests []int64,
	draPreScoreState *draPreScoreState,
) (int64, *fwk.Status) {
	logger := klog.FromContext(ctx)
	node := nodeInfo.Node()

	// resources not set, nothing scheduled,
	if len(r.resources) == 0 {
		return 0, fwk.NewStatus(fwk.Error, "resources not found")
	}

	requested := make([]int64, len(r.resources))
	allocatable := make([]int64, len(r.resources))
	for i := range r.resources {
		alloc, req := r.calculateResourceAllocatableRequest(ctx, nodeInfo, v1.ResourceName(r.resources[i].Name), podRequests[i], draPreScoreState)
		// Only fill the extended resource entry when it's non-zero.
		if alloc == 0 {
			continue
		}
		allocatable[i] = alloc
		requested[i] = req
	}

	score := r.scorer(requested, allocatable)

	if loggerV := logger.V(10); loggerV.Enabled() { // Serializing these maps is costly.
		loggerV.Info("Listed internal info for allocatable resources, requested resources and score", "pod",
			klog.KObj(pod), "node", klog.KObj(node), "resourceAllocationScorer", r.Name,
			"allocatableResource", allocatable, "requestedResource", requested, "resourceScore", score,
		)
	}

	return score, nil
}

// calculateResourceAllocatableRequest returns 2 parameters:
// - 1st param: quantity of allocatable resource on the node.
// - 2nd param: aggregated quantity of requested resource on the node.
// Note: if it's an extended resource, and the pod doesn't request it, (0, 0) is returned.
func (r *resourceAllocationScorer) calculateResourceAllocatableRequest(
	ctx context.Context,
	nodeInfo fwk.NodeInfo,
	resource v1.ResourceName,
	podRequest int64,
	draPreScoreState *draPreScoreState,
) (int64, int64) {
	requested := nodeInfo.GetNonZeroRequested()
	if r.useRequested {
		requested = nodeInfo.GetRequested()
	}

	// If it's an extended resource, and the pod doesn't request it. We return (0, 0)
	// as an implication to bypass scoring on this resource.
	if podRequest == 0 && schedutil.IsScalarResourceName(resource) {
		return 0, 0
	}
	switch resource {
	case v1.ResourceCPU:
		return nodeInfo.GetAllocatable().GetMilliCPU(), (requested.GetMilliCPU() + podRequest)
	case v1.ResourceMemory:
		return nodeInfo.GetAllocatable().GetMemory(), (requested.GetMemory() + podRequest)
	case v1.ResourceEphemeralStorage:
		return nodeInfo.GetAllocatable().GetEphemeralStorage(), (nodeInfo.GetRequested().GetEphemeralStorage() + podRequest)
	default:
		allocatable, exists := nodeInfo.GetAllocatable().GetScalarResources()[resource]
		if allocatable == 0 && r.enableDRAExtendedResource && draPreScoreState != nil {
			// Allocatable 0 means that this resource is not handled by device plugin.
			// Calculate allocatable and requested for resources backed by DRA.
			allocatable, allocated := r.calculateDRAExtendedResourceAllocatableRequest(ctx, nodeInfo.Node(), resource, draPreScoreState)
			if allocatable > 0 {
				return allocatable, allocated + podRequest
			}
		}
		if exists {
			return allocatable, (nodeInfo.GetRequested().GetScalarResources()[resource] + podRequest)
		}
	}
	klog.FromContext(ctx).V(10).Info("Requested resource is omitted for node score calculation", "resourceName", resource)
	return 0, 0
}

// calculatePodResourceRequest returns the total non-zero requests. If Overhead is defined for the pod
// the Overhead is added to the result.
func (r *resourceAllocationScorer) calculatePodResourceRequest(pod *v1.Pod, resourceName v1.ResourceName) int64 {

	opts := resourcehelper.PodResourcesOptions{
		UseStatusResources: r.enableInPlacePodVerticalScaling,
		InPlacePodLevelResourcesVerticalScalingEnabled: r.enableInPlacePodLevelResourcesVerticalScaling,
		// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
		SkipPodLevelResources: !r.enablePodLevelResources,
	}

	if !r.useRequested {
		opts.NonMissingContainerRequests = v1.ResourceList{
			v1.ResourceCPU:    *resource.NewMilliQuantity(schedutil.DefaultMilliCPURequest, resource.DecimalSI),
			v1.ResourceMemory: *resource.NewQuantity(schedutil.DefaultMemoryRequest, resource.DecimalSI),
		}
	}

	requests := resourcehelper.PodRequests(pod, opts)

	quantity := requests[resourceName]
	if resourceName == v1.ResourceCPU {
		return quantity.MilliValue()
	}
	return quantity.Value()
}

func (r *resourceAllocationScorer) calculatePodResourceRequestList(pod *v1.Pod, resources []config.ResourceSpec) []int64 {
	podRequests := make([]int64, len(resources))
	for i := range resources {
		podRequests[i] = r.calculatePodResourceRequest(pod, v1.ResourceName(resources[i].Name))
	}
	return podRequests
}

func (r *resourceAllocationScorer) isBestEffortPod(podRequests []int64) bool {
	for _, request := range podRequests {
		if request != 0 {
			return false
		}
	}
	return true
}

// getDRAPreScoredParams returns the DRA allocated state and resource slices for DRA extended resource scoring.
func getDRAPreScoredParams(draManager fwk.SharedDRAManager, resources []config.ResourceSpec) (*draPreScoreState, *fwk.Status) {
	anyBackedByDRA := false
	for _, resource := range resources {
		resourceName := v1.ResourceName(resource.Name)
		if !schedutil.IsDRAExtendedResourceName(resourceName) {
			continue
		}
		deviceClass := draManager.DeviceClassResolver().GetDeviceClass(resourceName)
		if deviceClass != nil {
			anyBackedByDRA = true
			break
		}
	}
	// There's no point in returning DRA data as there are no resources backed by DRA.
	if !anyBackedByDRA {
		return nil, nil
	}

	allocatedState, err := draManager.ResourceClaims().GatherAllocatedState()
	if err != nil {
		return nil, fwk.AsStatus(err)
	}
	resourceSlices, err := draManager.ResourceSlices().ListWithDeviceTaintRules()
	if err != nil {
		return nil, fwk.AsStatus(err)
	}

	return &draPreScoreState{
		allocatedState: allocatedState,
		resourceSlices: resourceSlices,
	}, nil
}

// calculateDRAExtendedResourceAllocatableRequest calculates allocatable and allocated
// quantities for extended resources backed by DRA.
func (r *resourceAllocationScorer) calculateDRAExtendedResourceAllocatableRequest(
	ctx context.Context,
	node *v1.Node,
	resource v1.ResourceName,
	draPreScoreState *draPreScoreState,
) (int64, int64) {
	logger := klog.FromContext(ctx)
	deviceClass := r.draManager.DeviceClassResolver().GetDeviceClass(resource)
	if deviceClass == nil {
		// This resource is not backed by DRA.
		logger.V(7).Info("Extended resource not found in device class mapping", "resource", resource)
		return 0, 0
	}

	capacity, allocated, err := r.calculateDRAResourceTotals(ctx, node, deviceClass, draPreScoreState.allocatedState, draPreScoreState.resourceSlices)
	if err != nil {
		logger.Error(err, "Failed to calculate DRA resource capacity and allocated", "node", node.Name, "resource", resource, "deviceClass", deviceClass.Name)
		return 0, 0
	}

	logger.V(7).Info("DRA extended resource calculation", "node", node.Name, "resource", resource, "deviceClass", deviceClass.Name, "capacity", capacity, "allocated", allocated)
	return capacity, allocated
}

// calculateDRAResourceTotals computes the total capacity and total allocated count of devices
// matching the specified Device Class on the given node. It queries the DRA manager for resource
// slices and allocated devices, filters devices by class and driver, and returns the counts.
// Returns an error if resource information cannot be retrieved or if node matching fails.
//
// Parameters:
//
//	ctx         - context for cancellation and deadlines
//	node        - the node to evaluate device resources on
//	deviceClass - the device class to filter devices by
//
// Returns:
//
//	totalCapacity  - total number of devices matching the device class on the node
//	totalAllocated - number of devices currently allocated from the matching set
//	error          - any error encountered during processing
func (r *resourceAllocationScorer) calculateDRAResourceTotals(ctx context.Context, node *v1.Node, deviceClass *resourceapi.DeviceClass, allocatedState *structured.AllocatedState, resourceSlices []*resourceapi.ResourceSlice,
) (int64, int64, error) {
	var totalCapacity, totalAllocated int64
	nodeName := node.Name

	for _, slice := range resourceSlices {
		// Early filtering: check if slice applies to this node
		perDeviceNodeSelection := ptr.Deref(slice.Spec.PerDeviceNodeSelection, false)

		var devices []resourceapi.Device

		if perDeviceNodeSelection {
			// Per-device node selection: filter devices individually
			devices = make([]resourceapi.Device, 0, len(slice.Spec.Devices))
			for _, device := range slice.Spec.Devices {
				deviceNodeName := ptr.Deref(device.NodeName, "")
				deviceAllNodes := ptr.Deref(device.AllNodes, false)

				// Fast path: check AllNodes or exact name match first
				if deviceAllNodes || (deviceNodeName != "" && deviceNodeName == nodeName) {
					devices = append(devices, device)
					continue
				}

				// Slow path: only if we have a node selector
				if device.NodeSelector != nil {
					deviceMatches, err := r.nodeMatches(node, deviceNodeName, deviceAllNodes, device.NodeSelector)
					if err != nil {
						return 0, 0, err
					}
					if deviceMatches {
						devices = append(devices, device)
					}
				}
			}
		} else {
			// Slice-level node selection
			sliceNodeName := ptr.Deref(slice.Spec.NodeName, "")
			sliceAllNodes := ptr.Deref(slice.Spec.AllNodes, false)

			// Fast path: check AllNodes or exact name match first
			if !sliceAllNodes && sliceNodeName != nodeName && slice.Spec.NodeSelector != nil {
				// Need to check node selector
				matches, err := r.nodeMatches(node, sliceNodeName, sliceAllNodes, slice.Spec.NodeSelector)
				if err != nil {
					return 0, 0, err
				}
				if !matches {
					continue // Skip this slice
				}
			} else if !sliceAllNodes && sliceNodeName != "" && sliceNodeName != nodeName {
				// Node name specified but doesn't match
				continue
			}

			devices = slice.Spec.Devices
		}

		// Fast path for device class with no selectors
		if len(deviceClass.Spec.Selectors) == 0 {
			driver := slice.Spec.Driver
			pool := slice.Spec.Pool.Name
			for _, device := range devices {
				totalCapacity++
				deviceID := structured.MakeDeviceID(driver, pool, device.Name)
				if structured.IsDeviceAllocated(deviceID, allocatedState) {
					totalAllocated++
				}
			}
		} else {
			// Slow path: check device class selectors
			driver := slice.Spec.Driver
			pool := slice.Spec.Pool.Name
			for _, device := range devices {
				matches, err := r.deviceMatchesClass(ctx, device, deviceClass, driver, pool)
				if err != nil {
					return 0, 0, err
				}
				if matches {
					totalCapacity++
					deviceID := structured.MakeDeviceID(driver, pool, device.Name)
					if structured.IsDeviceAllocated(deviceID, allocatedState) {
						totalAllocated++
					}
				}
			}
		}
	}

	return totalCapacity, totalAllocated, nil
}

// deviceMatchesClass checks if a device matches the selectors of a device class.
// Note: This method assumes the device class has ExtendedResourceName set, as filtering
// should be done by the caller to ensure we only process DRA resources meant for extended
// resource scoring.
func (r *resourceAllocationScorer) deviceMatchesClass(ctx context.Context, device resourceapi.Device, deviceClass *resourceapi.DeviceClass, driver string, poolName string) (bool, error) {
	// If no selectors are defined, all devices match
	if len(deviceClass.Spec.Selectors) == 0 {
		return true, nil
	}

	// Lazily create the CEL device only when needed (first CEL selector that's not cached)
	var celDevice cel.Device
	celDeviceCreated := false

	// All selectors must match for the device to be considered a match
	for _, selector := range deviceClass.Spec.Selectors {
		if selector.CEL == nil {
			continue
		}

		key := buildDeviceMatchCacheKey(selector.CEL.Expression, driver, poolName, device.Name)

		// Check if result is already cached
		if matches, ok := r.deviceMatchCache.Load(key); ok {
			if !matches.(bool) {
				return false, nil
			}
			continue // This selector matches, check the next one
		}

		// Cache miss - need to evaluate CEL expression
		// Create CEL device if we haven't already
		if !celDeviceCreated {
			celDevice = cel.Device{
				Driver:     driver,
				Attributes: device.Attributes,
				Capacity:   device.Capacity,
			}
			celDeviceCreated = true
		}

		// Use cached CEL compilation for performance
		result := r.celCache.GetOrCompile(selector.CEL.Expression)
		if result.Error != nil {
			return false, result.Error
		}

		matches, _, err := result.DeviceMatches(ctx, celDevice)
		if err != nil || !matches {
			return false, err
		}

		// Cache the result for future use
		r.deviceMatchCache.Store(key, matches)
	}

	return true, nil
}