mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-02-15 08:47:59 -05:00
* First version of batching w/out signatures. * First version of pod signatures. * Integrate batching with signatures. * Fix merge conflicts. * Fixes from self-review. * Test fixes. * Fix a bug that limited batches to size 2 Also add some new high-level logging and simplify the pod affinity signature. * Re-enable batching on perf tests for now. * fwk.NewStatus(fwk.Success) * Review feedback. * Review feedback. * Comment fix. * Two plugin specific unit tests.: * Add cycle state to the sign call, apply to topo spread. Also add unit tests for several plugi signature calls. * Review feedback. * Switch to distinct stats for hint and store calls. * Switch signature from string to []byte * Revert cyclestate in signs. Update node affinity. Node affinity now sorts all of the various nested arrays in the structure. CycleState no longer in signature; revert to signing fewer cases for pod spread. * hack/update-vendor.sh * Disable signatures when extenders are configured. * Update pkg/scheduler/framework/runtime/batch.go Co-authored-by: Maciej Skoczeń <87243939+macsko@users.noreply.github.com> * Update staging/src/k8s.io/kube-scheduler/framework/interface.go Co-authored-by: Maciej Skoczeń <87243939+macsko@users.noreply.github.com> * Review feedback. * Disable node resource signatures when extended DRA enabled. * Review feedback. * Update pkg/scheduler/framework/plugins/imagelocality/image_locality.go Co-authored-by: Maciej Skoczeń <87243939+macsko@users.noreply.github.com> * Update pkg/scheduler/framework/interface.go Co-authored-by: Maciej Skoczeń <87243939+macsko@users.noreply.github.com> * Update pkg/scheduler/framework/plugins/nodedeclaredfeatures/nodedeclaredfeatures.go Co-authored-by: Maciej Skoczeń <87243939+macsko@users.noreply.github.com> * Update pkg/scheduler/framework/runtime/batch.go Co-authored-by: Maciej Skoczeń <87243939+macsko@users.noreply.github.com> * Review feedback. * Fixes for review suggestions. * Add integration tests. * Linter fixes, test fix. * Whitespace fix. * Remove broken test. * Unschedulable test. * Remove go.mod changes. --------- Co-authored-by: Maciej Skoczeń <87243939+macsko@users.noreply.github.com>
228 lines
8.6 KiB
Go
228 lines
8.6 KiB
Go
/*
|
|
Copyright 2019 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package noderesources
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
resourceapi "k8s.io/api/resource/v1"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/dynamic-resource-allocation/structured"
|
|
fwk "k8s.io/kube-scheduler/framework"
|
|
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
|
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
|
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
|
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
|
)
|
|
|
|
// BalancedAllocation is a score plugin that calculates the difference between the cpu and memory fraction
|
|
// of capacity, and prioritizes the host based on how close the two metrics are to each other.
|
|
type BalancedAllocation struct {
|
|
handle fwk.Handle
|
|
resourceAllocationScorer
|
|
}
|
|
|
|
var _ fwk.PreScorePlugin = &BalancedAllocation{}
|
|
var _ fwk.ScorePlugin = &BalancedAllocation{}
|
|
var _ fwk.SignPlugin = &BalancedAllocation{}
|
|
|
|
// BalancedAllocationName is the name of the plugin used in the plugin registry and configurations.
|
|
const (
|
|
BalancedAllocationName = names.NodeResourcesBalancedAllocation
|
|
|
|
// balancedAllocationPreScoreStateKey is the key in CycleState to NodeResourcesBalancedAllocation pre-computed data for Scoring.
|
|
balancedAllocationPreScoreStateKey = "PreScore" + BalancedAllocationName
|
|
)
|
|
|
|
// draPreScoreState holds the pre-computed data for DRA extended resources scoring.
|
|
type draPreScoreState struct {
|
|
// allocatedState holds the DRA allocated state for DRA extended resources scoring.
|
|
allocatedState *structured.AllocatedState
|
|
// resourceSlices holds the list of resource slices for DRA extended resource scoring.
|
|
resourceSlices []*resourceapi.ResourceSlice
|
|
}
|
|
|
|
// balancedAllocationPreScoreState computed at PreScore and used at Score.
|
|
type balancedAllocationPreScoreState struct {
|
|
// podRequests have the same order of the resources defined in NodeResourcesFitArgs.Resources,
|
|
// same for other place we store a list like that.
|
|
podRequests []int64
|
|
// DRA extended resource scoring state.
|
|
*draPreScoreState
|
|
}
|
|
|
|
// Clone implements the mandatory Clone interface. We don't really copy the data since
|
|
// there is no need for that.
|
|
func (s *balancedAllocationPreScoreState) Clone() fwk.StateData {
|
|
return s
|
|
}
|
|
|
|
// PreScore calculates incoming pod's resource requests and writes them to the cycle state used.
|
|
func (ba *BalancedAllocation) PreScore(ctx context.Context, cycleState fwk.CycleState, pod *v1.Pod, nodes []fwk.NodeInfo) *fwk.Status {
|
|
podRequests := ba.calculatePodResourceRequestList(pod, ba.resources)
|
|
if ba.isBestEffortPod(podRequests) {
|
|
// Skip BalancedAllocation scoring for best-effort pods to
|
|
// prevent a large number of pods from being scheduled to the same node.
|
|
// See https://github.com/kubernetes/kubernetes/issues/129138 for details.
|
|
return fwk.NewStatus(fwk.Skip)
|
|
}
|
|
state := &balancedAllocationPreScoreState{
|
|
podRequests: podRequests,
|
|
}
|
|
if ba.enableDRAExtendedResource {
|
|
draPreScoreState, status := getDRAPreScoredParams(ba.draManager, ba.resources)
|
|
if status != nil {
|
|
return status
|
|
}
|
|
if draPreScoreState != nil {
|
|
state.draPreScoreState = draPreScoreState
|
|
}
|
|
}
|
|
cycleState.Write(balancedAllocationPreScoreStateKey, state)
|
|
return nil
|
|
}
|
|
|
|
func getBalancedAllocationPreScoreState(cycleState fwk.CycleState) (*balancedAllocationPreScoreState, error) {
|
|
c, err := cycleState.Read(balancedAllocationPreScoreStateKey)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("reading %q from cycleState: %w", balancedAllocationPreScoreStateKey, err)
|
|
}
|
|
|
|
s, ok := c.(*balancedAllocationPreScoreState)
|
|
if !ok {
|
|
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
|
|
}
|
|
return s, nil
|
|
}
|
|
|
|
// Name returns name of the plugin. It is used in logs, etc.
|
|
func (ba *BalancedAllocation) Name() string {
|
|
return BalancedAllocationName
|
|
}
|
|
|
|
// Filtering and scoring based on the container resources and overheads.
|
|
func (pl *BalancedAllocation) SignPod(ctx context.Context, pod *v1.Pod) ([]fwk.SignFragment, *fwk.Status) {
|
|
opts := ResourceRequestsOptions{
|
|
EnablePodLevelResources: pl.enablePodLevelResources,
|
|
EnableDRAExtendedResource: pl.enableDRAExtendedResource,
|
|
}
|
|
|
|
if pl.enableDRAExtendedResource {
|
|
return nil, fwk.NewStatus(fwk.Unschedulable, "signature disabled when dra extended resources enabled")
|
|
}
|
|
|
|
return []fwk.SignFragment{
|
|
{Key: fwk.ResourcesSignerName, Value: computePodResourceRequest(pod, opts)},
|
|
}, nil
|
|
}
|
|
|
|
// Score invoked at the score extension point.
|
|
func (ba *BalancedAllocation) Score(ctx context.Context, state fwk.CycleState, pod *v1.Pod, nodeInfo fwk.NodeInfo) (int64, *fwk.Status) {
|
|
s, err := getBalancedAllocationPreScoreState(state)
|
|
if err != nil {
|
|
s = &balancedAllocationPreScoreState{podRequests: ba.calculatePodResourceRequestList(pod, ba.resources)}
|
|
if ba.isBestEffortPod(s.podRequests) {
|
|
return 0, nil
|
|
}
|
|
if ba.enableDRAExtendedResource {
|
|
draPreScoreState, status := getDRAPreScoredParams(ba.draManager, ba.resources)
|
|
if status != nil {
|
|
return 0, status
|
|
}
|
|
if draPreScoreState != nil {
|
|
s.draPreScoreState = draPreScoreState
|
|
}
|
|
}
|
|
}
|
|
|
|
// ba.score favors nodes with balanced resource usage rate.
|
|
// It calculates the standard deviation for those resources and prioritizes the node based on how close the usage of those resources is to each other.
|
|
// Detail: score = (1 - std) * MaxNodeScore, where std is calculated by the root square of Σ((fraction(i)-mean)^2)/len(resources)
|
|
// The algorithm is partly inspired by:
|
|
// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization"
|
|
return ba.score(ctx, pod, nodeInfo, s.podRequests, s.draPreScoreState)
|
|
}
|
|
|
|
// ScoreExtensions of the Score plugin.
|
|
func (ba *BalancedAllocation) ScoreExtensions() fwk.ScoreExtensions {
|
|
return nil
|
|
}
|
|
|
|
// NewBalancedAllocation initializes a new plugin and returns it.
|
|
func NewBalancedAllocation(_ context.Context, baArgs runtime.Object, h fwk.Handle, fts feature.Features) (fwk.Plugin, error) {
|
|
args, ok := baArgs.(*config.NodeResourcesBalancedAllocationArgs)
|
|
if !ok {
|
|
return nil, fmt.Errorf("want args to be of type NodeResourcesBalancedAllocationArgs, got %T", baArgs)
|
|
}
|
|
|
|
if err := validation.ValidateNodeResourcesBalancedAllocationArgs(nil, args); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &BalancedAllocation{
|
|
handle: h,
|
|
resourceAllocationScorer: resourceAllocationScorer{
|
|
Name: BalancedAllocationName,
|
|
enableInPlacePodVerticalScaling: fts.EnableInPlacePodVerticalScaling,
|
|
enablePodLevelResources: fts.EnablePodLevelResources,
|
|
enableDRAExtendedResource: fts.EnableDRAExtendedResource,
|
|
scorer: balancedResourceScorer,
|
|
useRequested: true,
|
|
resources: args.Resources,
|
|
enableInPlacePodLevelResourcesVerticalScaling: fts.EnableInPlacePodLevelResourcesVerticalScaling,
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
func balancedResourceScorer(requested, allocable []int64) int64 {
|
|
var resourceToFractions []float64
|
|
var totalFraction float64
|
|
for i := range requested {
|
|
if allocable[i] == 0 {
|
|
continue
|
|
}
|
|
fraction := float64(requested[i]) / float64(allocable[i])
|
|
if fraction > 1 {
|
|
fraction = 1
|
|
}
|
|
totalFraction += fraction
|
|
resourceToFractions = append(resourceToFractions, fraction)
|
|
}
|
|
|
|
std := 0.0
|
|
|
|
// For most cases, resources are limited to cpu and memory, the std could be simplified to std := (fraction1-fraction2)/2
|
|
// len(fractions) > 2: calculate std based on the well-known formula - root square of Σ((fraction(i)-mean)^2)/len(fractions)
|
|
// Otherwise, set the std to zero is enough.
|
|
if len(resourceToFractions) == 2 {
|
|
std = math.Abs((resourceToFractions[0] - resourceToFractions[1]) / 2)
|
|
} else if len(resourceToFractions) > 2 {
|
|
mean := totalFraction / float64(len(resourceToFractions))
|
|
var sum float64
|
|
for _, fraction := range resourceToFractions {
|
|
sum = sum + (fraction-mean)*(fraction-mean)
|
|
}
|
|
std = math.Sqrt(sum / float64(len(resourceToFractions)))
|
|
}
|
|
|
|
// STD (standard deviation) is always a positive value. 1-deviation lets the score to be higher for node which has least deviation and
|
|
// multiplying it with `MaxNodeScore` provides the scaling factor needed.
|
|
return int64((1 - std) * float64(fwk.MaxNodeScore))
|
|
}
|