2019-10-09 02:19:24 -04:00
/ *
Copyright 2019 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package nodeaffinity
import (
2019-08-28 07:12:02 -04:00
"context"
2019-10-16 04:42:26 -04:00
"fmt"
2019-08-28 07:12:02 -04:00
2023-01-05 12:48:46 -05:00
v1 "k8s.io/api/core/v1"
2022-03-14 15:37:03 -04:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2019-10-09 02:19:24 -04:00
"k8s.io/apimachinery/pkg/runtime"
2022-03-14 15:37:03 -04:00
"k8s.io/apimachinery/pkg/util/sets"
2020-11-02 17:21:13 -05:00
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
2023-12-13 22:08:50 -05:00
"k8s.io/klog/v2"
2025-05-21 11:21:27 -04:00
fwk "k8s.io/kube-scheduler/framework"
2020-10-23 13:21:10 -04:00
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
2024-09-08 01:54:46 -04:00
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
2021-06-09 15:31:56 -04:00
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
2023-12-13 22:08:50 -05:00
"k8s.io/kubernetes/pkg/scheduler/util"
2019-10-09 02:19:24 -04:00
)
// NodeAffinity is a plugin that checks if a pod node selector matches the node label.
2019-10-16 04:42:26 -04:00
type NodeAffinity struct {
2025-07-24 07:48:07 -04:00
handle fwk . Handle
2024-09-08 01:54:46 -04:00
addedNodeSelector * nodeaffinity . NodeSelector
addedPrefSchedTerms * nodeaffinity . PreferredSchedulingTerms
enableSchedulingQueueHint bool
2019-10-16 04:42:26 -04:00
}
2019-10-09 02:19:24 -04:00
2025-07-24 07:48:07 -04:00
var _ fwk . PreFilterPlugin = & NodeAffinity { }
var _ fwk . FilterPlugin = & NodeAffinity { }
var _ fwk . PreScorePlugin = & NodeAffinity { }
var _ fwk . ScorePlugin = & NodeAffinity { }
var _ fwk . EnqueueExtensions = & NodeAffinity { }
2025-11-13 00:51:37 -05:00
var _ fwk . SignPlugin = & NodeAffinity { }
2019-10-09 02:19:24 -04:00
2020-01-13 13:02:19 -05:00
const (
// Name is the name of the plugin used in the plugin registry and configurations.
2021-06-09 15:31:56 -04:00
Name = names . NodeAffinity
2020-01-13 13:02:19 -05:00
2020-11-25 22:19:52 -05:00
// preScoreStateKey is the key in CycleState to NodeAffinity pre-computed data for Scoring.
preScoreStateKey = "PreScore" + Name
2021-02-18 15:58:05 -05:00
// preFilterStateKey is the key in CycleState to NodeAffinity pre-compute data for Filtering.
preFilterStateKey = "PreFilter" + Name
2020-10-23 13:21:10 -04:00
// ErrReasonPod is the reason for Pod's node affinity/selector not matching.
2020-12-26 01:15:08 -05:00
ErrReasonPod = "node(s) didn't match Pod's node affinity/selector"
2020-10-23 13:21:10 -04:00
// errReasonEnforced is the reason for added node affinity not matching.
errReasonEnforced = "node(s) didn't match scheduler-enforced node affinity"
2022-03-14 15:37:03 -04:00
// errReasonConflict is the reason for pod's conflicting affinity rules.
errReasonConflict = "pod affinity terms conflict"
2020-01-13 13:02:19 -05:00
)
2019-10-09 02:19:24 -04:00
// Name returns name of the plugin. It is used in logs, etc.
func ( pl * NodeAffinity ) Name ( ) string {
return Name
}
2025-11-13 00:51:37 -05:00
// Node affinity filtering and scoring depend on NodeAffinity and NodeSelectors.
func ( pl * NodeAffinity ) SignPod ( ctx context . Context , pod * v1 . Pod ) ( [ ] fwk . SignFragment , * fwk . Status ) {
aff , err := fwk . NodeAffinitySigner ( pod )
if err != nil {
return nil , fwk . AsStatus ( err )
}
return [ ] fwk . SignFragment {
{ Key : fwk . NodeAffinitySignerName , Value : aff } ,
{ Key : fwk . NodeSelectorSignerName , Value : pod . Spec . NodeSelector } ,
} , nil
}
2021-02-18 15:58:05 -05:00
type preFilterState struct {
2021-03-09 16:33:02 -05:00
requiredNodeSelectorAndAffinity nodeaffinity . RequiredNodeAffinity
2021-02-18 15:58:05 -05:00
}
// Clone just returns the same state because it is not affected by pod additions or deletions.
2025-05-21 11:21:27 -04:00
func ( s * preFilterState ) Clone ( ) fwk . StateData {
2021-02-18 15:58:05 -05:00
return s
}
2021-03-09 04:54:59 -05:00
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
2025-06-26 11:06:29 -04:00
func ( pl * NodeAffinity ) EventsToRegister ( _ context . Context ) ( [ ] fwk . ClusterEventWithHint , error ) {
2024-09-08 01:54:46 -04:00
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
2025-06-26 11:06:29 -04:00
nodeActionType := fwk . Add | fwk . UpdateNodeLabel | fwk . UpdateNodeTaint
2024-09-08 01:54:46 -04:00
if pl . enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence we can use UpdateNodeLabel instead of Update.
2025-06-26 11:06:29 -04:00
nodeActionType = fwk . Add | fwk . UpdateNodeLabel
2024-09-08 01:54:46 -04:00
}
2025-06-26 11:06:29 -04:00
return [ ] fwk . ClusterEventWithHint {
{ Event : fwk . ClusterEvent { Resource : fwk . Node , ActionType : nodeActionType } , QueueingHintFn : pl . isSchedulableAfterNodeChange } ,
2024-07-16 01:47:31 -04:00
} , nil
2021-03-09 04:54:59 -05:00
}
2023-12-13 22:08:50 -05:00
// isSchedulableAfterNodeChange is invoked whenever a node changed. It checks whether
// that change made a previously unschedulable pod schedulable.
2025-06-26 11:06:29 -04:00
func ( pl * NodeAffinity ) isSchedulableAfterNodeChange ( logger klog . Logger , pod * v1 . Pod , oldObj , newObj interface { } ) ( fwk . QueueingHint , error ) {
2024-09-18 09:36:52 -04:00
originalNode , modifiedNode , err := util . As [ * v1 . Node ] ( oldObj , newObj )
2023-12-13 22:08:50 -05:00
if err != nil {
2025-06-26 11:06:29 -04:00
return fwk . Queue , err
2023-12-13 22:08:50 -05:00
}
if pl . addedNodeSelector != nil && ! pl . addedNodeSelector . Match ( modifiedNode ) {
logger . V ( 4 ) . Info ( "added or modified node didn't match scheduler-enforced node affinity and this event won't make the Pod schedulable" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( modifiedNode ) )
2025-06-26 11:06:29 -04:00
return fwk . QueueSkip , nil
2023-12-13 22:08:50 -05:00
}
requiredNodeAffinity := nodeaffinity . GetRequiredNodeAffinity ( pod )
isMatched , err := requiredNodeAffinity . Match ( modifiedNode )
if err != nil {
2025-06-26 11:06:29 -04:00
return fwk . Queue , err
2023-12-13 22:08:50 -05:00
}
2024-09-18 09:36:52 -04:00
if ! isMatched {
logger . V ( 5 ) . Info ( "node was created or updated, but the pod's NodeAffinity doesn't match" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( modifiedNode ) )
2025-06-26 11:06:29 -04:00
return fwk . QueueSkip , nil
2024-09-18 09:36:52 -04:00
}
// Since the node was added and it matches the pod's affinity criteria, we can unblock it.
if originalNode == nil {
logger . V ( 5 ) . Info ( "node was created, and matches with the pod's NodeAffinity" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( modifiedNode ) )
2025-06-26 11:06:29 -04:00
return fwk . Queue , nil
2023-12-13 22:08:50 -05:00
}
2024-09-18 09:36:52 -04:00
// At this point we know the operation is update so we can narrow down the criteria to unmatch -> match changes only
// (necessary affinity label was added to the node in this case).
wasMatched , err := requiredNodeAffinity . Match ( originalNode )
if err != nil {
2025-06-26 11:06:29 -04:00
return fwk . Queue , err
2024-09-18 09:36:52 -04:00
}
if wasMatched {
logger . V ( 5 ) . Info ( "node updated, but the pod's NodeAffinity hasn't changed" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( modifiedNode ) )
2025-06-26 11:06:29 -04:00
return fwk . QueueSkip , nil
2024-09-18 09:36:52 -04:00
}
logger . V ( 5 ) . Info ( "node was updated and the pod's NodeAffinity changed to matched" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( modifiedNode ) )
2025-06-26 11:06:29 -04:00
return fwk . Queue , nil
2023-12-13 22:08:50 -05:00
}
2021-02-18 15:58:05 -05:00
// PreFilter builds and writes cycle state used by Filter.
2025-07-24 07:48:07 -04:00
func ( pl * NodeAffinity ) PreFilter ( ctx context . Context , cycleState fwk . CycleState , pod * v1 . Pod , nodes [ ] fwk . NodeInfo ) ( * fwk . PreFilterResult , * fwk . Status ) {
2022-03-14 15:37:03 -04:00
affinity := pod . Spec . Affinity
2023-01-05 12:48:46 -05:00
noNodeAffinity := ( affinity == nil ||
2022-03-14 15:37:03 -04:00
affinity . NodeAffinity == nil ||
2023-01-05 12:48:46 -05:00
affinity . NodeAffinity . RequiredDuringSchedulingIgnoredDuringExecution == nil )
if noNodeAffinity && pl . addedNodeSelector == nil && pod . Spec . NodeSelector == nil {
// NodeAffinity Filter has nothing to do with the Pod.
2025-06-03 18:59:50 -04:00
return nil , fwk . NewStatus ( fwk . Skip )
2023-01-05 12:48:46 -05:00
}
state := & preFilterState { requiredNodeSelectorAndAffinity : nodeaffinity . GetRequiredNodeAffinity ( pod ) }
cycleState . Write ( preFilterStateKey , state )
if noNodeAffinity || len ( affinity . NodeAffinity . RequiredDuringSchedulingIgnoredDuringExecution . NodeSelectorTerms ) == 0 {
2022-03-14 15:37:03 -04:00
return nil , nil
}
// Check if there is affinity to a specific node and return it.
terms := affinity . NodeAffinity . RequiredDuringSchedulingIgnoredDuringExecution . NodeSelectorTerms
2023-03-27 03:46:13 -04:00
var nodeNames sets . Set [ string ]
2022-03-14 15:37:03 -04:00
for _ , t := range terms {
2023-03-27 03:46:13 -04:00
var termNodeNames sets . Set [ string ]
2022-03-14 15:37:03 -04:00
for _ , r := range t . MatchFields {
if r . Key == metav1 . ObjectNameField && r . Operator == v1 . NodeSelectorOpIn {
// The requirements represent ANDed constraints, and so we need to
// find the intersection of nodes.
2023-03-27 03:46:13 -04:00
s := sets . New ( r . Values ... )
2022-03-14 15:37:03 -04:00
if termNodeNames == nil {
termNodeNames = s
} else {
termNodeNames = termNodeNames . Intersection ( s )
}
}
}
if termNodeNames == nil {
// If this term has no node.Name field affinity,
// then all nodes are eligible because the terms are ORed.
return nil , nil
}
nodeNames = nodeNames . Union ( termNodeNames )
}
2023-03-10 07:08:31 -05:00
// If nodeNames is not nil, but length is 0, it means each term have conflicting affinity to node.Name;
// therefore, pod will not match any node.
if nodeNames != nil && len ( nodeNames ) == 0 {
2025-06-03 18:59:50 -04:00
return nil , fwk . NewStatus ( fwk . UnschedulableAndUnresolvable , errReasonConflict )
2023-03-10 07:08:31 -05:00
} else if len ( nodeNames ) > 0 {
2025-07-24 07:48:07 -04:00
return & fwk . PreFilterResult { NodeNames : nodeNames } , nil
2022-03-14 15:37:03 -04:00
}
2022-03-10 17:48:33 -05:00
return nil , nil
2022-03-14 15:37:03 -04:00
2021-02-18 15:58:05 -05:00
}
// PreFilterExtensions not necessary for this plugin as state doesn't depend on pod additions or deletions.
2025-07-24 07:48:07 -04:00
func ( pl * NodeAffinity ) PreFilterExtensions ( ) fwk . PreFilterExtensions {
2021-02-18 15:58:05 -05:00
return nil
}
2020-10-23 13:21:10 -04:00
// Filter checks if the Node matches the Pod .spec.affinity.nodeAffinity and
// the plugin's added affinity.
2025-06-11 05:56:18 -04:00
func ( pl * NodeAffinity ) Filter ( ctx context . Context , state fwk . CycleState , pod * v1 . Pod , nodeInfo fwk . NodeInfo ) * fwk . Status {
2020-01-05 14:57:44 -05:00
node := nodeInfo . Node ( )
2023-05-05 22:59:23 -04:00
2020-10-23 13:21:10 -04:00
if pl . addedNodeSelector != nil && ! pl . addedNodeSelector . Match ( node ) {
2025-06-03 18:59:50 -04:00
return fwk . NewStatus ( fwk . UnschedulableAndUnresolvable , errReasonEnforced )
2020-10-23 13:21:10 -04:00
}
2021-02-18 15:58:05 -05:00
s , err := getPreFilterState ( state )
if err != nil {
// Fallback to calculate requiredNodeSelector and requiredNodeAffinity
// here when PreFilter is disabled.
2021-03-09 16:33:02 -05:00
s = & preFilterState { requiredNodeSelectorAndAffinity : nodeaffinity . GetRequiredNodeAffinity ( pod ) }
2021-02-18 15:58:05 -05:00
}
2021-03-09 16:33:02 -05:00
// Ignore parsing errors for backwards compatibility.
match , _ := s . requiredNodeSelectorAndAffinity . Match ( node )
if ! match {
2025-06-03 18:59:50 -04:00
return fwk . NewStatus ( fwk . UnschedulableAndUnresolvable , ErrReasonPod )
2020-01-05 14:57:44 -05:00
}
2021-03-09 16:33:02 -05:00
2020-01-05 14:57:44 -05:00
return nil
2019-10-09 02:19:24 -04:00
}
2020-11-25 22:19:52 -05:00
// preScoreState computed at PreScore and used at Score.
type preScoreState struct {
preferredNodeAffinity * nodeaffinity . PreferredSchedulingTerms
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
2025-05-21 11:21:27 -04:00
func ( s * preScoreState ) Clone ( ) fwk . StateData {
2020-11-25 22:19:52 -05:00
return s
}
// PreScore builds and writes cycle state used by Score and NormalizeScore.
2025-06-11 05:56:18 -04:00
func ( pl * NodeAffinity ) PreScore ( ctx context . Context , cycleState fwk . CycleState , pod * v1 . Pod , nodes [ ] fwk . NodeInfo ) * fwk . Status {
2020-11-25 22:19:52 -05:00
preferredNodeAffinity , err := getPodPreferredNodeAffinity ( pod )
if err != nil {
2025-06-03 18:59:50 -04:00
return fwk . AsStatus ( err )
2020-11-25 22:19:52 -05:00
}
2023-03-30 20:04:01 -04:00
if preferredNodeAffinity == nil && pl . addedPrefSchedTerms == nil {
// NodeAffinity Score has nothing to do with the Pod.
2025-06-03 18:59:50 -04:00
return fwk . NewStatus ( fwk . Skip )
2023-03-30 20:04:01 -04:00
}
2020-11-25 22:19:52 -05:00
state := & preScoreState {
preferredNodeAffinity : preferredNodeAffinity ,
}
cycleState . Write ( preScoreStateKey , state )
return nil
}
2020-10-23 13:21:10 -04:00
// Score returns the sum of the weights of the terms that match the Node.
// Terms came from the Pod .spec.affinity.nodeAffinity and from the plugin's
// default affinity.
2025-06-11 05:56:18 -04:00
func ( pl * NodeAffinity ) Score ( ctx context . Context , state fwk . CycleState , pod * v1 . Pod , nodeInfo fwk . NodeInfo ) ( int64 , * fwk . Status ) {
2019-12-24 04:07:56 -05:00
node := nodeInfo . Node ( )
var count int64
2020-10-23 13:21:10 -04:00
if pl . addedPrefSchedTerms != nil {
count += pl . addedPrefSchedTerms . Score ( node )
}
2020-11-25 22:19:52 -05:00
s , err := getPreScoreState ( state )
if err != nil {
2021-02-18 15:58:05 -05:00
// Fallback to calculate preferredNodeAffinity here when PreScore is disabled.
2020-11-25 22:19:52 -05:00
preferredNodeAffinity , err := getPodPreferredNodeAffinity ( pod )
2020-11-02 17:21:13 -05:00
if err != nil {
2025-06-03 18:59:50 -04:00
return 0 , fwk . AsStatus ( err )
2019-12-24 04:07:56 -05:00
}
2020-11-25 22:19:52 -05:00
s = & preScoreState {
preferredNodeAffinity : preferredNodeAffinity ,
}
}
if s . preferredNodeAffinity != nil {
count += s . preferredNodeAffinity . Score ( node )
2019-12-24 04:07:56 -05:00
}
return count , nil
2019-10-16 04:42:26 -04:00
}
// NormalizeScore invoked after scoring all nodes.
2025-07-24 07:48:07 -04:00
func ( pl * NodeAffinity ) NormalizeScore ( ctx context . Context , state fwk . CycleState , pod * v1 . Pod , scores fwk . NodeScoreList ) * fwk . Status {
return helper . DefaultNormalizeScore ( fwk . MaxNodeScore , false , scores )
2019-10-16 04:42:26 -04:00
}
// ScoreExtensions of the Score plugin.
2025-07-24 07:48:07 -04:00
func ( pl * NodeAffinity ) ScoreExtensions ( ) fwk . ScoreExtensions {
2019-10-16 04:42:26 -04:00
return pl
}
2019-10-09 02:19:24 -04:00
// New initializes a new plugin and returns it.
2025-07-24 07:48:07 -04:00
func New ( _ context . Context , plArgs runtime . Object , h fwk . Handle , fts feature . Features ) ( fwk . Plugin , error ) {
2020-10-23 13:21:10 -04:00
args , err := getArgs ( plArgs )
if err != nil {
return nil , err
}
pl := & NodeAffinity {
2024-09-08 01:54:46 -04:00
handle : h ,
enableSchedulingQueueHint : fts . EnableSchedulingQueueHint ,
2020-10-23 13:21:10 -04:00
}
if args . AddedAffinity != nil {
if ns := args . AddedAffinity . RequiredDuringSchedulingIgnoredDuringExecution ; ns != nil {
pl . addedNodeSelector , err = nodeaffinity . NewNodeSelector ( ns )
if err != nil {
return nil , fmt . Errorf ( "parsing addedAffinity.requiredDuringSchedulingIgnoredDuringExecution: %w" , err )
}
}
// TODO: parse requiredDuringSchedulingRequiredDuringExecution when it gets added to the API.
if terms := args . AddedAffinity . PreferredDuringSchedulingIgnoredDuringExecution ; len ( terms ) != 0 {
pl . addedPrefSchedTerms , err = nodeaffinity . NewPreferredSchedulingTerms ( terms )
if err != nil {
return nil , fmt . Errorf ( "parsing addedAffinity.preferredDuringSchedulingIgnoredDuringExecution: %w" , err )
}
}
}
return pl , nil
}
func getArgs ( obj runtime . Object ) ( config . NodeAffinityArgs , error ) {
ptr , ok := obj . ( * config . NodeAffinityArgs )
if ! ok {
return config . NodeAffinityArgs { } , fmt . Errorf ( "args are not of type NodeAffinityArgs, got %T" , obj )
}
2021-03-25 06:18:21 -04:00
return * ptr , validation . ValidateNodeAffinityArgs ( nil , ptr )
2019-10-09 02:19:24 -04:00
}
2020-11-25 22:19:52 -05:00
func getPodPreferredNodeAffinity ( pod * v1 . Pod ) ( * nodeaffinity . PreferredSchedulingTerms , error ) {
affinity := pod . Spec . Affinity
if affinity != nil && affinity . NodeAffinity != nil && affinity . NodeAffinity . PreferredDuringSchedulingIgnoredDuringExecution != nil {
return nodeaffinity . NewPreferredSchedulingTerms ( affinity . NodeAffinity . PreferredDuringSchedulingIgnoredDuringExecution )
}
return nil , nil
}
2025-05-21 11:21:27 -04:00
func getPreScoreState ( cycleState fwk . CycleState ) ( * preScoreState , error ) {
2020-11-25 22:19:52 -05:00
c , err := cycleState . Read ( preScoreStateKey )
if err != nil {
2021-02-23 20:57:28 -05:00
return nil , fmt . Errorf ( "reading %q from cycleState: %w" , preScoreStateKey , err )
2020-11-25 22:19:52 -05:00
}
s , ok := c . ( * preScoreState )
if ! ok {
return nil , fmt . Errorf ( "invalid PreScore state, got type %T" , c )
}
return s , nil
}
2021-02-18 15:58:05 -05:00
2025-05-21 11:21:27 -04:00
func getPreFilterState ( cycleState fwk . CycleState ) ( * preFilterState , error ) {
2021-02-18 15:58:05 -05:00
c , err := cycleState . Read ( preFilterStateKey )
if err != nil {
return nil , fmt . Errorf ( "reading %q from cycleState: %v" , preFilterStateKey , err )
}
s , ok := c . ( * preFilterState )
if ! ok {
return nil , fmt . Errorf ( "invalid PreFilter state, got type %T" , c )
}
return s , nil
}