2022-04-12 07:41:56 -04:00
/ *
Copyright 2022 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package dynamicresources
import (
"context"
"errors"
"fmt"
"sort"
"sync"
2023-02-13 03:34:11 -05:00
"github.com/google/go-cmp/cmp"
2022-04-12 07:41:56 -04:00
v1 "k8s.io/api/core/v1"
2023-03-06 06:43:58 -05:00
resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
2023-02-13 03:34:11 -05:00
apiequality "k8s.io/apimachinery/pkg/api/equality"
2022-04-12 07:41:56 -04:00
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
2023-08-30 03:13:31 -04:00
"k8s.io/apimachinery/pkg/runtime/schema"
2022-04-12 07:41:56 -04:00
"k8s.io/apimachinery/pkg/util/sets"
2023-09-06 13:44:29 -04:00
resourcev1alpha2apply "k8s.io/client-go/applyconfigurations/resource/v1alpha2"
2022-04-12 07:41:56 -04:00
"k8s.io/client-go/kubernetes"
2023-03-06 06:43:58 -05:00
resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2"
2022-04-12 07:41:56 -04:00
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/dynamic-resource-allocation/resourceclaim"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
2023-02-13 03:34:11 -05:00
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
2023-09-05 09:01:09 -04:00
"k8s.io/utils/ptr"
2022-04-12 07:41:56 -04:00
)
const (
// Name is the name of the plugin used in Registry and configurations.
Name = names . DynamicResources
stateKey framework . StateKey = Name
)
// The state is initialized in PreFilter phase. Because we save the pointer in
// framework.CycleState, in the later phases we don't need to call Write method
// to update the value
type stateData struct {
2023-08-30 03:13:31 -04:00
// preScored is true if PreScore was invoked.
preScored bool
2022-04-12 07:41:56 -04:00
// A copy of all claims for the Pod (i.e. 1:1 match with
// pod.Spec.ResourceClaims), initially with the status from the start
// of the scheduling cycle. Each claim instance is read-only because it
// might come from the informer cache. The instances get replaced when
// the plugin itself successfully does an Update.
//
// Empty if the Pod has no claims.
2023-03-06 06:43:58 -05:00
claims [ ] * resourcev1alpha2 . ResourceClaim
2022-04-12 07:41:56 -04:00
// The indices of all claims that:
// - are allocated
// - use delayed allocation
// - were not available on at least one node
//
// Set in parallel during Filter, so write access there must be
// protected by the mutex. Used by PostFilter.
2023-09-28 09:30:28 -04:00
unavailableClaims sets . Set [ int ]
2022-04-12 07:41:56 -04:00
2023-08-30 03:13:31 -04:00
// podSchedulingState keeps track of the PodSchedulingContext
// (if one exists) and the changes made to it.
podSchedulingState podSchedulingState
2022-04-12 07:41:56 -04:00
mutex sync . Mutex
2023-08-13 05:34:16 -04:00
informationsForClaim [ ] informationForClaim
}
type informationForClaim struct {
// The availableOnNode node filter of the claim converted from the
// v1 API to nodeaffinity.NodeSelector by PreFilter for repeated
// evaluation in Filter. Nil for claim which don't have it.
availableOnNode * nodeaffinity . NodeSelector
// The status of the claim got from the
// schedulingCtx by PreFilter for repeated
// evaluation in Filter. Nil for claim which don't have it.
status * resourcev1alpha2 . ResourceClaimSchedulingStatus
2022-04-12 07:41:56 -04:00
}
func ( d * stateData ) Clone ( ) framework . StateData {
return d
}
2023-03-06 06:43:58 -05:00
func ( d * stateData ) updateClaimStatus ( ctx context . Context , clientset kubernetes . Interface , index int , claim * resourcev1alpha2 . ResourceClaim ) error {
2022-04-12 07:41:56 -04:00
// TODO (#113700): replace with patch operation. Beware that patching must only succeed if the
// object has not been modified in parallel by someone else.
2023-03-06 06:43:58 -05:00
claim , err := clientset . ResourceV1alpha2 ( ) . ResourceClaims ( claim . Namespace ) . UpdateStatus ( ctx , claim , metav1 . UpdateOptions { } )
2022-04-12 07:41:56 -04:00
// TODO: metric for update results, with the operation ("set selected
// node", "set PotentialNodes", etc.) as one dimension.
if err != nil {
return fmt . Errorf ( "update resource claim: %w" , err )
}
// Remember the new instance. This is relevant when the plugin must
// update the same claim multiple times (for example, first reserve
// the claim, then later remove the reservation), because otherwise the second
// update would fail with a "was modified" error.
d . claims [ index ] = claim
return nil
}
2023-08-30 03:13:31 -04:00
type podSchedulingState struct {
// A pointer to the PodSchedulingContext object for the pod, if one exists
// in the API server.
//
// Conceptually, this object belongs into the scheduler framework
// where it might get shared by different plugins. But in practice,
// it is currently only used by dynamic provisioning and thus
// managed entirely here.
schedulingCtx * resourcev1alpha2 . PodSchedulingContext
// selectedNode is set if (and only if) a node has been selected.
selectedNode * string
2022-04-12 07:41:56 -04:00
2023-08-30 03:13:31 -04:00
// potentialNodes is set if (and only if) the potential nodes field
// needs to be updated or set.
potentialNodes * [ ] string
}
2022-04-12 07:41:56 -04:00
2023-08-30 03:13:31 -04:00
func ( p * podSchedulingState ) isDirty ( ) bool {
return p . selectedNode != nil ||
p . potentialNodes != nil
}
// init checks whether there is already a PodSchedulingContext object.
// Must not be called concurrently,
func ( p * podSchedulingState ) init ( ctx context . Context , pod * v1 . Pod , podSchedulingContextLister resourcev1alpha2listers . PodSchedulingContextLister ) error {
2023-03-06 14:57:35 -05:00
schedulingCtx , err := podSchedulingContextLister . PodSchedulingContexts ( pod . Namespace ) . Get ( pod . Name )
2022-04-12 07:41:56 -04:00
switch {
case apierrors . IsNotFound ( err ) :
2023-08-30 03:13:31 -04:00
return nil
2022-04-12 07:41:56 -04:00
case err != nil :
2023-08-30 03:13:31 -04:00
return err
2022-04-12 07:41:56 -04:00
default :
// We have an object, but it might be obsolete.
2023-03-06 14:57:35 -05:00
if ! metav1 . IsControlledBy ( schedulingCtx , pod ) {
2023-08-30 03:13:31 -04:00
return fmt . Errorf ( "PodSchedulingContext object with UID %s is not owned by Pod %s/%s" , schedulingCtx . UID , pod . Namespace , pod . Name )
2022-04-12 07:41:56 -04:00
}
}
2023-08-30 03:13:31 -04:00
p . schedulingCtx = schedulingCtx
return nil
2022-04-12 07:41:56 -04:00
}
2023-08-30 03:13:31 -04:00
// publish creates or updates the PodSchedulingContext object, if necessary.
// Must not be called concurrently.
func ( p * podSchedulingState ) publish ( ctx context . Context , pod * v1 . Pod , clientset kubernetes . Interface ) error {
if ! p . isDirty ( ) {
return nil
}
2022-04-12 07:41:56 -04:00
var err error
logger := klog . FromContext ( ctx )
2023-08-30 03:13:31 -04:00
if p . schedulingCtx != nil {
// Update it.
schedulingCtx := p . schedulingCtx . DeepCopy ( )
if p . selectedNode != nil {
schedulingCtx . Spec . SelectedNode = * p . selectedNode
}
if p . potentialNodes != nil {
schedulingCtx . Spec . PotentialNodes = * p . potentialNodes
}
if loggerV := logger . V ( 6 ) ; loggerV . Enabled ( ) {
// At a high enough log level, dump the entire object.
loggerV . Info ( "Updating PodSchedulingContext" , "podSchedulingCtx" , klog . KObj ( schedulingCtx ) , "podSchedulingCtxObject" , klog . Format ( schedulingCtx ) )
} else {
logger . V ( 5 ) . Info ( "Updating PodSchedulingContext" , "podSchedulingCtx" , klog . KObj ( schedulingCtx ) )
}
_ , err = clientset . ResourceV1alpha2 ( ) . PodSchedulingContexts ( schedulingCtx . Namespace ) . Update ( ctx , schedulingCtx , metav1 . UpdateOptions { } )
2023-09-06 13:44:29 -04:00
if apierrors . IsConflict ( err ) {
// We don't use SSA by default for performance reasons
// (https://github.com/kubernetes/kubernetes/issues/113700#issuecomment-1698563918)
// because most of the time an Update doesn't encounter
// a conflict and is faster.
//
// We could return an error here and rely on
// backoff+retry, but scheduling attempts are expensive
// and the backoff delay would cause a (small)
// slowdown. Therefore we fall back to SSA here if needed.
//
// Using SSA instead of Get+Update has the advantage that
// there is no delay for the Get. SSA is safe because only
// the scheduler updates these fields.
spec := resourcev1alpha2apply . PodSchedulingContextSpec ( )
spec . SelectedNode = p . selectedNode
if p . potentialNodes != nil {
spec . PotentialNodes = * p . potentialNodes
} else {
// Unchanged. Has to be set because the object that we send
// must represent the "fully specified intent". Not sending
// the list would clear it.
spec . PotentialNodes = p . schedulingCtx . Spec . PotentialNodes
}
schedulingCtxApply := resourcev1alpha2apply . PodSchedulingContext ( pod . Name , pod . Namespace ) . WithSpec ( spec )
if loggerV := logger . V ( 6 ) ; loggerV . Enabled ( ) {
// At a high enough log level, dump the entire object.
loggerV . Info ( "Patching PodSchedulingContext" , "podSchedulingCtx" , klog . KObj ( pod ) , "podSchedulingCtxApply" , klog . Format ( schedulingCtxApply ) )
} else {
logger . V ( 5 ) . Info ( "Patching PodSchedulingContext" , "podSchedulingCtx" , klog . KObj ( pod ) )
}
_ , err = clientset . ResourceV1alpha2 ( ) . PodSchedulingContexts ( pod . Namespace ) . Apply ( ctx , schedulingCtxApply , metav1 . ApplyOptions { FieldManager : "kube-scheduler" , Force : true } )
}
2022-04-12 07:41:56 -04:00
} else {
2023-08-30 03:13:31 -04:00
// Create it.
schedulingCtx := & resourcev1alpha2 . PodSchedulingContext {
ObjectMeta : metav1 . ObjectMeta {
Name : pod . Name ,
Namespace : pod . Namespace ,
OwnerReferences : [ ] metav1 . OwnerReference { * metav1 . NewControllerRef ( pod , schema . GroupVersionKind { Version : "v1" , Kind : "Pod" } ) } ,
} ,
}
if p . selectedNode != nil {
schedulingCtx . Spec . SelectedNode = * p . selectedNode
}
if p . potentialNodes != nil {
schedulingCtx . Spec . PotentialNodes = * p . potentialNodes
}
if loggerV := logger . V ( 6 ) ; loggerV . Enabled ( ) {
// At a high enough log level, dump the entire object.
loggerV . Info ( "Creating PodSchedulingContext" , "podSchedulingCtx" , klog . KObj ( schedulingCtx ) , "podSchedulingCtxObject" , klog . Format ( schedulingCtx ) )
} else {
logger . V ( 5 ) . Info ( "Creating PodSchedulingContext" , "podSchedulingCtx" , klog . KObj ( schedulingCtx ) )
}
_ , err = clientset . ResourceV1alpha2 ( ) . PodSchedulingContexts ( schedulingCtx . Namespace ) . Create ( ctx , schedulingCtx , metav1 . CreateOptions { } )
2022-04-12 07:41:56 -04:00
}
if err != nil {
return err
}
2023-08-30 03:13:31 -04:00
p . potentialNodes = nil
p . selectedNode = nil
2022-04-12 07:41:56 -04:00
return nil
}
2023-03-06 14:57:35 -05:00
func statusForClaim ( schedulingCtx * resourcev1alpha2 . PodSchedulingContext , podClaimName string ) * resourcev1alpha2 . ResourceClaimSchedulingStatus {
2023-08-30 03:13:31 -04:00
if schedulingCtx == nil {
return nil
}
2023-03-06 14:57:35 -05:00
for _ , status := range schedulingCtx . Status . ResourceClaims {
2022-04-12 07:41:56 -04:00
if status . Name == podClaimName {
return & status
}
}
return nil
}
// dynamicResources is a plugin that ensures that ResourceClaims are allocated.
type dynamicResources struct {
2023-03-06 14:57:35 -05:00
enabled bool
2023-02-13 03:34:11 -05:00
fh framework . Handle
2023-03-06 14:57:35 -05:00
clientset kubernetes . Interface
claimLister resourcev1alpha2listers . ResourceClaimLister
classLister resourcev1alpha2listers . ResourceClassLister
podSchedulingContextLister resourcev1alpha2listers . PodSchedulingContextLister
2022-04-12 07:41:56 -04:00
}
// New initializes a new plugin and returns it.
2023-09-05 23:55:33 -04:00
func New ( _ context . Context , plArgs runtime . Object , fh framework . Handle , fts feature . Features ) ( framework . Plugin , error ) {
2022-04-12 07:41:56 -04:00
if ! fts . EnableDynamicResourceAllocation {
// Disabled, won't do anything.
return & dynamicResources { } , nil
}
return & dynamicResources {
2023-03-06 14:57:35 -05:00
enabled : true ,
2023-02-13 03:34:11 -05:00
fh : fh ,
2023-03-06 14:57:35 -05:00
clientset : fh . ClientSet ( ) ,
claimLister : fh . SharedInformerFactory ( ) . Resource ( ) . V1alpha2 ( ) . ResourceClaims ( ) . Lister ( ) ,
classLister : fh . SharedInformerFactory ( ) . Resource ( ) . V1alpha2 ( ) . ResourceClasses ( ) . Lister ( ) ,
podSchedulingContextLister : fh . SharedInformerFactory ( ) . Resource ( ) . V1alpha2 ( ) . PodSchedulingContexts ( ) . Lister ( ) ,
2022-04-12 07:41:56 -04:00
} , nil
}
2023-02-13 03:34:11 -05:00
var _ framework . PreEnqueuePlugin = & dynamicResources { }
2022-04-12 07:41:56 -04:00
var _ framework . PreFilterPlugin = & dynamicResources { }
var _ framework . FilterPlugin = & dynamicResources { }
var _ framework . PostFilterPlugin = & dynamicResources { }
var _ framework . PreScorePlugin = & dynamicResources { }
var _ framework . ReservePlugin = & dynamicResources { }
var _ framework . EnqueueExtensions = & dynamicResources { }
var _ framework . PostBindPlugin = & dynamicResources { }
// Name returns name of the plugin. It is used in logs, etc.
func ( pl * dynamicResources ) Name ( ) string {
return Name
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
2023-06-08 00:54:30 -04:00
func ( pl * dynamicResources ) EventsToRegister ( ) [ ] framework . ClusterEventWithHint {
2022-04-12 07:41:56 -04:00
if ! pl . enabled {
return nil
}
2023-06-08 00:54:30 -04:00
events := [ ] framework . ClusterEventWithHint {
2022-04-12 07:41:56 -04:00
// Allocation is tracked in ResourceClaims, so any changes may make the pods schedulable.
2023-02-13 03:34:11 -05:00
{ Event : framework . ClusterEvent { Resource : framework . ResourceClaim , ActionType : framework . Add | framework . Update } , QueueingHintFn : pl . isSchedulableAfterClaimChange } ,
2022-04-12 07:41:56 -04:00
// When a driver has provided additional information, a pod waiting for that information
// may be schedulable.
2023-02-13 03:34:11 -05:00
{ Event : framework . ClusterEvent { Resource : framework . PodSchedulingContext , ActionType : framework . Add | framework . Update } , QueueingHintFn : pl . isSchedulableAfterPodSchedulingContextChange } ,
2022-04-12 07:41:56 -04:00
// A resource might depend on node labels for topology filtering.
// A new or updated node may make pods schedulable.
2023-06-08 00:54:30 -04:00
{ Event : framework . ClusterEvent { Resource : framework . Node , ActionType : framework . Add | framework . UpdateNodeLabel } } ,
2023-08-28 12:02:00 -04:00
// A pod might be waiting for a class to get created or modified.
{ Event : framework . ClusterEvent { Resource : framework . ResourceClass , ActionType : framework . Add | framework . Update } } ,
2022-04-12 07:41:56 -04:00
}
return events
}
2023-02-13 03:34:11 -05:00
// PreEnqueue checks if there are known reasons why a pod currently cannot be
// scheduled. When this fails, one of the registered events can trigger another
// attempt.
func ( pl * dynamicResources ) PreEnqueue ( ctx context . Context , pod * v1 . Pod ) ( status * framework . Status ) {
if err := pl . foreachPodResourceClaim ( pod , nil ) ; err != nil {
return statusUnschedulable ( klog . FromContext ( ctx ) , err . Error ( ) )
}
return nil
}
// isSchedulableAfterClaimChange is invoked for all claim events reported by
// an informer. It checks whether that change made a previously unschedulable
// pod schedulable. It errs on the side of letting a pod scheduling attempt
// happen.
2023-07-13 09:45:26 -04:00
func ( pl * dynamicResources ) isSchedulableAfterClaimChange ( logger klog . Logger , pod * v1 . Pod , oldObj , newObj interface { } ) ( framework . QueueingHint , error ) {
2023-02-13 03:34:11 -05:00
if newObj == nil {
// Deletes don't make a pod schedulable.
2023-07-13 09:45:26 -04:00
return framework . QueueSkip , nil
2023-02-13 03:34:11 -05:00
}
2023-07-13 09:45:26 -04:00
originalClaim , modifiedClaim , err := schedutil . As [ * resourcev1alpha2 . ResourceClaim ] ( oldObj , newObj )
2023-02-13 03:34:11 -05:00
if err != nil {
// Shouldn't happen.
2023-10-19 07:02:11 -04:00
return framework . Queue , fmt . Errorf ( "unexpected object in isSchedulableAfterClaimChange: %w" , err )
2023-02-13 03:34:11 -05:00
}
usesClaim := false
if err := pl . foreachPodResourceClaim ( pod , func ( _ string , claim * resourcev1alpha2 . ResourceClaim ) {
if claim . UID == modifiedClaim . UID {
usesClaim = true
}
} ) ; err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
2023-07-11 22:24:35 -04:00
logger . V ( 4 ) . Info ( "pod is not schedulable" , "pod" , klog . KObj ( pod ) , "claim" , klog . KObj ( modifiedClaim ) , "reason" , err . Error ( ) )
2023-07-13 09:45:26 -04:00
return framework . QueueSkip , nil
2023-02-13 03:34:11 -05:00
}
if ! usesClaim {
// This was not the claim the pod was waiting for.
2023-07-11 22:24:35 -04:00
logger . V ( 6 ) . Info ( "unrelated claim got modified" , "pod" , klog . KObj ( pod ) , "claim" , klog . KObj ( modifiedClaim ) )
2023-07-13 09:45:26 -04:00
return framework . QueueSkip , nil
2023-02-13 03:34:11 -05:00
}
2023-07-13 09:45:26 -04:00
if originalClaim == nil {
2023-07-11 22:24:35 -04:00
logger . V ( 4 ) . Info ( "claim for pod got created" , "pod" , klog . KObj ( pod ) , "claim" , klog . KObj ( modifiedClaim ) )
2023-10-19 07:02:11 -04:00
return framework . Queue , nil
2023-02-13 03:34:11 -05:00
}
// Modifications may or may not be relevant. If the entire
// status is as before, then something else must have changed
// and we don't care. What happens in practice is that the
// resource driver adds the finalizer.
if apiequality . Semantic . DeepEqual ( & originalClaim . Status , & modifiedClaim . Status ) {
2023-07-11 22:24:35 -04:00
if loggerV := logger . V ( 7 ) ; loggerV . Enabled ( ) {
2023-02-13 03:34:11 -05:00
// Log more information.
loggerV . Info ( "claim for pod got modified where the pod doesn't care" , "pod" , klog . KObj ( pod ) , "claim" , klog . KObj ( modifiedClaim ) , "diff" , cmp . Diff ( originalClaim , modifiedClaim ) )
} else {
2023-07-11 22:24:35 -04:00
logger . V ( 6 ) . Info ( "claim for pod got modified where the pod doesn't care" , "pod" , klog . KObj ( pod ) , "claim" , klog . KObj ( modifiedClaim ) )
2023-02-13 03:34:11 -05:00
}
2023-07-13 09:45:26 -04:00
return framework . QueueSkip , nil
2023-02-13 03:34:11 -05:00
}
2023-07-11 22:24:35 -04:00
logger . V ( 4 ) . Info ( "status of claim for pod got updated" , "pod" , klog . KObj ( pod ) , "claim" , klog . KObj ( modifiedClaim ) )
2023-10-19 07:02:11 -04:00
return framework . Queue , nil
2023-02-13 03:34:11 -05:00
}
// isSchedulableAfterPodSchedulingContextChange is invoked for all
// PodSchedulingContext events reported by an informer. It checks whether that
// change made a previously unschedulable pod schedulable (updated) or a new
// attempt is needed to re-create the object (deleted). It errs on the side of
// letting a pod scheduling attempt happen.
2023-07-13 09:45:26 -04:00
func ( pl * dynamicResources ) isSchedulableAfterPodSchedulingContextChange ( logger klog . Logger , pod * v1 . Pod , oldObj , newObj interface { } ) ( framework . QueueingHint , error ) {
2023-02-13 03:34:11 -05:00
// Deleted? That can happen because we ourselves delete the PodSchedulingContext while
// working on the pod. This can be ignored.
if oldObj != nil && newObj == nil {
2023-07-11 22:24:35 -04:00
logger . V ( 4 ) . Info ( "PodSchedulingContext got deleted" )
2023-07-13 09:45:26 -04:00
return framework . QueueSkip , nil
2023-02-13 03:34:11 -05:00
}
oldPodScheduling , newPodScheduling , err := schedutil . As [ * resourcev1alpha2 . PodSchedulingContext ] ( oldObj , newObj )
if err != nil {
// Shouldn't happen.
2023-10-19 07:02:11 -04:00
return framework . Queue , fmt . Errorf ( "unexpected object in isSchedulableAfterPodSchedulingContextChange: %w" , err )
2023-02-13 03:34:11 -05:00
}
podScheduling := newPodScheduling // Never nil because deletes are handled above.
if podScheduling . Name != pod . Name || podScheduling . Namespace != pod . Namespace {
2023-07-11 22:24:35 -04:00
logger . V ( 7 ) . Info ( "PodSchedulingContext for unrelated pod got modified" , "pod" , klog . KObj ( pod ) , "podScheduling" , klog . KObj ( podScheduling ) )
2023-07-13 09:45:26 -04:00
return framework . QueueSkip , nil
2023-02-13 03:34:11 -05:00
}
// If the drivers have provided information about all
// unallocated claims with delayed allocation, then the next
// scheduling attempt is able to pick a node, so we let it run
// immediately if this occurred for the first time, otherwise
// we allow backoff.
pendingDelayedClaims := 0
if err := pl . foreachPodResourceClaim ( pod , func ( podResourceName string , claim * resourcev1alpha2 . ResourceClaim ) {
if claim . Spec . AllocationMode == resourcev1alpha2 . AllocationModeWaitForFirstConsumer &&
claim . Status . Allocation == nil &&
! podSchedulingHasClaimInfo ( podScheduling , podResourceName ) {
pendingDelayedClaims ++
}
} ) ; err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
2023-07-11 22:24:35 -04:00
logger . V ( 4 ) . Info ( "pod is not schedulable, keep waiting" , "pod" , klog . KObj ( pod ) , "reason" , err . Error ( ) )
2023-07-13 09:45:26 -04:00
return framework . QueueSkip , nil
2023-02-13 03:34:11 -05:00
}
// Some driver responses missing?
if pendingDelayedClaims > 0 {
// We could start a pod scheduling attempt to refresh the
// potential nodes list. But pod scheduling attempts are
// expensive and doing them too often causes the pod to enter
// backoff. Let's wait instead for all drivers to reply.
2023-07-11 22:24:35 -04:00
if loggerV := logger . V ( 6 ) ; loggerV . Enabled ( ) {
2023-02-13 03:34:11 -05:00
loggerV . Info ( "PodSchedulingContext with missing resource claim information, keep waiting" , "pod" , klog . KObj ( pod ) , "podSchedulingDiff" , cmp . Diff ( oldPodScheduling , podScheduling ) )
} else {
2023-07-11 22:24:35 -04:00
logger . V ( 5 ) . Info ( "PodSchedulingContext with missing resource claim information, keep waiting" , "pod" , klog . KObj ( pod ) )
2023-02-13 03:34:11 -05:00
}
2023-07-13 09:45:26 -04:00
return framework . QueueSkip , nil
2023-02-13 03:34:11 -05:00
}
if oldPodScheduling == nil /* create */ ||
len ( oldPodScheduling . Status . ResourceClaims ) < len ( podScheduling . Status . ResourceClaims ) /* new information and not incomplete (checked above) */ {
// This definitely is new information for the scheduler. Try again immediately.
2023-07-11 22:24:35 -04:00
logger . V ( 4 ) . Info ( "PodSchedulingContext for pod has all required information, schedule immediately" , "pod" , klog . KObj ( pod ) )
2023-10-19 07:02:11 -04:00
return framework . Queue , nil
2023-02-13 03:34:11 -05:00
}
// The other situation where the scheduler needs to do
// something immediately is when the selected node doesn't
// work: waiting in the backoff queue only helps eventually
// resources on the selected node become available again. It's
// much more likely, in particular when trying to fill up the
// cluster, that the choice simply didn't work out. The risk
// here is that in a situation where the cluster really is
// full, backoff won't be used because the scheduler keeps
// trying different nodes. This should not happen when it has
// full knowledge about resource availability (=
// PodSchedulingContext.*.UnsuitableNodes is complete) but may happen
// when it doesn't (= PodSchedulingContext.*.UnsuitableNodes had to be
// truncated).
//
// Truncation only happens for very large clusters and then may slow
// down scheduling, but should not break it completely. This is
// acceptable while DRA is alpha and will be investigated further
// before moving DRA to beta.
if podScheduling . Spec . SelectedNode != "" {
for _ , claimStatus := range podScheduling . Status . ResourceClaims {
if sliceContains ( claimStatus . UnsuitableNodes , podScheduling . Spec . SelectedNode ) {
2023-07-11 22:24:35 -04:00
logger . V ( 5 ) . Info ( "PodSchedulingContext has unsuitable selected node, schedule immediately" , "pod" , klog . KObj ( pod ) , "selectedNode" , podScheduling . Spec . SelectedNode , "podResourceName" , claimStatus . Name )
2023-10-19 07:02:11 -04:00
return framework . Queue , nil
2023-02-13 03:34:11 -05:00
}
}
}
// Update with only the spec modified?
if oldPodScheduling != nil &&
! apiequality . Semantic . DeepEqual ( & oldPodScheduling . Spec , & podScheduling . Spec ) &&
apiequality . Semantic . DeepEqual ( & oldPodScheduling . Status , & podScheduling . Status ) {
2023-07-11 22:24:35 -04:00
logger . V ( 5 ) . Info ( "PodSchedulingContext has only the scheduler spec changes, ignore the update" , "pod" , klog . KObj ( pod ) )
2023-07-13 09:45:26 -04:00
return framework . QueueSkip , nil
2023-02-13 03:34:11 -05:00
}
// Once we get here, all changes which are known to require special responses
// have been checked for. Whatever the change was, we don't know exactly how
2023-10-19 07:02:11 -04:00
// to handle it and thus return Queue. This will cause the
2023-02-13 03:34:11 -05:00
// scheduler to treat the event as if no event hint callback had been provided.
// Developers who want to investigate this can enable a diff at log level 6.
2023-07-11 22:24:35 -04:00
if loggerV := logger . V ( 6 ) ; loggerV . Enabled ( ) {
2023-02-13 03:34:11 -05:00
loggerV . Info ( "PodSchedulingContext for pod with unknown changes, maybe schedule" , "pod" , klog . KObj ( pod ) , "podSchedulingDiff" , cmp . Diff ( oldPodScheduling , podScheduling ) )
} else {
2023-07-11 22:24:35 -04:00
logger . V ( 5 ) . Info ( "PodSchedulingContext for pod with unknown changes, maybe schedule" , "pod" , klog . KObj ( pod ) )
2023-02-13 03:34:11 -05:00
}
2023-10-19 07:02:11 -04:00
return framework . Queue , nil
2023-02-13 03:34:11 -05:00
}
func podSchedulingHasClaimInfo ( podScheduling * resourcev1alpha2 . PodSchedulingContext , podResourceName string ) bool {
for _ , claimStatus := range podScheduling . Status . ResourceClaims {
if claimStatus . Name == podResourceName {
return true
}
}
return false
}
func sliceContains ( hay [ ] string , needle string ) bool {
for _ , item := range hay {
if item == needle {
return true
}
}
return false
}
2022-04-12 07:41:56 -04:00
// podResourceClaims returns the ResourceClaims for all pod.Spec.PodResourceClaims.
2023-03-06 06:43:58 -05:00
func ( pl * dynamicResources ) podResourceClaims ( pod * v1 . Pod ) ( [ ] * resourcev1alpha2 . ResourceClaim , error ) {
claims := make ( [ ] * resourcev1alpha2 . ResourceClaim , 0 , len ( pod . Spec . ResourceClaims ) )
2023-02-13 03:34:11 -05:00
if err := pl . foreachPodResourceClaim ( pod , func ( _ string , claim * resourcev1alpha2 . ResourceClaim ) {
// We store the pointer as returned by the lister. The
// assumption is that if a claim gets modified while our code
// runs, the cache will store a new pointer, not mutate the
// existing object that we point to here.
claims = append ( claims , claim )
} ) ; err != nil {
return nil , err
}
return claims , nil
}
// foreachPodResourceClaim checks that each ResourceClaim for the pod exists.
// It calls an optional handler for those claims that it finds.
func ( pl * dynamicResources ) foreachPodResourceClaim ( pod * v1 . Pod , cb func ( podResourceName string , claim * resourcev1alpha2 . ResourceClaim ) ) error {
2022-04-12 07:41:56 -04:00
for _ , resource := range pod . Spec . ResourceClaims {
2023-04-14 03:50:52 -04:00
claimName , mustCheckOwner , err := resourceclaim . Name ( pod , & resource )
if err != nil {
2023-02-13 03:34:11 -05:00
return err
2023-04-14 03:50:52 -04:00
}
// The claim name might be nil if no underlying resource claim
// was generated for the referenced claim. There are valid use
// cases when this might happen, so we simply skip it.
if claimName == nil {
continue
}
claim , err := pl . claimLister . ResourceClaims ( pod . Namespace ) . Get ( * claimName )
2022-04-12 07:41:56 -04:00
if err != nil {
2023-02-13 03:34:11 -05:00
return err
2022-04-12 07:41:56 -04:00
}
if claim . DeletionTimestamp != nil {
2023-02-13 03:34:11 -05:00
return fmt . Errorf ( "resourceclaim %q is being deleted" , claim . Name )
2022-04-12 07:41:56 -04:00
}
2023-04-14 03:50:52 -04:00
if mustCheckOwner {
2022-04-12 07:41:56 -04:00
if err := resourceclaim . IsForPod ( pod , claim ) ; err != nil {
2023-02-13 03:34:11 -05:00
return err
2022-04-12 07:41:56 -04:00
}
}
2023-02-13 03:34:11 -05:00
if cb != nil {
cb ( resource . Name , claim )
}
2022-04-12 07:41:56 -04:00
}
2023-02-13 03:34:11 -05:00
return nil
2022-04-12 07:41:56 -04:00
}
// PreFilter invoked at the prefilter extension point to check if pod has all
// immediate claims bound. UnschedulableAndUnresolvable is returned if
// the pod cannot be scheduled at the moment on any node.
func ( pl * dynamicResources ) PreFilter ( ctx context . Context , state * framework . CycleState , pod * v1 . Pod ) ( * framework . PreFilterResult , * framework . Status ) {
if ! pl . enabled {
2023-05-10 06:07:00 -04:00
return nil , framework . NewStatus ( framework . Skip )
2022-04-12 07:41:56 -04:00
}
logger := klog . FromContext ( ctx )
// If the pod does not reference any claim, we don't need to do
// anything for it. We just initialize an empty state to record that
// observation for the other functions. This gets updated below
// if we get that far.
s := & stateData { }
state . Write ( stateKey , s )
claims , err := pl . podResourceClaims ( pod )
if err != nil {
return nil , statusUnschedulable ( logger , err . Error ( ) )
}
2023-01-16 09:04:01 -05:00
logger . V ( 5 ) . Info ( "pod resource claims" , "pod" , klog . KObj ( pod ) , "resourceclaims" , klog . KObjSlice ( claims ) )
2023-05-10 06:07:00 -04:00
// If the pod does not reference any claim,
// DynamicResources Filter has nothing to do with the Pod.
2022-04-12 07:41:56 -04:00
if len ( claims ) == 0 {
2023-05-10 06:07:00 -04:00
return nil , framework . NewStatus ( framework . Skip )
2022-04-12 07:41:56 -04:00
}
2023-08-30 03:13:31 -04:00
// Fetch s.podSchedulingState.schedulingCtx, it's going to be needed when checking claims.
if err := s . podSchedulingState . init ( ctx , pod , pl . podSchedulingContextLister ) ; err != nil {
return nil , statusError ( logger , err )
}
2023-08-13 05:34:16 -04:00
s . informationsForClaim = make ( [ ] informationForClaim , len ( claims ) )
2022-04-12 07:41:56 -04:00
for index , claim := range claims {
2023-03-06 06:43:58 -05:00
if claim . Spec . AllocationMode == resourcev1alpha2 . AllocationModeImmediate &&
2022-04-12 07:41:56 -04:00
claim . Status . Allocation == nil {
// This will get resolved by the resource driver.
return nil , statusUnschedulable ( logger , "unallocated immediate resourceclaim" , "pod" , klog . KObj ( pod ) , "resourceclaim" , klog . KObj ( claim ) )
}
if claim . Status . DeallocationRequested {
// This will get resolved by the resource driver.
return nil , statusUnschedulable ( logger , "resourceclaim must be reallocated" , "pod" , klog . KObj ( pod ) , "resourceclaim" , klog . KObj ( claim ) )
}
if claim . Status . Allocation != nil &&
! resourceclaim . CanBeReserved ( claim ) &&
! resourceclaim . IsReservedForPod ( pod , claim ) {
// Resource is in use. The pod has to wait.
return nil , statusUnschedulable ( logger , "resourceclaim in use" , "pod" , klog . KObj ( pod ) , "resourceclaim" , klog . KObj ( claim ) )
}
if claim . Status . Allocation != nil &&
claim . Status . Allocation . AvailableOnNodes != nil {
nodeSelector , err := nodeaffinity . NewNodeSelector ( claim . Status . Allocation . AvailableOnNodes )
if err != nil {
return nil , statusError ( logger , err )
}
2023-08-13 05:34:16 -04:00
s . informationsForClaim [ index ] . availableOnNode = nodeSelector
}
if claim . Status . Allocation == nil &&
claim . Spec . AllocationMode == resourcev1alpha2 . AllocationModeWaitForFirstConsumer {
// The ResourceClass might have a node filter. This is
// useful for trimming the initial set of potential
// nodes before we ask the driver(s) for information
// about the specific pod.
class , err := pl . classLister . Get ( claim . Spec . ResourceClassName )
if err != nil {
2023-08-28 12:02:00 -04:00
// If the class cannot be retrieved, allocation cannot proceed.
if apierrors . IsNotFound ( err ) {
// Here we mark the pod as "unschedulable", so it'll sleep in
// the unscheduleable queue until a ResourceClass event occurs.
return nil , statusUnschedulable ( logger , fmt . Sprintf ( "resource class %s does not exist" , claim . Spec . ResourceClassName ) )
}
// Other error, retry with backoff.
2023-08-13 05:34:16 -04:00
return nil , statusError ( logger , fmt . Errorf ( "look up resource class: %v" , err ) )
}
if class . SuitableNodes != nil {
selector , err := nodeaffinity . NewNodeSelector ( class . SuitableNodes )
if err != nil {
return nil , statusError ( logger , err )
}
s . informationsForClaim [ index ] . availableOnNode = selector
}
// Now we need information from drivers.
2023-08-30 03:13:31 -04:00
s . informationsForClaim [ index ] . status = statusForClaim ( s . podSchedulingState . schedulingCtx , pod . Spec . ResourceClaims [ index ] . Name )
2022-04-12 07:41:56 -04:00
}
}
s . claims = claims
state . Write ( stateKey , s )
return nil , nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func ( pl * dynamicResources ) PreFilterExtensions ( ) framework . PreFilterExtensions {
return nil
}
func getStateData ( cs * framework . CycleState ) ( * stateData , error ) {
state , err := cs . Read ( stateKey )
if err != nil {
return nil , err
}
s , ok := state . ( * stateData )
if ! ok {
return nil , errors . New ( "unable to convert state into stateData" )
}
return s , nil
}
// Filter invoked at the filter extension point.
// It evaluates if a pod can fit due to the resources it requests,
// for both allocated and unallocated claims.
//
// For claims that are bound, then it checks that the node affinity is
// satisfied by the given node.
//
// For claims that are unbound, it checks whether the claim might get allocated
// for the node.
func ( pl * dynamicResources ) Filter ( ctx context . Context , cs * framework . CycleState , pod * v1 . Pod , nodeInfo * framework . NodeInfo ) * framework . Status {
if ! pl . enabled {
return nil
}
state , err := getStateData ( cs )
if err != nil {
return statusError ( klog . FromContext ( ctx ) , err )
}
if len ( state . claims ) == 0 {
return nil
}
logger := klog . FromContext ( ctx )
node := nodeInfo . Node ( )
var unavailableClaims [ ] int
for index , claim := range state . claims {
logger . V ( 10 ) . Info ( "filtering based on resource claims of the pod" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( node ) , "resourceclaim" , klog . KObj ( claim ) )
switch {
case claim . Status . Allocation != nil :
2023-08-13 05:34:16 -04:00
if nodeSelector := state . informationsForClaim [ index ] . availableOnNode ; nodeSelector != nil {
2022-04-12 07:41:56 -04:00
if ! nodeSelector . Match ( node ) {
logger . V ( 5 ) . Info ( "AvailableOnNodes does not match" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( node ) , "resourceclaim" , klog . KObj ( claim ) )
unavailableClaims = append ( unavailableClaims , index )
}
}
case claim . Status . DeallocationRequested :
// We shouldn't get here. PreFilter already checked this.
return statusUnschedulable ( logger , "resourceclaim must be reallocated" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( node ) , "resourceclaim" , klog . KObj ( claim ) )
2023-03-06 06:43:58 -05:00
case claim . Spec . AllocationMode == resourcev1alpha2 . AllocationModeWaitForFirstConsumer :
2023-08-13 05:34:16 -04:00
if selector := state . informationsForClaim [ index ] . availableOnNode ; selector != nil {
if matches := selector . Match ( node ) ; ! matches {
return statusUnschedulable ( logger , "excluded by resource class node filter" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( node ) , "resourceclassName" , claim . Spec . ResourceClassName )
2022-04-12 07:41:56 -04:00
}
}
2023-08-13 05:34:16 -04:00
if status := state . informationsForClaim [ index ] . status ; status != nil {
2022-04-12 07:41:56 -04:00
for _ , unsuitableNode := range status . UnsuitableNodes {
if node . Name == unsuitableNode {
return statusUnschedulable ( logger , "resourceclaim cannot be allocated for the node (unsuitable)" , "pod" , klog . KObj ( pod ) , "node" , klog . KObj ( node ) , "resourceclaim" , klog . KObj ( claim ) , "unsuitablenodes" , status . UnsuitableNodes )
}
}
}
default :
// This should have been delayed allocation. Immediate
// allocation was already checked for in PreFilter.
return statusError ( logger , fmt . Errorf ( "internal error, unexpected allocation mode %v" , claim . Spec . AllocationMode ) )
}
}
if len ( unavailableClaims ) > 0 {
state . mutex . Lock ( )
defer state . mutex . Unlock ( )
if state . unavailableClaims == nil {
2023-09-28 09:30:28 -04:00
state . unavailableClaims = sets . New [ int ] ( )
2022-04-12 07:41:56 -04:00
}
for index := range unavailableClaims {
claim := state . claims [ index ]
// Deallocation makes more sense for claims with
// delayed allocation. Claims with immediate allocation
// would just get allocated again for a random node,
// which is unlikely to help the pod.
2023-03-06 06:43:58 -05:00
if claim . Spec . AllocationMode == resourcev1alpha2 . AllocationModeWaitForFirstConsumer {
2022-04-12 07:41:56 -04:00
state . unavailableClaims . Insert ( unavailableClaims ... )
}
}
return statusUnschedulable ( logger , "resourceclaim not available on the node" , "pod" , klog . KObj ( pod ) )
}
return nil
}
// PostFilter checks whether there are allocated claims that could get
// deallocated to help get the Pod schedulable. If yes, it picks one and
// requests its deallocation. This only gets called when filtering found no
// suitable node.
func ( pl * dynamicResources ) PostFilter ( ctx context . Context , cs * framework . CycleState , pod * v1 . Pod , filteredNodeStatusMap framework . NodeToStatusMap ) ( * framework . PostFilterResult , * framework . Status ) {
if ! pl . enabled {
return nil , framework . NewStatus ( framework . Unschedulable , "plugin disabled" )
}
logger := klog . FromContext ( ctx )
state , err := getStateData ( cs )
if err != nil {
return nil , statusError ( logger , err )
}
if len ( state . claims ) == 0 {
return nil , framework . NewStatus ( framework . Unschedulable , "no new claims to deallocate" )
}
// Iterating over a map is random. This is intentional here, we want to
// pick one claim randomly because there is no better heuristic.
for index := range state . unavailableClaims {
claim := state . claims [ index ]
if len ( claim . Status . ReservedFor ) == 0 ||
len ( claim . Status . ReservedFor ) == 1 && claim . Status . ReservedFor [ 0 ] . UID == pod . UID {
2023-09-05 09:01:09 -04:00
// Before we tell a driver to deallocate a claim, we
// have to stop telling it to allocate. Otherwise,
// depending on timing, it will deallocate the claim,
// see a PodSchedulingContext with selected node, and
// allocate again for that same node.
if state . podSchedulingState . schedulingCtx != nil &&
state . podSchedulingState . schedulingCtx . Spec . SelectedNode != "" {
state . podSchedulingState . selectedNode = ptr . To ( "" )
if err := state . podSchedulingState . publish ( ctx , pod , pl . clientset ) ; err != nil {
return nil , statusError ( logger , err )
}
}
2022-04-12 07:41:56 -04:00
claim := state . claims [ index ] . DeepCopy ( )
claim . Status . DeallocationRequested = true
claim . Status . ReservedFor = nil
logger . V ( 5 ) . Info ( "Requesting deallocation of ResourceClaim" , "pod" , klog . KObj ( pod ) , "resourceclaim" , klog . KObj ( claim ) )
if err := state . updateClaimStatus ( ctx , pl . clientset , index , claim ) ; err != nil {
return nil , statusError ( logger , err )
}
return nil , nil
}
}
return nil , framework . NewStatus ( framework . Unschedulable , "still not schedulable" )
}
// PreScore is passed a list of all nodes that would fit the pod. Not all
// claims are necessarily allocated yet, so here we can set the SuitableNodes
// field for those which are pending.
func ( pl * dynamicResources ) PreScore ( ctx context . Context , cs * framework . CycleState , pod * v1 . Pod , nodes [ ] * v1 . Node ) * framework . Status {
if ! pl . enabled {
return nil
}
state , err := getStateData ( cs )
if err != nil {
return statusError ( klog . FromContext ( ctx ) , err )
}
2023-08-30 03:13:31 -04:00
defer func ( ) {
state . preScored = true
} ( )
2022-04-12 07:41:56 -04:00
if len ( state . claims ) == 0 {
return nil
}
logger := klog . FromContext ( ctx )
pending := false
for _ , claim := range state . claims {
if claim . Status . Allocation == nil {
pending = true
}
}
2023-08-30 03:13:31 -04:00
if ! pending {
logger . V ( 5 ) . Info ( "no pending claims" , "pod" , klog . KObj ( pod ) )
return nil
}
if haveAllPotentialNodes ( state . podSchedulingState . schedulingCtx , nodes ) {
logger . V ( 5 ) . Info ( "all potential nodes already set" , "pod" , klog . KObj ( pod ) , "potentialnodes" , klog . KObjSlice ( nodes ) )
return nil
}
// Remember the potential nodes. The object will get created or
// updated in Reserve. This is both an optimization and
// covers the case that PreScore doesn't get called when there
// is only a single node.
logger . V ( 5 ) . Info ( "remembering potential nodes" , "pod" , klog . KObj ( pod ) , "potentialnodes" , klog . KObjSlice ( nodes ) )
numNodes := len ( nodes )
if numNodes > resourcev1alpha2 . PodSchedulingNodeListMaxSize {
numNodes = resourcev1alpha2 . PodSchedulingNodeListMaxSize
}
potentialNodes := make ( [ ] string , 0 , numNodes )
if numNodes == len ( nodes ) {
// Copy all node names.
for _ , node := range nodes {
potentialNodes = append ( potentialNodes , node . Name )
}
} else {
// Select a random subset of the nodes to comply with
// the PotentialNodes length limit. Randomization is
// done for us by Go which iterates over map entries
// randomly.
nodeNames := map [ string ] struct { } { }
for _ , node := range nodes {
nodeNames [ node . Name ] = struct { } { }
}
for nodeName := range nodeNames {
if len ( potentialNodes ) >= resourcev1alpha2 . PodSchedulingNodeListMaxSize {
break
2022-04-12 07:41:56 -04:00
}
2023-08-30 03:13:31 -04:00
potentialNodes = append ( potentialNodes , nodeName )
2022-04-12 07:41:56 -04:00
}
}
2023-08-30 03:13:31 -04:00
sort . Strings ( potentialNodes )
state . podSchedulingState . potentialNodes = & potentialNodes
2022-04-12 07:41:56 -04:00
return nil
}
2023-08-30 03:13:31 -04:00
func haveAllPotentialNodes ( schedulingCtx * resourcev1alpha2 . PodSchedulingContext , nodes [ ] * v1 . Node ) bool {
if schedulingCtx == nil {
return false
}
2022-04-12 07:41:56 -04:00
for _ , node := range nodes {
2023-08-30 03:13:31 -04:00
if ! haveNode ( schedulingCtx . Spec . PotentialNodes , node . Name ) {
2022-04-12 07:41:56 -04:00
return false
}
}
return true
}
func haveNode ( nodeNames [ ] string , nodeName string ) bool {
for _ , n := range nodeNames {
if n == nodeName {
return true
}
}
return false
}
// Reserve reserves claims for the pod.
func ( pl * dynamicResources ) Reserve ( ctx context . Context , cs * framework . CycleState , pod * v1 . Pod , nodeName string ) * framework . Status {
if ! pl . enabled {
return nil
}
state , err := getStateData ( cs )
if err != nil {
return statusError ( klog . FromContext ( ctx ) , err )
}
if len ( state . claims ) == 0 {
return nil
}
numDelayedAllocationPending := 0
numClaimsWithStatusInfo := 0
logger := klog . FromContext ( ctx )
for index , claim := range state . claims {
if claim . Status . Allocation != nil {
// Allocated, but perhaps not reserved yet.
if resourceclaim . IsReservedForPod ( pod , claim ) {
logger . V ( 5 ) . Info ( "is reserved" , "pod" , klog . KObj ( pod ) , "node" , klog . ObjectRef { Name : nodeName } , "resourceclaim" , klog . KObj ( claim ) )
continue
}
claim := claim . DeepCopy ( )
claim . Status . ReservedFor = append ( claim . Status . ReservedFor ,
2023-03-06 06:43:58 -05:00
resourcev1alpha2 . ResourceClaimConsumerReference {
2022-04-12 07:41:56 -04:00
Resource : "pods" ,
Name : pod . Name ,
UID : pod . UID ,
} )
logger . V ( 5 ) . Info ( "reserve" , "pod" , klog . KObj ( pod ) , "node" , klog . ObjectRef { Name : nodeName } , "resourceclaim" , klog . KObj ( claim ) )
2023-03-06 06:43:58 -05:00
_ , err := pl . clientset . ResourceV1alpha2 ( ) . ResourceClaims ( claim . Namespace ) . UpdateStatus ( ctx , claim , metav1 . UpdateOptions { } )
2022-04-12 07:41:56 -04:00
// TODO: metric for update errors.
if err != nil {
return statusError ( logger , err )
}
// If we get here, we know that reserving the claim for
2023-03-06 14:57:35 -05:00
// the pod worked and we can proceed with schedulingCtx
2022-04-12 07:41:56 -04:00
// it.
} else {
// Must be delayed allocation.
numDelayedAllocationPending ++
// Did the driver provide information that steered node
// selection towards a node that it can support?
2023-08-30 03:13:31 -04:00
if statusForClaim ( state . podSchedulingState . schedulingCtx , pod . Spec . ResourceClaims [ index ] . Name ) != nil {
2022-04-12 07:41:56 -04:00
numClaimsWithStatusInfo ++
}
}
}
if numDelayedAllocationPending == 0 {
// Nothing left to do.
return nil
}
2023-08-30 03:13:31 -04:00
if ! state . preScored {
// There was only one candidate that passed the Filters and
// therefore PreScore was not called.
//
// We need to ask whether that node is suitable, otherwise the
// scheduler will pick it forever even when it cannot satisfy
// the claim.
if state . podSchedulingState . schedulingCtx == nil ||
! containsNode ( state . podSchedulingState . schedulingCtx . Spec . PotentialNodes , nodeName ) {
potentialNodes := [ ] string { nodeName }
state . podSchedulingState . potentialNodes = & potentialNodes
logger . V ( 5 ) . Info ( "asking for information about single potential node" , "pod" , klog . KObj ( pod ) , "node" , klog . ObjectRef { Name : nodeName } )
}
2022-04-12 07:41:56 -04:00
}
// When there is only one pending resource, we can go ahead with
// requesting allocation even when we don't have the information from
// the driver yet. Otherwise we wait for information before blindly
// making a decision that might have to be reversed later.
if numDelayedAllocationPending == 1 || numClaimsWithStatusInfo == numDelayedAllocationPending {
// TODO: can we increase the chance that the scheduler picks
// the same node as before when allocation is on-going,
// assuming that that node still fits the pod? Picking a
// different node may lead to some claims being allocated for
// one node and others for another, which then would have to be
// resolved with deallocation.
2023-08-30 03:13:31 -04:00
if state . podSchedulingState . schedulingCtx == nil ||
state . podSchedulingState . schedulingCtx . Spec . SelectedNode != nodeName {
state . podSchedulingState . selectedNode = & nodeName
logger . V ( 5 ) . Info ( "start allocation" , "pod" , klog . KObj ( pod ) , "node" , klog . ObjectRef { Name : nodeName } )
if err := state . podSchedulingState . publish ( ctx , pod , pl . clientset ) ; err != nil {
return statusError ( logger , err )
}
2023-10-19 07:02:11 -04:00
return statusPending ( logger , "waiting for resource driver to allocate resource" , "pod" , klog . KObj ( pod ) , "node" , klog . ObjectRef { Name : nodeName } )
2022-04-12 07:41:56 -04:00
}
}
// May have been modified earlier in PreScore or above.
2023-08-30 03:13:31 -04:00
if err := state . podSchedulingState . publish ( ctx , pod , pl . clientset ) ; err != nil {
return statusError ( logger , err )
2022-04-12 07:41:56 -04:00
}
// More than one pending claim and not enough information about all of them.
//
2023-03-06 14:57:35 -05:00
// TODO: can or should we ensure that schedulingCtx gets aborted while
2022-04-12 07:41:56 -04:00
// waiting for resources *before* triggering delayed volume
// provisioning? On the one hand, volume provisioning is currently
// irreversible, so it better should come last. On the other hand,
// triggering both in parallel might be faster.
2023-10-19 07:02:11 -04:00
return statusPending ( logger , "waiting for resource driver to provide information" , "pod" , klog . KObj ( pod ) )
2022-04-12 07:41:56 -04:00
}
2023-08-30 03:13:31 -04:00
func containsNode ( hay [ ] string , needle string ) bool {
for _ , node := range hay {
if node == needle {
return true
}
}
return false
}
2022-04-12 07:41:56 -04:00
// Unreserve clears the ReservedFor field for all claims.
// It's idempotent, and does nothing if no state found for the given pod.
func ( pl * dynamicResources ) Unreserve ( ctx context . Context , cs * framework . CycleState , pod * v1 . Pod , nodeName string ) {
if ! pl . enabled {
return
}
state , err := getStateData ( cs )
if err != nil {
return
}
if len ( state . claims ) == 0 {
return
}
logger := klog . FromContext ( ctx )
for index , claim := range state . claims {
if claim . Status . Allocation != nil &&
resourceclaim . IsReservedForPod ( pod , claim ) {
// Remove pod from ReservedFor.
claim := claim . DeepCopy ( )
2023-03-06 06:43:58 -05:00
reservedFor := make ( [ ] resourcev1alpha2 . ResourceClaimConsumerReference , 0 , len ( claim . Status . ReservedFor ) - 1 )
2022-04-12 07:41:56 -04:00
for _ , reserved := range claim . Status . ReservedFor {
// TODO: can UID be assumed to be unique all resources or do we also need to compare Group/Version/Resource?
if reserved . UID != pod . UID {
reservedFor = append ( reservedFor , reserved )
}
}
claim . Status . ReservedFor = reservedFor
logger . V ( 5 ) . Info ( "unreserve" , "resourceclaim" , klog . KObj ( claim ) )
if err := state . updateClaimStatus ( ctx , pl . clientset , index , claim ) ; err != nil {
2023-03-06 14:57:35 -05:00
// We will get here again when pod schedulingCtx
2022-04-12 07:41:56 -04:00
// is retried.
logger . Error ( err , "unreserve" , "resourceclaim" , klog . KObj ( claim ) )
}
}
}
}
// PostBind is called after a pod is successfully bound to a node. Now we are
2023-03-06 14:57:35 -05:00
// sure that a PodSchedulingContext object, if it exists, is definitely not going to
2022-04-12 07:41:56 -04:00
// be needed anymore and can delete it. This is a one-shot thing, there won't
// be any retries. This is okay because it should usually work and in those
// cases where it doesn't, the garbage collector will eventually clean up.
func ( pl * dynamicResources ) PostBind ( ctx context . Context , cs * framework . CycleState , pod * v1 . Pod , nodeName string ) {
if ! pl . enabled {
return
}
state , err := getStateData ( cs )
if err != nil {
return
}
if len ( state . claims ) == 0 {
return
}
2023-03-06 14:57:35 -05:00
// We cannot know for sure whether the PodSchedulingContext object exists. We
// might have created it in the previous pod schedulingCtx cycle and not
2022-04-12 07:41:56 -04:00
// have it in our informer cache yet. Let's try to delete, just to be
// on the safe side.
logger := klog . FromContext ( ctx )
2023-03-06 14:57:35 -05:00
err = pl . clientset . ResourceV1alpha2 ( ) . PodSchedulingContexts ( pod . Namespace ) . Delete ( ctx , pod . Name , metav1 . DeleteOptions { } )
2022-04-12 07:41:56 -04:00
switch {
case apierrors . IsNotFound ( err ) :
2023-03-06 14:57:35 -05:00
logger . V ( 5 ) . Info ( "no PodSchedulingContext object to delete" )
2022-04-12 07:41:56 -04:00
case err != nil :
2023-03-06 14:57:35 -05:00
logger . Error ( err , "delete PodSchedulingContext" )
2022-04-12 07:41:56 -04:00
default :
2023-03-06 14:57:35 -05:00
logger . V ( 5 ) . Info ( "PodSchedulingContext object deleted" )
2022-04-12 07:41:56 -04:00
}
}
// statusUnschedulable ensures that there is a log message associated with the
// line where the status originated.
func statusUnschedulable ( logger klog . Logger , reason string , kv ... interface { } ) * framework . Status {
if loggerV := logger . V ( 5 ) ; loggerV . Enabled ( ) {
helper , loggerV := loggerV . WithCallStackHelper ( )
helper ( )
kv = append ( kv , "reason" , reason )
// nolint: logcheck // warns because it cannot check key/values
loggerV . Info ( "pod unschedulable" , kv ... )
}
return framework . NewStatus ( framework . UnschedulableAndUnresolvable , reason )
}
2023-10-19 07:02:11 -04:00
// statusPending ensures that there is a log message associated with the
// line where the status originated.
func statusPending ( logger klog . Logger , reason string , kv ... interface { } ) * framework . Status {
if loggerV := logger . V ( 5 ) ; loggerV . Enabled ( ) {
helper , loggerV := loggerV . WithCallStackHelper ( )
helper ( )
kv = append ( kv , "reason" , reason )
// nolint: logcheck // warns because it cannot check key/values
loggerV . Info ( "pod waiting for external component" , kv ... )
}
// When we return Pending, we want to block the Pod at the same time.
return framework . NewStatus ( framework . Pending , reason )
}
2022-04-12 07:41:56 -04:00
// statusError ensures that there is a log message associated with the
// line where the error originated.
func statusError ( logger klog . Logger , err error , kv ... interface { } ) * framework . Status {
if loggerV := logger . V ( 5 ) ; loggerV . Enabled ( ) {
helper , loggerV := loggerV . WithCallStackHelper ( )
helper ( )
// nolint: logcheck // warns because it cannot check key/values
loggerV . Error ( err , "dynamic resource plugin failed" , kv ... )
}
return framework . AsStatus ( err )
}