diff --git a/README.md b/README.md index 49a599393..79adb1760 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,7 @@ The Default Evictor Plugin is used by default for filtering pods before processi | `minReplicas` |`uint`|`0`| ignore eviction of pods where owner (e.g. `ReplicaSet`) replicas is below this threshold | | `minPodAge` |`metav1.Duration`|`0`| ignore eviction of pods with a creation time within this threshold | | `ignorePodsWithoutPDB` |`bool`|`false`| set whether pods without PodDisruptionBudget should be evicted or ignored | +| `noEvictionPolicy` |`enum`|``| sets whether a `descheduler.alpha.kubernetes.io/prefer-no-eviction` pod annotation is considered preferred or mandatory. Accepted values: "", "Preferred", "Mandatory". Defaults to "Preferred". | ### Example policy @@ -1013,12 +1014,16 @@ never evicted because these pods won't be recreated. (Standalone pods in failed * Pods with PVCs are evicted (unless `ignorePvcPods: true` is set). * In `LowNodeUtilization` and `RemovePodsViolatingInterPodAntiAffinity`, pods are evicted by their priority from low to high, and if they have same priority, best effort pods are evicted before burstable and guaranteed pods. -* All types of pods with the annotation `descheduler.alpha.kubernetes.io/evict` are eligible for eviction. This +* All types of pods with the `descheduler.alpha.kubernetes.io/evict` annotation are eligible for eviction. This annotation is used to override checks which prevent eviction and users can select which pod is evicted. Users should know how and if the pod will be recreated. The annotation only affects internal descheduler checks. The anti-disruption protection provided by the [/eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/) subresource is still respected. +* Pods with the `descheduler.alpha.kubernetes.io/prefer-no-eviction` annotation voice their preference not to be evicted. + Each plugin decides whether the annotation gets respected or not. When the `DefaultEvictor` plugin sets `noEvictionPolicy` + to `Mandatory` all such pods are excluded from eviction. Needs to be used with caution as some plugins may enfore + various policies that are expected to be always met. * Pods with a non-nil DeletionTimestamp are not evicted by default. Setting `--v=4` or greater on the Descheduler will log all reasons why any pod is not evictable. diff --git a/pkg/descheduler/evictions/utils/utils.go b/pkg/descheduler/evictions/utils/utils.go index 68ca25a8c..f84a56293 100644 --- a/pkg/descheduler/evictions/utils/utils.go +++ b/pkg/descheduler/evictions/utils/utils.go @@ -17,12 +17,17 @@ limitations under the License. package utils import ( + corev1 "k8s.io/api/core/v1" clientset "k8s.io/client-go/kubernetes" ) const ( EvictionKind = "Eviction" EvictionSubresource = "pods/eviction" + // A new experimental feature for soft no-eviction preference. + // Each plugin will decide whether the soft preference will be respected. + // If configured the soft preference turns into a mandatory no-eviction policy for the DefaultEvictor plugin. + SoftNoEvictionAnnotationKey = "descheduler.alpha.kubernetes.io/prefer-no-eviction" ) // SupportEviction uses Discovery API to find out if the server support eviction subresource @@ -56,3 +61,9 @@ func SupportEviction(client clientset.Interface) (string, error) { } return "", nil } + +// HaveEvictAnnotation checks if the pod have evict annotation +func HaveNoEvictionAnnotation(pod *corev1.Pod) bool { + _, found := pod.ObjectMeta.Annotations[SoftNoEvictionAnnotationKey] + return found +} diff --git a/pkg/descheduler/pod/pods.go b/pkg/descheduler/pod/pods.go index 1c640bcf2..164fb43e4 100644 --- a/pkg/descheduler/pod/pods.go +++ b/pkg/descheduler/pod/pods.go @@ -25,6 +25,7 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/cache" + evictionutils "sigs.k8s.io/descheduler/pkg/descheduler/evictions/utils" "sigs.k8s.io/descheduler/pkg/utils" ) @@ -254,14 +255,32 @@ func SortPodsBasedOnPriorityLowToHigh(pods []*v1.Pod) { return false } if (pods[j].Spec.Priority == nil && pods[i].Spec.Priority == nil) || (*pods[i].Spec.Priority == *pods[j].Spec.Priority) { - if IsBestEffortPod(pods[i]) { + iIsBestEffortPod := IsBestEffortPod(pods[i]) + jIsBestEffortPod := IsBestEffortPod(pods[j]) + iIsBurstablePod := IsBurstablePod(pods[i]) + jIsBurstablePod := IsBurstablePod(pods[j]) + iIsGuaranteedPod := IsGuaranteedPod(pods[i]) + jIsGuaranteedPod := IsGuaranteedPod(pods[j]) + if (iIsBestEffortPod && jIsBestEffortPod) || (iIsBurstablePod && jIsBurstablePod) || (iIsGuaranteedPod && jIsGuaranteedPod) { + iHasNoEvictonPolicy := evictionutils.HaveNoEvictionAnnotation(pods[i]) + jHasNoEvictonPolicy := evictionutils.HaveNoEvictionAnnotation(pods[j]) + if !iHasNoEvictonPolicy { + return true + } + if !jHasNoEvictonPolicy { + return false + } return true } - if IsBurstablePod(pods[i]) && IsGuaranteedPod(pods[j]) { + if iIsBestEffortPod { + return true + } + if iIsBurstablePod && jIsGuaranteedPod { return true } return false } + return *pods[i].Spec.Priority < *pods[j].Spec.Priority }) } diff --git a/pkg/descheduler/pod/pods_test.go b/pkg/descheduler/pod/pods_test.go index c890f8aaf..d318c6f04 100644 --- a/pkg/descheduler/pod/pods_test.go +++ b/pkg/descheduler/pod/pods_test.go @@ -157,8 +157,66 @@ func TestSortPodsBasedOnPriorityLowToHigh(t *testing.T) { p6 := test.BuildTestPod("p6", 400, 100, n1.Name, test.MakeGuaranteedPod) p6.Spec.Priority = nil - podList := []*v1.Pod{p4, p3, p2, p1, p6, p5} - expectedPodList := []*v1.Pod{p5, p6, p1, p2, p3, p4} + p7 := test.BuildTestPod("p7", 400, 0, n1.Name, func(pod *v1.Pod) { + test.SetPodPriority(pod, lowPriority) + pod.Annotations = map[string]string{ + "descheduler.alpha.kubernetes.io/prefer-no-eviction": "", + } + }) + + // BestEffort + p8 := test.BuildTestPod("p8", 400, 0, n1.Name, func(pod *v1.Pod) { + test.SetPodPriority(pod, highPriority) + test.MakeBestEffortPod(pod) + pod.Annotations = map[string]string{ + "descheduler.alpha.kubernetes.io/prefer-no-eviction": "", + } + }) + + // Burstable + p9 := test.BuildTestPod("p9", 400, 0, n1.Name, func(pod *v1.Pod) { + test.SetPodPriority(pod, highPriority) + test.MakeBurstablePod(pod) + pod.Annotations = map[string]string{ + "descheduler.alpha.kubernetes.io/prefer-no-eviction": "", + } + }) + + // Guaranteed + p10 := test.BuildTestPod("p10", 400, 100, n1.Name, func(pod *v1.Pod) { + test.SetPodPriority(pod, highPriority) + test.MakeGuaranteedPod(pod) + pod.Annotations = map[string]string{ + "descheduler.alpha.kubernetes.io/prefer-no-eviction": "", + } + }) + + // Burstable + p11 := test.BuildTestPod("p11", 400, 0, n1.Name, func(pod *v1.Pod) { + test.MakeBurstablePod(pod) + }) + + // Burstable + p12 := test.BuildTestPod("p12", 400, 0, n1.Name, func(pod *v1.Pod) { + test.MakeBurstablePod(pod) + pod.Annotations = map[string]string{ + "descheduler.alpha.kubernetes.io/prefer-no-eviction": "", + } + }) + + podList := []*v1.Pod{p1, p8, p9, p10, p2, p3, p4, p5, p6, p7, p11, p12} + // p5: no priority, best effort + // p11: no priority, burstable + // p6: no priority, guaranteed + // p1: low priority + // p7: low priority, prefer-no-eviction + // p2: high priority, best effort + // p8: high priority, best effort, prefer-no-eviction + // p3: high priority, burstable + // p9: high priority, burstable, prefer-no-eviction + // p4: high priority, guaranteed + // p10: high priority, guaranteed, prefer-no-eviction + expectedPodList := []*v1.Pod{p5, p11, p12, p6, p1, p7, p2, p8, p3, p9, p4, p10} SortPodsBasedOnPriorityLowToHigh(podList) if !reflect.DeepEqual(getPodListNames(podList), getPodListNames(expectedPodList)) { diff --git a/pkg/framework/plugins/defaultevictor/defaultevictor.go b/pkg/framework/plugins/defaultevictor/defaultevictor.go index f99d63ecb..236af45ae 100644 --- a/pkg/framework/plugins/defaultevictor/defaultevictor.go +++ b/pkg/framework/plugins/defaultevictor/defaultevictor.go @@ -24,6 +24,7 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" + evictionutils "sigs.k8s.io/descheduler/pkg/descheduler/evictions/utils" nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" @@ -140,6 +141,10 @@ func (d *DefaultEvictor) Filter(pod *v1.Pod) bool { return true } + if d.args.NoEvictionPolicy == MandatoryNoEvictionPolicy && evictionutils.HaveNoEvictionAnnotation(pod) { + return false + } + if utils.IsMirrorPod(pod) { checkErrs = append(checkErrs, fmt.Errorf("pod is a mirror pod")) } diff --git a/pkg/framework/plugins/defaultevictor/defaultevictor_test.go b/pkg/framework/plugins/defaultevictor/defaultevictor_test.go index 11fadf7fe..865a1b795 100644 --- a/pkg/framework/plugins/defaultevictor/defaultevictor_test.go +++ b/pkg/framework/plugins/defaultevictor/defaultevictor_test.go @@ -29,6 +29,7 @@ import ( "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes/fake" "sigs.k8s.io/descheduler/pkg/api" + evictionutils "sigs.k8s.io/descheduler/pkg/descheduler/evictions/utils" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" frameworkfake "sigs.k8s.io/descheduler/pkg/framework/fake" frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" @@ -51,6 +52,7 @@ type testCase struct { minPodAge *metav1.Duration result bool ignorePodsWithoutPDB bool + noEvictionPolicy NoEvictionPolicy } func TestDefaultEvictorPreEvictionFilter(t *testing.T) { @@ -358,6 +360,29 @@ func TestDefaultEvictorFilter(t *testing.T) { }), }, result: true, + }, { + description: "Normal pod eviction with normal ownerRefs and " + evictionutils.SoftNoEvictionAnnotationKey + " annotation (preference)", + pods: []*v1.Pod{ + test.BuildTestPod("p2", 400, 0, n1.Name, func(pod *v1.Pod) { + pod.Annotations = map[string]string{evictionutils.SoftNoEvictionAnnotationKey: ""} + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + }), + }, + evictLocalStoragePods: false, + evictSystemCriticalPods: false, + result: true, + }, { + description: "Normal pod eviction with normal ownerRefs and " + evictionutils.SoftNoEvictionAnnotationKey + " annotation (mandatory)", + pods: []*v1.Pod{ + test.BuildTestPod("p2", 400, 0, n1.Name, func(pod *v1.Pod) { + pod.Annotations = map[string]string{evictionutils.SoftNoEvictionAnnotationKey: ""} + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + }), + }, + evictLocalStoragePods: false, + evictSystemCriticalPods: false, + noEvictionPolicy: MandatoryNoEvictionPolicy, + result: false, }, { description: "Normal pod eviction with replicaSet ownerRefs", pods: []*v1.Pod{ @@ -831,6 +856,7 @@ func initializePlugin(ctx context.Context, test testCase) (frameworktypes.Plugin MinReplicas: test.minReplicas, MinPodAge: test.minPodAge, IgnorePodsWithoutPDB: test.ignorePodsWithoutPDB, + NoEvictionPolicy: test.noEvictionPolicy, } evictorPlugin, err := New( diff --git a/pkg/framework/plugins/defaultevictor/types.go b/pkg/framework/plugins/defaultevictor/types.go index 3a39cbc91..3ab0bb9ad 100644 --- a/pkg/framework/plugins/defaultevictor/types.go +++ b/pkg/framework/plugins/defaultevictor/types.go @@ -37,4 +37,23 @@ type DefaultEvictorArgs struct { MinReplicas uint `json:"minReplicas,omitempty"` MinPodAge *metav1.Duration `json:"minPodAge,omitempty"` IgnorePodsWithoutPDB bool `json:"ignorePodsWithoutPDB,omitempty"` + NoEvictionPolicy NoEvictionPolicy `json:"noEvictionPolicy,omitempty"` } + +// NoEvictionPolicy dictates whether a no-eviction policy is preferred or mandatory. +// Needs to be used with caution as this will give users ability to protect their pods +// from eviction. Which might work against enfored policies. E.g. plugins evicting pods +// violating security policies. +type NoEvictionPolicy string + +const ( + // PreferredNoEvictionPolicy interprets the no-eviction policy as a preference. + // Meaning the annotation will get ignored by the DefaultEvictor plugin. + // Yet, plugins may optionally sort their pods based on the annotation + // and focus on evicting pods that do not set the annotation. + PreferredNoEvictionPolicy NoEvictionPolicy = "Preferred" + + // MandatoryNoEvictionPolicy interprets the no-eviction policy as mandatory. + // Every pod carying the annotation will get excluded from eviction. + MandatoryNoEvictionPolicy NoEvictionPolicy = "Mandatory" +) diff --git a/pkg/framework/plugins/defaultevictor/validation.go b/pkg/framework/plugins/defaultevictor/validation.go index a7db5366b..8c074955e 100644 --- a/pkg/framework/plugins/defaultevictor/validation.go +++ b/pkg/framework/plugins/defaultevictor/validation.go @@ -25,12 +25,18 @@ func ValidateDefaultEvictorArgs(obj runtime.Object) error { args := obj.(*DefaultEvictorArgs) if args.PriorityThreshold != nil && args.PriorityThreshold.Value != nil && len(args.PriorityThreshold.Name) > 0 { - return fmt.Errorf("priority threshold misconfigured, only one of priorityThreshold fields can be set, got %v", args) + return fmt.Errorf("priority threshold misconfigured, only one of priorityThreshold fields can be set") } if args.MinReplicas == 1 { klog.V(4).Info("DefaultEvictor minReplicas must be greater than 1 to check for min pods during eviction. This check will be ignored during eviction.") } + if args.NoEvictionPolicy != "" { + if args.NoEvictionPolicy != PreferredNoEvictionPolicy && args.NoEvictionPolicy != MandatoryNoEvictionPolicy { + return fmt.Errorf("noEvictionPolicy accepts only %q values", []NoEvictionPolicy{PreferredNoEvictionPolicy, MandatoryNoEvictionPolicy}) + } + } + return nil } diff --git a/pkg/framework/plugins/defaultevictor/validation_test.go b/pkg/framework/plugins/defaultevictor/validation_test.go new file mode 100644 index 000000000..254bd1ad9 --- /dev/null +++ b/pkg/framework/plugins/defaultevictor/validation_test.go @@ -0,0 +1,64 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package defaultevictor + +import ( + "fmt" + "testing" + + "k8s.io/apimachinery/pkg/runtime" + utilptr "k8s.io/utils/ptr" + "sigs.k8s.io/descheduler/pkg/api" +) + +func TestValidateDefaultEvictorArgs(t *testing.T) { + tests := []struct { + name string + args *DefaultEvictorArgs + errInfo error + }{ + { + name: "passing invalid priority", + args: &DefaultEvictorArgs{ + PriorityThreshold: &api.PriorityThreshold{ + Value: utilptr.To[int32](1), + Name: "priority-name", + }, + }, + errInfo: fmt.Errorf("priority threshold misconfigured, only one of priorityThreshold fields can be set"), + }, { + name: "passing invalid no eviction policy", + args: &DefaultEvictorArgs{ + NoEvictionPolicy: "invalid-no-eviction-policy", + }, + errInfo: fmt.Errorf("noEvictionPolicy accepts only %q values", []NoEvictionPolicy{PreferredNoEvictionPolicy, MandatoryNoEvictionPolicy}), + }, + } + + for _, testCase := range tests { + t.Run(testCase.name, func(t *testing.T) { + validateErr := ValidateDefaultEvictorArgs(runtime.Object(testCase.args)) + if validateErr == nil || testCase.errInfo == nil { + if validateErr != testCase.errInfo { + t.Errorf("expected validity of plugin config: %q but got %q instead", testCase.errInfo, validateErr) + } + } else if validateErr.Error() != testCase.errInfo.Error() { + t.Errorf("expected validity of plugin config: %q but got %q instead", testCase.errInfo, validateErr) + } + }) + } +}