From ef0c2c1c47c6aa6afb0d08f4a2488d7df004c2f7 Mon Sep 17 00:00:00 2001 From: Simon Scharf Date: Tue, 15 Oct 2024 22:21:04 +0200 Subject: [PATCH] add ignorePodsWithoutPDB option (#1529) * add ignoreNonPDBPods option * take2 * add test * poddisruptionbudgets are now used by defaultevictor plugin * add poddisruptionbudgets to rbac * review comments * don't use GetPodPodDisruptionBudgets * review comment, don't hide error --- README.md | 23 ++++++----- kubernetes/base/rbac.yaml | 3 ++ pkg/descheduler/descheduler.go | 8 +++- pkg/descheduler/policyconfig.go | 1 + .../plugins/defaultevictor/defaultevictor.go | 14 +++++++ .../defaultevictor/defaultevictor_test.go | 41 +++++++++++++++++-- .../plugins/defaultevictor/defaults_test.go | 7 +++- pkg/framework/plugins/defaultevictor/types.go | 1 + pkg/utils/pod.go | 37 +++++++++++++++++ test/test_utils.go | 22 ++++++++++ 10 files changed, 139 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 115df6fd5..1ffce0ad5 100644 --- a/README.md +++ b/README.md @@ -129,18 +129,19 @@ These are top level keys in the Descheduler Policy that you can use to configure The Default Evictor Plugin is used by default for filtering pods before processing them in an strategy plugin, or for applying a PreEvictionFilter of pods before eviction. You can also create your own Evictor Plugin or use the Default one provided by Descheduler. Other uses for the Evictor plugin can be to sort, filter, validate or group pods by different criteria, and that's why this is handled by a plugin and not configured in the top level config. -| Name |type| Default Value | Description | -|------|----|---------------|-------------| -| `nodeSelector` |`string`| `nil` | limiting the nodes which are processed | -| `evictLocalStoragePods` |`bool`| `false` | allows eviction of pods with local storage | +| Name |type| Default Value | Description | +|---------------------------|----|---------------|-----------------------------------------------------------------------------------------------------------------------------| +| `nodeSelector` |`string`| `nil` | limiting the nodes which are processed | +| `evictLocalStoragePods` |`bool`| `false` | allows eviction of pods with local storage | | `evictSystemCriticalPods` |`bool`| `false` | [Warning: Will evict Kubernetes system pods] allows eviction of pods with any priority, including system pods like kube-dns | -| `ignorePvcPods` |`bool`| `false` | set whether PVC pods should be evicted or ignored | -| `evictFailedBarePods` |`bool`| `false` | allow eviction of pods without owner references and in failed phase | -|`labelSelector`|`metav1.LabelSelector`||(see [label filtering](#label-filtering))| -|`priorityThreshold`|`priorityThreshold`||(see [priority filtering](#priority-filtering))| -|`nodeFit`|`bool`|`false`|(see [node fit filtering](#node-fit-filtering))| -|`minReplicas`|`uint`|`0`| ignore eviction of pods where owner (e.g. `ReplicaSet`) replicas is below this threshold | -|`minPodAge`|`metav1.Duration`|`0`| ignore eviction of pods with a creation time within this threshold | +| `ignorePvcPods` |`bool`| `false` | set whether PVC pods should be evicted or ignored | +| `evictFailedBarePods` |`bool`| `false` | allow eviction of pods without owner references and in failed phase | +| `labelSelector` |`metav1.LabelSelector`|| (see [label filtering](#label-filtering)) | +| `priorityThreshold` |`priorityThreshold`|| (see [priority filtering](#priority-filtering)) | +| `nodeFit` |`bool`|`false`| (see [node fit filtering](#node-fit-filtering)) | +| `minReplicas` |`uint`|`0`| ignore eviction of pods where owner (e.g. `ReplicaSet`) replicas is below this threshold | +| `minPodAge` |`metav1.Duration`|`0`| ignore eviction of pods with a creation time within this threshold | +| `ignorePodsWithoutPDB` |`bool`|`false`| set whether pods without PodDisruptionBudget should be evicted or ignored | ### Example policy diff --git a/kubernetes/base/rbac.yaml b/kubernetes/base/rbac.yaml index b2288e343..f38ed3eb6 100644 --- a/kubernetes/base/rbac.yaml +++ b/kubernetes/base/rbac.yaml @@ -22,6 +22,9 @@ rules: - apiGroups: ["scheduling.k8s.io"] resources: ["priorityclasses"] verbs: ["get", "watch", "list"] +- apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["get", "watch", "list"] - apiGroups: ["coordination.k8s.io"] resources: ["leases"] verbs: ["create", "update"] diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index b50757e1b..c039bdac1 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -23,6 +23,7 @@ import ( "strconv" "time" + policyv1 "k8s.io/api/policy/v1" schedulingv1 "k8s.io/api/scheduling/v1" "k8s.io/apimachinery/pkg/runtime/schema" @@ -132,8 +133,11 @@ func newDescheduler(rs *options.DeschedulerServer, deschedulerPolicy *api.Desche v1.SchemeGroupVersion.WithResource("nodes"), // Future work could be to let each plugin declare what type of resources it needs; that way dry runs would stay // consistent with the real runs without having to keep the list here in sync. - v1.SchemeGroupVersion.WithResource("namespaces"), // Used by the defaultevictor plugin - schedulingv1.SchemeGroupVersion.WithResource("priorityclasses")) // Used by the defaultevictor plugin + v1.SchemeGroupVersion.WithResource("namespaces"), // Used by the defaultevictor plugin + schedulingv1.SchemeGroupVersion.WithResource("priorityclasses"), // Used by the defaultevictor plugin + policyv1.SchemeGroupVersion.WithResource("poddisruptionbudgets"), // Used by the defaultevictor plugin + + ) // Used by the defaultevictor plugin getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer) if err != nil { diff --git a/pkg/descheduler/policyconfig.go b/pkg/descheduler/policyconfig.go index 96538f480..393b66d24 100644 --- a/pkg/descheduler/policyconfig.go +++ b/pkg/descheduler/policyconfig.go @@ -105,6 +105,7 @@ func setDefaultEvictor(profile api.DeschedulerProfile, client clientset.Interfac EvictSystemCriticalPods: false, IgnorePvcPods: false, EvictFailedBarePods: false, + IgnorePodsWithoutPDB: false, }, } diff --git a/pkg/framework/plugins/defaultevictor/defaultevictor.go b/pkg/framework/plugins/defaultevictor/defaultevictor.go index 7b20517db..f23bf13a1 100644 --- a/pkg/framework/plugins/defaultevictor/defaultevictor.go +++ b/pkg/framework/plugins/defaultevictor/defaultevictor.go @@ -195,6 +195,20 @@ func New(args runtime.Object, handle frameworktypes.Handle) (frameworktypes.Plug return nil }) } + + if defaultEvictorArgs.IgnorePodsWithoutPDB { + ev.constraints = append(ev.constraints, func(pod *v1.Pod) error { + hasPdb, err := utils.IsPodCoveredByPDB(pod, handle.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()) + if err != nil { + return fmt.Errorf("unable to check if pod is covered by PodDisruptionBudget: %w", err) + } + if !hasPdb { + return fmt.Errorf("no PodDisruptionBudget found for pod") + } + return nil + }) + } + return ev, nil } diff --git a/pkg/framework/plugins/defaultevictor/defaultevictor_test.go b/pkg/framework/plugins/defaultevictor/defaultevictor_test.go index ab3f3c646..dd07d570f 100644 --- a/pkg/framework/plugins/defaultevictor/defaultevictor_test.go +++ b/pkg/framework/plugins/defaultevictor/defaultevictor_test.go @@ -22,6 +22,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" v1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/uuid" @@ -39,6 +40,7 @@ type testCase struct { description string pods []*v1.Pod nodes []*v1.Node + pdbs []*policyv1.PodDisruptionBudget evictFailedBarePods bool evictLocalStoragePods bool evictSystemCriticalPods bool @@ -47,6 +49,7 @@ type testCase struct { minReplicas uint minPodAge *metav1.Duration result bool + ignorePodsWithoutPDB bool } func TestDefaultEvictorPreEvictionFilter(t *testing.T) { @@ -739,6 +742,33 @@ func TestDefaultEvictorFilter(t *testing.T) { }), }, result: true, + }, { + description: "ignorePodsWithoutPDB, pod with no PDBs, no eviction", + pods: []*v1.Pod{ + test.BuildTestPod("p1", 1, 1, n1.Name, func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + pod.Labels = map[string]string{ + "app": "foo", + } + }), + }, + ignorePodsWithoutPDB: true, + result: false, + }, { + description: "ignorePodsWithoutPDB, pod with PDBs, evicts", + pods: []*v1.Pod{ + test.BuildTestPod("p1", 1, 1, n1.Name, func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + pod.Labels = map[string]string{ + "app": "foo", + } + }), + }, + pdbs: []*policyv1.PodDisruptionBudget{ + test.BuildTestPDB("pdb1", "foo"), + }, + ignorePodsWithoutPDB: true, + result: true, }, } @@ -811,11 +841,15 @@ func initializePlugin(ctx context.Context, test testCase) (frameworktypes.Plugin for _, pod := range test.pods { objs = append(objs, pod) } + for _, pdb := range test.pdbs { + objs = append(objs, pdb) + } fakeClient := fake.NewSimpleClientset(objs...) sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0) podInformer := sharedInformerFactory.Core().V1().Pods().Informer() + _ = sharedInformerFactory.Policy().V1().PodDisruptionBudgets().Lister() getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer) if err != nil { @@ -833,9 +867,10 @@ func initializePlugin(ctx context.Context, test testCase) (frameworktypes.Plugin PriorityThreshold: &api.PriorityThreshold{ Value: test.priorityThreshold, }, - NodeFit: test.nodeFit, - MinReplicas: test.minReplicas, - MinPodAge: test.minPodAge, + NodeFit: test.nodeFit, + MinReplicas: test.minReplicas, + MinPodAge: test.minPodAge, + IgnorePodsWithoutPDB: test.ignorePodsWithoutPDB, } evictorPlugin, err := New( diff --git a/pkg/framework/plugins/defaultevictor/defaults_test.go b/pkg/framework/plugins/defaultevictor/defaults_test.go index 026af73be..2a76e778c 100644 --- a/pkg/framework/plugins/defaultevictor/defaults_test.go +++ b/pkg/framework/plugins/defaultevictor/defaults_test.go @@ -42,6 +42,7 @@ func TestSetDefaults_DefaultEvictorArgs(t *testing.T) { LabelSelector: nil, PriorityThreshold: nil, NodeFit: false, + IgnorePodsWithoutPDB: false, }, }, { @@ -57,7 +58,8 @@ func TestSetDefaults_DefaultEvictorArgs(t *testing.T) { PriorityThreshold: &api.PriorityThreshold{ Value: utilptr.To[int32](800), }, - NodeFit: true, + NodeFit: true, + IgnorePodsWithoutPDB: true, }, want: &DefaultEvictorArgs{ NodeSelector: "NodeSelector", @@ -70,7 +72,8 @@ func TestSetDefaults_DefaultEvictorArgs(t *testing.T) { PriorityThreshold: &api.PriorityThreshold{ Value: utilptr.To[int32](800), }, - NodeFit: true, + NodeFit: true, + IgnorePodsWithoutPDB: true, }, }, } diff --git a/pkg/framework/plugins/defaultevictor/types.go b/pkg/framework/plugins/defaultevictor/types.go index 6e4d84f55..3a39cbc91 100644 --- a/pkg/framework/plugins/defaultevictor/types.go +++ b/pkg/framework/plugins/defaultevictor/types.go @@ -36,4 +36,5 @@ type DefaultEvictorArgs struct { NodeFit bool `json:"nodeFit,omitempty"` MinReplicas uint `json:"minReplicas,omitempty"` MinPodAge *metav1.Duration `json:"minPodAge,omitempty"` + IgnorePodsWithoutPDB bool `json:"ignorePodsWithoutPDB,omitempty"` } diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go index 4de6a6c26..3b015317e 100644 --- a/pkg/utils/pod.go +++ b/pkg/utils/pod.go @@ -3,6 +3,11 @@ package utils import ( "fmt" + policy "k8s.io/api/policy/v1" + "k8s.io/apimachinery/pkg/labels" + + policyv1 "k8s.io/client-go/listers/policy/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -115,6 +120,38 @@ func IsPodWithPVC(pod *v1.Pod) bool { return false } +// IsPodCoveredByPDB returns true if the pod is covered by at least one PodDisruptionBudget. +func IsPodCoveredByPDB(pod *v1.Pod, lister policyv1.PodDisruptionBudgetLister) (bool, error) { + // We can't use the GetPodPodDisruptionBudgets expansion method here because it treats no pdb as an error, + // but we want to return false. + + list, err := lister.PodDisruptionBudgets(pod.Namespace).List(labels.Everything()) + if err != nil { + return false, err + } + + if len(list) == 0 { + return false, nil + } + + podLabels := labels.Set(pod.Labels) + var pdbList []*policy.PodDisruptionBudget + for _, pdb := range list { + selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector) + if err != nil { + // This object has an invalid selector, it will never match the pod + continue + } + + if !selector.Matches(podLabels) { + continue + } + pdbList = append(pdbList, pdb) + } + + return len(pdbList) > 0, nil +} + // GetPodSource returns the source of the pod based on the annotation. func GetPodSource(pod *v1.Pod) (string, error) { if pod.Annotations != nil { diff --git a/test/test_utils.go b/test/test_utils.go index 58485c117..21c85a07b 100644 --- a/test/test_utils.go +++ b/test/test_utils.go @@ -23,6 +23,9 @@ import ( "testing" "time" + policyv1 "k8s.io/api/policy/v1" + "k8s.io/apimachinery/pkg/util/intstr" + appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -67,6 +70,25 @@ func BuildTestPod(name string, cpu, memory int64, nodeName string, apply func(*v return pod } +func BuildTestPDB(name, appLabel string) *policyv1.PodDisruptionBudget { + maxUnavailable := intstr.FromInt32(1) + pdb := &policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: name, + }, + Spec: policyv1.PodDisruptionBudgetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": appLabel, + }, + }, + MaxUnavailable: &maxUnavailable, + }, + } + return pdb +} + // GetMirrorPodAnnotation returns the annotation needed for mirror pod. func GetMirrorPodAnnotation() map[string]string { return map[string]string{