From 5bf11813e63db1c14b52eb659512436f1d2794d3 Mon Sep 17 00:00:00 2001 From: Jan Chaloupka Date: Fri, 7 Mar 2025 14:18:26 +0100 Subject: [PATCH] lownodeutilization: evictionLimits to limit the evictions per plugin In some cases it might be usefull to limit how many evictions per a domain can be performed. To avoid burning the whole per descheduling cycle budget. Limiting the number of evictions per node is a prerequisite for evicting pods whose usage can't be easily subtracted from overall node resource usage to predict the final usage. E.g. when a pod is evicted due to high PSI pressure which takes into account many factors which can be fully captured by the current predictive resource model. --- README.md | 9 ++- pkg/api/types.go | 6 ++ pkg/api/zz_generated.deepcopy.go | 21 ++++++ .../nodeutilization/highnodeutilization.go | 1 + .../nodeutilization/lownodeutilization.go | 6 ++ .../lownodeutilization_test.go | 69 +++++++++++++++++++ .../nodeutilization/nodeutilization.go | 10 ++- .../plugins/nodeutilization/types.go | 3 + .../nodeutilization/zz_generated.deepcopy.go | 5 ++ 9 files changed, 127 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d103d0e4f..3e7bd4de6 100644 --- a/README.md +++ b/README.md @@ -300,6 +300,7 @@ See `metricsCollector` field at [Top Level configuration](#top-level-configurati |`thresholds`|map(string:int)| |`targetThresholds`|map(string:int)| |`numberOfNodes`|int| +|`evictionLimits`|object| |`evictableNamespaces`|(see [namespace filtering](#namespace-filtering))| |`metricsUtilization`|object| |`metricsUtilization.metricsServer`|bool| @@ -325,6 +326,8 @@ profiles: "pods": 50 metricsUtilization: metricsServer: true + evictionLimits: + node: 5 plugins: balance: enabled: @@ -340,10 +343,12 @@ and will not be used to compute node's usage if it's not specified in `threshold * The valid range of the resource's percentage value is \[0, 100\] * Percentage value of `thresholds` can not be greater than `targetThresholds` for the same resource. -There is another parameter associated with the `LowNodeUtilization` strategy, called `numberOfNodes`. -This parameter can be configured to activate the strategy only when the number of under utilized nodes +There are two more parameters associated with the `LowNodeUtilization` strategy, called `numberOfNodes` and `evictionLimits`. +The first parameter can be configured to activate the strategy only when the number of under utilized nodes are above the configured value. This could be helpful in large clusters where a few nodes could go under utilized frequently or for a short period of time. By default, `numberOfNodes` is set to zero. +The second parameter is useful when a number of evictions per the plugin per a descheduling cycle needs to be limited. +The parameter currently enables to limit the number of evictions per node through `node` field. ### HighNodeUtilization diff --git a/pkg/api/types.go b/pkg/api/types.go index f3edce0c4..53c61b72b 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -64,6 +64,12 @@ type Namespaces struct { Exclude []string `json:"exclude,omitempty"` } +// EvictionLimits limits the number of evictions per domain. E.g. node, namespace, total. +type EvictionLimits struct { + // node restricts the maximum number of evictions per node + Node *uint `json:"node,omitempty"` +} + type ( Percentage float64 ResourceThresholds map[v1.ResourceName]Percentage diff --git a/pkg/api/zz_generated.deepcopy.go b/pkg/api/zz_generated.deepcopy.go index 137fcefbd..fac17715e 100644 --- a/pkg/api/zz_generated.deepcopy.go +++ b/pkg/api/zz_generated.deepcopy.go @@ -112,6 +112,27 @@ func (in *DeschedulerProfile) DeepCopy() *DeschedulerProfile { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EvictionLimits) DeepCopyInto(out *EvictionLimits) { + *out = *in + if in.Node != nil { + in, out := &in.Node, &out.Node + *out = new(uint) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EvictionLimits. +func (in *EvictionLimits) DeepCopy() *EvictionLimits { + if in == nil { + return nil + } + out := new(EvictionLimits) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MetricsCollector) DeepCopyInto(out *MetricsCollector) { *out = *in diff --git a/pkg/framework/plugins/nodeutilization/highnodeutilization.go b/pkg/framework/plugins/nodeutilization/highnodeutilization.go index 297567071..f9254e6cd 100644 --- a/pkg/framework/plugins/nodeutilization/highnodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/highnodeutilization.go @@ -161,6 +161,7 @@ func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fr h.resourceNames, continueEvictionCond, h.usageClient, + nil, ) return nil diff --git a/pkg/framework/plugins/nodeutilization/lownodeutilization.go b/pkg/framework/plugins/nodeutilization/lownodeutilization.go index 8c12beccc..7602f455b 100644 --- a/pkg/framework/plugins/nodeutilization/lownodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/lownodeutilization.go @@ -183,6 +183,11 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra // Sort the nodes by the usage in descending order sortNodesByUsage(sourceNodes, false) + var nodeLimit *uint + if l.args.EvictionLimits != nil { + nodeLimit = l.args.EvictionLimits.Node + } + evictPodsFromSourceNodes( ctx, l.args.EvictableNamespaces, @@ -194,6 +199,7 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra l.resourceNames, continueEvictionCond, l.usageClient, + nodeLimit, ) return nil diff --git a/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go b/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go index 7a8c441ce..671344a11 100644 --- a/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go +++ b/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go @@ -31,6 +31,7 @@ import ( core "k8s.io/client-go/testing" "k8s.io/metrics/pkg/apis/metrics/v1beta1" fakemetricsclient "k8s.io/metrics/pkg/client/clientset/versioned/fake" + "k8s.io/utils/ptr" "sigs.k8s.io/descheduler/pkg/api" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" @@ -63,6 +64,7 @@ func TestLowNodeUtilization(t *testing.T) { expectedPodsWithMetricsEvicted uint evictedPods []string evictableNamespaces *api.Namespaces + evictionLimits *api.EvictionLimits }{ { name: "no evictable pods", @@ -1122,6 +1124,72 @@ func TestLowNodeUtilization(t *testing.T) { expectedPodsEvicted: 3, expectedPodsWithMetricsEvicted: 2, }, + { + name: "without priorities with node eviction limit", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 30, + v1.ResourcePods: 30, + }, + targetThresholds: api.ResourceThresholds{ + v1.ResourceCPU: 50, + v1.ResourcePods: 50, + }, + evictionLimits: &api.EvictionLimits{ + Node: ptr.To[uint](2), + }, + nodes: []*v1.Node{ + test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil), + test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil), + test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable), + }, + pods: []*v1.Pod{ + test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p5", 400, 0, n1NodeName, test.SetRSOwnerRef), + // These won't be evicted. + test.BuildTestPod("p6", 400, 0, n1NodeName, test.SetDSOwnerRef), + test.BuildTestPod("p7", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod with local storage. + test.SetNormalOwnerRef(pod) + pod.Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI), + }, + }, + }, + } + // A Mirror Pod. + pod.Annotations = test.GetMirrorPodAnnotation() + }), + test.BuildTestPod("p8", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A Critical Pod. + pod.Namespace = "kube-system" + priority := utils.SystemCriticalPriority + pod.Spec.Priority = &priority + }), + test.BuildTestPod("p9", 400, 0, n2NodeName, test.SetRSOwnerRef), + }, + nodemetricses: []*v1beta1.NodeMetrics{ + test.BuildNodeMetrics(n1NodeName, 3201, 0), + test.BuildNodeMetrics(n2NodeName, 401, 0), + test.BuildNodeMetrics(n3NodeName, 11, 0), + }, + podmetricses: []*v1beta1.PodMetrics{ + test.BuildPodMetrics("p1", 401, 0), + test.BuildPodMetrics("p2", 401, 0), + test.BuildPodMetrics("p3", 401, 0), + test.BuildPodMetrics("p4", 401, 0), + test.BuildPodMetrics("p5", 401, 0), + }, + expectedPodsEvicted: 2, + expectedPodsWithMetricsEvicted: 2, + }, } for _, tc := range testCases { @@ -1193,6 +1261,7 @@ func TestLowNodeUtilization(t *testing.T) { Thresholds: tc.thresholds, TargetThresholds: tc.targetThresholds, UseDeviationThresholds: tc.useDeviationThresholds, + EvictionLimits: tc.evictionLimits, EvictableNamespaces: tc.evictableNamespaces, MetricsUtilization: MetricsUtilization{ MetricsServer: metricsEnabled, diff --git a/pkg/framework/plugins/nodeutilization/nodeutilization.go b/pkg/framework/plugins/nodeutilization/nodeutilization.go index 9750b7b6b..cca77e89b 100644 --- a/pkg/framework/plugins/nodeutilization/nodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/nodeutilization.go @@ -239,6 +239,7 @@ func evictPodsFromSourceNodes( resourceNames []v1.ResourceName, continueEviction continueEvictionCond, usageClient usageClient, + maxNoOfPodsToEvictPerNode *uint, ) { // upper bound on total number of pods/cpu/memory and optional extended resources to be moved totalAvailableUsage := api.ReferencedResourceList{} @@ -280,7 +281,7 @@ func evictPodsFromSourceNodes( klog.V(1).InfoS("Evicting pods based on priority, if they have same priority, they'll be evicted based on QoS tiers") // sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers. podutil.SortPodsBasedOnPriorityLowToHigh(removablePods) - err := evictPods(ctx, evictableNamespaces, removablePods, node, totalAvailableUsage, taintsOfDestinationNodes, podEvictor, evictOptions, continueEviction, usageClient) + err := evictPods(ctx, evictableNamespaces, removablePods, node, totalAvailableUsage, taintsOfDestinationNodes, podEvictor, evictOptions, continueEviction, usageClient, maxNoOfPodsToEvictPerNode) if err != nil { switch err.(type) { case *evictions.EvictionTotalLimitError: @@ -302,14 +303,20 @@ func evictPods( evictOptions evictions.EvictOptions, continueEviction continueEvictionCond, usageClient usageClient, + maxNoOfPodsToEvictPerNode *uint, ) error { var excludedNamespaces sets.Set[string] if evictableNamespaces != nil { excludedNamespaces = sets.New(evictableNamespaces.Exclude...) } + var evictionCounter uint = 0 if continueEviction(nodeInfo, totalAvailableUsage) { for _, pod := range inputPods { + if maxNoOfPodsToEvictPerNode != nil && evictionCounter >= *maxNoOfPodsToEvictPerNode { + klog.V(3).InfoS("Max number of evictions per node per plugin reached", "limit", *maxNoOfPodsToEvictPerNode) + break + } if !utils.PodToleratesTaints(pod, taintsOfLowNodes) { klog.V(3).InfoS("Skipping eviction for pod, doesn't tolerate node taint", "pod", klog.KObj(pod)) continue @@ -334,6 +341,7 @@ func evictPods( } err = podEvictor.Evict(ctx, pod, evictOptions) if err == nil { + evictionCounter++ klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod)) for name := range totalAvailableUsage { diff --git a/pkg/framework/plugins/nodeutilization/types.go b/pkg/framework/plugins/nodeutilization/types.go index 8e005fa02..e670edab0 100644 --- a/pkg/framework/plugins/nodeutilization/types.go +++ b/pkg/framework/plugins/nodeutilization/types.go @@ -34,6 +34,9 @@ type LowNodeUtilizationArgs struct { // considered while considering resources used by pods // but then filtered out before eviction EvictableNamespaces *api.Namespaces `json:"evictableNamespaces,omitempty"` + + // evictionLimits limits the number of evictions per domain. E.g. node, namespace, total. + EvictionLimits *api.EvictionLimits `json:"evictionLimits,omitempty"` } // +k8s:deepcopy-gen=true diff --git a/pkg/framework/plugins/nodeutilization/zz_generated.deepcopy.go b/pkg/framework/plugins/nodeutilization/zz_generated.deepcopy.go index 6a426b5fb..adbc058e9 100644 --- a/pkg/framework/plugins/nodeutilization/zz_generated.deepcopy.go +++ b/pkg/framework/plugins/nodeutilization/zz_generated.deepcopy.go @@ -88,6 +88,11 @@ func (in *LowNodeUtilizationArgs) DeepCopyInto(out *LowNodeUtilizationArgs) { *out = new(api.Namespaces) (*in).DeepCopyInto(*out) } + if in.EvictionLimits != nil { + in, out := &in.EvictionLimits, &out.EvictionLimits + *out = new(api.EvictionLimits) + (*in).DeepCopyInto(*out) + } return }