From d998d82357318f30adbfc1d871c68ae47cc72fb4 Mon Sep 17 00:00:00 2001
From: Jan Chaloupka <jchaloup@redhat.com>
Date: Tue, 8 Jun 2021 16:55:22 +0200
Subject: [PATCH] HighNodeUtilization: add NodeFit feature

---
 README.md                                     | 25 ++++--
 .../nodeutilization/highnodeutilization.go    | 19 +++--
 .../highnodeutilization_test.go               | 81 ++++++++++++++++++-
 3 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 145536abd..fc8e8ce85 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,7 @@ See the [user guide](docs/user-guide.md) in the `/docs` directory.
 ## Policy and Strategies
 
 Descheduler's policy is configurable and includes strategies that can be enabled or disabled.
-Nine strategies 
+Nine strategies
 1. `RemoveDuplicates`
 2. `LowNodeUtilization`
 3. `HighNodeUtilization`
@@ -252,8 +252,10 @@ strategies:
 ```
 
 Policy should pass the following validation checks:
-* Three basic native types of resources are supported: `cpu`, `memory` and `pods`. If any of these resource types is not specified, all its thresholds default to 100% to avoid nodes going from underutilized to overutilized.
-* Extended resources are supported. For example, resource type `nvidia.com/gpu` is specified for GPU node utilization. Extended resources are optional, and will not be used to compute node's usage if it's not specified in `thresholds` and `targetThresholds` explicitly.
+* Three basic native types of resources are supported: `cpu`, `memory` and `pods`.
+If any of these resource types is not specified, all its thresholds default to 100% to avoid nodes going from underutilized to overutilized.
+* Extended resources are supported. For example, resource type `nvidia.com/gpu` is specified for GPU node utilization. Extended resources are optional,
+and will not be used to compute node's usage if it's not specified in `thresholds` and `targetThresholds` explicitly.
 * `thresholds` or `targetThresholds` can not be nil and they must configure exactly the same types of resources.
 * The valid range of the resource's percentage value is \[0, 100\]
 * Percentage value of `thresholds` can not be greater than `targetThresholds` for the same resource.
@@ -265,7 +267,8 @@ under utilized frequently or for a short period of time. By default, `numberOfNo
 
 ### HighNodeUtilization
 
-This strategy finds nodes that are under utilized and evicts pods in the hope that these pods will be scheduled compactly into fewer nodes. This strategy **must** be used with the 
+This strategy finds nodes that are under utilized and evicts pods in the hope that these pods will be scheduled compactly into fewer nodes.
+This strategy **must** be used with the
 scheduler strategy `MostRequestedPriority`. The parameters of this strategy are configured under `nodeResourceUtilizationThresholds`.
 
 The under utilization of nodes is determined by a configurable threshold `thresholds`. The threshold
@@ -274,10 +277,13 @@ calculated as the current resources requested on the node vs [total allocatable]
 For pods, this means the number of pods on the node as a fraction of the pod capacity set for that node.
 
 If a node's usage is below threshold for all (cpu, memory, number of pods and extended resources), the node is considered underutilized.
-Currently, pods request resource requirements are considered for computing node resource utilization. Any node above `thresholds` is considered appropriately utilized and is not considered for eviction. 
+Currently, pods request resource requirements are considered for computing node resource utilization.
+Any node above `thresholds` is considered appropriately utilized and is not considered for eviction.
 
 The `thresholds` param could be tuned as per your cluster requirements. Note that this
-strategy evicts pods from `underutilized nodes` (those with usage below `thresholds`) so that they can be recreated in appropriately utilized nodes. The strategy will abort if any number of `underutilized nodes` or `appropriately utilized nodes` is zero.
+strategy evicts pods from `underutilized nodes` (those with usage below `thresholds`)
+so that they can be recreated in appropriately utilized nodes.
+The strategy will abort if any number of `underutilized nodes` or `appropriately utilized nodes` is zero.
 
 **Parameters:**
 
@@ -287,6 +293,7 @@ strategy evicts pods from `underutilized nodes` (those with usage below `thresho
 |`numberOfNodes`|int|
 |`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
 |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
+|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
 
 **Example:**
 
@@ -644,6 +651,7 @@ strategies:
 The following strategies accept a `nodeFit` boolean parameter which can optimize descheduling:
 * `RemoveDuplicates`
 * `LowNodeUtilization`
+* `HighNodeUtilization`
 * `RemovePodsViolatingInterPodAntiAffinity`
 * `RemovePodsViolatingNodeAffinity`
 * `RemovePodsViolatingNodeTaints`
@@ -677,7 +685,10 @@ strategies:
         nodeFit: true
 ```
 
-Note that node fit filtering references the current pod spec, and not that of it's owner. Thus, if the pod is owned by a ReplicationController (and that ReplicationController was modified recently), the pod may be running with an outdated spec, which the descheduler will reference when determining node fit. This is expected behavior as the descheduler is a "best-effort" mechanism. 
+Note that node fit filtering references the current pod spec, and not that of it's owner.
+Thus, if the pod is owned by a ReplicationController (and that ReplicationController was modified recently),
+the pod may be running with an outdated spec, which the descheduler will reference when determining node fit.
+This is expected behavior as the descheduler is a "best-effort" mechanism.
 
 Using Deployments instead of ReplicationControllers provides an automated rollout of pod spec changes, therefore ensuring that the descheduler has an up-to-date view of the cluster state.
 
diff --git a/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go b/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go
index 27e354f54..791500fb6 100644
--- a/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go
+++ b/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go
@@ -19,6 +19,7 @@ package nodeutilization
 import (
 	"context"
 	"fmt"
+
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	clientset "k8s.io/client-go/kubernetes"
@@ -36,6 +37,12 @@ func HighNodeUtilization(ctx context.Context, client clientset.Interface, strate
 		klog.ErrorS(err, "Invalid HighNodeUtilization parameters")
 		return
 	}
+
+	nodeFit := false
+	if strategy.Params != nil {
+		nodeFit = strategy.Params.NodeFit
+	}
+
 	thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
 	if err != nil {
 		klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
@@ -68,13 +75,13 @@ func HighNodeUtilization(ctx context.Context, client clientset.Interface, strate
 
 	// log message in one line
 	keysAndValues := []interface{}{
-		"CPU", targetThresholds[v1.ResourceCPU],
-		"Mem", targetThresholds[v1.ResourceMemory],
-		"Pods", targetThresholds[v1.ResourcePods],
+		"CPU", thresholds[v1.ResourceCPU],
+		"Mem", thresholds[v1.ResourceMemory],
+		"Pods", thresholds[v1.ResourcePods],
 	}
-	for name := range targetThresholds {
+	for name := range thresholds {
 		if !isBasicResource(name) {
-			keysAndValues = append(keysAndValues, string(name), int64(targetThresholds[name]))
+			keysAndValues = append(keysAndValues, string(name), int64(thresholds[name]))
 		}
 	}
 
@@ -98,7 +105,7 @@ func HighNodeUtilization(ctx context.Context, client clientset.Interface, strate
 		return
 	}
 
-	evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority))
+	evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
 
 	// stop if the total available usage has dropped to zero - no more pods can be scheduled
 	continueEvictionCond := func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool {
diff --git a/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go
index d3928587b..5fa3994ff 100644
--- a/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go
+++ b/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go
@@ -19,6 +19,9 @@ package nodeutilization
 import (
 	"context"
 	"fmt"
+	"strings"
+	"testing"
+
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/api/policy/v1beta1"
 	"k8s.io/apimachinery/pkg/api/resource"
@@ -29,8 +32,6 @@ import (
 	"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
 	"sigs.k8s.io/descheduler/pkg/utils"
 	"sigs.k8s.io/descheduler/test"
-	"strings"
-	"testing"
 )
 
 func TestHighNodeUtilization(t *testing.T) {
@@ -39,6 +40,9 @@ func TestHighNodeUtilization(t *testing.T) {
 	n2NodeName := "n2"
 	n3NodeName := "n3"
 
+	nodeSelectorKey := "datacenter"
+	nodeSelectorValue := "west"
+
 	testCases := []struct {
 		name                  string
 		thresholds            api.ResourceThresholds
@@ -445,6 +449,78 @@ func TestHighNodeUtilization(t *testing.T) {
 			maxPodsToEvictPerNode: 0,
 			expectedPodsEvicted:   0,
 		},
+		{
+			name: "Other node match pod node selector",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:  30,
+				v1.ResourcePods: 30,
+			},
+			nodes: map[string]*v1.Node{
+				n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, func(node *v1.Node) {
+					node.ObjectMeta.Labels = map[string]string{
+						nodeSelectorKey: nodeSelectorValue,
+					}
+				}),
+				n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
+			},
+			pods: map[string]*v1.PodList{
+				n1NodeName: {
+					Items: []v1.Pod{
+						*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
+						*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
+						*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
+						*test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetDSOwnerRef),
+					},
+				},
+				n2NodeName: {
+					Items: []v1.Pod{
+						*test.BuildTestPod("p5", 400, 0, n2NodeName, func(pod *v1.Pod) {
+							// A pod selecting nodes in the "west" datacenter
+							test.SetRSOwnerRef(pod)
+							pod.Spec.NodeSelector = map[string]string{
+								nodeSelectorKey: nodeSelectorValue,
+							}
+						}),
+					},
+				},
+			},
+			maxPodsToEvictPerNode: 0,
+			expectedPodsEvicted:   1,
+		},
+		{
+			name: "Other node does not match pod node selector",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:  30,
+				v1.ResourcePods: 30,
+			},
+			nodes: map[string]*v1.Node{
+				n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
+				n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
+			},
+			pods: map[string]*v1.PodList{
+				n1NodeName: {
+					Items: []v1.Pod{
+						*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
+						*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
+						*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
+						*test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetDSOwnerRef),
+					},
+				},
+				n2NodeName: {
+					Items: []v1.Pod{
+						*test.BuildTestPod("p5", 400, 0, n2NodeName, func(pod *v1.Pod) {
+							// A pod selecting nodes in the "west" datacenter
+							test.SetRSOwnerRef(pod)
+							pod.Spec.NodeSelector = map[string]string{
+								nodeSelectorKey: nodeSelectorValue,
+							}
+						}),
+					},
+				},
+			},
+			maxPodsToEvictPerNode: 0,
+			expectedPodsEvicted:   0,
+		},
 	}
 
 	for _, test := range testCases {
@@ -514,6 +590,7 @@ func TestHighNodeUtilization(t *testing.T) {
 					NodeResourceUtilizationThresholds: &api.NodeResourceUtilizationThresholds{
 						Thresholds: test.thresholds,
 					},
+					NodeFit: true,
 				},
 			}
 			HighNodeUtilization(ctx, fakeClient, strategy, nodes, podEvictor)