From af495e65f7aef29451e30bc5e1959055e95f7889 Mon Sep 17 00:00:00 2001
From: Hao Fan <hao.fan@shopee.com>
Date: Mon, 23 Sep 2024 00:22:45 +0800
Subject: [PATCH] e2e: TopologySpreadConstraint: build a descheduler image and
 run the descheduler as a pod

---
 test/e2e/e2e_topologyspreadconstraint_test.go | 272 ++++++++++++------
 1 file changed, 188 insertions(+), 84 deletions(-)

diff --git a/test/e2e/e2e_topologyspreadconstraint_test.go b/test/e2e/e2e_topologyspreadconstraint_test.go
index c033dc9d9..09007f104 100644
--- a/test/e2e/e2e_topologyspreadconstraint_test.go
+++ b/test/e2e/e2e_topologyspreadconstraint_test.go
@@ -6,29 +6,70 @@ import (
 	"os"
 	"strings"
 	"testing"
+	"time"
 
 	appsv1 "k8s.io/api/apps/v1"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/apimachinery/pkg/util/wait"
 	componentbaseconfig "k8s.io/component-base/config"
 
+	"sigs.k8s.io/descheduler/pkg/api"
+	apiv1alpha2 "sigs.k8s.io/descheduler/pkg/api/v1alpha2"
 	"sigs.k8s.io/descheduler/pkg/descheduler/client"
-	"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
-	eutils "sigs.k8s.io/descheduler/pkg/descheduler/evictions/utils"
 	"sigs.k8s.io/descheduler/pkg/framework/plugins/defaultevictor"
 	"sigs.k8s.io/descheduler/pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint"
-	frameworktesting "sigs.k8s.io/descheduler/pkg/framework/testing"
-	frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types"
 )
 
 const zoneTopologyKey string = "topology.kubernetes.io/zone"
 
+func topologySpreadConstraintPolicy(constraintArgs *removepodsviolatingtopologyspreadconstraint.RemovePodsViolatingTopologySpreadConstraintArgs,
+	evictorArgs *defaultevictor.DefaultEvictorArgs,
+) *apiv1alpha2.DeschedulerPolicy {
+	return &apiv1alpha2.DeschedulerPolicy{
+		Profiles: []apiv1alpha2.DeschedulerProfile{
+			{
+				Name: removepodsviolatingtopologyspreadconstraint.PluginName + "Profile",
+				PluginConfigs: []apiv1alpha2.PluginConfig{
+					{
+						Name: removepodsviolatingtopologyspreadconstraint.PluginName,
+						Args: runtime.RawExtension{
+							Object: constraintArgs,
+						},
+					},
+					{
+						Name: defaultevictor.PluginName,
+						Args: runtime.RawExtension{
+							Object: evictorArgs,
+						},
+					},
+				},
+				Plugins: apiv1alpha2.Plugins{
+					Filter: apiv1alpha2.PluginSet{
+						Enabled: []string{
+							defaultevictor.PluginName,
+						},
+					},
+					Balance: apiv1alpha2.PluginSet{
+						Enabled: []string{
+							removepodsviolatingtopologyspreadconstraint.PluginName,
+						},
+					},
+				},
+			},
+		},
+	}
+}
+
 func TestTopologySpreadConstraint(t *testing.T) {
 	ctx := context.Background()
+
 	clientSet, err := client.CreateClient(componentbaseconfig.ClientConnectionConfiguration{Kubeconfig: os.Getenv("KUBECONFIG")}, "")
 	if err != nil {
-		t.Errorf("Error during client creation with %v", err)
+		t.Errorf("Error during kubernetes client creation with %v", err)
 	}
 
 	nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
@@ -43,14 +84,16 @@ func TestTopologySpreadConstraint(t *testing.T) {
 	}
 	defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{})
 
-	testCases := map[string]struct {
-		expectedEvictedCount     uint
+	testCases := []struct {
+		name                     string
+		expectedEvictedPodCount  int
 		replicaCount             int
 		topologySpreadConstraint v1.TopologySpreadConstraint
 	}{
-		"test-topology-spread-hard-constraint": {
-			expectedEvictedCount: 1,
-			replicaCount:         4,
+		{
+			name:                    "test-topology-spread-hard-constraint",
+			expectedEvictedPodCount: 1,
+			replicaCount:            4,
 			topologySpreadConstraint: v1.TopologySpreadConstraint{
 				LabelSelector: &metav1.LabelSelector{
 					MatchLabels: map[string]string{
@@ -62,9 +105,10 @@ func TestTopologySpreadConstraint(t *testing.T) {
 				WhenUnsatisfiable: v1.DoNotSchedule,
 			},
 		},
-		"test-topology-spread-soft-constraint": {
-			expectedEvictedCount: 1,
-			replicaCount:         4,
+		{
+			name:                    "test-topology-spread-soft-constraint",
+			expectedEvictedPodCount: 1,
+			replicaCount:            4,
 			topologySpreadConstraint: v1.TopologySpreadConstraint{
 				LabelSelector: &metav1.LabelSelector{
 					MatchLabels: map[string]string{
@@ -76,9 +120,10 @@ func TestTopologySpreadConstraint(t *testing.T) {
 				WhenUnsatisfiable: v1.ScheduleAnyway,
 			},
 		},
-		"test-node-taints-policy-honor": {
-			expectedEvictedCount: 1,
-			replicaCount:         4,
+		{
+			name:                    "test-node-taints-policy-honor",
+			expectedEvictedPodCount: 1,
+			replicaCount:            4,
 			topologySpreadConstraint: v1.TopologySpreadConstraint{
 				LabelSelector: &metav1.LabelSelector{
 					MatchLabels: map[string]string{
@@ -91,9 +136,10 @@ func TestTopologySpreadConstraint(t *testing.T) {
 				WhenUnsatisfiable: v1.DoNotSchedule,
 			},
 		},
-		"test-node-affinity-policy-ignore": {
-			expectedEvictedCount: 1,
-			replicaCount:         4,
+		{
+			name:                    "test-node-affinity-policy-ignore",
+			expectedEvictedPodCount: 1,
+			replicaCount:            4,
 			topologySpreadConstraint: v1.TopologySpreadConstraint{
 				LabelSelector: &metav1.LabelSelector{
 					MatchLabels: map[string]string{
@@ -106,9 +152,10 @@ func TestTopologySpreadConstraint(t *testing.T) {
 				WhenUnsatisfiable:  v1.DoNotSchedule,
 			},
 		},
-		"test-match-label-keys": {
-			expectedEvictedCount: 0,
-			replicaCount:         4,
+		{
+			name:                    "test-match-label-keys",
+			expectedEvictedPodCount: 0,
+			replicaCount:            4,
 			topologySpreadConstraint: v1.TopologySpreadConstraint{
 				LabelSelector: &metav1.LabelSelector{
 					MatchLabels: map[string]string{
@@ -122,16 +169,16 @@ func TestTopologySpreadConstraint(t *testing.T) {
 			},
 		},
 	}
-	for name, tc := range testCases {
-		t.Run(name, func(t *testing.T) {
-			t.Logf("Creating Deployment %s with %d replicas", name, tc.replicaCount)
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Logf("Creating Deployment %s with %d replicas", tc.name, tc.replicaCount)
 			deployLabels := tc.topologySpreadConstraint.LabelSelector.DeepCopy().MatchLabels
-			deployLabels["name"] = name
-			deployment := buildTestDeployment(name, testNamespace.Name, int32(tc.replicaCount), deployLabels, func(d *appsv1.Deployment) {
+			deployLabels["name"] = tc.name
+			deployment := buildTestDeployment(tc.name, testNamespace.Name, int32(tc.replicaCount), deployLabels, func(d *appsv1.Deployment) {
 				d.Spec.Template.Spec.TopologySpreadConstraints = []v1.TopologySpreadConstraint{tc.topologySpreadConstraint}
 			})
 			if _, err := clientSet.AppsV1().Deployments(deployment.Namespace).Create(ctx, deployment, metav1.CreateOptions{}); err != nil {
-				t.Fatalf("Error creating Deployment %s %v", name, err)
+				t.Fatalf("Error creating Deployment %s %v", tc.name, err)
 			}
 			defer func() {
 				clientSet.AppsV1().Deployments(deployment.Namespace).Delete(ctx, deployment.Name, metav1.DeleteOptions{})
@@ -140,7 +187,7 @@ func TestTopologySpreadConstraint(t *testing.T) {
 			waitForPodsRunning(ctx, t, clientSet, deployment.Labels, tc.replicaCount, deployment.Namespace)
 
 			// Create a "Violator" Deployment that has the same label and is forced to be on the same node using a nodeSelector
-			violatorDeploymentName := name + "-violator"
+			violatorDeploymentName := tc.name + "-violator"
 			violatorCount := tc.topologySpreadConstraint.MaxSkew + 1
 			violatorDeployLabels := tc.topologySpreadConstraint.LabelSelector.DeepCopy().MatchLabels
 			violatorDeployLabels["name"] = violatorDeploymentName
@@ -156,82 +203,139 @@ func TestTopologySpreadConstraint(t *testing.T) {
 			}()
 			waitForPodsRunning(ctx, t, clientSet, violatorDeployment.Labels, int(violatorCount), violatorDeployment.Namespace)
 
-			evictionPolicyGroupVersion, err := eutils.SupportEviction(clientSet)
-			if err != nil || len(evictionPolicyGroupVersion) == 0 {
-				t.Fatalf("Error detecting eviction policy group: %v", err)
-			}
-
-			handle, podEvictor, err := frameworktesting.InitFrameworkHandle(
-				ctx,
-				clientSet,
-				evictions.NewOptions().
-					WithPolicyGroupVersion(evictionPolicyGroupVersion),
-				defaultevictor.DefaultEvictorArgs{
-					EvictLocalStoragePods: true,
-				},
-				nil,
-			)
-			if err != nil {
-				t.Fatalf("Unable to initialize a framework handle: %v", err)
-			}
-
 			// Run TopologySpreadConstraint strategy
-			t.Logf("Running RemovePodsViolatingTopologySpreadConstraint strategy for %s", name)
+			t.Logf("Running RemovePodsViolatingTopologySpreadConstraint strategy for %s", tc.name)
 
-			plugin, err := removepodsviolatingtopologyspreadconstraint.New(&removepodsviolatingtopologyspreadconstraint.RemovePodsViolatingTopologySpreadConstraintArgs{
+			preRunNames := sets.NewString(getCurrentPodNames(ctx, clientSet, testNamespace.Name, t)...)
+
+			evictorArgs := &defaultevictor.DefaultEvictorArgs{
+				EvictLocalStoragePods:   true,
+				EvictSystemCriticalPods: false,
+				IgnorePvcPods:           false,
+				EvictFailedBarePods:     false,
+			}
+			constraintArgs := &removepodsviolatingtopologyspreadconstraint.RemovePodsViolatingTopologySpreadConstraintArgs{
 				Constraints: []v1.UnsatisfiableConstraintAction{tc.topologySpreadConstraint.WhenUnsatisfiable},
-			},
-				handle,
-			)
+				Namespaces: &api.Namespaces{
+					Include: []string{testNamespace.Name},
+				},
+			}
+			deschedulerPolicyConfigMapObj, err := deschedulerPolicyConfigMap(topologySpreadConstraintPolicy(constraintArgs, evictorArgs))
 			if err != nil {
-				t.Fatalf("Unable to initialize the plugin: %v", err)
+				t.Fatalf("Error creating %q CM: %v", deschedulerPolicyConfigMapObj.Name, err)
 			}
 
-			plugin.(frameworktypes.BalancePlugin).Balance(ctx, workerNodes)
-			t.Logf("Finished RemovePodsViolatingTopologySpreadConstraint strategy for %s", name)
+			t.Logf("Creating %q policy CM with RemovePodsHavingTooManyRestarts configured...", deschedulerPolicyConfigMapObj.Name)
+			_, err = clientSet.CoreV1().ConfigMaps(deschedulerPolicyConfigMapObj.Namespace).Create(ctx, deschedulerPolicyConfigMapObj, metav1.CreateOptions{})
+			if err != nil {
+				t.Fatalf("Error creating %q CM: %v", deschedulerPolicyConfigMapObj.Name, err)
+			}
 
-			t.Logf("Wait for terminating pods of %s to disappear", name)
-			waitForTerminatingPodsToDisappear(ctx, t, clientSet, deployment.Namespace)
+			defer func() {
+				t.Logf("Deleting %q CM...", deschedulerPolicyConfigMapObj.Name)
+				err = clientSet.CoreV1().ConfigMaps(deschedulerPolicyConfigMapObj.Namespace).Delete(ctx, deschedulerPolicyConfigMapObj.Name, metav1.DeleteOptions{})
+				if err != nil {
+					t.Fatalf("Unable to delete %q CM: %v", deschedulerPolicyConfigMapObj.Name, err)
+				}
+			}()
+			deschedulerDeploymentObj := deschedulerDeployment(testNamespace.Name)
+			t.Logf("Creating descheduler deployment %v", deschedulerDeploymentObj.Name)
+			_, err = clientSet.AppsV1().Deployments(deschedulerDeploymentObj.Namespace).Create(ctx, deschedulerDeploymentObj, metav1.CreateOptions{})
+			if err != nil {
+				t.Fatalf("Error creating %q deployment: %v", deschedulerDeploymentObj.Name, err)
+			}
 
-			if totalEvicted := podEvictor.TotalEvicted(); totalEvicted == tc.expectedEvictedCount {
-				t.Logf("Total of %d Pods were evicted for %s", totalEvicted, name)
+			deschedulerPodName := ""
+			defer func() {
+				if deschedulerPodName != "" {
+					printPodLogs(ctx, t, clientSet, deschedulerPodName)
+				}
+
+				t.Logf("Deleting %q deployment...", deschedulerDeploymentObj.Name)
+				err = clientSet.AppsV1().Deployments(deschedulerDeploymentObj.Namespace).Delete(ctx, deschedulerDeploymentObj.Name, metav1.DeleteOptions{})
+				if err != nil {
+					t.Fatalf("Unable to delete %q deployment: %v", deschedulerDeploymentObj.Name, err)
+				}
+				waitForPodsToDisappear(ctx, t, clientSet, deschedulerDeploymentObj.Labels, deschedulerDeploymentObj.Namespace)
+			}()
+
+			t.Logf("Waiting for the descheduler pod running")
+			deschedulerPods := waitForPodsRunning(ctx, t, clientSet, deschedulerDeploymentObj.Labels, 1, deschedulerDeploymentObj.Namespace)
+			if len(deschedulerPods) != 0 {
+				deschedulerPodName = deschedulerPods[0].Name
+			}
+
+			// Run RemovePodsHavingTooManyRestarts strategy
+			var meetsEvictedExpectations bool
+			var actualEvictedPodCount int
+			t.Logf("Check whether the number of evicted pods meets the expectation")
+			if err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 60*time.Second, true, func(ctx context.Context) (bool, error) {
+				currentRunNames := sets.NewString(getCurrentPodNames(ctx, clientSet, testNamespace.Name, t)...)
+				actualEvictedPod := preRunNames.Difference(currentRunNames)
+				actualEvictedPodCount = actualEvictedPod.Len()
+				t.Logf("preRunNames: %v, currentRunNames: %v, actualEvictedPodCount: %v\n", preRunNames.List(), currentRunNames.List(), actualEvictedPodCount)
+				if actualEvictedPodCount != tc.expectedEvictedPodCount {
+					t.Logf("Expecting %v number of pods evicted, got %v instead", tc.expectedEvictedPodCount, actualEvictedPodCount)
+					return false, nil
+				}
+				meetsEvictedExpectations = true
+				return true, nil
+			}); err != nil {
+				t.Errorf("Error waiting for descheduler running: %v", err)
+			}
+
+			if !meetsEvictedExpectations {
+				t.Errorf("Unexpected number of pods have been evicted, got %v, expected %v", actualEvictedPodCount, tc.expectedEvictedPodCount)
 			} else {
-				t.Fatalf("Expected %d evictions but got %d for %s TopologySpreadConstraint", tc.expectedEvictedCount, totalEvicted, name)
+				t.Logf("Total of %d Pods were evicted for %s", actualEvictedPodCount, tc.name)
 			}
 
-			if tc.expectedEvictedCount == 0 {
+			if tc.expectedEvictedPodCount == 0 {
 				return
 			}
 
-			// Ensure recently evicted Pod are rescheduled and running before asserting for a balanced topology spread
-			waitForPodsRunning(ctx, t, clientSet, deployment.Labels, tc.replicaCount, deployment.Namespace)
+			var meetsSkewExpectations bool
+			var skewVal int
+			t.Logf("Check whether the skew meets the expectation")
+			if err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 60*time.Second, true, func(ctx context.Context) (bool, error) {
+				listOptions := metav1.ListOptions{LabelSelector: labels.SelectorFromSet(tc.topologySpreadConstraint.LabelSelector.MatchLabels).String()}
+				pods, err := clientSet.CoreV1().Pods(testNamespace.Name).List(ctx, listOptions)
+				if err != nil {
+					t.Errorf("Error listing pods for %s: %v", tc.name, err)
+				}
 
-			listOptions := metav1.ListOptions{LabelSelector: labels.SelectorFromSet(tc.topologySpreadConstraint.LabelSelector.MatchLabels).String()}
-			pods, err := clientSet.CoreV1().Pods(testNamespace.Name).List(ctx, listOptions)
-			if err != nil {
-				t.Errorf("Error listing pods for %s: %v", name, err)
+				nodePodCountMap := make(map[string]int)
+				for _, pod := range pods.Items {
+					nodePodCountMap[pod.Spec.NodeName]++
+				}
+
+				if len(nodePodCountMap) != len(workerNodes) {
+					t.Errorf("%s Pods were scheduled on only '%d' nodes and were not properly distributed on the nodes", tc.name, len(nodePodCountMap))
+					return false, nil
+				}
+
+				skewVal = getSkewValPodDistribution(nodePodCountMap)
+				if skewVal > int(tc.topologySpreadConstraint.MaxSkew) {
+					t.Errorf("Pod distribution for %s is still violating the max skew of %d as it is %d", tc.name, tc.topologySpreadConstraint.MaxSkew, skewVal)
+					return false, nil
+				}
+
+				meetsSkewExpectations = true
+				return true, nil
+			}); err != nil {
+				t.Errorf("Error waiting for descheduler running: %v", err)
 			}
 
-			nodePodCountMap := make(map[string]int)
-			for _, pod := range pods.Items {
-				nodePodCountMap[pod.Spec.NodeName]++
+			if !meetsSkewExpectations {
+				t.Errorf("Pod distribution for %s is still violating the max skew of %d as it is %d", tc.name, tc.topologySpreadConstraint.MaxSkew, skewVal)
+			} else {
+				t.Logf("Pods for %s were distributed in line with max skew of %d", tc.name, tc.topologySpreadConstraint.MaxSkew)
 			}
-
-			if len(nodePodCountMap) != len(workerNodes) {
-				t.Errorf("%s Pods were scheduled on only '%d' nodes and were not properly distributed on the nodes", name, len(nodePodCountMap))
-			}
-
-			min, max := getMinAndMaxPodDistribution(nodePodCountMap)
-			if max-min > int(tc.topologySpreadConstraint.MaxSkew) {
-				t.Errorf("Pod distribution for %s is still violating the max skew of %d as it is %d", name, tc.topologySpreadConstraint.MaxSkew, max-min)
-			}
-
-			t.Logf("Pods for %s were distributed in line with max skew of %d", name, tc.topologySpreadConstraint.MaxSkew)
 		})
 	}
 }
 
-func getMinAndMaxPodDistribution(nodePodCountMap map[string]int) (int, int) {
+func getSkewValPodDistribution(nodePodCountMap map[string]int) int {
 	min := math.MaxInt32
 	max := math.MinInt32
 	for _, podCount := range nodePodCountMap {
@@ -243,7 +347,7 @@ func getMinAndMaxPodDistribution(nodePodCountMap map[string]int) (int, int) {
 		}
 	}
 
-	return min, max
+	return max - min
 }
 
 func nodeInclusionPolicyRef(policy v1.NodeInclusionPolicy) *v1.NodeInclusionPolicy {