Merge pull request #285 from lixiang233/Ft_change_lowUtilization_break

Change break condition and thresholds validation for lowUtilization
2026-01-26 05:14:13 +01:00 · 2020-05-28 07:36:02 -07:00
parent 003a4cdc2b 2a8dc69cbb
commit 616a9b5f6b
3 changed files with 284 additions and 58 deletions
--- a/pkg/descheduler/strategies/lownodeutilization.go
+++ b/pkg/descheduler/strategies/lownodeutilization.go
@@ -18,6 +18,7 @@ package strategies

 import (
 	"context"
+	"fmt"
 	"sort"

 	v1 "k8s.io/api/core/v1"
@@ -40,6 +41,11 @@ type NodeUsageMap struct {

 type NodePodsMap map[*v1.Node][]*v1.Pod

+const (
+	MinResourcePercentage = 0
+	MaxResourcePercentage = 100
+)
+
 func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, evictLocalStoragePods bool, podEvictor *evictions.PodEvictor) {
 	// todo: move to config validation?
 	// TODO: May be create a struct for the strategy as well, so that we don't have to pass along the all the params?
@@ -49,12 +55,23 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg
 	}

 	thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds
-	if !validateThresholds(thresholds) {
+	targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds
+	if err := validateStrategyConfig(thresholds, targetThresholds); err != nil {
+		klog.Errorf("LowNodeUtilization config is not valid: %v", err)
 		return
 	}
-	targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds
-	if !validateTargetThresholds(targetThresholds) {
-		return
+	// check if Pods/CPU/Mem are set, if not, set them to 100
+	if _, ok := thresholds[v1.ResourcePods]; !ok {
+		thresholds[v1.ResourcePods] = MaxResourcePercentage
+		targetThresholds[v1.ResourcePods] = MaxResourcePercentage
+	}
+	if _, ok := thresholds[v1.ResourceCPU]; !ok {
+		thresholds[v1.ResourceCPU] = MaxResourcePercentage
+		targetThresholds[v1.ResourceCPU] = MaxResourcePercentage
+	}
+	if _, ok := thresholds[v1.ResourceMemory]; !ok {
+		thresholds[v1.ResourceMemory] = MaxResourcePercentage
+		targetThresholds[v1.ResourceMemory] = MaxResourcePercentage
 	}

 	npm := createNodePodsMap(ctx, client, nodes)
@@ -99,37 +116,46 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg
 	klog.V(1).Infof("Total number of pods evicted: %v", podEvictor.TotalEvicted())
 }

-func validateThresholds(thresholds api.ResourceThresholds) bool {
-	if thresholds == nil || len(thresholds) == 0 {
-		klog.V(1).Infof("no resource threshold is configured")
-		return false
+// validateStrategyConfig checks if the strategy's config is valid
+func validateStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error {
+	// validate thresholds and targetThresholds config
+	if err := validateThresholds(thresholds); err != nil {
+		return fmt.Errorf("thresholds config is not valid: %v", err)
 	}
-	for name := range thresholds {
-		switch name {
-		case v1.ResourceCPU:
-			continue
-		case v1.ResourceMemory:
-			continue
-		case v1.ResourcePods:
-			continue
-		default:
-			klog.Errorf("only cpu, memory, or pods thresholds can be specified")
-			return false
+	if err := validateThresholds(targetThresholds); err != nil {
+		return fmt.Errorf("targetThresholds config is not valid: %v", err)
+	}
+
+	// validate if thresholds and targetThresholds have same resources configured
+	if len(thresholds) != len(targetThresholds) {
+		return fmt.Errorf("thresholds and targetThresholds configured different resources")
+	}
+	for resourceName, value := range thresholds {
+		if targetValue, ok := targetThresholds[resourceName]; !ok {
+			return fmt.Errorf("thresholds and targetThresholds configured different resources")
+		} else if value > targetValue {
+			return fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", resourceName)
 		}
 	}
-	return true
+	return nil
 }

-//This function could be merged into above once we are clear.
-func validateTargetThresholds(targetThresholds api.ResourceThresholds) bool {
-	if targetThresholds == nil {
-		klog.V(1).Infof("no target resource threshold is configured")
-		return false
-	} else if _, ok := targetThresholds[v1.ResourcePods]; !ok {
-		klog.V(1).Infof("no target resource threshold for pods is configured")
-		return false
+// validateThresholds checks if thresholds have valid resource name and resource percentage configured
+func validateThresholds(thresholds api.ResourceThresholds) error {
+	if thresholds == nil || len(thresholds) == 0 {
+		return fmt.Errorf("no resource threshold is configured")
 	}
-	return true
+	for name, percent := range thresholds {
+		switch name {
+		case v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods:
+			if percent < MinResourcePercentage || percent > MaxResourcePercentage {
+				return fmt.Errorf("%v threshold not in [%v, %v] range", name, MinResourcePercentage, MaxResourcePercentage)
+			}
+		default:
+			return fmt.Errorf("only cpu, memory, or pods thresholds can be specified")
+		}
+	}
+	return nil
 }

 // classifyNodes classifies the nodes into low-utilization or high-utilization nodes. If a node lies between
@@ -184,16 +210,12 @@ func evictPodsFromTargetNodes(
 		totalPods += ((float64(podsPercentage) * float64(nodeCapacity.Pods().Value())) / 100)

 		// totalCPU capacity to be moved
-		if _, ok := targetThresholds[v1.ResourceCPU]; ok {
-			cpuPercentage := targetThresholds[v1.ResourceCPU] - node.usage[v1.ResourceCPU]
-			totalCPU += ((float64(cpuPercentage) * float64(nodeCapacity.Cpu().MilliValue())) / 100)
-		}
+		cpuPercentage := targetThresholds[v1.ResourceCPU] - node.usage[v1.ResourceCPU]
+		totalCPU += ((float64(cpuPercentage) * float64(nodeCapacity.Cpu().MilliValue())) / 100)

 		// totalMem capacity to be moved
-		if _, ok := targetThresholds[v1.ResourceMemory]; ok {
-			memPercentage := targetThresholds[v1.ResourceMemory] - node.usage[v1.ResourceMemory]
-			totalMem += ((float64(memPercentage) * float64(nodeCapacity.Memory().Value())) / 100)
-		}
+		memPercentage := targetThresholds[v1.ResourceMemory] - node.usage[v1.ResourceMemory]
+		totalMem += ((float64(memPercentage) * float64(nodeCapacity.Memory().Value())) / 100)
 	}

 	klog.V(1).Infof("Total capacity to be moved: CPU:%v, Mem:%v, Pods:%v", totalCPU, totalMem, totalPods)
@@ -246,7 +268,8 @@ func evictPods(
 	taintsOfLowNodes map[string][]v1.Taint,
 	podEvictor *evictions.PodEvictor,
 	node *v1.Node) {
-	if IsNodeAboveTargetUtilization(nodeUsage, targetThresholds) && (*totalPods > 0 || *totalCPU > 0 || *totalMem > 0) {
+	// stop if node utilization drops below target threshold or any of required capacity (cpu, memory, pods) is moved
+	if IsNodeAboveTargetUtilization(nodeUsage, targetThresholds) && *totalPods > 0 && *totalCPU > 0 && *totalMem > 0 {
 		onePodPercentage := api.Percentage((float64(1) * 100) / float64(nodeCapacity.Pods().Value()))
 		for _, pod := range inputPods {
 			if !utils.PodToleratesTaints(pod, taintsOfLowNodes) {
@@ -278,8 +301,8 @@ func evictPods(
 				nodeUsage[v1.ResourceMemory] -= api.Percentage(float64(mUsage) / float64(nodeCapacity.Memory().Value()) * 100)

 				klog.V(3).Infof("updated node usage: %#v", nodeUsage)
-				// check if node utilization drops below target threshold or required capacity (cpu, memory, pods) is moved
-				if !IsNodeAboveTargetUtilization(nodeUsage, targetThresholds) || (*totalPods <= 0 && *totalCPU <= 0 && *totalMem <= 0) {
+				// check if node utilization drops below target threshold or any required capacity (cpu, memory, pods) is moved
+				if !IsNodeAboveTargetUtilization(nodeUsage, targetThresholds) || *totalPods <= 0 || *totalCPU <= 0 || *totalMem <= 0 {
 					break
 				}
 			}
--- a/pkg/descheduler/strategies/lownodeutilization_test.go
+++ b/pkg/descheduler/strategies/lownodeutilization_test.go
@@ -80,8 +80,11 @@ func TestLowNodeUtilization(t *testing.T) {
 		thresholds, targetThresholds api.ResourceThresholds
 		nodes                        map[string]*v1.Node
 		pods                         map[string]*v1.PodList
-		expectedPodsEvicted          int
-		evictedPods                  []string
+		// TODO: divide expectedPodsEvicted into two params like other tests
+		// expectedPodsEvicted should be the result num of pods that this testCase expected but now it represents both
+		// MaxNoOfPodsToEvictPerNode and the test's expected result
+		expectedPodsEvicted int
+		evictedPods         []string
 	}{
 		{
 			name: "without priorities",
@@ -141,6 +144,65 @@ func TestLowNodeUtilization(t *testing.T) {
 			},
 			expectedPodsEvicted: 3,
 		},
+		{
+			name: "without priorities stop when cpu capacity is depleted",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:  30,
+				v1.ResourcePods: 30,
+			},
+			targetThresholds: api.ResourceThresholds{
+				v1.ResourceCPU:  50,
+				v1.ResourcePods: 50,
+			},
+			nodes: map[string]*v1.Node{
+				n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
+				n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
+				n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, setNodeUnschedulable),
+			},
+			pods: map[string]*v1.PodList{
+				n1NodeName: {
+					Items: []v1.Pod{
+						*test.BuildTestPod("p1", 400, 300, n1NodeName, setRSOwnerRef),
+						*test.BuildTestPod("p2", 400, 300, n1NodeName, setRSOwnerRef),
+						*test.BuildTestPod("p3", 400, 300, n1NodeName, setRSOwnerRef),
+						*test.BuildTestPod("p4", 400, 300, n1NodeName, setRSOwnerRef),
+						*test.BuildTestPod("p5", 400, 300, n1NodeName, setRSOwnerRef),
+						// These won't be evicted.
+						*test.BuildTestPod("p6", 400, 300, n1NodeName, setDSOwnerRef),
+						*test.BuildTestPod("p7", 400, 300, n1NodeName, func(pod *v1.Pod) {
+							// A pod with local storage.
+							setNormalOwnerRef(pod)
+							pod.Spec.Volumes = []v1.Volume{
+								{
+									Name: "sample",
+									VolumeSource: v1.VolumeSource{
+										HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
+										EmptyDir: &v1.EmptyDirVolumeSource{
+											SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
+									},
+								},
+							}
+							// A Mirror Pod.
+							pod.Annotations = test.GetMirrorPodAnnotation()
+						}),
+						*test.BuildTestPod("p8", 400, 300, n1NodeName, func(pod *v1.Pod) {
+							// A Critical Pod.
+							pod.Namespace = "kube-system"
+							priority := utils.SystemCriticalPriority
+							pod.Spec.Priority = &priority
+						}),
+					},
+				},
+				n2NodeName: {
+					Items: []v1.Pod{
+						*test.BuildTestPod("p9", 400, 2100, n1NodeName, setRSOwnerRef),
+					},
+				},
+				n3NodeName: {},
+			},
+			// 4 pods available for eviction based on v1.ResourcePods, only 3 pods can be evicted before cpu is depleted
+			expectedPodsEvicted: 3,
+		},
 		{
 			name: "with priorities",
 			thresholds: api.ResourceThresholds{
@@ -346,11 +408,6 @@ func TestLowNodeUtilization(t *testing.T) {
 				nodes = append(nodes, node)
 			}

-			npm := createNodePodsMap(ctx, fakeClient, nodes)
-			lowNodes, targetNodes := classifyNodes(npm, test.thresholds, test.targetThresholds, false)
-			if len(lowNodes) != 1 {
-				t.Errorf("After ignoring unschedulable nodes, expected only one node to be under utilized.")
-			}
 			podEvictor := evictions.NewPodEvictor(
 				fakeClient,
 				"v1",
@@ -359,7 +416,17 @@ func TestLowNodeUtilization(t *testing.T) {
 				nodes,
 			)

-			evictPodsFromTargetNodes(ctx, targetNodes, lowNodes, test.targetThresholds, false, podEvictor)
+			strategy := api.DeschedulerStrategy{
+				Enabled: true,
+				Params: api.StrategyParameters{
+					NodeResourceUtilizationThresholds: &api.NodeResourceUtilizationThresholds{
+						Thresholds:       test.thresholds,
+						TargetThresholds: test.targetThresholds,
+					},
+				},
+			}
+			LowNodeUtilization(ctx, fakeClient, strategy, nodes, false, podEvictor)
+
 			podsEvicted := podEvictor.TotalEvicted()
 			if test.expectedPodsEvicted != podsEvicted {
 				t.Errorf("Expected %#v pods to be evicted but %#v got evicted", test.expectedPodsEvicted, podsEvicted)
@@ -409,21 +476,120 @@ func TestSortPodsByPriority(t *testing.T) {
 	}
 }

+func TestValidateStrategyConfig(t *testing.T) {
+	tests := []struct {
+		name             string
+		thresholds       api.ResourceThresholds
+		targetThresholds api.ResourceThresholds
+		errInfo          error
+	}{
+		{
+			name: "passing invalid thresholds",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    20,
+				v1.ResourceMemory: 120,
+			},
+			targetThresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    80,
+				v1.ResourceMemory: 80,
+			},
+			errInfo: fmt.Errorf("thresholds config is not valid: %v", fmt.Errorf(
+				"%v threshold not in [%v, %v] range", v1.ResourceMemory, MinResourcePercentage, MaxResourcePercentage)),
+		},
+		{
+			name: "passing invalid targetThresholds",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    20,
+				v1.ResourceMemory: 20,
+			},
+			targetThresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    80,
+				"resourceInvalid": 80,
+			},
+			errInfo: fmt.Errorf("targetThresholds config is not valid: %v",
+				fmt.Errorf("only cpu, memory, or pods thresholds can be specified")),
+		},
+		{
+			name: "thresholds and targetThresholds configured different num of resources",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    20,
+				v1.ResourceMemory: 20,
+			},
+			targetThresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    80,
+				v1.ResourceMemory: 80,
+				v1.ResourcePods:   80,
+			},
+			errInfo: fmt.Errorf("thresholds and targetThresholds configured different resources"),
+		},
+		{
+			name: "thresholds and targetThresholds configured different resources",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    20,
+				v1.ResourceMemory: 20,
+			},
+			targetThresholds: api.ResourceThresholds{
+				v1.ResourceCPU:  80,
+				v1.ResourcePods: 80,
+			},
+			errInfo: fmt.Errorf("thresholds and targetThresholds configured different resources"),
+		},
+		{
+			name: "thresholds' CPU config value is greater than targetThresholds'",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    90,
+				v1.ResourceMemory: 20,
+			},
+			targetThresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    80,
+				v1.ResourceMemory: 80,
+			},
+			errInfo: fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", v1.ResourceCPU),
+		},
+		{
+			name: "passing valid strategy config",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    20,
+				v1.ResourceMemory: 20,
+			},
+			targetThresholds: api.ResourceThresholds{
+				v1.ResourceCPU:    80,
+				v1.ResourceMemory: 80,
+			},
+			errInfo: nil,
+		},
+	}
+
+	for _, testCase := range tests {
+		validateErr := validateStrategyConfig(testCase.thresholds, testCase.targetThresholds)
+
+		if validateErr == nil || testCase.errInfo == nil {
+			if validateErr != testCase.errInfo {
+				t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v\nto be %v but got %v instead",
+					testCase.thresholds, testCase.targetThresholds, testCase.errInfo, validateErr)
+			}
+		} else if validateErr.Error() != testCase.errInfo.Error() {
+			t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v\nto be %v but got %v instead",
+				testCase.thresholds, testCase.targetThresholds, testCase.errInfo, validateErr)
+		}
+	}
+}
+
 func TestValidateThresholds(t *testing.T) {
 	tests := []struct {
 		name    string
 		input   api.ResourceThresholds
-		succeed bool
+		errInfo error
 	}{
 		{
 			name:    "passing nil map for threshold",
 			input:   nil,
-			succeed: false,
+			errInfo: fmt.Errorf("no resource threshold is configured"),
 		},
 		{
 			name:    "passing no threshold",
 			input:   api.ResourceThresholds{},
-			succeed: false,
+			errInfo: fmt.Errorf("no resource threshold is configured"),
 		},
 		{
 			name: "passing unsupported resource name",
@@ -431,7 +597,7 @@ func TestValidateThresholds(t *testing.T) {
 				v1.ResourceCPU:     40,
 				v1.ResourceStorage: 25.5,
 			},
-			succeed: false,
+			errInfo: fmt.Errorf("only cpu, memory, or pods thresholds can be specified"),
 		},
 		{
 			name: "passing invalid resource name",
@@ -439,7 +605,30 @@ func TestValidateThresholds(t *testing.T) {
 				v1.ResourceCPU: 40,
 				"coolResource": 42.0,
 			},
-			succeed: false,
+			errInfo: fmt.Errorf("only cpu, memory, or pods thresholds can be specified"),
+		},
+		{
+			name: "passing invalid resource value",
+			input: api.ResourceThresholds{
+				v1.ResourceCPU:    110,
+				v1.ResourceMemory: 80,
+			},
+			errInfo: fmt.Errorf("%v threshold not in [%v, %v] range", v1.ResourceCPU, MinResourcePercentage, MaxResourcePercentage),
+		},
+		{
+			name: "passing a valid threshold with max and min resource value",
+			input: api.ResourceThresholds{
+				v1.ResourceCPU:    100,
+				v1.ResourceMemory: 0,
+			},
+			errInfo: nil,
+		},
+		{
+			name: "passing a valid threshold with only cpu",
+			input: api.ResourceThresholds{
+				v1.ResourceCPU: 80,
+			},
+			errInfo: nil,
 		},
 		{
 			name: "passing a valid threshold with cpu, memory and pods",
@@ -448,15 +637,19 @@ func TestValidateThresholds(t *testing.T) {
 				v1.ResourceMemory: 30,
 				v1.ResourcePods:   40,
 			},
-			succeed: true,
+			errInfo: nil,
 		},
 	}

 	for _, test := range tests {
-		isValid := validateThresholds(test.input)
+		validateErr := validateThresholds(test.input)

-		if isValid != test.succeed {
-			t.Errorf("expected validity of threshold: %#v\nto be %v but got %v instead", test.input, test.succeed, isValid)
+		if validateErr == nil || test.errInfo == nil {
+			if validateErr != test.errInfo {
+				t.Errorf("expected validity of threshold: %#v\nto be %v but got %v instead", test.input, test.errInfo, validateErr)
+			}
+		} else if validateErr.Error() != test.errInfo.Error() {
+			t.Errorf("expected validity of threshold: %#v\nto be %v but got %v instead", test.input, test.errInfo, validateErr)
 		}
 	}
 }