From 6e71068f4f76a1bd230ec3a24d5bf55fa1dc95b9 Mon Sep 17 00:00:00 2001 From: Hanu Date: Tue, 1 Jun 2021 21:14:28 +0800 Subject: [PATCH 1/3] Refractoring lownodeutilization - extracting common functions --- pkg/descheduler/descheduler.go | 3 +- .../nodeutilization/lownodeutilization.go | 170 ++++++++++++++++++ .../lownodeutilization_test.go | 133 +------------- .../nodeutilization.go} | 159 +--------------- .../nodeutilization/nodeutilization_test.go | 158 ++++++++++++++++ test/e2e/e2e_test.go | 3 +- 6 files changed, 341 insertions(+), 285 deletions(-) create mode 100644 pkg/descheduler/strategies/nodeutilization/lownodeutilization.go rename pkg/descheduler/strategies/{ => nodeutilization}/lownodeutilization_test.go (90%) rename pkg/descheduler/strategies/{lownodeutilization.go => nodeutilization/nodeutilization.go} (72%) create mode 100644 pkg/descheduler/strategies/nodeutilization/nodeutilization_test.go diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index dd0b159d5..8dd00b0b4 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -19,6 +19,7 @@ package descheduler import ( "context" "fmt" + "sigs.k8s.io/descheduler/pkg/descheduler/strategies/nodeutilization" v1 "k8s.io/api/core/v1" clientset "k8s.io/client-go/kubernetes" @@ -74,7 +75,7 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer strategyFuncs := map[api.StrategyName]strategyFunction{ "RemoveDuplicates": strategies.RemoveDuplicatePods, - "LowNodeUtilization": strategies.LowNodeUtilization, + "LowNodeUtilization": nodeutilization.LowNodeUtilization, "RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity, "RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity, "RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints, diff --git a/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go b/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go new file mode 100644 index 000000000..66da7b94c --- /dev/null +++ b/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go @@ -0,0 +1,170 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeutilization + +import ( + "context" + "fmt" + v1 "k8s.io/api/core/v1" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + + "sigs.k8s.io/descheduler/pkg/api" + "sigs.k8s.io/descheduler/pkg/descheduler/evictions" + nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" + "sigs.k8s.io/descheduler/pkg/utils" +) + +// LowNodeUtilization evicts pods from overutilized nodes to underutilized nodes. Note that CPU/Memory requests are used +// to calculate nodes' utilization and not the actual resource usage. +func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) { + // TODO: May be create a struct for the strategy as well, so that we don't have to pass along the all the params? + if err := validateLowNodeUtilizationParams(strategy.Params); err != nil { + klog.ErrorS(err, "Invalid LowNodeUtilization parameters") + return + } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) + if err != nil { + klog.ErrorS(err, "Failed to get threshold priority from strategy's params") + return + } + + nodeFit := false + if strategy.Params != nil { + nodeFit = strategy.Params.NodeFit + } + + thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds + targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds + if err := validateStrategyConfig(thresholds, targetThresholds); err != nil { + klog.ErrorS(err, "LowNodeUtilization config is not valid") + return + } + // check if Pods/CPU/Mem are set, if not, set them to 100 + if _, ok := thresholds[v1.ResourcePods]; !ok { + thresholds[v1.ResourcePods] = MaxResourcePercentage + targetThresholds[v1.ResourcePods] = MaxResourcePercentage + } + if _, ok := thresholds[v1.ResourceCPU]; !ok { + thresholds[v1.ResourceCPU] = MaxResourcePercentage + targetThresholds[v1.ResourceCPU] = MaxResourcePercentage + } + if _, ok := thresholds[v1.ResourceMemory]; !ok { + thresholds[v1.ResourceMemory] = MaxResourcePercentage + targetThresholds[v1.ResourceMemory] = MaxResourcePercentage + } + resourceNames := getResourceNames(thresholds) + + lowNodes, targetNodes := classifyNodes( + getNodeUsage(ctx, client, nodes, thresholds, targetThresholds, resourceNames), + // The node has to be schedulable (to be able to move workload there) + func(node *v1.Node, usage NodeUsage) bool { + if nodeutil.IsNodeUnschedulable(node) { + klog.V(2).InfoS("Node is unschedulable, thus not considered as underutilized", "node", klog.KObj(node)) + return false + } + return isNodeWithLowUtilization(usage) + }, + func(node *v1.Node, usage NodeUsage) bool { + return isNodeAboveTargetUtilization(usage) + }, + ) + + // log message in one line + keysAndValues := []interface{}{ + "CPU", thresholds[v1.ResourceCPU], + "Mem", thresholds[v1.ResourceMemory], + "Pods", thresholds[v1.ResourcePods], + } + for name := range thresholds { + if !isBasicResource(name) { + keysAndValues = append(keysAndValues, string(name), int64(thresholds[name])) + } + } + klog.V(1).InfoS("Criteria for a node under utilization", keysAndValues...) + klog.V(1).InfoS("Number of underutilized nodes", "totalNumber", len(lowNodes)) + + // log message in one line + keysAndValues = []interface{}{ + "CPU", targetThresholds[v1.ResourceCPU], + "Mem", targetThresholds[v1.ResourceMemory], + "Pods", targetThresholds[v1.ResourcePods], + } + for name := range targetThresholds { + if !isBasicResource(name) { + keysAndValues = append(keysAndValues, string(name), int64(targetThresholds[name])) + } + } + klog.V(1).InfoS("Criteria for a node above target utilization", keysAndValues...) + klog.V(1).InfoS("Number of overutilized nodes", "totalNumber", len(targetNodes)) + + if len(lowNodes) == 0 { + klog.V(1).InfoS("No node is underutilized, nothing to do here, you might tune your thresholds further") + return + } + + if len(lowNodes) < strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes { + klog.V(1).InfoS("Number of nodes underutilized is less than NumberOfNodes, nothing to do here", "underutilizedNodes", len(lowNodes), "numberOfNodes", strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes) + return + } + + if len(lowNodes) == len(nodes) { + klog.V(1).InfoS("All nodes are underutilized, nothing to do here") + return + } + + if len(targetNodes) == 0 { + klog.V(1).InfoS("All nodes are under target utilization, nothing to do here") + return + } + + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) + + evictPodsFromTargetNodes( + ctx, + targetNodes, + lowNodes, + podEvictor, + evictable.IsEvictable, + resourceNames) + + klog.V(1).InfoS("Total number of pods evicted", "evictedPods", podEvictor.TotalEvicted()) +} + +// validateStrategyConfig checks if the strategy's config is valid +func validateStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error { + // validate thresholds and targetThresholds config + if err := validateThresholds(thresholds); err != nil { + return fmt.Errorf("thresholds config is not valid: %v", err) + } + if err := validateThresholds(targetThresholds); err != nil { + return fmt.Errorf("targetThresholds config is not valid: %v", err) + } + + // validate if thresholds and targetThresholds have same resources configured + if len(thresholds) != len(targetThresholds) { + return fmt.Errorf("thresholds and targetThresholds configured different resources") + } + for resourceName, value := range thresholds { + if targetValue, ok := targetThresholds[resourceName]; !ok { + return fmt.Errorf("thresholds and targetThresholds configured different resources") + } else if value > targetValue { + return fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", resourceName) + } + } + return nil +} diff --git a/pkg/descheduler/strategies/lownodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go similarity index 90% rename from pkg/descheduler/strategies/lownodeutilization_test.go rename to pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go index 8accdc3b4..1324de091 100644 --- a/pkg/descheduler/strategies/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go @@ -14,12 +14,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -package strategies +package nodeutilization import ( "context" "fmt" - "math" "strings" "testing" @@ -36,12 +35,6 @@ import ( "sigs.k8s.io/descheduler/test" ) -var ( - lowPriority = int32(0) - highPriority = int32(10000) - extendedResource = v1.ResourceName("example.com/foo") -) - func TestLowNodeUtilization(t *testing.T) { ctx := context.Background() n1NodeName := "n1" @@ -979,93 +972,7 @@ func TestValidateStrategyConfig(t *testing.T) { } } -func TestValidateThresholds(t *testing.T) { - tests := []struct { - name string - input api.ResourceThresholds - errInfo error - }{ - { - name: "passing nil map for threshold", - input: nil, - errInfo: fmt.Errorf("no resource threshold is configured"), - }, - { - name: "passing no threshold", - input: api.ResourceThresholds{}, - errInfo: fmt.Errorf("no resource threshold is configured"), - }, - { - name: "passing extended resource name other than cpu/memory/pods", - input: api.ResourceThresholds{ - v1.ResourceCPU: 40, - extendedResource: 50, - }, - errInfo: nil, - }, - { - name: "passing invalid resource value", - input: api.ResourceThresholds{ - v1.ResourceCPU: 110, - v1.ResourceMemory: 80, - }, - errInfo: fmt.Errorf("%v threshold not in [%v, %v] range", v1.ResourceCPU, MinResourcePercentage, MaxResourcePercentage), - }, - { - name: "passing a valid threshold with max and min resource value", - input: api.ResourceThresholds{ - v1.ResourceCPU: 100, - v1.ResourceMemory: 0, - }, - errInfo: nil, - }, - { - name: "passing a valid threshold with only cpu", - input: api.ResourceThresholds{ - v1.ResourceCPU: 80, - }, - errInfo: nil, - }, - { - name: "passing a valid threshold with cpu, memory and pods", - input: api.ResourceThresholds{ - v1.ResourceCPU: 20, - v1.ResourceMemory: 30, - v1.ResourcePods: 40, - }, - errInfo: nil, - }, - { - name: "passing a valid threshold with only extended resource", - input: api.ResourceThresholds{ - extendedResource: 80, - }, - errInfo: nil, - }, - { - name: "passing a valid threshold with cpu, memory, pods and extended resource", - input: api.ResourceThresholds{ - v1.ResourceCPU: 20, - v1.ResourceMemory: 30, - v1.ResourcePods: 40, - extendedResource: 50, - }, - errInfo: nil, - }, - } - for _, test := range tests { - validateErr := validateThresholds(test.input) - - if validateErr == nil || test.errInfo == nil { - if validateErr != test.errInfo { - t.Errorf("expected validity of threshold: %#v to be %v but got %v instead", test.input, test.errInfo, validateErr) - } - } else if validateErr.Error() != test.errInfo.Error() { - t.Errorf("expected validity of threshold: %#v to be %v but got %v instead", test.input, test.errInfo, validateErr) - } - } -} func TestWithTaints(t *testing.T) { ctx := context.Background() @@ -1198,41 +1105,3 @@ func TestWithTaints(t *testing.T) { }) } } - -func TestResourceUsagePercentages(t *testing.T) { - resourceUsagePercentage := resourceUsagePercentages(NodeUsage{ - node: &v1.Node{ - Status: v1.NodeStatus{ - Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(3977868*1024, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI), - }, - Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1930, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(3287692*1024, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI), - }, - }, - }, - usage: map[v1.ResourceName]*resource.Quantity{ - v1.ResourceCPU: resource.NewMilliQuantity(1220, resource.DecimalSI), - v1.ResourceMemory: resource.NewQuantity(3038982964, resource.BinarySI), - v1.ResourcePods: resource.NewQuantity(11, resource.BinarySI), - }, - }) - - expectedUsageInIntPercentage := map[v1.ResourceName]float64{ - v1.ResourceCPU: 63, - v1.ResourceMemory: 90, - v1.ResourcePods: 37, - } - - for resourceName, percentage := range expectedUsageInIntPercentage { - if math.Floor(resourceUsagePercentage[resourceName]) != percentage { - t.Errorf("Incorrect percentange computation, expected %v, got math.Floor(%v) instead", percentage, resourceUsagePercentage[resourceName]) - } - } - - t.Logf("resourceUsagePercentage: %#v\n", resourceUsagePercentage) -} diff --git a/pkg/descheduler/strategies/lownodeutilization.go b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go similarity index 72% rename from pkg/descheduler/strategies/lownodeutilization.go rename to pkg/descheduler/strategies/nodeutilization/nodeutilization.go index d53c2b931..01a874da5 100644 --- a/pkg/descheduler/strategies/lownodeutilization.go +++ b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go @@ -1,5 +1,5 @@ /* -Copyright 2017 The Kubernetes Authors. +Copyright 2021 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,23 +14,20 @@ See the License for the specific language governing permissions and limitations under the License. */ -package strategies +package nodeutilization import ( "context" "fmt" - "sort" - v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" - "sigs.k8s.io/descheduler/pkg/api" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" - nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" "sigs.k8s.io/descheduler/pkg/utils" + "sort" ) // NodeUsage stores a node's info, pods on it, thresholds and its resource usage @@ -64,146 +61,6 @@ func validateLowNodeUtilizationParams(params *api.StrategyParameters) error { return nil } -// LowNodeUtilization evicts pods from overutilized nodes to underutilized nodes. Note that CPU/Memory requests are used -// to calculate nodes' utilization and not the actual resource usage. -func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) { - // TODO: May be create a struct for the strategy as well, so that we don't have to pass along the all the params? - if err := validateLowNodeUtilizationParams(strategy.Params); err != nil { - klog.ErrorS(err, "Invalid LowNodeUtilization parameters") - return - } - thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) - if err != nil { - klog.ErrorS(err, "Failed to get threshold priority from strategy's params") - return - } - - nodeFit := false - if strategy.Params != nil { - nodeFit = strategy.Params.NodeFit - } - - thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds - targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds - if err := validateStrategyConfig(thresholds, targetThresholds); err != nil { - klog.ErrorS(err, "LowNodeUtilization config is not valid") - return - } - // check if Pods/CPU/Mem are set, if not, set them to 100 - if _, ok := thresholds[v1.ResourcePods]; !ok { - thresholds[v1.ResourcePods] = MaxResourcePercentage - targetThresholds[v1.ResourcePods] = MaxResourcePercentage - } - if _, ok := thresholds[v1.ResourceCPU]; !ok { - thresholds[v1.ResourceCPU] = MaxResourcePercentage - targetThresholds[v1.ResourceCPU] = MaxResourcePercentage - } - if _, ok := thresholds[v1.ResourceMemory]; !ok { - thresholds[v1.ResourceMemory] = MaxResourcePercentage - targetThresholds[v1.ResourceMemory] = MaxResourcePercentage - } - resourceNames := getResourceNames(thresholds) - - lowNodes, targetNodes := classifyNodes( - getNodeUsage(ctx, client, nodes, thresholds, targetThresholds, resourceNames), - // The node has to be schedulable (to be able to move workload there) - func(node *v1.Node, usage NodeUsage) bool { - if nodeutil.IsNodeUnschedulable(node) { - klog.V(2).InfoS("Node is unschedulable, thus not considered as underutilized", "node", klog.KObj(node)) - return false - } - return isNodeWithLowUtilization(usage) - }, - func(node *v1.Node, usage NodeUsage) bool { - return isNodeAboveTargetUtilization(usage) - }, - ) - - // log message in one line - keysAndValues := []interface{}{ - "CPU", thresholds[v1.ResourceCPU], - "Mem", thresholds[v1.ResourceMemory], - "Pods", thresholds[v1.ResourcePods], - } - for name := range thresholds { - if !isBasicResource(name) { - keysAndValues = append(keysAndValues, string(name), int64(float64(thresholds[name]))) - } - } - klog.V(1).InfoS("Criteria for a node under utilization", keysAndValues...) - klog.V(1).InfoS("Number of underutilized nodes", "totalNumber", len(lowNodes)) - - // log message in one line - keysAndValues = []interface{}{ - "CPU", targetThresholds[v1.ResourceCPU], - "Mem", targetThresholds[v1.ResourceMemory], - "Pods", targetThresholds[v1.ResourcePods], - } - for name := range targetThresholds { - if !isBasicResource(name) { - keysAndValues = append(keysAndValues, string(name), int64(float64(targetThresholds[name]))) - } - } - klog.V(1).InfoS("Criteria for a node above target utilization", keysAndValues...) - klog.V(1).InfoS("Number of overutilized nodes", "totalNumber", len(targetNodes)) - - if len(lowNodes) == 0 { - klog.V(1).InfoS("No node is underutilized, nothing to do here, you might tune your thresholds further") - return - } - - if len(lowNodes) < strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes { - klog.V(1).InfoS("Number of nodes underutilized is less than NumberOfNodes, nothing to do here", "underutilizedNodes", len(lowNodes), "numberOfNodes", strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes) - return - } - - if len(lowNodes) == len(nodes) { - klog.V(1).InfoS("All nodes are underutilized, nothing to do here") - return - } - - if len(targetNodes) == 0 { - klog.V(1).InfoS("All nodes are under target utilization, nothing to do here") - return - } - - evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) - - evictPodsFromTargetNodes( - ctx, - targetNodes, - lowNodes, - podEvictor, - evictable.IsEvictable, - resourceNames) - - klog.V(1).InfoS("Total number of pods evicted", "evictedPods", podEvictor.TotalEvicted()) -} - -// validateStrategyConfig checks if the strategy's config is valid -func validateStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error { - // validate thresholds and targetThresholds config - if err := validateThresholds(thresholds); err != nil { - return fmt.Errorf("thresholds config is not valid: %v", err) - } - if err := validateThresholds(targetThresholds); err != nil { - return fmt.Errorf("targetThresholds config is not valid: %v", err) - } - - // validate if thresholds and targetThresholds have same resources configured - if len(thresholds) != len(targetThresholds) { - return fmt.Errorf("thresholds and targetThresholds configured different resources") - } - for resourceName, value := range thresholds { - if targetValue, ok := targetThresholds[resourceName]; !ok { - return fmt.Errorf("thresholds and targetThresholds configured different resources") - } else if value > targetValue { - return fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", resourceName) - } - } - return nil -} - // validateThresholds checks if thresholds have valid resource name and resource percentage configured func validateThresholds(thresholds api.ResourceThresholds) error { if thresholds == nil || len(thresholds) == 0 { @@ -224,7 +81,7 @@ func getNodeUsage( lowThreshold, highThreshold api.ResourceThresholds, resourceNames []v1.ResourceName, ) []NodeUsage { - nodeUsageList := []NodeUsage{} + var nodeUsageList []NodeUsage for _, node := range nodes { pods, err := podutil.ListPodsOnANode(ctx, client, node) @@ -388,8 +245,9 @@ func evictPods( taintsOfLowNodes map[string][]v1.Taint, podEvictor *evictions.PodEvictor, ) { + // stop if node utilization drops below target threshold or any of required capacity (cpu, memory, pods) is moved - continueCond := func() bool { + continueCond := func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool { if !isNodeAboveTargetUtilization(nodeUsage) { return false } @@ -401,8 +259,7 @@ func evictPods( return true } - - if continueCond() { + if continueCond(nodeUsage, totalAvailableUsage) { for _, pod := range inputPods { if !utils.PodToleratesTaints(pod, taintsOfLowNodes) { klog.V(3).InfoS("Skipping eviction for pod, doesn't tolerate node taint", "pod", klog.KObj(pod)) @@ -443,7 +300,7 @@ func evictPods( klog.V(3).InfoS("Updated node usage", keysAndValues...) // check if node utilization drops below target threshold or any required capacity (cpu, memory, pods) is moved - if !continueCond() { + if !continueCond(nodeUsage, totalAvailableUsage) { break } } diff --git a/pkg/descheduler/strategies/nodeutilization/nodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/nodeutilization_test.go new file mode 100644 index 000000000..b623368b6 --- /dev/null +++ b/pkg/descheduler/strategies/nodeutilization/nodeutilization_test.go @@ -0,0 +1,158 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeutilization + +import ( + "fmt" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "math" + "sigs.k8s.io/descheduler/pkg/api" + "testing" +) + +var ( + lowPriority = int32(0) + highPriority = int32(10000) + extendedResource = v1.ResourceName("example.com/foo") +) + +func TestValidateThresholds(t *testing.T) { + tests := []struct { + name string + input api.ResourceThresholds + errInfo error + }{ + { + name: "passing nil map for threshold", + input: nil, + errInfo: fmt.Errorf("no resource threshold is configured"), + }, + { + name: "passing no threshold", + input: api.ResourceThresholds{}, + errInfo: fmt.Errorf("no resource threshold is configured"), + }, + { + name: "passing extended resource name other than cpu/memory/pods", + input: api.ResourceThresholds{ + v1.ResourceCPU: 40, + extendedResource: 50, + }, + errInfo: nil, + }, + { + name: "passing invalid resource value", + input: api.ResourceThresholds{ + v1.ResourceCPU: 110, + v1.ResourceMemory: 80, + }, + errInfo: fmt.Errorf("%v threshold not in [%v, %v] range", v1.ResourceCPU, MinResourcePercentage, MaxResourcePercentage), + }, + { + name: "passing a valid threshold with max and min resource value", + input: api.ResourceThresholds{ + v1.ResourceCPU: 100, + v1.ResourceMemory: 0, + }, + errInfo: nil, + }, + { + name: "passing a valid threshold with only cpu", + input: api.ResourceThresholds{ + v1.ResourceCPU: 80, + }, + errInfo: nil, + }, + { + name: "passing a valid threshold with cpu, memory and pods", + input: api.ResourceThresholds{ + v1.ResourceCPU: 20, + v1.ResourceMemory: 30, + v1.ResourcePods: 40, + }, + errInfo: nil, + }, + { + name: "passing a valid threshold with only extended resource", + input: api.ResourceThresholds{ + extendedResource: 80, + }, + errInfo: nil, + }, + { + name: "passing a valid threshold with cpu, memory, pods and extended resource", + input: api.ResourceThresholds{ + v1.ResourceCPU: 20, + v1.ResourceMemory: 30, + v1.ResourcePods: 40, + extendedResource: 50, + }, + errInfo: nil, + }, + } + + for _, test := range tests { + validateErr := validateThresholds(test.input) + + if validateErr == nil || test.errInfo == nil { + if validateErr != test.errInfo { + t.Errorf("expected validity of threshold: %#v to be %v but got %v instead", test.input, test.errInfo, validateErr) + } + } else if validateErr.Error() != test.errInfo.Error() { + t.Errorf("expected validity of threshold: %#v to be %v but got %v instead", test.input, test.errInfo, validateErr) + } + } +} + +func TestResourceUsagePercentages(t *testing.T) { + resourceUsagePercentage := resourceUsagePercentages(NodeUsage{ + node: &v1.Node{ + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(3977868*1024, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI), + }, + Allocatable: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(1930, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(3287692*1024, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI), + }, + }, + }, + usage: map[v1.ResourceName]*resource.Quantity{ + v1.ResourceCPU: resource.NewMilliQuantity(1220, resource.DecimalSI), + v1.ResourceMemory: resource.NewQuantity(3038982964, resource.BinarySI), + v1.ResourcePods: resource.NewQuantity(11, resource.BinarySI), + }, + }) + + expectedUsageInIntPercentage := map[v1.ResourceName]float64{ + v1.ResourceCPU: 63, + v1.ResourceMemory: 90, + v1.ResourcePods: 37, + } + + for resourceName, percentage := range expectedUsageInIntPercentage { + if math.Floor(resourceUsagePercentage[resourceName]) != percentage { + t.Errorf("Incorrect percentange computation, expected %v, got math.Floor(%v) instead", percentage, resourceUsagePercentage[resourceName]) + } + } + + t.Logf("resourceUsagePercentage: %#v\n", resourceUsagePercentage) +} diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 4d4b9e3f0..db599dad3 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -48,6 +48,7 @@ import ( nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" "sigs.k8s.io/descheduler/pkg/descheduler/strategies" + "sigs.k8s.io/descheduler/pkg/descheduler/strategies/nodeutilization" "sigs.k8s.io/descheduler/pkg/utils" ) @@ -319,7 +320,7 @@ func TestLowNodeUtilization(t *testing.T) { podsBefore := len(podsOnMosttUtilizedNode) t.Log("Running LowNodeUtilization strategy") - strategies.LowNodeUtilization( + nodeutilization.LowNodeUtilization( ctx, clientSet, deschedulerapi.DeschedulerStrategy{ From 2f18864fa5b62245422a416a0b8cba68c8ebb7c5 Mon Sep 17 00:00:00 2001 From: Hanu Date: Tue, 1 Jun 2021 21:31:11 +0800 Subject: [PATCH 2/3] Refractor - Modify the common functions to be used by high utilisation --- .../nodeutilization/lownodeutilization.go | 37 +++++++++++---- .../lownodeutilization_test.go | 8 ++-- .../nodeutilization/nodeutilization.go | 47 ++++++++----------- 3 files changed, 50 insertions(+), 42 deletions(-) diff --git a/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go b/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go index 66da7b94c..45520ecb7 100644 --- a/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go +++ b/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go @@ -20,6 +20,7 @@ import ( "context" "fmt" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" @@ -33,7 +34,7 @@ import ( // to calculate nodes' utilization and not the actual resource usage. func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) { // TODO: May be create a struct for the strategy as well, so that we don't have to pass along the all the params? - if err := validateLowNodeUtilizationParams(strategy.Params); err != nil { + if err := validateNodeUtilizationParams(strategy.Params); err != nil { klog.ErrorS(err, "Invalid LowNodeUtilization parameters") return } @@ -50,7 +51,7 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds - if err := validateStrategyConfig(thresholds, targetThresholds); err != nil { + if err := validateLowUtilizationStrategyConfig(thresholds, targetThresholds); err != nil { klog.ErrorS(err, "LowNodeUtilization config is not valid") return } @@ -69,7 +70,7 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg } resourceNames := getResourceNames(thresholds) - lowNodes, targetNodes := classifyNodes( + lowNodes, sourceNodes := classifyNodes( getNodeUsage(ctx, client, nodes, thresholds, targetThresholds, resourceNames), // The node has to be schedulable (to be able to move workload there) func(node *v1.Node, usage NodeUsage) bool { @@ -110,7 +111,7 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg } } klog.V(1).InfoS("Criteria for a node above target utilization", keysAndValues...) - klog.V(1).InfoS("Number of overutilized nodes", "totalNumber", len(targetNodes)) + klog.V(1).InfoS("Number of overutilized nodes", "totalNumber", len(sourceNodes)) if len(lowNodes) == 0 { klog.V(1).InfoS("No node is underutilized, nothing to do here, you might tune your thresholds further") @@ -127,26 +128,42 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg return } - if len(targetNodes) == 0 { + if len(sourceNodes) == 0 { klog.V(1).InfoS("All nodes are under target utilization, nothing to do here") return } evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) - evictPodsFromTargetNodes( + // stop if node utilization drops below target threshold or any of required capacity (cpu, memory, pods) is moved + continueEvictionCond := func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool { + if !isNodeAboveTargetUtilization(nodeUsage) { + return false + } + for name := range totalAvailableUsage { + if totalAvailableUsage[name].CmpInt64(0) < 1 { + return false + } + } + + return true + } + + evictPodsFromSourceNodes( ctx, - targetNodes, + sourceNodes, lowNodes, podEvictor, evictable.IsEvictable, - resourceNames) + resourceNames, + "LowNodeUtilization", + continueEvictionCond) klog.V(1).InfoS("Total number of pods evicted", "evictedPods", podEvictor.TotalEvicted()) } -// validateStrategyConfig checks if the strategy's config is valid -func validateStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error { +// validateLowUtilizationStrategyConfig checks if the strategy's config is valid +func validateLowUtilizationStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error { // validate thresholds and targetThresholds config if err := validateThresholds(thresholds); err != nil { return fmt.Errorf("thresholds config is not valid: %v", err) diff --git a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go index 1324de091..dca875caf 100644 --- a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go @@ -818,7 +818,7 @@ func TestLowNodeUtilization(t *testing.T) { } } -func TestValidateStrategyConfig(t *testing.T) { +func TestValidateLowNodeUtilizationStrategyConfig(t *testing.T) { tests := []struct { name string thresholds api.ResourceThresholds @@ -958,7 +958,7 @@ func TestValidateStrategyConfig(t *testing.T) { } for _, testCase := range tests { - validateErr := validateStrategyConfig(testCase.thresholds, testCase.targetThresholds) + validateErr := validateLowUtilizationStrategyConfig(testCase.thresholds, testCase.targetThresholds) if validateErr == nil || testCase.errInfo == nil { if validateErr != testCase.errInfo { @@ -972,9 +972,7 @@ func TestValidateStrategyConfig(t *testing.T) { } } - - -func TestWithTaints(t *testing.T) { +func TestLowNodeUtilizationWithTaints(t *testing.T) { ctx := context.Background() strategy := api.DeschedulerStrategy{ Enabled: true, diff --git a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go index 01a874da5..fca2330b7 100644 --- a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go +++ b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go @@ -40,6 +40,8 @@ type NodeUsage struct { highResourceThreshold map[v1.ResourceName]*resource.Quantity } +type continueEvictionCond func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool + // NodePodsMap is a set of (node, pods) pairs type NodePodsMap map[*v1.Node][]*v1.Pod @@ -50,7 +52,7 @@ const ( MaxResourcePercentage = 100 ) -func validateLowNodeUtilizationParams(params *api.StrategyParameters) error { +func validateNodeUtilizationParams(params *api.StrategyParameters) error { if params == nil || params.NodeResourceUtilizationThresholds == nil { return fmt.Errorf("NodeResourceUtilizationThresholds not set") } @@ -172,18 +174,20 @@ func classifyNodes( return lowNodes, highNodes } -// evictPodsFromTargetNodes evicts pods based on priority, if all the pods on the node have priority, if not +// evictPodsFromSourceNodes evicts pods based on priority, if all the pods on the node have priority, if not // evicts them based on QoS as fallback option. // TODO: @ravig Break this function into smaller functions. -func evictPodsFromTargetNodes( +func evictPodsFromSourceNodes( ctx context.Context, - targetNodes, lowNodes []NodeUsage, + sourceNodes, destinationNodes []NodeUsage, podEvictor *evictions.PodEvictor, podFilter func(pod *v1.Pod) bool, resourceNames []v1.ResourceName, + strategy string, + continueEviction continueEvictionCond, ) { - sortNodesByUsage(targetNodes) + sortNodesByUsage(sourceNodes) // upper bound on total number of pods/cpu/memory and optional extended resources to be moved totalAvailableUsage := map[v1.ResourceName]*resource.Quantity{ @@ -192,9 +196,9 @@ func evictPodsFromTargetNodes( v1.ResourceMemory: {}, } - var taintsOfLowNodes = make(map[string][]v1.Taint, len(lowNodes)) - for _, node := range lowNodes { - taintsOfLowNodes[node.node.Name] = node.node.Spec.Taints + var taintsOfDestinationNodes = make(map[string][]v1.Taint, len(destinationNodes)) + for _, node := range destinationNodes { + taintsOfDestinationNodes[node.node.Name] = node.node.Spec.Taints for _, name := range resourceNames { if _, ok := totalAvailableUsage[name]; !ok { @@ -218,7 +222,7 @@ func evictPodsFromTargetNodes( } klog.V(1).InfoS("Total capacity to be moved", keysAndValues...) - for _, node := range targetNodes { + for _, node := range sourceNodes { klog.V(3).InfoS("Evicting pods from node", "node", klog.KObj(node.node), "usage", node.usage) nonRemovablePods, removablePods := classifyPods(node.allPods, podFilter) @@ -232,7 +236,7 @@ func evictPodsFromTargetNodes( klog.V(1).InfoS("Evicting pods based on priority, if they have same priority, they'll be evicted based on QoS tiers") // sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers. podutil.SortPodsBasedOnPriorityLowToHigh(removablePods) - evictPods(ctx, removablePods, node, totalAvailableUsage, taintsOfLowNodes, podEvictor) + evictPods(ctx, removablePods, node, totalAvailableUsage, taintsOfDestinationNodes, podEvictor, strategy, continueEviction) klog.V(1).InfoS("Evicted pods from node", "node", klog.KObj(node.node), "evictedPods", podEvictor.NodeEvicted(node.node), "usage", node.usage) } } @@ -244,29 +248,18 @@ func evictPods( totalAvailableUsage map[v1.ResourceName]*resource.Quantity, taintsOfLowNodes map[string][]v1.Taint, podEvictor *evictions.PodEvictor, + strategy string, + continueEviction continueEvictionCond, ) { - // stop if node utilization drops below target threshold or any of required capacity (cpu, memory, pods) is moved - continueCond := func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool { - if !isNodeAboveTargetUtilization(nodeUsage) { - return false - } - for name := range totalAvailableUsage { - if totalAvailableUsage[name].CmpInt64(0) < 1 { - return false - } - } - - return true - } - if continueCond(nodeUsage, totalAvailableUsage) { + if continueEviction(nodeUsage, totalAvailableUsage) { for _, pod := range inputPods { if !utils.PodToleratesTaints(pod, taintsOfLowNodes) { klog.V(3).InfoS("Skipping eviction for pod, doesn't tolerate node taint", "pod", klog.KObj(pod)) continue } - success, err := podEvictor.EvictPod(ctx, pod, nodeUsage.node, "LowNodeUtilization") + success, err := podEvictor.EvictPod(ctx, pod, nodeUsage.node, strategy) if err != nil { klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) break @@ -299,8 +292,8 @@ func evictPods( } klog.V(3).InfoS("Updated node usage", keysAndValues...) - // check if node utilization drops below target threshold or any required capacity (cpu, memory, pods) is moved - if !continueCond(nodeUsage, totalAvailableUsage) { + // check if pods can be still evicted + if !continueEviction(nodeUsage, totalAvailableUsage) { break } } From 4cd1e66ef3202d8d4ba3f5d529c1b17e397cd92f Mon Sep 17 00:00:00 2001 From: Hanu Date: Fri, 7 May 2021 11:11:33 +0800 Subject: [PATCH 3/3] Adding highnodeutilization strategy --- README.md | 67 +- docs/user-guide.md | 23 +- examples/high-node-utilization.yml | 10 + pkg/descheduler/descheduler.go | 1 + .../nodeutilization/highnodeutilization.go | 157 ++++ .../highnodeutilization_test.go | 707 ++++++++++++++++++ 6 files changed, 959 insertions(+), 6 deletions(-) create mode 100644 examples/high-node-utilization.yml create mode 100644 pkg/descheduler/strategies/nodeutilization/highnodeutilization.go create mode 100644 pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go diff --git a/README.md b/README.md index 87b88475f..21494b942 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ Table of Contents - [Policy and Strategies](#policy-and-strategies) - [RemoveDuplicates](#removeduplicates) - [LowNodeUtilization](#lownodeutilization) + - [HighNodeUtilization](#highnodeutilization) - [RemovePodsViolatingInterPodAntiAffinity](#removepodsviolatinginterpodantiaffinity) - [RemovePodsViolatingNodeAffinity](#removepodsviolatingnodeaffinity) - [RemovePodsViolatingNodeTaints](#removepodsviolatingnodetaints) @@ -107,9 +108,17 @@ See the [user guide](docs/user-guide.md) in the `/docs` directory. ## Policy and Strategies Descheduler's policy is configurable and includes strategies that can be enabled or disabled. -Eight strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`, -`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsViolatingTopologySpreadConstraint`, -`RemovePodsHavingTooManyRestarts`, and `PodLifeTime` are currently implemented. As part of the policy, the +Nine strategies +1. `RemoveDuplicates` +2. `LowNodeUtilization` +3. `HighNodeUtilization` +4. `RemovePodsViolatingInterPodAntiAffinity` +5. `RemovePodsViolatingNodeAffinity` +6. `RemovePodsViolatingNodeTaints` +7. `RemovePodsViolatingTopologySpreadConstraint` +8. `RemovePodsHavingTooManyRestarts` +9. `PodLifeTime` +are currently implemented. As part of the policy, the parameters associated with the strategies can be configured too. By default, all strategies are enabled. The following diagram provides a visualization of most of the strategies to help @@ -240,6 +249,58 @@ This parameter can be configured to activate the strategy only when the number o are above the configured value. This could be helpful in large clusters where a few nodes could go under utilized frequently or for a short period of time. By default, `numberOfNodes` is set to zero. +### HighNodeUtilization + +This strategy finds nodes that are under utilized and evicts pods in the hope that these pods will be scheduled compactly into fewer nodes. This strategy **must** be used with the +scheduler strategy `MostRequestedPriority`. The parameters of this strategy are configured under `nodeResourceUtilizationThresholds`. + +The under utilization of nodes is determined by a configurable threshold `thresholds`. The threshold +`thresholds` can be configured for cpu, memory, number of pods, and extended resources in terms of percentage. The percentage is +calculated as the current resources requested on the node vs [total allocatable](https://kubernetes.io/docs/concepts/architecture/nodes/#capacity). +For pods, this means the number of pods on the node as a fraction of the pod capacity set for that node. + +If a node's usage is below threshold for all (cpu, memory, number of pods and extended resources), the node is considered underutilized. +Currently, pods request resource requirements are considered for computing node resource utilization. Any node above `thresholds` is considered appropriately utilized and is not considered for eviction. + +The `thresholds` param could be tuned as per your cluster requirements. Note that this +strategy evicts pods from `underutilized nodes` (those with usage below `thresholds`) so that they can be recreated in appropriately utilized nodes. The strategy will abort if any number of `underutilized nodes` or `appropriately utilized nodes` is zero. + +**Parameters:** + +|Name|Type| +|---|---| +|`thresholds`|map(string:int)| +|`numberOfNodes`|int| +|`thresholdPriority`|int (see [priority filtering](#priority-filtering))| +|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| + +**Example:** + +```yaml +apiVersion: "descheduler/v1alpha1" +kind: "DeschedulerPolicy" +strategies: + "HighNodeUtilization": + enabled: true + params: + nodeResourceUtilizationThresholds: + thresholds: + "cpu" : 20 + "memory": 20 + "pods": 20 +``` + +Policy should pass the following validation checks: +* Three basic native types of resources are supported: `cpu`, `memory` and `pods`. If any of these resource types is not specified, all its thresholds default to 100%. +* Extended resources are supported. For example, resource type `nvidia.com/gpu` is specified for GPU node utilization. Extended resources are optional, and will not be used to compute node's usage if it's not specified in `thresholds` explicitly. +* `thresholds` can not be nil. +* The valid range of the resource's percentage value is \[0, 100\] + +There is another parameter associated with the `HighNodeUtilization` strategy, called `numberOfNodes`. +This parameter can be configured to activate the strategy only when the number of under utilized nodes +is above the configured value. This could be helpful in large clusters where a few nodes could go +under utilized frequently or for a short period of time. By default, `numberOfNodes` is set to zero. + ### RemovePodsViolatingInterPodAntiAffinity This strategy makes sure that pods violating interpod anti-affinity are removed from nodes. For example, diff --git a/docs/user-guide.md b/docs/user-guide.md index 156dd0866..f9ba86ce5 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -92,12 +92,12 @@ strategies: ``` ### Balance Cluster By Node Memory Utilization - If your cluster has been running for a long period of time, you may find that the resource utilization is not very -balanced. The `LowNodeUtilization` strategy can be used to rebalance your cluster based on `cpu`, `memory` +balanced. The following two strategies can be used to rebalance your cluster based on `cpu`, `memory` or `number of pods`. -Using the following policy configuration file, descheduler will rebalance the cluster based on memory by evicting pods +#### Balance high utilization nodes +Using `LowNodeUtilization`, descheduler will rebalance the cluster based on memory by evicting pods from nodes with memory utilization over 70% to nodes with memory utilization below 20%. ``` @@ -114,6 +114,23 @@ strategies: "memory": 70 ``` +#### Balance low utilization nodes +Using `HighNodeUtilization`, descheduler will rebalance the cluster based on memory by evicting pods +from nodes with memory utilization lower than 20%. This should be used along with scheduler strategy `MostRequestedPriority`. +The evicted pods will be compacted into minimal set of nodes. + +``` +apiVersion: "descheduler/v1alpha1" +kind: "DeschedulerPolicy" +strategies: + "HighNodeUtilization": + enabled: true + params: + nodeResourceUtilizationThresholds: + thresholds: + "memory": 20 +``` + ### Autoheal Node Problems Descheduler's `RemovePodsViolatingNodeTaints` strategy can be combined with [Node Problem Detector](https://github.com/kubernetes/node-problem-detector/) and diff --git a/examples/high-node-utilization.yml b/examples/high-node-utilization.yml new file mode 100644 index 000000000..67733ba37 --- /dev/null +++ b/examples/high-node-utilization.yml @@ -0,0 +1,10 @@ +--- +apiVersion: "descheduler/v1alpha1" +kind: "DeschedulerPolicy" +strategies: + "HighNodeUtilization": + enabled: true + params: + nodeResourceUtilizationThresholds: + thresholds: + "memory": 20 diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index 8dd00b0b4..06bdefaf1 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -76,6 +76,7 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer strategyFuncs := map[api.StrategyName]strategyFunction{ "RemoveDuplicates": strategies.RemoveDuplicatePods, "LowNodeUtilization": nodeutilization.LowNodeUtilization, + "HighNodeUtilization": nodeutilization.HighNodeUtilization, "RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity, "RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity, "RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints, diff --git a/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go b/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go new file mode 100644 index 000000000..27e354f54 --- /dev/null +++ b/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go @@ -0,0 +1,157 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeutilization + +import ( + "context" + "fmt" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + "sigs.k8s.io/descheduler/pkg/api" + "sigs.k8s.io/descheduler/pkg/descheduler/evictions" + nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" + "sigs.k8s.io/descheduler/pkg/utils" +) + +// HighNodeUtilization evicts pods from under utilized nodes so that scheduler can schedule according to its strategy. +// Note that CPU/Memory requests are used to calculate nodes' utilization and not the actual resource usage. +func HighNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) { + if err := validateNodeUtilizationParams(strategy.Params); err != nil { + klog.ErrorS(err, "Invalid HighNodeUtilization parameters") + return + } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) + if err != nil { + klog.ErrorS(err, "Failed to get threshold priority from strategy's params") + return + } + + thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds + targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds + if err := validateHighUtilizationStrategyConfig(thresholds, targetThresholds); err != nil { + klog.ErrorS(err, "HighNodeUtilization config is not valid") + return + } + targetThresholds = make(api.ResourceThresholds) + + setDefaultForThresholds(thresholds, targetThresholds) + resourceNames := getResourceNames(targetThresholds) + + sourceNodes, highNodes := classifyNodes( + getNodeUsage(ctx, client, nodes, thresholds, targetThresholds, resourceNames), + func(node *v1.Node, usage NodeUsage) bool { + return isNodeWithLowUtilization(usage) + }, + func(node *v1.Node, usage NodeUsage) bool { + if nodeutil.IsNodeUnschedulable(node) { + klog.V(2).InfoS("Node is unschedulable", "node", klog.KObj(node)) + return false + } + return !isNodeWithLowUtilization(usage) + }) + + // log message in one line + keysAndValues := []interface{}{ + "CPU", targetThresholds[v1.ResourceCPU], + "Mem", targetThresholds[v1.ResourceMemory], + "Pods", targetThresholds[v1.ResourcePods], + } + for name := range targetThresholds { + if !isBasicResource(name) { + keysAndValues = append(keysAndValues, string(name), int64(targetThresholds[name])) + } + } + + klog.V(1).InfoS("Criteria for a node below target utilization", keysAndValues...) + klog.V(1).InfoS("Number of underutilized nodes", "totalNumber", len(sourceNodes)) + + if len(sourceNodes) == 0 { + klog.V(1).InfoS("No node is underutilized, nothing to do here, you might tune your thresholds further") + return + } + if len(sourceNodes) < strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes { + klog.V(1).InfoS("Number of nodes underutilized is less than NumberOfNodes, nothing to do here", "underutilizedNodes", len(sourceNodes), "numberOfNodes", strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes) + return + } + if len(sourceNodes) == len(nodes) { + klog.V(1).InfoS("All nodes are underutilized, nothing to do here") + return + } + if len(highNodes) == 0 { + klog.V(1).InfoS("No node is available to schedule the pods, nothing to do here") + return + } + + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) + + // stop if the total available usage has dropped to zero - no more pods can be scheduled + continueEvictionCond := func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool { + for name := range totalAvailableUsage { + if totalAvailableUsage[name].CmpInt64(0) < 1 { + return false + } + } + + return true + } + evictPodsFromSourceNodes( + ctx, + sourceNodes, + highNodes, + podEvictor, + evictable.IsEvictable, + resourceNames, + "HighNodeUtilization", + continueEvictionCond) + +} + +func validateHighUtilizationStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error { + if targetThresholds != nil { + return fmt.Errorf("targetThresholds is not applicable for HighNodeUtilization") + } + if err := validateThresholds(thresholds); err != nil { + return fmt.Errorf("thresholds config is not valid: %v", err) + } + return nil +} + +func setDefaultForThresholds(thresholds, targetThresholds api.ResourceThresholds) { + // check if Pods/CPU/Mem are set, if not, set them to 100 + if _, ok := thresholds[v1.ResourcePods]; !ok { + thresholds[v1.ResourcePods] = MaxResourcePercentage + } + if _, ok := thresholds[v1.ResourceCPU]; !ok { + thresholds[v1.ResourceCPU] = MaxResourcePercentage + } + if _, ok := thresholds[v1.ResourceMemory]; !ok { + thresholds[v1.ResourceMemory] = MaxResourcePercentage + } + + // Default targetThreshold resource values to 100 + targetThresholds[v1.ResourcePods] = MaxResourcePercentage + targetThresholds[v1.ResourceCPU] = MaxResourcePercentage + targetThresholds[v1.ResourceMemory] = MaxResourcePercentage + + for name := range thresholds { + if !isBasicResource(name) { + targetThresholds[name] = MaxResourcePercentage + } + } +} diff --git a/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go new file mode 100644 index 000000000..d3928587b --- /dev/null +++ b/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go @@ -0,0 +1,707 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeutilization + +import ( + "context" + "fmt" + v1 "k8s.io/api/core/v1" + "k8s.io/api/policy/v1beta1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + core "k8s.io/client-go/testing" + "sigs.k8s.io/descheduler/pkg/api" + "sigs.k8s.io/descheduler/pkg/descheduler/evictions" + "sigs.k8s.io/descheduler/pkg/utils" + "sigs.k8s.io/descheduler/test" + "strings" + "testing" +) + +func TestHighNodeUtilization(t *testing.T) { + ctx := context.Background() + n1NodeName := "n1" + n2NodeName := "n2" + n3NodeName := "n3" + + testCases := []struct { + name string + thresholds api.ResourceThresholds + nodes map[string]*v1.Node + pods map[string]*v1.PodList + maxPodsToEvictPerNode int + expectedPodsEvicted int + evictedPods []string + }{ + { + name: "no node below threshold usage", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 20, + v1.ResourcePods: 20, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil), + n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef), + }, + }, + n2NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef), + }, + }, + n3NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p8", 400, 0, n3NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p9", 400, 0, n3NodeName, test.SetRSOwnerRef), + }, + }, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 0, + }, + { + name: "no evictable pods", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 40, + v1.ResourcePods: 40, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil), + n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod with local storage. + test.SetNormalOwnerRef(pod) + pod.Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, + }, + }, + } + // A Mirror Pod. + pod.Annotations = test.GetMirrorPodAnnotation() + }), + *test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A Critical Pod. + pod.Namespace = "kube-system" + priority := utils.SystemCriticalPriority + pod.Spec.Priority = &priority + }), + }, + }, + n2NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetDSOwnerRef), + *test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetDSOwnerRef), + }, + }, + n3NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p5", 400, 0, n3NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p6", 400, 0, n3NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p8", 400, 0, n3NodeName, test.SetRSOwnerRef), + }, + }, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 0, + }, + { + name: "no node to schedule evicted pods", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 20, + v1.ResourcePods: 20, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil), + n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + // These can't be evicted. + *test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + }, + }, + n2NodeName: { + Items: []v1.Pod{ + // These can't be evicted. + *test.BuildTestPod("p2", 400, 0, n2NodeName, test.SetRSOwnerRef), + }, + }, + n3NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p3", 400, 0, n3NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p4", 400, 0, n3NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p5", 400, 0, n3NodeName, test.SetRSOwnerRef), + }, + }, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 0, + }, + { + name: "without priorities", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 30, + v1.ResourcePods: 30, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil), + n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + // These won't be evicted. + *test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A Critical Pod. + pod.Namespace = "kube-system" + priority := utils.SystemCriticalPriority + pod.Spec.Priority = &priority + }), + }, + }, + n2NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef), + }, + }, + n3NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef), + }, + }, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 2, + evictedPods: []string{"p1", "p7"}, + }, + { + name: "without priorities stop when resource capacity is depleted", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 30, + v1.ResourcePods: 30, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 2000, 3000, 10, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 2000, 3000, 10, nil), + n3NodeName: test.BuildTestNode(n3NodeName, 2000, 3000, 10, test.SetNodeUnschedulable), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + }, + }, + n2NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p2", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef), + }, + }, + n3NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p6", 400, 0, n3NodeName, test.SetRSOwnerRef), + }, + }, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 1, + }, + { + name: "with priorities", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 30, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 2000, 3000, 10, nil), + n3NodeName: test.BuildTestNode(n3NodeName, 2000, 3000, 10, test.SetNodeUnschedulable), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodPriority(pod, lowPriority) + }), + *test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodPriority(pod, highPriority) + }), + }, + }, + n2NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p7", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p8", 400, 0, n2NodeName, test.SetRSOwnerRef), + }, + }, + n3NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p9", 400, 0, n3NodeName, test.SetDSOwnerRef), + }, + }, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 1, + evictedPods: []string{"p1"}, + }, + { + name: "without priorities evicting best-effort pods only", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 30, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 3000, 3000, 10, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 3000, 3000, 5, nil), + n3NodeName: test.BuildTestNode(n3NodeName, 3000, 3000, 10, test.SetNodeUnschedulable), + }, + // All pods are assumed to be burstable (test.BuildTestNode always sets both cpu/memory resource requests to some value) + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.MakeBestEffortPod(pod) + }), + *test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + }), + }, + }, + n2NodeName: { + Items: []v1.Pod{ + // These won't be evicted. + *test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef), + }, + }, + n3NodeName: { + Items: []v1.Pod{}, + }, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 1, + evictedPods: []string{"p1"}, + }, + { + name: "with extended resource", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 20, + extendedResource: 40, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) { + test.SetNodeExtendedResource(node, extendedResource, 8) + }), + n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) { + test.SetNodeExtendedResource(node, extendedResource, 8) + }), + n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 1) + }), + *test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 1) + }), + // These won't be evicted + *test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) { + test.SetDSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 1) + }), + }, + }, + n2NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p3", 500, 0, n2NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 1) + }), + *test.BuildTestPod("p4", 500, 0, n2NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 1) + }), + *test.BuildTestPod("p5", 500, 0, n2NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 1) + }), + *test.BuildTestPod("p6", 500, 0, n2NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 1) + }), + }, + }, + n3NodeName: { + Items: []v1.Pod{}, + }, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 2, + evictedPods: []string{"p1", "p2"}, + }, + { + name: "with extended resource in some of nodes", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 40, + extendedResource: 40, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) { + test.SetNodeExtendedResource(node, extendedResource, 8) + }), + n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil), + n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + //These won't be evicted + *test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 1) + }), + *test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 1) + }), + }, + }, + n2NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p3", 500, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p4", 500, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p5", 500, 0, n2NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p6", 500, 0, n2NodeName, test.SetRSOwnerRef), + }, + }, + n3NodeName: { + Items: []v1.Pod{}, + }, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 0, + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + fakeClient := &fake.Clientset{} + fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { + list := action.(core.ListAction) + fieldString := list.GetListRestrictions().Fields.String() + if strings.Contains(fieldString, n1NodeName) { + return true, test.pods[n1NodeName], nil + } + if strings.Contains(fieldString, n2NodeName) { + return true, test.pods[n2NodeName], nil + } + if strings.Contains(fieldString, n3NodeName) { + return true, test.pods[n3NodeName], nil + } + return true, nil, fmt.Errorf("Failed to list: %v", list) + }) + fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) { + getAction := action.(core.GetAction) + if node, exists := test.nodes[getAction.GetName()]; exists { + return true, node, nil + } + return true, nil, fmt.Errorf("Wrong node: %v", getAction.GetName()) + }) + podsForEviction := make(map[string]struct{}) + for _, pod := range test.evictedPods { + podsForEviction[pod] = struct{}{} + } + + evictionFailed := false + if len(test.evictedPods) > 0 { + fakeClient.Fake.AddReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) { + getAction := action.(core.CreateAction) + obj := getAction.GetObject() + if eviction, ok := obj.(*v1beta1.Eviction); ok { + if _, exists := podsForEviction[eviction.Name]; exists { + return true, obj, nil + } + evictionFailed = true + return true, nil, fmt.Errorf("pod %q was unexpectedly evicted", eviction.Name) + } + return true, obj, nil + }) + } + + var nodes []*v1.Node + for _, node := range test.nodes { + nodes = append(nodes, node) + } + + podEvictor := evictions.NewPodEvictor( + fakeClient, + "v1", + false, + test.maxPodsToEvictPerNode, + nodes, + false, + false, + false, + ) + + strategy := api.DeschedulerStrategy{ + Enabled: true, + Params: &api.StrategyParameters{ + NodeResourceUtilizationThresholds: &api.NodeResourceUtilizationThresholds{ + Thresholds: test.thresholds, + }, + }, + } + HighNodeUtilization(ctx, fakeClient, strategy, nodes, podEvictor) + + podsEvicted := podEvictor.TotalEvicted() + if test.expectedPodsEvicted != podsEvicted { + t.Errorf("Expected %#v pods to be evicted but %#v got evicted", test.expectedPodsEvicted, podsEvicted) + } + if evictionFailed { + t.Errorf("Pod evictions failed unexpectedly") + } + }) + } +} + +func TestValidateHighNodeUtilizationStrategyConfig(t *testing.T) { + tests := []struct { + name string + thresholds api.ResourceThresholds + targetThresholds api.ResourceThresholds + errInfo error + }{ + { + name: "passing target thresholds", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 20, + v1.ResourceMemory: 20, + }, + targetThresholds: api.ResourceThresholds{ + v1.ResourceCPU: 80, + v1.ResourceMemory: 80, + }, + errInfo: fmt.Errorf("targetThresholds is not applicable for HighNodeUtilization"), + }, + { + name: "passing empty thresholds", + thresholds: api.ResourceThresholds{}, + errInfo: fmt.Errorf("thresholds config is not valid: no resource threshold is configured"), + }, + { + name: "passing invalid thresholds", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 80, + v1.ResourceMemory: 120, + }, + errInfo: fmt.Errorf("thresholds config is not valid: %v", fmt.Errorf( + "%v threshold not in [%v, %v] range", v1.ResourceMemory, MinResourcePercentage, MaxResourcePercentage)), + }, + { + name: "passing valid strategy config", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 80, + v1.ResourceMemory: 80, + }, + errInfo: nil, + }, + { + name: "passing valid strategy config with extended resource", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 80, + v1.ResourceMemory: 80, + extendedResource: 80, + }, + errInfo: nil, + }, + } + + for _, testCase := range tests { + validateErr := validateHighUtilizationStrategyConfig(testCase.thresholds, testCase.targetThresholds) + + if validateErr == nil || testCase.errInfo == nil { + if validateErr != testCase.errInfo { + t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v to be %v but got %v instead", + testCase.thresholds, testCase.targetThresholds, testCase.errInfo, validateErr) + } + } else if validateErr.Error() != testCase.errInfo.Error() { + t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v to be %v but got %v instead", + testCase.thresholds, testCase.targetThresholds, testCase.errInfo, validateErr) + } + } +} + +func TestHighNodeUtilizationWithTaints(t *testing.T) { + ctx := context.Background() + strategy := api.DeschedulerStrategy{ + Enabled: true, + Params: &api.StrategyParameters{ + NodeResourceUtilizationThresholds: &api.NodeResourceUtilizationThresholds{ + Thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 40, + }, + }, + }, + } + + n1 := test.BuildTestNode("n1", 1000, 3000, 10, nil) + n2 := test.BuildTestNode("n2", 1000, 3000, 10, nil) + n3 := test.BuildTestNode("n3", 1000, 3000, 10, nil) + n3withTaints := n3.DeepCopy() + n3withTaints.Spec.Taints = []v1.Taint{ + { + Key: "key", + Value: "value", + Effect: v1.TaintEffectNoSchedule, + }, + } + + podThatToleratesTaint := test.BuildTestPod("tolerate_pod", 200, 0, n1.Name, test.SetRSOwnerRef) + podThatToleratesTaint.Spec.Tolerations = []v1.Toleration{ + { + Key: "key", + Value: "value", + }, + } + + tests := []struct { + name string + nodes []*v1.Node + pods []*v1.Pod + evictionsExpected int + }{ + { + name: "No taints", + nodes: []*v1.Node{n1, n2, n3}, + pods: []*v1.Pod{ + //Node 1 pods + test.BuildTestPod(fmt.Sprintf("pod_1_%s", n1.Name), 200, 0, n1.Name, test.SetRSOwnerRef), + test.BuildTestPod(fmt.Sprintf("pod_2_%s", n1.Name), 200, 0, n1.Name, test.SetRSOwnerRef), + test.BuildTestPod(fmt.Sprintf("pod_3_%s", n1.Name), 200, 0, n1.Name, test.SetRSOwnerRef), + // Node 2 pods + test.BuildTestPod(fmt.Sprintf("pod_4_%s", n2.Name), 200, 0, n2.Name, test.SetRSOwnerRef), + }, + evictionsExpected: 1, + }, + { + name: "No pod tolerates node taint", + nodes: []*v1.Node{n1, n3withTaints}, + pods: []*v1.Pod{ + //Node 1 pods + test.BuildTestPod(fmt.Sprintf("pod_1_%s", n1.Name), 200, 0, n1.Name, test.SetRSOwnerRef), + // Node 3 pods + test.BuildTestPod(fmt.Sprintf("pod_2_%s", n3withTaints.Name), 200, 0, n3withTaints.Name, test.SetRSOwnerRef), + }, + evictionsExpected: 0, + }, + { + name: "Pod which tolerates node taint", + nodes: []*v1.Node{n1, n3withTaints}, + pods: []*v1.Pod{ + //Node 1 pods + test.BuildTestPod(fmt.Sprintf("pod_1_%s", n1.Name), 100, 0, n1.Name, test.SetRSOwnerRef), + podThatToleratesTaint, + // Node 3 pods + test.BuildTestPod(fmt.Sprintf("pod_9_%s", n3withTaints.Name), 500, 0, n3withTaints.Name, test.SetRSOwnerRef), + }, + evictionsExpected: 1, + }, + } + + for _, item := range tests { + t.Run(item.name, func(t *testing.T) { + var objs []runtime.Object + for _, node := range item.nodes { + objs = append(objs, node) + } + + for _, pod := range item.pods { + objs = append(objs, pod) + } + + fakeClient := fake.NewSimpleClientset(objs...) + + podEvictor := evictions.NewPodEvictor( + fakeClient, + "policy/v1", + false, + item.evictionsExpected, + item.nodes, + false, + false, + false, + ) + + HighNodeUtilization(ctx, fakeClient, strategy, item.nodes, podEvictor) + + if item.evictionsExpected != podEvictor.TotalEvicted() { + t.Errorf("Expected %v evictions, got %v", item.evictionsExpected, podEvictor.TotalEvicted()) + } + }) + } +}