diff --git a/README.md b/README.md index d5ffda6a7..f20a79a32 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,11 @@ These thresholds, `thresholds` and `targetThresholds`, could be tuned as per you strategy evicts pods from `overutilized nodes` (those with usage above `targetThresholds`) to `underutilized nodes` (those with usage below `thresholds`), it will abort if any number of `underutilized nodes` or `overutilized nodes` is zero. +Additionally, the strategy accepts a `useDeviationThresholds` parameter. +If that parameter is set to `true`, the thresholds are considered as percentage deviations from mean resource usage. +`thresholds` will be deducted from the mean among all nodes and `targetThresholds` will be added to the mean. +A resource consumption above (resp. below) this window is considered as overutilization (resp. underutilization). + **NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands @@ -232,6 +237,7 @@ actual usage metrics. Implementing metrics-based descheduling is currently TODO |`thresholds`|map(string:int)| |`targetThresholds`|map(string:int)| |`numberOfNodes`|int| +|`useDeviationThresholds`|bool| |`thresholdPriority`|int (see [priority filtering](#priority-filtering))| |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| |`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))| diff --git a/pkg/api/types.go b/pkg/api/types.go index 919807c48..ef4c65123 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -96,9 +96,10 @@ type Percentage float64 type ResourceThresholds map[v1.ResourceName]Percentage type NodeResourceUtilizationThresholds struct { - Thresholds ResourceThresholds - TargetThresholds ResourceThresholds - NumberOfNodes int + UseDeviationThresholds bool + Thresholds ResourceThresholds + TargetThresholds ResourceThresholds + NumberOfNodes int } type PodsHavingTooManyRestarts struct { diff --git a/pkg/api/v1alpha1/types.go b/pkg/api/v1alpha1/types.go index 43fe680eb..472e2c86a 100644 --- a/pkg/api/v1alpha1/types.go +++ b/pkg/api/v1alpha1/types.go @@ -94,9 +94,10 @@ type Percentage float64 type ResourceThresholds map[v1.ResourceName]Percentage type NodeResourceUtilizationThresholds struct { - Thresholds ResourceThresholds `json:"thresholds,omitempty"` - TargetThresholds ResourceThresholds `json:"targetThresholds,omitempty"` - NumberOfNodes int `json:"numberOfNodes,omitempty"` + UseDeviationThresholds bool `json:"useDeviationThresholds,omitempty"` + Thresholds ResourceThresholds `json:"thresholds,omitempty"` + TargetThresholds ResourceThresholds `json:"targetThresholds,omitempty"` + NumberOfNodes int `json:"numberOfNodes,omitempty"` } type PodsHavingTooManyRestarts struct { diff --git a/pkg/api/v1alpha1/zz_generated.conversion.go b/pkg/api/v1alpha1/zz_generated.conversion.go index 8798136f4..038a4ec40 100644 --- a/pkg/api/v1alpha1/zz_generated.conversion.go +++ b/pkg/api/v1alpha1/zz_generated.conversion.go @@ -261,6 +261,7 @@ func Convert_api_Namespaces_To_v1alpha1_Namespaces(in *api.Namespaces, out *Name } func autoConvert_v1alpha1_NodeResourceUtilizationThresholds_To_api_NodeResourceUtilizationThresholds(in *NodeResourceUtilizationThresholds, out *api.NodeResourceUtilizationThresholds, s conversion.Scope) error { + out.UseDeviationThresholds = in.UseDeviationThresholds out.Thresholds = *(*api.ResourceThresholds)(unsafe.Pointer(&in.Thresholds)) out.TargetThresholds = *(*api.ResourceThresholds)(unsafe.Pointer(&in.TargetThresholds)) out.NumberOfNodes = in.NumberOfNodes @@ -273,6 +274,7 @@ func Convert_v1alpha1_NodeResourceUtilizationThresholds_To_api_NodeResourceUtili } func autoConvert_api_NodeResourceUtilizationThresholds_To_v1alpha1_NodeResourceUtilizationThresholds(in *api.NodeResourceUtilizationThresholds, out *NodeResourceUtilizationThresholds, s conversion.Scope) error { + out.UseDeviationThresholds = in.UseDeviationThresholds out.Thresholds = *(*ResourceThresholds)(unsafe.Pointer(&in.Thresholds)) out.TargetThresholds = *(*ResourceThresholds)(unsafe.Pointer(&in.TargetThresholds)) out.NumberOfNodes = in.NumberOfNodes diff --git a/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go b/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go index aa90c317b..9d6b8b695 100644 --- a/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go +++ b/pkg/descheduler/strategies/nodeutilization/highnodeutilization.go @@ -64,7 +64,7 @@ func HighNodeUtilization(ctx context.Context, client clientset.Interface, strate sourceNodes, highNodes := classifyNodes( getNodeUsage(nodes, resourceNames, getPodsAssignedToNode), - getNodeThresholds(nodes, thresholds, targetThresholds, resourceNames), + getNodeThresholds(nodes, thresholds, targetThresholds, resourceNames, getPodsAssignedToNode, false), func(node *v1.Node, usage NodeUsage, threshold NodeThresholds) bool { return isNodeWithLowUtilization(usage, threshold.lowResourceThreshold) }, diff --git a/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go b/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go index dde109359..3e9ff5a28 100644 --- a/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go +++ b/pkg/descheduler/strategies/nodeutilization/lownodeutilization.go @@ -50,31 +50,47 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg if strategy.Params != nil { nodeFit = strategy.Params.NodeFit } - + useDeviationThresholds := strategy.Params.NodeResourceUtilizationThresholds.UseDeviationThresholds thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds - if err := validateLowUtilizationStrategyConfig(thresholds, targetThresholds); err != nil { + if err := validateLowUtilizationStrategyConfig(thresholds, targetThresholds, useDeviationThresholds); err != nil { klog.ErrorS(err, "LowNodeUtilization config is not valid") return } + // check if Pods/CPU/Mem are set, if not, set them to 100 if _, ok := thresholds[v1.ResourcePods]; !ok { - thresholds[v1.ResourcePods] = MaxResourcePercentage - targetThresholds[v1.ResourcePods] = MaxResourcePercentage + if useDeviationThresholds { + thresholds[v1.ResourcePods] = MinResourcePercentage + targetThresholds[v1.ResourcePods] = MinResourcePercentage + } else { + thresholds[v1.ResourcePods] = MaxResourcePercentage + targetThresholds[v1.ResourcePods] = MaxResourcePercentage + } } if _, ok := thresholds[v1.ResourceCPU]; !ok { - thresholds[v1.ResourceCPU] = MaxResourcePercentage - targetThresholds[v1.ResourceCPU] = MaxResourcePercentage + if useDeviationThresholds { + thresholds[v1.ResourceCPU] = MinResourcePercentage + targetThresholds[v1.ResourceCPU] = MinResourcePercentage + } else { + thresholds[v1.ResourceCPU] = MaxResourcePercentage + targetThresholds[v1.ResourceCPU] = MaxResourcePercentage + } } if _, ok := thresholds[v1.ResourceMemory]; !ok { - thresholds[v1.ResourceMemory] = MaxResourcePercentage - targetThresholds[v1.ResourceMemory] = MaxResourcePercentage + if useDeviationThresholds { + thresholds[v1.ResourceMemory] = MinResourcePercentage + targetThresholds[v1.ResourceMemory] = MinResourcePercentage + } else { + thresholds[v1.ResourceMemory] = MaxResourcePercentage + targetThresholds[v1.ResourceMemory] = MaxResourcePercentage + } } resourceNames := getResourceNames(thresholds) lowNodes, sourceNodes := classifyNodes( getNodeUsage(nodes, resourceNames, getPodsAssignedToNode), - getNodeThresholds(nodes, thresholds, targetThresholds, resourceNames), + getNodeThresholds(nodes, thresholds, targetThresholds, resourceNames, getPodsAssignedToNode, useDeviationThresholds), // The node has to be schedulable (to be able to move workload there) func(node *v1.Node, usage NodeUsage, threshold NodeThresholds) bool { if nodeutil.IsNodeUnschedulable(node) { @@ -166,7 +182,7 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg } // validateLowUtilizationStrategyConfig checks if the strategy's config is valid -func validateLowUtilizationStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error { +func validateLowUtilizationStrategyConfig(thresholds, targetThresholds api.ResourceThresholds, useDeviationThresholds bool) error { // validate thresholds and targetThresholds config if err := validateThresholds(thresholds); err != nil { return fmt.Errorf("thresholds config is not valid: %v", err) @@ -182,7 +198,7 @@ func validateLowUtilizationStrategyConfig(thresholds, targetThresholds api.Resou for resourceName, value := range thresholds { if targetValue, ok := targetThresholds[resourceName]; !ok { return fmt.Errorf("thresholds and targetThresholds configured different resources") - } else if value > targetValue { + } else if value > targetValue && !useDeviationThresholds { return fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", resourceName) } } diff --git a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go index 8cdcb299b..6b2c9fa1f 100644 --- a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go @@ -48,6 +48,7 @@ func TestLowNodeUtilization(t *testing.T) { testCases := []struct { name string + useDeviationThresholds bool thresholds, targetThresholds api.ResourceThresholds nodes []*v1.Node pods []*v1.Pod @@ -643,6 +644,57 @@ func TestLowNodeUtilization(t *testing.T) { }, expectedPodsEvicted: 3, }, + { + name: "deviation thresholds", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 5, + v1.ResourcePods: 5, + }, + targetThresholds: api.ResourceThresholds{ + v1.ResourceCPU: 5, + v1.ResourcePods: 5, + }, + useDeviationThresholds: true, + nodes: []*v1.Node{ + test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil), + test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil), + test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable), + }, + pods: []*v1.Pod{ + test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p5", 400, 0, n1NodeName, test.SetRSOwnerRef), + // These won't be evicted. + test.BuildTestPod("p6", 400, 0, n1NodeName, test.SetDSOwnerRef), + test.BuildTestPod("p7", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod with local storage. + test.SetNormalOwnerRef(pod) + pod.Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, + }, + }, + } + // A Mirror Pod. + pod.Annotations = test.GetMirrorPodAnnotation() + }), + test.BuildTestPod("p8", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A Critical Pod. + pod.Namespace = "kube-system" + priority := utils.SystemCriticalPriority + pod.Spec.Priority = &priority + }), + test.BuildTestPod("p9", 400, 0, n2NodeName, test.SetRSOwnerRef), + }, + expectedPodsEvicted: 2, + evictedPods: []string{}, + }, } for _, test := range testCases { @@ -731,8 +783,9 @@ func TestLowNodeUtilization(t *testing.T) { Enabled: true, Params: &api.StrategyParameters{ NodeResourceUtilizationThresholds: &api.NodeResourceUtilizationThresholds{ - Thresholds: test.thresholds, - TargetThresholds: test.targetThresholds, + Thresholds: test.thresholds, + TargetThresholds: test.targetThresholds, + UseDeviationThresholds: test.useDeviationThresholds, }, NodeFit: true, }, @@ -890,7 +943,7 @@ func TestValidateLowNodeUtilizationStrategyConfig(t *testing.T) { } for _, testCase := range tests { - validateErr := validateLowUtilizationStrategyConfig(testCase.thresholds, testCase.targetThresholds) + validateErr := validateLowUtilizationStrategyConfig(testCase.thresholds, testCase.targetThresholds, false) if validateErr == nil || testCase.errInfo == nil { if validateErr != testCase.errInfo { diff --git a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go index f9259de34..d9141f5a4 100644 --- a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go +++ b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go @@ -84,12 +84,30 @@ func validateThresholds(thresholds api.ResourceThresholds) error { return nil } +func normalizePercentage(percent api.Percentage) api.Percentage { + if percent > MaxResourcePercentage { + return MaxResourcePercentage + } + if percent < MinResourcePercentage { + return MinResourcePercentage + } + return percent +} + func getNodeThresholds( nodes []*v1.Node, lowThreshold, highThreshold api.ResourceThresholds, resourceNames []v1.ResourceName, + getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, + useDeviationThresholds bool, ) map[string]NodeThresholds { nodeThresholdsMap := map[string]NodeThresholds{} + + averageResourceUsagePercent := api.ResourceThresholds{} + if useDeviationThresholds { + averageResourceUsagePercent = averageNodeBasicresources(nodes, getPodsAssignedToNode, resourceNames) + } + for _, node := range nodes { nodeCapacity := node.Status.Capacity if len(node.Status.Allocatable) > 0 { @@ -102,8 +120,19 @@ func getNodeThresholds( } for _, resourceName := range resourceNames { - nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, lowThreshold[resourceName]) - nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, highThreshold[resourceName]) + if useDeviationThresholds { + cap := nodeCapacity[resourceName] + if lowThreshold[resourceName] == MinResourcePercentage { + nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = &cap + nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = &cap + } else { + nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, normalizePercentage(averageResourceUsagePercent[resourceName]-lowThreshold[resourceName])) + nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, normalizePercentage(averageResourceUsagePercent[resourceName]+highThreshold[resourceName])) + } + } else { + nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, lowThreshold[resourceName]) + nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, highThreshold[resourceName]) + } } } @@ -430,3 +459,34 @@ func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []* return nonRemovablePods, removablePods } + +func averageNodeBasicresources(nodes []*v1.Node, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, resourceNames []v1.ResourceName) api.ResourceThresholds { + total := api.ResourceThresholds{} + average := api.ResourceThresholds{} + numberOfNodes := len(nodes) + for _, node := range nodes { + pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, nil) + if err != nil { + numberOfNodes-- + continue + } + usage := nodeUtilization(node, pods, resourceNames) + nodeCapacity := node.Status.Capacity + if len(node.Status.Allocatable) > 0 { + nodeCapacity = node.Status.Allocatable + } + for resource, value := range usage { + nodeCapacityValue := nodeCapacity[resource] + if resource == v1.ResourceCPU { + total[resource] += api.Percentage(value.MilliValue()) / api.Percentage(nodeCapacityValue.MilliValue()) * 100.0 + } else { + total[resource] += api.Percentage(value.Value()) / api.Percentage(nodeCapacityValue.Value()) * 100.0 + + } + } + } + for resource, value := range total { + average[resource] = value / api.Percentage(numberOfNodes) + } + return average +}