mirror of
https://github.com/kubernetes-sigs/descheduler.git
synced 2026-01-26 21:31:18 +01:00
In some cases it might be usefull to limit how many evictions per a domain can be performed. To avoid burning the whole per descheduling cycle budget. Limiting the number of evictions per node is a prerequisite for evicting pods whose usage can't be easily subtracted from overall node resource usage to predict the final usage. E.g. when a pod is evicted due to high PSI pressure which takes into account many factors which can be fully captured by the current predictive resource model.
481 lines
16 KiB
Go
481 lines
16 KiB
Go
/*
|
|
Copyright 2021 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package nodeutilization
|
|
|
|
import (
|
|
"context"
|
|
"math"
|
|
"sort"
|
|
|
|
"sigs.k8s.io/descheduler/pkg/api"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
|
"k8s.io/klog/v2"
|
|
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
|
|
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
|
|
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
|
|
frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types"
|
|
"sigs.k8s.io/descheduler/pkg/utils"
|
|
)
|
|
|
|
// NodeUsage stores a node's info, pods on it, thresholds and its resource usage
|
|
type NodeUsage struct {
|
|
node *v1.Node
|
|
usage api.ReferencedResourceList
|
|
allPods []*v1.Pod
|
|
}
|
|
|
|
type NodeThresholds struct {
|
|
lowResourceThreshold api.ReferencedResourceList
|
|
highResourceThreshold api.ReferencedResourceList
|
|
}
|
|
|
|
type NodeInfo struct {
|
|
NodeUsage
|
|
thresholds NodeThresholds
|
|
}
|
|
|
|
type continueEvictionCond func(nodeInfo NodeInfo, totalAvailableUsage api.ReferencedResourceList) bool
|
|
|
|
// NodePodsMap is a set of (node, pods) pairs
|
|
type NodePodsMap map[*v1.Node][]*v1.Pod
|
|
|
|
const (
|
|
// MinResourcePercentage is the minimum value of a resource's percentage
|
|
MinResourcePercentage = 0
|
|
// MaxResourcePercentage is the maximum value of a resource's percentage
|
|
MaxResourcePercentage = 100
|
|
)
|
|
|
|
func normalizePercentage(percent api.Percentage) api.Percentage {
|
|
if percent > MaxResourcePercentage {
|
|
return MaxResourcePercentage
|
|
}
|
|
if percent < MinResourcePercentage {
|
|
return MinResourcePercentage
|
|
}
|
|
return percent
|
|
}
|
|
|
|
func getNodeThresholds(
|
|
nodes []*v1.Node,
|
|
lowThreshold, highThreshold api.ResourceThresholds,
|
|
resourceNames []v1.ResourceName,
|
|
useDeviationThresholds bool,
|
|
usageClient usageClient,
|
|
) map[string]NodeThresholds {
|
|
nodeThresholdsMap := map[string]NodeThresholds{}
|
|
|
|
averageResourceUsagePercent := api.ResourceThresholds{}
|
|
if useDeviationThresholds {
|
|
averageResourceUsagePercent = averageNodeBasicresources(nodes, usageClient)
|
|
}
|
|
|
|
for _, node := range nodes {
|
|
nodeCapacity := node.Status.Capacity
|
|
if len(node.Status.Allocatable) > 0 {
|
|
nodeCapacity = node.Status.Allocatable
|
|
}
|
|
|
|
nodeThresholdsMap[node.Name] = NodeThresholds{
|
|
lowResourceThreshold: api.ReferencedResourceList{},
|
|
highResourceThreshold: api.ReferencedResourceList{},
|
|
}
|
|
|
|
for _, resourceName := range resourceNames {
|
|
if useDeviationThresholds {
|
|
cap := nodeCapacity[resourceName]
|
|
if lowThreshold[resourceName] == MinResourcePercentage {
|
|
nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = &cap
|
|
nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = &cap
|
|
} else {
|
|
nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, normalizePercentage(averageResourceUsagePercent[resourceName]-lowThreshold[resourceName]))
|
|
nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, normalizePercentage(averageResourceUsagePercent[resourceName]+highThreshold[resourceName]))
|
|
}
|
|
} else {
|
|
nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, lowThreshold[resourceName])
|
|
nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, highThreshold[resourceName])
|
|
}
|
|
}
|
|
|
|
}
|
|
return nodeThresholdsMap
|
|
}
|
|
|
|
func getNodeUsage(
|
|
nodes []*v1.Node,
|
|
usageClient usageClient,
|
|
) []NodeUsage {
|
|
var nodeUsageList []NodeUsage
|
|
|
|
for _, node := range nodes {
|
|
nodeUsageList = append(nodeUsageList, NodeUsage{
|
|
node: node,
|
|
usage: usageClient.nodeUtilization(node.Name),
|
|
allPods: usageClient.pods(node.Name),
|
|
})
|
|
}
|
|
|
|
return nodeUsageList
|
|
}
|
|
|
|
func resourceThreshold(nodeCapacity v1.ResourceList, resourceName v1.ResourceName, threshold api.Percentage) *resource.Quantity {
|
|
defaultFormat := resource.DecimalSI
|
|
if resourceName == v1.ResourceMemory {
|
|
defaultFormat = resource.BinarySI
|
|
}
|
|
|
|
resourceCapacityFraction := func(resourceNodeCapacity int64) int64 {
|
|
// A threshold is in percentages but in <0;100> interval.
|
|
// Performing `threshold * 0.01` will convert <0;100> interval into <0;1>.
|
|
// Multiplying it with capacity will give fraction of the capacity corresponding to the given resource threshold in Quantity units.
|
|
return int64(float64(threshold) * 0.01 * float64(resourceNodeCapacity))
|
|
}
|
|
|
|
resourceCapacityQuantity := nodeCapacity.Name(resourceName, defaultFormat)
|
|
|
|
if resourceName == v1.ResourceCPU {
|
|
return resource.NewMilliQuantity(resourceCapacityFraction(resourceCapacityQuantity.MilliValue()), defaultFormat)
|
|
}
|
|
return resource.NewQuantity(resourceCapacityFraction(resourceCapacityQuantity.Value()), defaultFormat)
|
|
}
|
|
|
|
func roundTo2Decimals(percentage float64) float64 {
|
|
return math.Round(percentage*100) / 100
|
|
}
|
|
|
|
func resourceUsagePercentages(nodeUsage NodeUsage) map[v1.ResourceName]float64 {
|
|
nodeCapacity := nodeUsage.node.Status.Capacity
|
|
if len(nodeUsage.node.Status.Allocatable) > 0 {
|
|
nodeCapacity = nodeUsage.node.Status.Allocatable
|
|
}
|
|
|
|
resourceUsagePercentage := map[v1.ResourceName]float64{}
|
|
for resourceName, resourceUsage := range nodeUsage.usage {
|
|
cap := nodeCapacity[resourceName]
|
|
if !cap.IsZero() {
|
|
resourceUsagePercentage[resourceName] = 100 * float64(resourceUsage.MilliValue()) / float64(cap.MilliValue())
|
|
resourceUsagePercentage[resourceName] = roundTo2Decimals(resourceUsagePercentage[resourceName])
|
|
}
|
|
}
|
|
|
|
return resourceUsagePercentage
|
|
}
|
|
|
|
// classifyNodes classifies the nodes into low-utilization or high-utilization nodes. If a node lies between
|
|
// low and high thresholds, it is simply ignored.
|
|
func classifyNodes(
|
|
nodeUsages []NodeUsage,
|
|
nodeThresholds map[string]NodeThresholds,
|
|
lowThresholdFilter, highThresholdFilter func(node *v1.Node, usage NodeUsage, threshold NodeThresholds) bool,
|
|
) ([]NodeInfo, []NodeInfo) {
|
|
lowNodes, highNodes := []NodeInfo{}, []NodeInfo{}
|
|
|
|
for _, nodeUsage := range nodeUsages {
|
|
nodeInfo := NodeInfo{
|
|
NodeUsage: nodeUsage,
|
|
thresholds: nodeThresholds[nodeUsage.node.Name],
|
|
}
|
|
if lowThresholdFilter(nodeUsage.node, nodeUsage, nodeThresholds[nodeUsage.node.Name]) {
|
|
klog.InfoS("Node is underutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
|
|
lowNodes = append(lowNodes, nodeInfo)
|
|
} else if highThresholdFilter(nodeUsage.node, nodeUsage, nodeThresholds[nodeUsage.node.Name]) {
|
|
klog.InfoS("Node is overutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
|
|
highNodes = append(highNodes, nodeInfo)
|
|
} else {
|
|
klog.InfoS("Node is appropriately utilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
|
|
}
|
|
}
|
|
|
|
return lowNodes, highNodes
|
|
}
|
|
|
|
func usageToKeysAndValues(usage api.ReferencedResourceList) []interface{} {
|
|
// log message in one line
|
|
keysAndValues := []interface{}{}
|
|
if quantity, exists := usage[v1.ResourceCPU]; exists {
|
|
keysAndValues = append(keysAndValues, "CPU", quantity.MilliValue())
|
|
}
|
|
if quantity, exists := usage[v1.ResourceMemory]; exists {
|
|
keysAndValues = append(keysAndValues, "Mem", quantity.Value())
|
|
}
|
|
if quantity, exists := usage[v1.ResourcePods]; exists {
|
|
keysAndValues = append(keysAndValues, "Pods", quantity.Value())
|
|
}
|
|
for name := range usage {
|
|
if !nodeutil.IsBasicResource(name) {
|
|
keysAndValues = append(keysAndValues, string(name), usage[name].Value())
|
|
}
|
|
}
|
|
return keysAndValues
|
|
}
|
|
|
|
// evictPodsFromSourceNodes evicts pods based on priority, if all the pods on the node have priority, if not
|
|
// evicts them based on QoS as fallback option.
|
|
// TODO: @ravig Break this function into smaller functions.
|
|
func evictPodsFromSourceNodes(
|
|
ctx context.Context,
|
|
evictableNamespaces *api.Namespaces,
|
|
sourceNodes, destinationNodes []NodeInfo,
|
|
podEvictor frameworktypes.Evictor,
|
|
evictOptions evictions.EvictOptions,
|
|
podFilter func(pod *v1.Pod) bool,
|
|
resourceNames []v1.ResourceName,
|
|
continueEviction continueEvictionCond,
|
|
usageClient usageClient,
|
|
maxNoOfPodsToEvictPerNode *uint,
|
|
) {
|
|
// upper bound on total number of pods/cpu/memory and optional extended resources to be moved
|
|
totalAvailableUsage := api.ReferencedResourceList{}
|
|
for _, resourceName := range resourceNames {
|
|
totalAvailableUsage[resourceName] = &resource.Quantity{}
|
|
}
|
|
|
|
taintsOfDestinationNodes := make(map[string][]v1.Taint, len(destinationNodes))
|
|
for _, node := range destinationNodes {
|
|
taintsOfDestinationNodes[node.node.Name] = node.node.Spec.Taints
|
|
|
|
for _, name := range resourceNames {
|
|
if _, exists := node.usage[name]; !exists {
|
|
klog.Errorf("unable to find %q resource in node's %q usage, terminating eviction", name, node.node.Name)
|
|
return
|
|
}
|
|
if _, ok := totalAvailableUsage[name]; !ok {
|
|
totalAvailableUsage[name] = resource.NewQuantity(0, resource.DecimalSI)
|
|
}
|
|
totalAvailableUsage[name].Add(*node.thresholds.highResourceThreshold[name])
|
|
totalAvailableUsage[name].Sub(*node.usage[name])
|
|
}
|
|
}
|
|
|
|
// log message in one line
|
|
klog.V(1).InfoS("Total capacity to be moved", usageToKeysAndValues(totalAvailableUsage)...)
|
|
|
|
for _, node := range sourceNodes {
|
|
klog.V(3).InfoS("Evicting pods from node", "node", klog.KObj(node.node), "usage", node.usage)
|
|
|
|
nonRemovablePods, removablePods := classifyPods(node.allPods, podFilter)
|
|
klog.V(2).InfoS("Pods on node", "node", klog.KObj(node.node), "allPods", len(node.allPods), "nonRemovablePods", len(nonRemovablePods), "removablePods", len(removablePods))
|
|
|
|
if len(removablePods) == 0 {
|
|
klog.V(1).InfoS("No removable pods on node, try next node", "node", klog.KObj(node.node))
|
|
continue
|
|
}
|
|
|
|
klog.V(1).InfoS("Evicting pods based on priority, if they have same priority, they'll be evicted based on QoS tiers")
|
|
// sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers.
|
|
podutil.SortPodsBasedOnPriorityLowToHigh(removablePods)
|
|
err := evictPods(ctx, evictableNamespaces, removablePods, node, totalAvailableUsage, taintsOfDestinationNodes, podEvictor, evictOptions, continueEviction, usageClient, maxNoOfPodsToEvictPerNode)
|
|
if err != nil {
|
|
switch err.(type) {
|
|
case *evictions.EvictionTotalLimitError:
|
|
return
|
|
default:
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func evictPods(
|
|
ctx context.Context,
|
|
evictableNamespaces *api.Namespaces,
|
|
inputPods []*v1.Pod,
|
|
nodeInfo NodeInfo,
|
|
totalAvailableUsage api.ReferencedResourceList,
|
|
taintsOfLowNodes map[string][]v1.Taint,
|
|
podEvictor frameworktypes.Evictor,
|
|
evictOptions evictions.EvictOptions,
|
|
continueEviction continueEvictionCond,
|
|
usageClient usageClient,
|
|
maxNoOfPodsToEvictPerNode *uint,
|
|
) error {
|
|
var excludedNamespaces sets.Set[string]
|
|
if evictableNamespaces != nil {
|
|
excludedNamespaces = sets.New(evictableNamespaces.Exclude...)
|
|
}
|
|
|
|
var evictionCounter uint = 0
|
|
if continueEviction(nodeInfo, totalAvailableUsage) {
|
|
for _, pod := range inputPods {
|
|
if maxNoOfPodsToEvictPerNode != nil && evictionCounter >= *maxNoOfPodsToEvictPerNode {
|
|
klog.V(3).InfoS("Max number of evictions per node per plugin reached", "limit", *maxNoOfPodsToEvictPerNode)
|
|
break
|
|
}
|
|
if !utils.PodToleratesTaints(pod, taintsOfLowNodes) {
|
|
klog.V(3).InfoS("Skipping eviction for pod, doesn't tolerate node taint", "pod", klog.KObj(pod))
|
|
continue
|
|
}
|
|
|
|
preEvictionFilterWithOptions, err := podutil.NewOptions().
|
|
WithFilter(podEvictor.PreEvictionFilter).
|
|
WithoutNamespaces(excludedNamespaces).
|
|
BuildFilterFunc()
|
|
if err != nil {
|
|
klog.ErrorS(err, "could not build preEvictionFilter with namespace exclusion")
|
|
continue
|
|
}
|
|
|
|
if !preEvictionFilterWithOptions(pod) {
|
|
continue
|
|
}
|
|
podUsage, err := usageClient.podUsage(pod)
|
|
if err != nil {
|
|
klog.Errorf("unable to get pod usage for %v/%v: %v", pod.Namespace, pod.Name, err)
|
|
continue
|
|
}
|
|
err = podEvictor.Evict(ctx, pod, evictOptions)
|
|
if err == nil {
|
|
evictionCounter++
|
|
klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod))
|
|
|
|
for name := range totalAvailableUsage {
|
|
if name == v1.ResourcePods {
|
|
nodeInfo.usage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI))
|
|
totalAvailableUsage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI))
|
|
} else {
|
|
nodeInfo.usage[name].Sub(*podUsage[name])
|
|
totalAvailableUsage[name].Sub(*podUsage[name])
|
|
}
|
|
}
|
|
|
|
keysAndValues := []interface{}{
|
|
"node", nodeInfo.node.Name,
|
|
}
|
|
keysAndValues = append(keysAndValues, usageToKeysAndValues(nodeInfo.usage)...)
|
|
klog.V(3).InfoS("Updated node usage", keysAndValues...)
|
|
// check if pods can be still evicted
|
|
if !continueEviction(nodeInfo, totalAvailableUsage) {
|
|
break
|
|
}
|
|
continue
|
|
}
|
|
switch err.(type) {
|
|
case *evictions.EvictionNodeLimitError, *evictions.EvictionTotalLimitError:
|
|
return err
|
|
default:
|
|
klog.Errorf("eviction failed: %v", err)
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// sortNodesByUsage sorts nodes based on usage according to the given plugin.
|
|
func sortNodesByUsage(nodes []NodeInfo, ascending bool) {
|
|
sort.Slice(nodes, func(i, j int) bool {
|
|
ti := resource.NewQuantity(0, resource.DecimalSI).Value()
|
|
tj := resource.NewQuantity(0, resource.DecimalSI).Value()
|
|
for resourceName := range nodes[i].usage {
|
|
if resourceName == v1.ResourceCPU {
|
|
ti += nodes[i].usage[resourceName].MilliValue()
|
|
} else {
|
|
ti += nodes[i].usage[resourceName].Value()
|
|
}
|
|
}
|
|
for resourceName := range nodes[j].usage {
|
|
if resourceName == v1.ResourceCPU {
|
|
tj += nodes[j].usage[resourceName].MilliValue()
|
|
} else {
|
|
tj += nodes[j].usage[resourceName].Value()
|
|
}
|
|
}
|
|
|
|
// Return ascending order for HighNodeUtilization plugin
|
|
if ascending {
|
|
return ti < tj
|
|
}
|
|
|
|
// Return descending order for LowNodeUtilization plugin
|
|
return ti > tj
|
|
})
|
|
}
|
|
|
|
// isNodeAboveTargetUtilization checks if a node is overutilized
|
|
// At least one resource has to be above the high threshold
|
|
func isNodeAboveTargetUtilization(usage NodeUsage, threshold api.ReferencedResourceList) bool {
|
|
for name, nodeValue := range usage.usage {
|
|
// usage.highResourceThreshold[name] < nodeValue
|
|
if threshold[name].Cmp(*nodeValue) == -1 {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// isNodeWithLowUtilization checks if a node is underutilized
|
|
// All resources have to be below the low threshold
|
|
func isNodeWithLowUtilization(usage NodeUsage, threshold api.ReferencedResourceList) bool {
|
|
for name, nodeValue := range usage.usage {
|
|
// usage.lowResourceThreshold[name] < nodeValue
|
|
if threshold[name].Cmp(*nodeValue) == -1 {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// getResourceNames returns list of resource names in resource thresholds
|
|
func getResourceNames(thresholds api.ResourceThresholds) []v1.ResourceName {
|
|
resourceNames := make([]v1.ResourceName, 0, len(thresholds))
|
|
for name := range thresholds {
|
|
resourceNames = append(resourceNames, name)
|
|
}
|
|
return resourceNames
|
|
}
|
|
|
|
func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) {
|
|
var nonRemovablePods, removablePods []*v1.Pod
|
|
|
|
for _, pod := range pods {
|
|
if !filter(pod) {
|
|
nonRemovablePods = append(nonRemovablePods, pod)
|
|
} else {
|
|
removablePods = append(removablePods, pod)
|
|
}
|
|
}
|
|
|
|
return nonRemovablePods, removablePods
|
|
}
|
|
|
|
func averageNodeBasicresources(nodes []*v1.Node, usageClient usageClient) api.ResourceThresholds {
|
|
total := api.ResourceThresholds{}
|
|
average := api.ResourceThresholds{}
|
|
numberOfNodes := len(nodes)
|
|
for _, node := range nodes {
|
|
usage := usageClient.nodeUtilization(node.Name)
|
|
nodeCapacity := node.Status.Capacity
|
|
if len(node.Status.Allocatable) > 0 {
|
|
nodeCapacity = node.Status.Allocatable
|
|
}
|
|
for resource, value := range usage {
|
|
nodeCapacityValue := nodeCapacity[resource]
|
|
if resource == v1.ResourceCPU {
|
|
total[resource] += api.Percentage(value.MilliValue()) / api.Percentage(nodeCapacityValue.MilliValue()) * 100.0
|
|
} else {
|
|
total[resource] += api.Percentage(value.Value()) / api.Percentage(nodeCapacityValue.Value()) * 100.0
|
|
}
|
|
}
|
|
}
|
|
for resource, value := range total {
|
|
average[resource] = value / api.Percentage(numberOfNodes)
|
|
}
|
|
return average
|
|
}
|