descheduler/pkg/descheduler/strategies/lownodeutilization.go

/*
Copyright 2017 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package strategies

import (
	"context"
	"sort"

	v1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/resource"
	clientset "k8s.io/client-go/kubernetes"
	"k8s.io/klog"

	"sigs.k8s.io/descheduler/pkg/api"
	"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
	nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
	podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
	"sigs.k8s.io/descheduler/pkg/utils"
)

type NodeUsageMap struct {
	node    *v1.Node
	usage   api.ResourceThresholds
	allPods []*v1.Pod
}

type NodePodsMap map[*v1.Node][]*v1.Pod

func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, evictLocalStoragePods bool, podEvictor *evictions.PodEvictor) {
	if !strategy.Enabled {
		return
	}
	// todo: move to config validation?
	// TODO: May be create a struct for the strategy as well, so that we don't have to pass along the all the params?
	if strategy.Params.NodeResourceUtilizationThresholds == nil {
		klog.V(1).Infof("NodeResourceUtilizationThresholds not set")
		return
	}

	thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds
	if !validateThresholds(thresholds) {
		return
	}
	targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds
	if !validateTargetThresholds(targetThresholds) {
		return
	}

	npm := createNodePodsMap(ctx, client, nodes)
	lowNodes, targetNodes := classifyNodes(npm, thresholds, targetThresholds, evictLocalStoragePods)

	klog.V(1).Infof("Criteria for a node under utilization: CPU: %v, Mem: %v, Pods: %v",
		thresholds[v1.ResourceCPU], thresholds[v1.ResourceMemory], thresholds[v1.ResourcePods])

	if len(lowNodes) == 0 {
		klog.V(1).Infof("No node is underutilized, nothing to do here, you might tune your thresholds further")
		return
	}
	klog.V(1).Infof("Total number of underutilized nodes: %v", len(lowNodes))

	if len(lowNodes) < strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes {
		klog.V(1).Infof("number of nodes underutilized (%v) is less than NumberOfNodes (%v), nothing to do here", len(lowNodes), strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes)
		return
	}

	if len(lowNodes) == len(nodes) {
		klog.V(1).Infof("all nodes are underutilized, nothing to do here")
		return
	}

	if len(targetNodes) == 0 {
		klog.V(1).Infof("all nodes are under target utilization, nothing to do here")
		return
	}

	klog.V(1).Infof("Criteria for a node above target utilization: CPU: %v, Mem: %v, Pods: %v",
		targetThresholds[v1.ResourceCPU], targetThresholds[v1.ResourceMemory], targetThresholds[v1.ResourcePods])
	klog.V(1).Infof("Total number of nodes above target utilization: %v", len(targetNodes))

	evictPodsFromTargetNodes(
		ctx,
		targetNodes,
		lowNodes,
		targetThresholds,
		evictLocalStoragePods,
		podEvictor)

	klog.V(1).Infof("Total number of pods evicted: %v", podEvictor.TotalEvicted())
}

func validateThresholds(thresholds api.ResourceThresholds) bool {
	if thresholds == nil || len(thresholds) == 0 {
		klog.V(1).Infof("no resource threshold is configured")
		return false
	}
	for name := range thresholds {
		switch name {
		case v1.ResourceCPU:
			continue
		case v1.ResourceMemory:
			continue
		case v1.ResourcePods:
			continue
		default:
			klog.Errorf("only cpu, memory, or pods thresholds can be specified")
			return false
		}
	}
	return true
}

//This function could be merged into above once we are clear.
func validateTargetThresholds(targetThresholds api.ResourceThresholds) bool {
	if targetThresholds == nil {
		klog.V(1).Infof("no target resource threshold is configured")
		return false
	} else if _, ok := targetThresholds[v1.ResourcePods]; !ok {
		klog.V(1).Infof("no target resource threshold for pods is configured")
		return false
	}
	return true
}

// classifyNodes classifies the nodes into low-utilization or high-utilization nodes. If a node lies between
// low and high thresholds, it is simply ignored.
func classifyNodes(npm NodePodsMap, thresholds api.ResourceThresholds, targetThresholds api.ResourceThresholds, evictLocalStoragePods bool) ([]NodeUsageMap, []NodeUsageMap) {
	lowNodes, targetNodes := []NodeUsageMap{}, []NodeUsageMap{}
	for node, pods := range npm {
		usage := nodeUtilization(node, pods, evictLocalStoragePods)
		nuMap := NodeUsageMap{
			node:    node,
			usage:   usage,
			allPods: pods,
		}
		// Check if node is underutilized and if we can schedule pods on it.
		if !nodeutil.IsNodeUnschedulable(node) && IsNodeWithLowUtilization(usage, thresholds) {
			klog.V(2).Infof("Node %#v is under utilized with usage: %#v", node.Name, usage)
			lowNodes = append(lowNodes, nuMap)
		} else if IsNodeAboveTargetUtilization(usage, targetThresholds) {
			klog.V(2).Infof("Node %#v is over utilized with usage: %#v", node.Name, usage)
			targetNodes = append(targetNodes, nuMap)
		} else {
			klog.V(2).Infof("Node %#v is appropriately utilized with usage: %#v", node.Name, usage)
		}
	}
	return lowNodes, targetNodes
}

// evictPodsFromTargetNodes evicts pods based on priority, if all the pods on the node have priority, if not
// evicts them based on QoS as fallback option.
// TODO: @ravig Break this function into smaller functions.
func evictPodsFromTargetNodes(
	ctx context.Context,
	targetNodes, lowNodes []NodeUsageMap,
	targetThresholds api.ResourceThresholds,
	evictLocalStoragePods bool,
	podEvictor *evictions.PodEvictor,
) {

	SortNodesByUsage(targetNodes)

	// upper bound on total number of pods/cpu/memory to be moved
	var totalPods, totalCPU, totalMem float64
	var taintsOfLowNodes = make(map[string][]v1.Taint, len(lowNodes))
	for _, node := range lowNodes {
		taintsOfLowNodes[node.node.Name] = node.node.Spec.Taints
		nodeCapacity := node.node.Status.Capacity
		if len(node.node.Status.Allocatable) > 0 {
			nodeCapacity = node.node.Status.Allocatable
		}
		// totalPods to be moved
		podsPercentage := targetThresholds[v1.ResourcePods] - node.usage[v1.ResourcePods]
		totalPods += ((float64(podsPercentage) * float64(nodeCapacity.Pods().Value())) / 100)

		// totalCPU capacity to be moved
		if _, ok := targetThresholds[v1.ResourceCPU]; ok {
			cpuPercentage := targetThresholds[v1.ResourceCPU] - node.usage[v1.ResourceCPU]
			totalCPU += ((float64(cpuPercentage) * float64(nodeCapacity.Cpu().MilliValue())) / 100)
		}

		// totalMem capacity to be moved
		if _, ok := targetThresholds[v1.ResourceMemory]; ok {
			memPercentage := targetThresholds[v1.ResourceMemory] - node.usage[v1.ResourceMemory]
			totalMem += ((float64(memPercentage) * float64(nodeCapacity.Memory().Value())) / 100)
		}
	}

	klog.V(1).Infof("Total capacity to be moved: CPU:%v, Mem:%v, Pods:%v", totalCPU, totalMem, totalPods)
	klog.V(1).Infof("********Number of pods evicted from each node:***********")

	for _, node := range targetNodes {
		nodeCapacity := node.node.Status.Capacity
		if len(node.node.Status.Allocatable) > 0 {
			nodeCapacity = node.node.Status.Allocatable
		}
		klog.V(3).Infof("evicting pods from node %#v with usage: %#v", node.node.Name, node.usage)

		nonRemovablePods, bestEffortPods, burstablePods, guaranteedPods := classifyPods(node.allPods, evictLocalStoragePods)
		klog.V(2).Infof("allPods:%v, nonRemovablePods:%v, bestEffortPods:%v, burstablePods:%v, guaranteedPods:%v", len(node.allPods), len(nonRemovablePods), len(bestEffortPods), len(burstablePods), len(guaranteedPods))

		// Check if one pod has priority, if yes, assume that all pods have priority and evict pods based on priority.
		if node.allPods[0].Spec.Priority != nil {
			klog.V(1).Infof("All pods have priority associated with them. Evicting pods based on priority")
			evictablePods := make([]*v1.Pod, 0)
			evictablePods = append(append(burstablePods, bestEffortPods...), guaranteedPods...)

			// sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers.
			sortPodsBasedOnPriority(evictablePods)
			evictPods(ctx, evictablePods, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, taintsOfLowNodes, podEvictor, node.node)
		} else {
			// TODO: Remove this when we support only priority.
			//  Falling back to evicting pods based on priority.
			klog.V(1).Infof("Evicting pods based on QoS")
			klog.V(1).Infof("There are %v non-evictable pods on the node", len(nonRemovablePods))
			// evict best effort pods
			evictPods(ctx, bestEffortPods, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, taintsOfLowNodes, podEvictor, node.node)
			// evict burstable pods
			evictPods(ctx, burstablePods, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, taintsOfLowNodes, podEvictor, node.node)
			// evict guaranteed pods
			evictPods(ctx, guaranteedPods, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, taintsOfLowNodes, podEvictor, node.node)
		}
		klog.V(1).Infof("%v pods evicted from node %#v with usage %v", podEvictor.NodeEvicted(node.node), node.node.Name, node.usage)
	}
}

func evictPods(
	ctx context.Context,
	inputPods []*v1.Pod,
	targetThresholds api.ResourceThresholds,
	nodeCapacity v1.ResourceList,
	nodeUsage api.ResourceThresholds,
	totalPods *float64,
	totalCPU *float64,
	totalMem *float64,
	taintsOfLowNodes map[string][]v1.Taint,
	podEvictor *evictions.PodEvictor,
	node *v1.Node) {
	if IsNodeAboveTargetUtilization(nodeUsage, targetThresholds) && (*totalPods > 0 || *totalCPU > 0 || *totalMem > 0) {
		onePodPercentage := api.Percentage((float64(1) * 100) / float64(nodeCapacity.Pods().Value()))
		for _, pod := range inputPods {
			if !utils.PodToleratesTaints(pod, taintsOfLowNodes) {
				klog.V(3).Infof("Skipping eviction for Pod: %#v, doesn't tolerate node taint", pod.Name)
				continue
			}

			cUsage := utils.GetResourceRequest(pod, v1.ResourceCPU)
			mUsage := utils.GetResourceRequest(pod, v1.ResourceMemory)

			success, err := podEvictor.EvictPod(ctx, pod, node)
			if err != nil {
				break
			}

			if success {
				klog.V(3).Infof("Evicted pod: %#v", pod.Name)
				// update remaining pods
				nodeUsage[v1.ResourcePods] -= onePodPercentage
				*totalPods--

				// update remaining cpu
				*totalCPU -= float64(cUsage)
				nodeUsage[v1.ResourceCPU] -= api.Percentage((float64(cUsage) * 100) / float64(nodeCapacity.Cpu().MilliValue()))

				// update remaining memory
				*totalMem -= float64(mUsage)
				nodeUsage[v1.ResourceMemory] -= api.Percentage(float64(mUsage) / float64(nodeCapacity.Memory().Value()) * 100)

				klog.V(3).Infof("updated node usage: %#v", nodeUsage)
				// check if node utilization drops below target threshold or required capacity (cpu, memory, pods) is moved
				if !IsNodeAboveTargetUtilization(nodeUsage, targetThresholds) || (*totalPods <= 0 && *totalCPU <= 0 && *totalMem <= 0) {
					break
				}
			}
		}
	}
}

func SortNodesByUsage(nodes []NodeUsageMap) {
	sort.Slice(nodes, func(i, j int) bool {
		var ti, tj api.Percentage
		for name, value := range nodes[i].usage {
			if name == v1.ResourceCPU || name == v1.ResourceMemory || name == v1.ResourcePods {
				ti += value
			}
		}
		for name, value := range nodes[j].usage {
			if name == v1.ResourceCPU || name == v1.ResourceMemory || name == v1.ResourcePods {
				tj += value
			}
		}
		// To return sorted in descending order
		return ti > tj
	})
}

// sortPodsBasedOnPriority sorts pods based on priority and if their priorities are equal, they are sorted based on QoS tiers.
func sortPodsBasedOnPriority(evictablePods []*v1.Pod) {
	sort.Slice(evictablePods, func(i, j int) bool {
		if evictablePods[i].Spec.Priority == nil && evictablePods[j].Spec.Priority != nil {
			return true
		}
		if evictablePods[j].Spec.Priority == nil && evictablePods[i].Spec.Priority != nil {
			return false
		}
		if (evictablePods[j].Spec.Priority == nil && evictablePods[i].Spec.Priority == nil) || (*evictablePods[i].Spec.Priority == *evictablePods[j].Spec.Priority) {
			if podutil.IsBestEffortPod(evictablePods[i]) {
				return true
			}
			if podutil.IsBurstablePod(evictablePods[i]) && podutil.IsGuaranteedPod(evictablePods[j]) {
				return true
			}
			return false
		}
		return *evictablePods[i].Spec.Priority < *evictablePods[j].Spec.Priority
	})
}

// createNodePodsMap returns nodepodsmap with evictable pods on node.
func createNodePodsMap(ctx context.Context, client clientset.Interface, nodes []*v1.Node) NodePodsMap {
	npm := NodePodsMap{}
	for _, node := range nodes {
		pods, err := podutil.ListPodsOnANode(ctx, client, node)
		if err != nil {
			klog.Warningf("node %s will not be processed, error in accessing its pods (%#v)", node.Name, err)
		} else {
			npm[node] = pods
		}
	}
	return npm
}

func IsNodeAboveTargetUtilization(nodeThresholds api.ResourceThresholds, thresholds api.ResourceThresholds) bool {
	for name, nodeValue := range nodeThresholds {
		if name == v1.ResourceCPU || name == v1.ResourceMemory || name == v1.ResourcePods {
			if value, ok := thresholds[name]; !ok {
				continue
			} else if nodeValue > value {
				return true
			}
		}
	}
	return false
}

func IsNodeWithLowUtilization(nodeThresholds api.ResourceThresholds, thresholds api.ResourceThresholds) bool {
	for name, nodeValue := range nodeThresholds {
		if name == v1.ResourceCPU || name == v1.ResourceMemory || name == v1.ResourcePods {
			if value, ok := thresholds[name]; !ok {
				continue
			} else if nodeValue > value {
				return false
			}
		}
	}
	return true
}

func nodeUtilization(node *v1.Node, pods []*v1.Pod, evictLocalStoragePods bool) api.ResourceThresholds {
	totalReqs := map[v1.ResourceName]*resource.Quantity{
		v1.ResourceCPU:    {},
		v1.ResourceMemory: {},
	}
	for _, pod := range pods {
		req, _ := utils.PodRequestsAndLimits(pod)
		for name, quantity := range req {
			if name == v1.ResourceCPU || name == v1.ResourceMemory {
				// As Quantity.Add says: Add adds the provided y quantity to the current value. If the current value is zero,
				// the format of the quantity will be updated to the format of y.
				totalReqs[name].Add(quantity)
			}
		}
	}

	nodeCapacity := node.Status.Capacity
	if len(node.Status.Allocatable) > 0 {
		nodeCapacity = node.Status.Allocatable
	}

	totalPods := len(pods)
	return api.ResourceThresholds{
		v1.ResourceCPU:    api.Percentage((float64(totalReqs[v1.ResourceCPU].MilliValue()) * 100) / float64(nodeCapacity.Cpu().MilliValue())),
		v1.ResourceMemory: api.Percentage(float64(totalReqs[v1.ResourceMemory].Value()) / float64(nodeCapacity.Memory().Value()) * 100),
		v1.ResourcePods:   api.Percentage((float64(totalPods) * 100) / float64(nodeCapacity.Pods().Value())),
	}
}

func classifyPods(pods []*v1.Pod, evictLocalStoragePods bool) ([]*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod) {
	var nonRemovablePods, bestEffortPods, burstablePods, guaranteedPods []*v1.Pod

	// From https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/
	//
	// For a Pod to be given a QoS class of Guaranteed:
	// - every Container in the Pod must have a memory limit and a memory request, and they must be the same.
	// - every Container in the Pod must have a CPU limit and a CPU request, and they must be the same.
	// A Pod is given a QoS class of Burstable if:
	// - the Pod does not meet the criteria for QoS class Guaranteed.
	// - at least one Container in the Pod has a memory or CPU request.
	// For a Pod to be given a QoS class of BestEffort, the Containers in the Pod must not have any memory or CPU limits or requests.

	for _, pod := range pods {
		if !podutil.IsEvictable(pod, evictLocalStoragePods) {
			nonRemovablePods = append(nonRemovablePods, pod)
			continue
		}

		switch utils.GetPodQOS(pod) {
		case v1.PodQOSGuaranteed:
			guaranteedPods = append(guaranteedPods, pod)
		case v1.PodQOSBurstable:
			burstablePods = append(burstablePods, pod)
		default: // alias v1.PodQOSBestEffort
			bestEffortPods = append(bestEffortPods, pod)
		}
	}

	return nonRemovablePods, bestEffortPods, burstablePods, guaranteedPods
}