1
0
mirror of https://github.com/kubernetes-sigs/descheduler.git synced 2026-01-26 13:29:11 +01:00
Files
descheduler/pkg/framework/plugins/nodeutilization/nodeutilization.go
Gabriel Tiossi bb5930eb21 Improve PodEvictor observability through EvictOptions (#1349)
* feat: profile name for pods_evicted metric

Support new label "profile" for "pods_evicted" metric to allow
understand which profiles are evicting more pods, allowing better
observability

* refactor: evictoptions improved observability

Send profile and strategy names for EvictOptions, allowing Evictors to
access observability information

* cleanup: remove unnecessary evictoption reference

* feat: evictoptions for nodeutilzation

Explicit usage of options when invoking evictPods from the helper
function from nodeutilization for both highnodeutilization and
lownodeutilization
2024-03-02 12:06:05 -08:00

456 lines
16 KiB
Go

/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeutilization
import (
"context"
"math"
"sort"
"sigs.k8s.io/descheduler/pkg/api"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
"sigs.k8s.io/descheduler/pkg/descheduler/node"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types"
"sigs.k8s.io/descheduler/pkg/utils"
)
// NodeUsage stores a node's info, pods on it, thresholds and its resource usage
type NodeUsage struct {
node *v1.Node
usage map[v1.ResourceName]*resource.Quantity
allPods []*v1.Pod
}
type NodeThresholds struct {
lowResourceThreshold map[v1.ResourceName]*resource.Quantity
highResourceThreshold map[v1.ResourceName]*resource.Quantity
}
type NodeInfo struct {
NodeUsage
thresholds NodeThresholds
}
type continueEvictionCond func(nodeInfo NodeInfo, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool
// NodePodsMap is a set of (node, pods) pairs
type NodePodsMap map[*v1.Node][]*v1.Pod
const (
// MinResourcePercentage is the minimum value of a resource's percentage
MinResourcePercentage = 0
// MaxResourcePercentage is the maximum value of a resource's percentage
MaxResourcePercentage = 100
)
func normalizePercentage(percent api.Percentage) api.Percentage {
if percent > MaxResourcePercentage {
return MaxResourcePercentage
}
if percent < MinResourcePercentage {
return MinResourcePercentage
}
return percent
}
func getNodeThresholds(
nodes []*v1.Node,
lowThreshold, highThreshold api.ResourceThresholds,
resourceNames []v1.ResourceName,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
useDeviationThresholds bool,
) map[string]NodeThresholds {
nodeThresholdsMap := map[string]NodeThresholds{}
averageResourceUsagePercent := api.ResourceThresholds{}
if useDeviationThresholds {
averageResourceUsagePercent = averageNodeBasicresources(nodes, getPodsAssignedToNode, resourceNames)
}
for _, node := range nodes {
nodeCapacity := node.Status.Capacity
if len(node.Status.Allocatable) > 0 {
nodeCapacity = node.Status.Allocatable
}
nodeThresholdsMap[node.Name] = NodeThresholds{
lowResourceThreshold: map[v1.ResourceName]*resource.Quantity{},
highResourceThreshold: map[v1.ResourceName]*resource.Quantity{},
}
for _, resourceName := range resourceNames {
if useDeviationThresholds {
cap := nodeCapacity[resourceName]
if lowThreshold[resourceName] == MinResourcePercentage {
nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = &cap
nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = &cap
} else {
nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, normalizePercentage(averageResourceUsagePercent[resourceName]-lowThreshold[resourceName]))
nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, normalizePercentage(averageResourceUsagePercent[resourceName]+highThreshold[resourceName]))
}
} else {
nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, lowThreshold[resourceName])
nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, highThreshold[resourceName])
}
}
}
return nodeThresholdsMap
}
func getNodeUsage(
nodes []*v1.Node,
resourceNames []v1.ResourceName,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
) []NodeUsage {
var nodeUsageList []NodeUsage
for _, node := range nodes {
pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, nil)
if err != nil {
klog.V(2).InfoS("Node will not be processed, error accessing its pods", "node", klog.KObj(node), "err", err)
continue
}
nodeUsageList = append(nodeUsageList, NodeUsage{
node: node,
usage: nodeutil.NodeUtilization(pods, resourceNames),
allPods: pods,
})
}
return nodeUsageList
}
func resourceThreshold(nodeCapacity v1.ResourceList, resourceName v1.ResourceName, threshold api.Percentage) *resource.Quantity {
defaultFormat := resource.DecimalSI
if resourceName == v1.ResourceMemory {
defaultFormat = resource.BinarySI
}
resourceCapacityFraction := func(resourceNodeCapacity int64) int64 {
// A threshold is in percentages but in <0;100> interval.
// Performing `threshold * 0.01` will convert <0;100> interval into <0;1>.
// Multiplying it with capacity will give fraction of the capacity corresponding to the given resource threshold in Quantity units.
return int64(float64(threshold) * 0.01 * float64(resourceNodeCapacity))
}
resourceCapacityQuantity := nodeCapacity.Name(resourceName, defaultFormat)
if resourceName == v1.ResourceCPU {
return resource.NewMilliQuantity(resourceCapacityFraction(resourceCapacityQuantity.MilliValue()), defaultFormat)
}
return resource.NewQuantity(resourceCapacityFraction(resourceCapacityQuantity.Value()), defaultFormat)
}
func roundTo2Decimals(percentage float64) float64 {
return math.Round(percentage*100) / 100
}
func resourceUsagePercentages(nodeUsage NodeUsage) map[v1.ResourceName]float64 {
nodeCapacity := nodeUsage.node.Status.Capacity
if len(nodeUsage.node.Status.Allocatable) > 0 {
nodeCapacity = nodeUsage.node.Status.Allocatable
}
resourceUsagePercentage := map[v1.ResourceName]float64{}
for resourceName, resourceUsage := range nodeUsage.usage {
cap := nodeCapacity[resourceName]
if !cap.IsZero() {
resourceUsagePercentage[resourceName] = 100 * float64(resourceUsage.MilliValue()) / float64(cap.MilliValue())
resourceUsagePercentage[resourceName] = roundTo2Decimals(resourceUsagePercentage[resourceName])
}
}
return resourceUsagePercentage
}
// classifyNodes classifies the nodes into low-utilization or high-utilization nodes. If a node lies between
// low and high thresholds, it is simply ignored.
func classifyNodes(
nodeUsages []NodeUsage,
nodeThresholds map[string]NodeThresholds,
lowThresholdFilter, highThresholdFilter func(node *v1.Node, usage NodeUsage, threshold NodeThresholds) bool,
) ([]NodeInfo, []NodeInfo) {
lowNodes, highNodes := []NodeInfo{}, []NodeInfo{}
for _, nodeUsage := range nodeUsages {
nodeInfo := NodeInfo{
NodeUsage: nodeUsage,
thresholds: nodeThresholds[nodeUsage.node.Name],
}
if lowThresholdFilter(nodeUsage.node, nodeUsage, nodeThresholds[nodeUsage.node.Name]) {
klog.InfoS("Node is underutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
lowNodes = append(lowNodes, nodeInfo)
} else if highThresholdFilter(nodeUsage.node, nodeUsage, nodeThresholds[nodeUsage.node.Name]) {
klog.InfoS("Node is overutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
highNodes = append(highNodes, nodeInfo)
} else {
klog.InfoS("Node is appropriately utilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
}
}
return lowNodes, highNodes
}
// evictPodsFromSourceNodes evicts pods based on priority, if all the pods on the node have priority, if not
// evicts them based on QoS as fallback option.
// TODO: @ravig Break this function into smaller functions.
func evictPodsFromSourceNodes(
ctx context.Context,
evictableNamespaces *api.Namespaces,
sourceNodes, destinationNodes []NodeInfo,
podEvictor frameworktypes.Evictor,
evictOptions evictions.EvictOptions,
podFilter func(pod *v1.Pod) bool,
resourceNames []v1.ResourceName,
continueEviction continueEvictionCond,
) {
// upper bound on total number of pods/cpu/memory and optional extended resources to be moved
totalAvailableUsage := map[v1.ResourceName]*resource.Quantity{
v1.ResourcePods: {},
v1.ResourceCPU: {},
v1.ResourceMemory: {},
}
taintsOfDestinationNodes := make(map[string][]v1.Taint, len(destinationNodes))
for _, node := range destinationNodes {
taintsOfDestinationNodes[node.node.Name] = node.node.Spec.Taints
for _, name := range resourceNames {
if _, ok := totalAvailableUsage[name]; !ok {
totalAvailableUsage[name] = resource.NewQuantity(0, resource.DecimalSI)
}
totalAvailableUsage[name].Add(*node.thresholds.highResourceThreshold[name])
totalAvailableUsage[name].Sub(*node.usage[name])
}
}
// log message in one line
keysAndValues := []interface{}{
"CPU", totalAvailableUsage[v1.ResourceCPU].MilliValue(),
"Mem", totalAvailableUsage[v1.ResourceMemory].Value(),
"Pods", totalAvailableUsage[v1.ResourcePods].Value(),
}
for name := range totalAvailableUsage {
if !node.IsBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), totalAvailableUsage[name].Value())
}
}
klog.V(1).InfoS("Total capacity to be moved", keysAndValues...)
for _, node := range sourceNodes {
klog.V(3).InfoS("Evicting pods from node", "node", klog.KObj(node.node), "usage", node.usage)
nonRemovablePods, removablePods := classifyPods(node.allPods, podFilter)
klog.V(2).InfoS("Pods on node", "node", klog.KObj(node.node), "allPods", len(node.allPods), "nonRemovablePods", len(nonRemovablePods), "removablePods", len(removablePods))
if len(removablePods) == 0 {
klog.V(1).InfoS("No removable pods on node, try next node", "node", klog.KObj(node.node))
continue
}
klog.V(1).InfoS("Evicting pods based on priority, if they have same priority, they'll be evicted based on QoS tiers")
// sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers.
podutil.SortPodsBasedOnPriorityLowToHigh(removablePods)
evictPods(ctx, evictableNamespaces, removablePods, node, totalAvailableUsage, taintsOfDestinationNodes, podEvictor, evictOptions, continueEviction)
}
}
func evictPods(
ctx context.Context,
evictableNamespaces *api.Namespaces,
inputPods []*v1.Pod,
nodeInfo NodeInfo,
totalAvailableUsage map[v1.ResourceName]*resource.Quantity,
taintsOfLowNodes map[string][]v1.Taint,
podEvictor frameworktypes.Evictor,
evictOptions evictions.EvictOptions,
continueEviction continueEvictionCond,
) {
var excludedNamespaces sets.Set[string]
if evictableNamespaces != nil {
excludedNamespaces = sets.New(evictableNamespaces.Exclude...)
}
if continueEviction(nodeInfo, totalAvailableUsage) {
for _, pod := range inputPods {
if !utils.PodToleratesTaints(pod, taintsOfLowNodes) {
klog.V(3).InfoS("Skipping eviction for pod, doesn't tolerate node taint", "pod", klog.KObj(pod))
continue
}
preEvictionFilterWithOptions, err := podutil.NewOptions().
WithFilter(podEvictor.PreEvictionFilter).
WithoutNamespaces(excludedNamespaces).
BuildFilterFunc()
if err != nil {
klog.ErrorS(err, "could not build preEvictionFilter with namespace exclusion")
continue
}
if preEvictionFilterWithOptions(pod) {
if podEvictor.Evict(ctx, pod, evictOptions) {
klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod))
for name := range totalAvailableUsage {
if name == v1.ResourcePods {
nodeInfo.usage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI))
totalAvailableUsage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI))
} else {
quantity := utils.GetResourceRequestQuantity(pod, name)
nodeInfo.usage[name].Sub(quantity)
totalAvailableUsage[name].Sub(quantity)
}
}
keysAndValues := []interface{}{
"node", nodeInfo.node.Name,
"CPU", nodeInfo.usage[v1.ResourceCPU].MilliValue(),
"Mem", nodeInfo.usage[v1.ResourceMemory].Value(),
"Pods", nodeInfo.usage[v1.ResourcePods].Value(),
}
for name := range totalAvailableUsage {
if !nodeutil.IsBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), totalAvailableUsage[name].Value())
}
}
klog.V(3).InfoS("Updated node usage", keysAndValues...)
// check if pods can be still evicted
if !continueEviction(nodeInfo, totalAvailableUsage) {
break
}
}
}
if podEvictor.NodeLimitExceeded(nodeInfo.node) {
return
}
}
}
}
// sortNodesByUsage sorts nodes based on usage according to the given plugin.
func sortNodesByUsage(nodes []NodeInfo, ascending bool) {
sort.Slice(nodes, func(i, j int) bool {
ti := nodes[i].usage[v1.ResourceMemory].Value() + nodes[i].usage[v1.ResourceCPU].MilliValue() + nodes[i].usage[v1.ResourcePods].Value()
tj := nodes[j].usage[v1.ResourceMemory].Value() + nodes[j].usage[v1.ResourceCPU].MilliValue() + nodes[j].usage[v1.ResourcePods].Value()
// extended resources
for name := range nodes[i].usage {
if !nodeutil.IsBasicResource(name) {
ti = ti + nodes[i].usage[name].Value()
tj = tj + nodes[j].usage[name].Value()
}
}
// Return ascending order for HighNodeUtilization plugin
if ascending {
return ti < tj
}
// Return descending order for LowNodeUtilization plugin
return ti > tj
})
}
// isNodeAboveTargetUtilization checks if a node is overutilized
// At least one resource has to be above the high threshold
func isNodeAboveTargetUtilization(usage NodeUsage, threshold map[v1.ResourceName]*resource.Quantity) bool {
for name, nodeValue := range usage.usage {
// usage.highResourceThreshold[name] < nodeValue
if threshold[name].Cmp(*nodeValue) == -1 {
return true
}
}
return false
}
// isNodeWithLowUtilization checks if a node is underutilized
// All resources have to be below the low threshold
func isNodeWithLowUtilization(usage NodeUsage, threshold map[v1.ResourceName]*resource.Quantity) bool {
for name, nodeValue := range usage.usage {
// usage.lowResourceThreshold[name] < nodeValue
if threshold[name].Cmp(*nodeValue) == -1 {
return false
}
}
return true
}
// getResourceNames returns list of resource names in resource thresholds
func getResourceNames(thresholds api.ResourceThresholds) []v1.ResourceName {
resourceNames := make([]v1.ResourceName, 0, len(thresholds))
for name := range thresholds {
resourceNames = append(resourceNames, name)
}
return resourceNames
}
func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) {
var nonRemovablePods, removablePods []*v1.Pod
for _, pod := range pods {
if !filter(pod) {
nonRemovablePods = append(nonRemovablePods, pod)
} else {
removablePods = append(removablePods, pod)
}
}
return nonRemovablePods, removablePods
}
func averageNodeBasicresources(nodes []*v1.Node, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, resourceNames []v1.ResourceName) api.ResourceThresholds {
total := api.ResourceThresholds{}
average := api.ResourceThresholds{}
numberOfNodes := len(nodes)
for _, node := range nodes {
pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, nil)
if err != nil {
numberOfNodes--
continue
}
usage := nodeutil.NodeUtilization(pods, resourceNames)
nodeCapacity := node.Status.Capacity
if len(node.Status.Allocatable) > 0 {
nodeCapacity = node.Status.Allocatable
}
for resource, value := range usage {
nodeCapacityValue := nodeCapacity[resource]
if resource == v1.ResourceCPU {
total[resource] += api.Percentage(value.MilliValue()) / api.Percentage(nodeCapacityValue.MilliValue()) * 100.0
} else {
total[resource] += api.Percentage(value.Value()) / api.Percentage(nodeCapacityValue.Value()) * 100.0
}
}
}
for resource, value := range total {
average[resource] = value / api.Percentage(numberOfNodes)
}
return average
}