mirror of
https://github.com/kubernetes-sigs/descheduler.git
synced 2026-01-26 05:14:13 +01:00
* feat: Implement preferredDuringSchedulingIgnoredDuringExecution for RemovePodsViolatingNodeAffinity Now, the descheduler can detect and evict pods that are not optimally allocated according to the "preferred..." node affinity. It only evicts a pod if it can be scheduled on a node that scores higher in terms of preferred node affinity than the current one. This can be activated by enabling the RemovePodsViolatingNodeAffinity plugin and passing "preferredDuringSchedulingIgnoredDuringExecution" in the args. For example, imagine we have a pod that prefers nodes with label "key1: value1" with a weight of 10. If this pod is scheduled on a node that doesn't have "key1: value1" as label but there's another node that has this label and where this pod can potentially run, then the descheduler will evict the pod. Another effect of this commit is that the RemovePodsViolatingNodeAffinity plugin will not remove pods that don't fit in the current node but for other reasons than violating the node affinity. Before that, enabling this plugin could cause evictions on pods that were running on tainted nodes without the necessary tolerations. This commit also fixes the wording of some tests from node_affinity_test.go and some parameters and expectations of these tests, which were wrong. * Optimization on RemovePodsViolatingNodeAffinity Before checking if a pod can be evicted or if it can be scheduled somewhere else, we first check if it has the corresponding nodeAffinity field defined. Otherwise, the pod is automatically discarded as a candidate. Apart from that, the method that calculates the weight that a pod gives to a node based on its preferred node affinity has been renamed to better reflect what it does.
241 lines
7.2 KiB
Go
241 lines
7.2 KiB
Go
package utils
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/klog/v2"
|
|
)
|
|
|
|
// GetResourceRequest finds and returns the request value for a specific resource.
|
|
func GetResourceRequest(pod *v1.Pod, resource v1.ResourceName) int64 {
|
|
if resource == v1.ResourcePods {
|
|
return 1
|
|
}
|
|
|
|
requestQuantity := GetResourceRequestQuantity(pod, resource)
|
|
|
|
if resource == v1.ResourceCPU {
|
|
return requestQuantity.MilliValue()
|
|
}
|
|
|
|
return requestQuantity.Value()
|
|
}
|
|
|
|
// GetResourceRequestQuantity finds and returns the request quantity for a specific resource.
|
|
func GetResourceRequestQuantity(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity {
|
|
requestQuantity := resource.Quantity{}
|
|
|
|
switch resourceName {
|
|
case v1.ResourceCPU:
|
|
requestQuantity = resource.Quantity{Format: resource.DecimalSI}
|
|
case v1.ResourceMemory, v1.ResourceStorage, v1.ResourceEphemeralStorage:
|
|
requestQuantity = resource.Quantity{Format: resource.BinarySI}
|
|
default:
|
|
requestQuantity = resource.Quantity{Format: resource.DecimalSI}
|
|
}
|
|
|
|
for _, container := range pod.Spec.Containers {
|
|
if rQuantity, ok := container.Resources.Requests[resourceName]; ok {
|
|
requestQuantity.Add(rQuantity)
|
|
}
|
|
}
|
|
|
|
for _, container := range pod.Spec.InitContainers {
|
|
if rQuantity, ok := container.Resources.Requests[resourceName]; ok {
|
|
if requestQuantity.Cmp(rQuantity) < 0 {
|
|
requestQuantity = rQuantity.DeepCopy()
|
|
}
|
|
}
|
|
}
|
|
|
|
// We assume pod overhead feature gate is enabled.
|
|
// We can't import the scheduler settings so we will inherit the default.
|
|
if pod.Spec.Overhead != nil {
|
|
if podOverhead, ok := pod.Spec.Overhead[resourceName]; ok && !requestQuantity.IsZero() {
|
|
requestQuantity.Add(podOverhead)
|
|
}
|
|
}
|
|
|
|
return requestQuantity
|
|
}
|
|
|
|
// IsMirrorPod returns true if the pod is a Mirror Pod.
|
|
func IsMirrorPod(pod *v1.Pod) bool {
|
|
_, ok := pod.Annotations[v1.MirrorPodAnnotationKey]
|
|
return ok
|
|
}
|
|
|
|
// IsPodTerminating returns true if the pod DeletionTimestamp is set.
|
|
func IsPodTerminating(pod *v1.Pod) bool {
|
|
return pod.DeletionTimestamp != nil
|
|
}
|
|
|
|
// IsStaticPod returns true if the pod is a static pod.
|
|
func IsStaticPod(pod *v1.Pod) bool {
|
|
source, err := GetPodSource(pod)
|
|
return err == nil && source != "api"
|
|
}
|
|
|
|
// IsCriticalPriorityPod returns true if the pod has critical priority.
|
|
func IsCriticalPriorityPod(pod *v1.Pod) bool {
|
|
return pod.Spec.Priority != nil && *pod.Spec.Priority >= SystemCriticalPriority
|
|
}
|
|
|
|
// IsDaemonsetPod returns true if the pod is a IsDaemonsetPod.
|
|
func IsDaemonsetPod(ownerRefList []metav1.OwnerReference) bool {
|
|
for _, ownerRef := range ownerRefList {
|
|
if ownerRef.Kind == "DaemonSet" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// IsPodWithLocalStorage returns true if the pod has local storage.
|
|
func IsPodWithLocalStorage(pod *v1.Pod) bool {
|
|
for _, volume := range pod.Spec.Volumes {
|
|
if volume.HostPath != nil || volume.EmptyDir != nil {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// IsPodWithPVC returns true if the pod has claimed a persistent volume.
|
|
func IsPodWithPVC(pod *v1.Pod) bool {
|
|
for _, volume := range pod.Spec.Volumes {
|
|
if volume.PersistentVolumeClaim != nil {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// GetPodSource returns the source of the pod based on the annotation.
|
|
func GetPodSource(pod *v1.Pod) (string, error) {
|
|
if pod.Annotations != nil {
|
|
if source, ok := pod.Annotations["kubernetes.io/config.source"]; ok {
|
|
return source, nil
|
|
}
|
|
}
|
|
return "", fmt.Errorf("cannot get source of pod %q", pod.UID)
|
|
}
|
|
|
|
// PodRequestsAndLimits returns a dictionary of all defined resources summed up for all
|
|
// containers of the pod. If PodOverhead feature is enabled, pod overhead is added to the
|
|
// total container resource requests and to the total container limits which have a
|
|
// non-zero quantity.
|
|
func PodRequestsAndLimits(pod *v1.Pod) (reqs, limits v1.ResourceList) {
|
|
reqs, limits = v1.ResourceList{}, v1.ResourceList{}
|
|
for _, container := range pod.Spec.Containers {
|
|
addResourceList(reqs, container.Resources.Requests)
|
|
addResourceList(limits, container.Resources.Limits)
|
|
}
|
|
// init containers define the minimum of any resource
|
|
for _, container := range pod.Spec.InitContainers {
|
|
maxResourceList(reqs, container.Resources.Requests)
|
|
maxResourceList(limits, container.Resources.Limits)
|
|
}
|
|
|
|
// We assume pod overhead feature gate is enabled.
|
|
// We can't import the scheduler settings so we will inherit the default.
|
|
if pod.Spec.Overhead != nil {
|
|
addResourceList(reqs, pod.Spec.Overhead)
|
|
|
|
for name, quantity := range pod.Spec.Overhead {
|
|
if value, ok := limits[name]; ok && !value.IsZero() {
|
|
value.Add(quantity)
|
|
limits[name] = value
|
|
}
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// addResourceList adds the resources in newList to list
|
|
func addResourceList(list, newList v1.ResourceList) {
|
|
for name, quantity := range newList {
|
|
if value, ok := list[name]; !ok {
|
|
list[name] = quantity.DeepCopy()
|
|
} else {
|
|
value.Add(quantity)
|
|
list[name] = value
|
|
}
|
|
}
|
|
}
|
|
|
|
// maxResourceList sets list to the greater of list/newList for every resource
|
|
// either list
|
|
func maxResourceList(list, new v1.ResourceList) {
|
|
for name, quantity := range new {
|
|
if value, ok := list[name]; !ok {
|
|
list[name] = quantity.DeepCopy()
|
|
continue
|
|
} else {
|
|
if quantity.Cmp(value) > 0 {
|
|
list[name] = quantity.DeepCopy()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// PodToleratesTaints returns true if a pod tolerates one node's taints
|
|
func PodToleratesTaints(pod *v1.Pod, taintsOfNodes map[string][]v1.Taint) bool {
|
|
for nodeName, taintsForNode := range taintsOfNodes {
|
|
if len(pod.Spec.Tolerations) >= len(taintsForNode) {
|
|
|
|
if TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, taintsForNode, nil) {
|
|
return true
|
|
}
|
|
|
|
if klog.V(5).Enabled() {
|
|
for i := range taintsForNode {
|
|
if !TolerationsTolerateTaint(pod.Spec.Tolerations, &taintsForNode[i]) {
|
|
klog.V(5).InfoS("Pod doesn't tolerate node taint",
|
|
"pod", klog.KObj(pod),
|
|
"nodeName", nodeName,
|
|
"taint", fmt.Sprintf("%s:%s=%s", taintsForNode[i].Key, taintsForNode[i].Value, taintsForNode[i].Effect),
|
|
)
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
klog.V(5).InfoS("Pod doesn't tolerate nodes taint, count mismatch",
|
|
"pod", klog.KObj(pod),
|
|
"nodeName", nodeName,
|
|
)
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// PodHasNodeAffinity returns true if the pod has a node affinity of type
|
|
// `nodeAffinityType` defined. The nodeAffinityType param can take this two values:
|
|
// "requiredDuringSchedulingIgnoredDuringExecution" or "requiredDuringSchedulingIgnoredDuringExecution"
|
|
func PodHasNodeAffinity(pod *v1.Pod, nodeAffinityType NodeAffinityType) bool {
|
|
if pod.Spec.Affinity == nil {
|
|
return false
|
|
}
|
|
if pod.Spec.Affinity.NodeAffinity == nil {
|
|
return false
|
|
}
|
|
if nodeAffinityType == RequiredDuringSchedulingIgnoredDuringExecution {
|
|
return pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil
|
|
} else if nodeAffinityType == PreferredDuringSchedulingIgnoredDuringExecution {
|
|
return len(pod.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution) > 0
|
|
}
|
|
return false
|
|
}
|
|
|
|
type NodeAffinityType string
|
|
|
|
const (
|
|
RequiredDuringSchedulingIgnoredDuringExecution NodeAffinityType = "requiredDuringSchedulingIgnoredDuringExecution"
|
|
PreferredDuringSchedulingIgnoredDuringExecution NodeAffinityType = "preferredDuringSchedulingIgnoredDuringExecution"
|
|
)
|