diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index c03f10a29..1cf8338b9 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -302,7 +302,7 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer continue } evictorFilter := evictions.NewEvictorFilter(nodes, getPodsAssignedToNode, evictLocalStoragePods, evictSystemCriticalPods, ignorePvcPods, evictBarePods, evictions.WithNodeFit(nodeFit), evictions.WithPriorityThreshold(thresholdPriority)) - f(ctx, rs.Client, strategy, nodes, podEvictor, evictorFilter, getPodsAssignedToNode) + f(context.WithValue(ctx, "strategyName", string(name)), rs.Client, strategy, nodes, podEvictor, evictorFilter, getPodsAssignedToNode) } } else { klog.ErrorS(fmt.Errorf("unknown strategy name"), "skipping strategy", "strategy", name) diff --git a/pkg/descheduler/evictions/evictions.go b/pkg/descheduler/evictions/evictions.go index 39f5ef6e8..a83dc704d 100644 --- a/pkg/descheduler/evictions/evictions.go +++ b/pkg/descheduler/evictions/evictions.go @@ -19,7 +19,6 @@ package evictions import ( "context" "fmt" - "strings" v1 "k8s.io/api/core/v1" policy "k8s.io/api/policy/v1beta1" @@ -45,7 +44,7 @@ const ( ) // nodePodEvictedCount keeps count of pods evicted on node -type nodePodEvictedCount map[*v1.Node]uint +type nodePodEvictedCount map[string]uint type namespacePodEvictCount map[string]uint type PodEvictor struct { @@ -73,7 +72,7 @@ func NewPodEvictor( var namespacePodCount = make(namespacePodEvictCount) for _, node := range nodes { // Initialize podsEvicted till now with 0. - nodePodCount[node] = 0 + nodePodCount[node.Name] = 0 } return &PodEvictor{ @@ -91,7 +90,7 @@ func NewPodEvictor( // NodeEvicted gives a number of pods evicted for node func (pe *PodEvictor) NodeEvicted(node *v1.Node) uint { - return pe.nodepodCount[node] + return pe.nodepodCount[node.Name] } // TotalEvicted gives a number of pods evicted through all nodes @@ -103,26 +102,43 @@ func (pe *PodEvictor) TotalEvicted() uint { return total } -// EvictPod returns non-nil error only when evicting a pod on a node is not -// possible (due to maxPodsToEvictPerNode constraint). Success is true when the pod -// is evicted on the server side. -func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, node *v1.Node, strategy string, reasons ...string) (bool, error) { - reason := strategy - if len(reasons) > 0 { - reason += " (" + strings.Join(reasons, ", ") + ")" +// NodeLimitExceeded checks if the number of evictions for a node was exceeded +func (pe *PodEvictor) NodeLimitExceeded(node *v1.Node) bool { + if pe.maxPodsToEvictPerNode != nil { + return pe.nodepodCount[node.Name] == *pe.maxPodsToEvictPerNode } - if pe.maxPodsToEvictPerNode != nil && pe.nodepodCount[node]+1 > *pe.maxPodsToEvictPerNode { - if pe.metricsEnabled { - metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + return false +} + +// EvictPod evicts a pod while exercising eviction limits. +// Returns true when the pod is evicted on the server side. +// Eviction reason can be set through the ctx's evictionReason:STRING pair +func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod) bool { + strategy := "" + if ctx.Value("strategyName") != nil { + strategy = ctx.Value("strategyName").(string) + } + reason := "" + if ctx.Value("evictionReason") != nil { + reason = ctx.Value("evictionReason").(string) + } + + if pod.Spec.NodeName != "" { + if pe.maxPodsToEvictPerNode != nil && pe.nodepodCount[pod.Spec.NodeName]+1 > *pe.maxPodsToEvictPerNode { + if pe.metricsEnabled { + metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc() + } + klog.ErrorS(fmt.Errorf("Maximum number of evicted pods per node reached"), "limit", *pe.maxPodsToEvictPerNode, "node", pod.Spec.NodeName) + return false } - return false, fmt.Errorf("Maximum number %v of evicted pods per %q node reached", *pe.maxPodsToEvictPerNode, node.Name) } if pe.maxPodsToEvictPerNamespace != nil && pe.namespacePodCount[pod.Namespace]+1 > *pe.maxPodsToEvictPerNamespace { if pe.metricsEnabled { - metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc() } - return false, fmt.Errorf("Maximum number %v of evicted pods per %q namespace reached", *pe.maxPodsToEvictPerNamespace, pod.Namespace) + klog.ErrorS(fmt.Errorf("Maximum number of evicted pods per namespace reached"), "limit", *pe.maxPodsToEvictPerNamespace, "namespace", pod.Namespace) + return false } err := evictPod(ctx, pe.client, pod, pe.policyGroupVersion) @@ -130,29 +146,31 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, node *v1.Node, // err is used only for logging purposes klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", reason) if pe.metricsEnabled { - metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc() } - return false, nil + return false } - pe.nodepodCount[node]++ + if pod.Spec.NodeName != "" { + pe.nodepodCount[pod.Spec.NodeName]++ + } pe.namespacePodCount[pod.Namespace]++ if pe.metricsEnabled { - metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc() } if pe.dryRun { - klog.V(1).InfoS("Evicted pod in dry run mode", "pod", klog.KObj(pod), "reason", reason, "strategy", strategy, "node", node.Name) + klog.V(1).InfoS("Evicted pod in dry run mode", "pod", klog.KObj(pod), "reason", reason, "strategy", strategy, "node", pod.Spec.NodeName) } else { - klog.V(1).InfoS("Evicted pod", "pod", klog.KObj(pod), "reason", reason, "strategy", strategy, "node", node.Name) + klog.V(1).InfoS("Evicted pod", "pod", klog.KObj(pod), "reason", reason, "strategy", strategy, "node", pod.Spec.NodeName) eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartStructuredLogging(3) eventBroadcaster.StartRecordingToSink(&clientcorev1.EventSinkImpl{Interface: pe.client.CoreV1().Events(pod.Namespace)}) r := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "sigs.k8s.io.descheduler"}) r.Event(pod, v1.EventTypeNormal, "Descheduled", fmt.Sprintf("pod evicted by sigs.k8s.io/descheduler%s", reason)) } - return true, nil + return true } func evictPod(ctx context.Context, client clientset.Interface, pod *v1.Pod, policyGroupVersion string) error { diff --git a/pkg/descheduler/strategies/duplicates.go b/pkg/descheduler/strategies/duplicates.go index d5e11f0f5..f4dc13bf6 100644 --- a/pkg/descheduler/strategies/duplicates.go +++ b/pkg/descheduler/strategies/duplicates.go @@ -192,6 +192,7 @@ func RemoveDuplicatePods( } upperAvg := int(math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(len(targetNodes)))) + loop: for nodeName, pods := range podNodes { klog.V(2).InfoS("Average occurrence per node", "node", klog.KObj(nodeMap[nodeName]), "ownerKey", ownerKey, "avg", upperAvg) // list of duplicated pods does not contain the original referential pod @@ -199,9 +200,9 @@ func RemoveDuplicatePods( // It's assumed all duplicated pods are in the same priority class // TODO(jchaloup): check if the pod has a different node to lend to for _, pod := range pods[upperAvg-1:] { - if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[nodeName], "RemoveDuplicatePods"); err != nil { - klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) - break + podEvictor.EvictPod(ctx, pod) + if podEvictor.NodeLimitExceeded(nodeMap[nodeName]) { + continue loop } } } diff --git a/pkg/descheduler/strategies/failedpods.go b/pkg/descheduler/strategies/failedpods.go index b9bfabaea..0dac0c0c0 100644 --- a/pkg/descheduler/strategies/failedpods.go +++ b/pkg/descheduler/strategies/failedpods.go @@ -75,9 +75,9 @@ func RemoveFailedPods( continue } - if _, err = podEvictor.EvictPod(ctx, pods[i], node, "FailedPod"); err != nil { - klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) - break + podEvictor.EvictPod(ctx, pods[i]) + if podEvictor.NodeLimitExceeded(node) { + continue } } } diff --git a/pkg/descheduler/strategies/node_affinity.go b/pkg/descheduler/strategies/node_affinity.go index c3c21fec9..39145b095 100644 --- a/pkg/descheduler/strategies/node_affinity.go +++ b/pkg/descheduler/strategies/node_affinity.go @@ -74,6 +74,7 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter switch nodeAffinity { case "requiredDuringSchedulingIgnoredDuringExecution": + loop: for _, node := range nodes { klog.V(1).InfoS("Processing node", "node", klog.KObj(node)) @@ -93,9 +94,9 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter for _, pod := range pods { if pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil && pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { klog.V(1).InfoS("Evicting pod", "pod", klog.KObj(pod)) - if _, err := podEvictor.EvictPod(ctx, pod, node, "NodeAffinity"); err != nil { - klog.ErrorS(err, "Error evicting pod") - break + podEvictor.EvictPod(ctx, pod) + if podEvictor.NodeLimitExceeded(node) { + continue loop } } } diff --git a/pkg/descheduler/strategies/node_taint.go b/pkg/descheduler/strategies/node_taint.go index 094ded915..48cdcf48c 100644 --- a/pkg/descheduler/strategies/node_taint.go +++ b/pkg/descheduler/strategies/node_taint.go @@ -91,6 +91,7 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa } } +loop: for _, node := range nodes { klog.V(1).InfoS("Processing node", "node", klog.KObj(node)) pods, err := podutil.ListAllPodsOnANode(node.Name, getPodsAssignedToNode, podFilter) @@ -106,9 +107,9 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa taintFilterFnc, ) { klog.V(2).InfoS("Not all taints with NoSchedule effect are tolerated after update for pod on node", "pod", klog.KObj(pods[i]), "node", klog.KObj(node)) - if _, err := podEvictor.EvictPod(ctx, pods[i], node, "NodeTaint"); err != nil { - klog.ErrorS(err, "Error evicting pod") - break + podEvictor.EvictPod(ctx, pods[i]) + if podEvictor.NodeLimitExceeded(node) { + continue loop } } } diff --git a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go index 9c155851d..5790141f9 100644 --- a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go +++ b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go @@ -313,14 +313,8 @@ func evictPods( continue } - success, err := podEvictor.EvictPod(ctx, pod, nodeInfo.node, strategy) - if err != nil { - klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) - break - } - - if success { - klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod), "err", err) + if podEvictor.EvictPod(ctx, pod) { + klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod)) for name := range totalAvailableUsage { if name == v1.ResourcePods { @@ -351,6 +345,9 @@ func evictPods( break } } + if podEvictor.NodeLimitExceeded(nodeInfo.node) { + return + } } } } diff --git a/pkg/descheduler/strategies/pod_antiaffinity.go b/pkg/descheduler/strategies/pod_antiaffinity.go index ded994f97..b13418c13 100644 --- a/pkg/descheduler/strategies/pod_antiaffinity.go +++ b/pkg/descheduler/strategies/pod_antiaffinity.go @@ -75,6 +75,7 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients return } +loop: for _, node := range nodes { klog.V(1).InfoS("Processing node", "node", klog.KObj(node)) pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter) @@ -86,13 +87,7 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients totalPods := len(pods) for i := 0; i < totalPods; i++ { if checkPodsWithAntiAffinityExist(pods[i], pods) && evictorFilter.Filter(pods[i]) { - success, err := podEvictor.EvictPod(ctx, pods[i], node, "InterPodAntiAffinity") - if err != nil { - klog.ErrorS(err, "Error evicting pod") - break - } - - if success { + if podEvictor.EvictPod(ctx, pods[i]) { // Since the current pod is evicted all other pods which have anti-affinity with this // pod need not be evicted. // Update pods. @@ -101,6 +96,9 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients totalPods-- } } + if podEvictor.NodeLimitExceeded(node) { + continue loop + } } } } diff --git a/pkg/descheduler/strategies/pod_lifetime.go b/pkg/descheduler/strategies/pod_lifetime.go index 20010b42b..6149ea27a 100644 --- a/pkg/descheduler/strategies/pod_lifetime.go +++ b/pkg/descheduler/strategies/pod_lifetime.go @@ -126,15 +126,16 @@ func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.D // in the event that PDB or settings such maxNoOfPodsToEvictPer* prevent too much eviction podutil.SortPodsBasedOnAge(podsToEvict) + nodeLimitExceeded := map[string]bool{} for _, pod := range podsToEvict { - success, err := podEvictor.EvictPod(ctx, pod, nodeMap[pod.Spec.NodeName], "PodLifeTime") - if success { + if nodeLimitExceeded[pod.Spec.NodeName] { + continue + } + if podEvictor.EvictPod(ctx, pod) { klog.V(1).InfoS("Evicted pod because it exceeded its lifetime", "pod", klog.KObj(pod), "maxPodLifeTime", *strategy.Params.PodLifeTime.MaxPodLifeTimeSeconds) } - - if err != nil { - klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) - break + if podEvictor.NodeLimitExceeded(nodeMap[pod.Spec.NodeName]) { + nodeLimitExceeded[pod.Spec.NodeName] = true } } } diff --git a/pkg/descheduler/strategies/toomanyrestarts.go b/pkg/descheduler/strategies/toomanyrestarts.go index 7e5431e01..914046623 100644 --- a/pkg/descheduler/strategies/toomanyrestarts.go +++ b/pkg/descheduler/strategies/toomanyrestarts.go @@ -72,6 +72,7 @@ func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Inter return } +loop: for _, node := range nodes { klog.V(1).InfoS("Processing node", "node", klog.KObj(node)) pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter) @@ -89,9 +90,9 @@ func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Inter } else if restarts < strategy.Params.PodsHavingTooManyRestarts.PodRestartThreshold { continue } - if _, err := podEvictor.EvictPod(ctx, pods[i], node, "TooManyRestarts"); err != nil { - klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) - break + podEvictor.EvictPod(ctx, pods[i]) + if podEvictor.NodeLimitExceeded(node) { + continue loop } } } diff --git a/pkg/descheduler/strategies/topologyspreadconstraint.go b/pkg/descheduler/strategies/topologyspreadconstraint.go index a5254dcdc..dbd903fbe 100644 --- a/pkg/descheduler/strategies/topologyspreadconstraint.go +++ b/pkg/descheduler/strategies/topologyspreadconstraint.go @@ -183,13 +183,17 @@ func RemovePodsViolatingTopologySpreadConstraint( } } + nodeLimitExceeded := map[string]bool{} for pod := range podsForEviction { + if nodeLimitExceeded[pod.Spec.NodeName] { + continue + } if !isEvictable(pod) { continue } - if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[pod.Spec.NodeName], "PodTopologySpread"); err != nil { - klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) - break + podEvictor.EvictPod(ctx, pod) + if podEvictor.NodeLimitExceeded(nodeMap[pod.Spec.NodeName]) { + nodeLimitExceeded[pod.Spec.NodeName] = true } } }