mirror of
https://github.com/kubernetes-sigs/descheduler.git
synced 2026-01-26 05:14:13 +01:00
EvictPod: stop returning an error
When an error is returned a strategy either stops completely or starts processing another node. Given the error can be a transient error or only one of the limits can get exceeded it is fair to just skip a pod that failed eviction and proceed to the next instead. In order to optimize the processing and stop earlier, it is more practical to implement a check which will say when a limit was exceeded.
This commit is contained in:
@@ -102,11 +102,18 @@ func (pe *PodEvictor) TotalEvicted() uint {
|
||||
return total
|
||||
}
|
||||
|
||||
// EvictPod returns non-nil error only when evicting a pod on a node is not
|
||||
// possible (due to maxPodsToEvictPerNode constraint). Success is true when the pod
|
||||
// is evicted on the server side.
|
||||
// eviction reason can be set through the ctx's evictionReason:STRING pair
|
||||
func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod) (bool, error) {
|
||||
// NodeLimitExceeded checks if the number of evictions for a node was exceeded
|
||||
func (pe *PodEvictor) NodeLimitExceeded(node *v1.Node) bool {
|
||||
if pe.maxPodsToEvictPerNode != nil {
|
||||
return pe.nodepodCount[node.Name] == *pe.maxPodsToEvictPerNode
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// EvictPod evicts a pod while exercising eviction limits.
|
||||
// Returns true when the pod is evicted on the server side.
|
||||
// Eviction reason can be set through the ctx's evictionReason:STRING pair
|
||||
func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod) bool {
|
||||
strategy := ""
|
||||
if ctx.Value("strategyName") != nil {
|
||||
strategy = ctx.Value("strategyName").(string)
|
||||
@@ -121,7 +128,8 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod) (bool, error) {
|
||||
if pe.metricsEnabled {
|
||||
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
|
||||
}
|
||||
return false, fmt.Errorf("Maximum number %v of evicted pods per %q node reached", *pe.maxPodsToEvictPerNode, pod.Spec.NodeName)
|
||||
klog.ErrorS(fmt.Errorf("Maximum number of evicted pods per node reached"), "limit", *pe.maxPodsToEvictPerNode, "node", pod.Spec.NodeName)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,7 +137,8 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod) (bool, error) {
|
||||
if pe.metricsEnabled {
|
||||
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
|
||||
}
|
||||
return false, fmt.Errorf("Maximum number %v of evicted pods per %q namespace reached", *pe.maxPodsToEvictPerNamespace, pod.Namespace)
|
||||
klog.ErrorS(fmt.Errorf("Maximum number of evicted pods per namespace reached"), "limit", *pe.maxPodsToEvictPerNamespace, "namespace", pod.Namespace)
|
||||
return false
|
||||
}
|
||||
|
||||
err := evictPod(ctx, pe.client, pod, pe.policyGroupVersion)
|
||||
@@ -139,7 +148,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod) (bool, error) {
|
||||
if pe.metricsEnabled {
|
||||
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
|
||||
}
|
||||
return false, nil
|
||||
return false
|
||||
}
|
||||
|
||||
if pod.Spec.NodeName != "" {
|
||||
@@ -161,7 +170,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod) (bool, error) {
|
||||
r := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "sigs.k8s.io.descheduler"})
|
||||
r.Event(pod, v1.EventTypeNormal, "Descheduled", fmt.Sprintf("pod evicted by sigs.k8s.io/descheduler%s", reason))
|
||||
}
|
||||
return true, nil
|
||||
return true
|
||||
}
|
||||
|
||||
func evictPod(ctx context.Context, client clientset.Interface, pod *v1.Pod, policyGroupVersion string) error {
|
||||
|
||||
@@ -192,6 +192,7 @@ func RemoveDuplicatePods(
|
||||
}
|
||||
|
||||
upperAvg := int(math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(len(targetNodes))))
|
||||
loop:
|
||||
for nodeName, pods := range podNodes {
|
||||
klog.V(2).InfoS("Average occurrence per node", "node", klog.KObj(nodeMap[nodeName]), "ownerKey", ownerKey, "avg", upperAvg)
|
||||
// list of duplicated pods does not contain the original referential pod
|
||||
@@ -199,9 +200,9 @@ func RemoveDuplicatePods(
|
||||
// It's assumed all duplicated pods are in the same priority class
|
||||
// TODO(jchaloup): check if the pod has a different node to lend to
|
||||
for _, pod := range pods[upperAvg-1:] {
|
||||
if _, err := podEvictor.EvictPod(ctx, pod); err != nil {
|
||||
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
|
||||
break
|
||||
podEvictor.EvictPod(ctx, pod)
|
||||
if podEvictor.NodeLimitExceeded(nodeMap[nodeName]) {
|
||||
continue loop
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,9 +75,9 @@ func RemoveFailedPods(
|
||||
continue
|
||||
}
|
||||
|
||||
if _, err = podEvictor.EvictPod(ctx, pods[i]); err != nil {
|
||||
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
|
||||
break
|
||||
podEvictor.EvictPod(ctx, pods[i])
|
||||
if podEvictor.NodeLimitExceeded(node) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,6 +74,7 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter
|
||||
|
||||
switch nodeAffinity {
|
||||
case "requiredDuringSchedulingIgnoredDuringExecution":
|
||||
loop:
|
||||
for _, node := range nodes {
|
||||
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
|
||||
|
||||
@@ -93,9 +94,9 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter
|
||||
for _, pod := range pods {
|
||||
if pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil && pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil {
|
||||
klog.V(1).InfoS("Evicting pod", "pod", klog.KObj(pod))
|
||||
if _, err := podEvictor.EvictPod(ctx, pod); err != nil {
|
||||
klog.ErrorS(err, "Error evicting pod")
|
||||
break
|
||||
podEvictor.EvictPod(ctx, pod)
|
||||
if podEvictor.NodeLimitExceeded(node) {
|
||||
continue loop
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,6 +91,7 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa
|
||||
}
|
||||
}
|
||||
|
||||
loop:
|
||||
for _, node := range nodes {
|
||||
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
|
||||
pods, err := podutil.ListAllPodsOnANode(node.Name, getPodsAssignedToNode, podFilter)
|
||||
@@ -106,9 +107,9 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa
|
||||
taintFilterFnc,
|
||||
) {
|
||||
klog.V(2).InfoS("Not all taints with NoSchedule effect are tolerated after update for pod on node", "pod", klog.KObj(pods[i]), "node", klog.KObj(node))
|
||||
if _, err := podEvictor.EvictPod(ctx, pods[i]); err != nil {
|
||||
klog.ErrorS(err, "Error evicting pod")
|
||||
break
|
||||
podEvictor.EvictPod(ctx, pods[i])
|
||||
if podEvictor.NodeLimitExceeded(node) {
|
||||
continue loop
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -313,14 +313,8 @@ func evictPods(
|
||||
continue
|
||||
}
|
||||
|
||||
success, err := podEvictor.EvictPod(ctx, pod)
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
|
||||
break
|
||||
}
|
||||
|
||||
if success {
|
||||
klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod), "err", err)
|
||||
if podEvictor.EvictPod(ctx, pod) {
|
||||
klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod))
|
||||
|
||||
for name := range totalAvailableUsage {
|
||||
if name == v1.ResourcePods {
|
||||
@@ -351,6 +345,9 @@ func evictPods(
|
||||
break
|
||||
}
|
||||
}
|
||||
if podEvictor.NodeLimitExceeded(nodeInfo.node) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,6 +75,7 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients
|
||||
return
|
||||
}
|
||||
|
||||
loop:
|
||||
for _, node := range nodes {
|
||||
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
|
||||
pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter)
|
||||
@@ -86,13 +87,7 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients
|
||||
totalPods := len(pods)
|
||||
for i := 0; i < totalPods; i++ {
|
||||
if checkPodsWithAntiAffinityExist(pods[i], pods) && evictorFilter.Filter(pods[i]) {
|
||||
success, err := podEvictor.EvictPod(ctx, pods[i])
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "Error evicting pod")
|
||||
break
|
||||
}
|
||||
|
||||
if success {
|
||||
if podEvictor.EvictPod(ctx, pods[i]) {
|
||||
// Since the current pod is evicted all other pods which have anti-affinity with this
|
||||
// pod need not be evicted.
|
||||
// Update pods.
|
||||
@@ -101,6 +96,9 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients
|
||||
totalPods--
|
||||
}
|
||||
}
|
||||
if podEvictor.NodeLimitExceeded(node) {
|
||||
continue loop
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,15 +106,16 @@ func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.D
|
||||
// in the event that PDB or settings such maxNoOfPodsToEvictPer* prevent too much eviction
|
||||
podutil.SortPodsBasedOnAge(podsToEvict)
|
||||
|
||||
nodeLimitExceeded := map[string]bool{}
|
||||
for _, pod := range podsToEvict {
|
||||
success, err := podEvictor.EvictPod(ctx, pod)
|
||||
if success {
|
||||
if nodeLimitExceeded[pod.Spec.NodeName] {
|
||||
continue
|
||||
}
|
||||
if podEvictor.EvictPod(ctx, pod) {
|
||||
klog.V(1).InfoS("Evicted pod because it exceeded its lifetime", "pod", klog.KObj(pod), "maxPodLifeTime", *strategy.Params.PodLifeTime.MaxPodLifeTimeSeconds)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
|
||||
break
|
||||
if podEvictor.NodeLimitExceeded(nodeMap[pod.Spec.NodeName]) {
|
||||
nodeLimitExceeded[pod.Spec.NodeName] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,6 +72,7 @@ func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Inter
|
||||
return
|
||||
}
|
||||
|
||||
loop:
|
||||
for _, node := range nodes {
|
||||
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
|
||||
pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter)
|
||||
@@ -89,9 +90,9 @@ func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Inter
|
||||
} else if restarts < strategy.Params.PodsHavingTooManyRestarts.PodRestartThreshold {
|
||||
continue
|
||||
}
|
||||
if _, err := podEvictor.EvictPod(ctx, pods[i]); err != nil {
|
||||
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
|
||||
break
|
||||
podEvictor.EvictPod(ctx, pods[i])
|
||||
if podEvictor.NodeLimitExceeded(node) {
|
||||
continue loop
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,13 +183,17 @@ func RemovePodsViolatingTopologySpreadConstraint(
|
||||
}
|
||||
}
|
||||
|
||||
nodeLimitExceeded := map[string]bool{}
|
||||
for pod := range podsForEviction {
|
||||
if nodeLimitExceeded[pod.Spec.NodeName] {
|
||||
continue
|
||||
}
|
||||
if !isEvictable(pod) {
|
||||
continue
|
||||
}
|
||||
if _, err := podEvictor.EvictPod(ctx, pod); err != nil {
|
||||
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
|
||||
break
|
||||
podEvictor.EvictPod(ctx, pod)
|
||||
if podEvictor.NodeLimitExceeded(nodeMap[pod.Spec.NodeName]) {
|
||||
nodeLimitExceeded[pod.Spec.NodeName] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user