From 3eca2782d41393284784d07de8e1e88fee545445 Mon Sep 17 00:00:00 2001 From: Jan Chaloupka Date: Thu, 28 Apr 2022 12:00:28 +0200 Subject: [PATCH] Addressing review comments Both LowNode and HighNode utilization strategies evict only as many pods as there's free resources on other nodes. Thus, the resource fit test is always true by definition. --- README.md | 24 ++-- pkg/descheduler/node/node.go | 76 ++++++------- pkg/descheduler/node/node_test.go | 6 - pkg/descheduler/strategies/node_taint_test.go | 8 +- .../lownodeutilization_test.go | 106 ------------------ .../nodeutilization/nodeutilization.go | 8 +- .../strategies/topologyspreadconstraint.go | 18 ++- 7 files changed, 70 insertions(+), 176 deletions(-) diff --git a/README.md b/README.md index 1d39039ab..068e45a48 100644 --- a/README.md +++ b/README.md @@ -226,10 +226,10 @@ If that parameter is set to `true`, the thresholds are considered as percentage `thresholds` will be deducted from the mean among all nodes and `targetThresholds` will be added to the mean. A resource consumption above (resp. below) this window is considered as overutilization (resp. underutilization). -**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. -This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same -design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands -like `kubectl top`) may differ from the calculated consumption, due to these components reporting +**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. +This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same +design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands +like `kubectl top`) may differ from the calculated consumption, due to these components reporting actual usage metrics. Implementing metrics-based descheduling is currently TODO for the project. **Parameters:** @@ -280,8 +280,8 @@ under utilized frequently or for a short period of time. By default, `numberOfNo ### HighNodeUtilization -This strategy finds nodes that are under utilized and evicts pods from the nodes in the hope that these pods will be -scheduled compactly into fewer nodes. Used in conjunction with node auto-scaling, this strategy is intended to help +This strategy finds nodes that are under utilized and evicts pods from the nodes in the hope that these pods will be +scheduled compactly into fewer nodes. Used in conjunction with node auto-scaling, this strategy is intended to help trigger down scaling of under utilized nodes. This strategy **must** be used with the scheduler scoring strategy `MostAllocated`. The parameters of this strategy are configured under `nodeResourceUtilizationThresholds`. @@ -300,10 +300,10 @@ strategy evicts pods from `underutilized nodes` (those with usage below `thresho so that they can be recreated in appropriately utilized nodes. The strategy will abort if any number of `underutilized nodes` or `appropriately utilized nodes` is zero. -**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. -This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same -design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands -like `kubectl top`) may differ from the calculated consumption, due to these components reporting +**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. +This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same +design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands +like `kubectl top`) may differ from the calculated consumption, due to these components reporting actual usage metrics. Implementing metrics-based descheduling is currently TODO for the project. **Parameters:** @@ -737,9 +737,9 @@ The following strategies accept a `nodeFit` boolean parameter which can optimize If set to `true` the descheduler will consider whether or not the pods that meet eviction criteria will fit on other nodes before evicting them. If a pod cannot be rescheduled to another node, it will not be evicted. Currently the following criteria are considered when setting `nodeFit` to `true`: - A `nodeSelector` on the pod -- Any `Tolerations` on the pod and any `Taints` on the other nodes +- Any `tolerations` on the pod and any `taints` on the other nodes - `nodeAffinity` on the pod -- Resource `Requests` made by the pod and the resources available on other nodes +- Resource `requests` made by the pod and the resources available on other nodes - Whether any of the other nodes are marked as `unschedulable` E.g. diff --git a/pkg/descheduler/node/node.go b/pkg/descheduler/node/node.go index a088facb3..e95606325 100644 --- a/pkg/descheduler/node/node.go +++ b/pkg/descheduler/node/node.go @@ -99,28 +99,27 @@ func IsReady(node *v1.Node) bool { return true } -// NodeFit returns true if the provided pod can probably be scheduled onto the provided node. +// NodeFit returns true if the provided pod can be scheduled onto the provided node. // This function is used when the NodeFit pod filtering feature of the Descheduler is enabled. // This function currently considers a subset of the Kubernetes Scheduler's predicates when // deciding if a pod would fit on a node, but more predicates may be added in the future. func NodeFit(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) []error { // Check node selector and required affinity var errors []error - ok, err := utils.PodMatchNodeSelector(pod, node) - if err != nil { + if ok, err := utils.PodMatchNodeSelector(pod, node); err != nil { errors = append(errors, err) } else if !ok { errors = append(errors, fmt.Errorf("pod node selector does not match the node label")) } // Check taints (we only care about NoSchedule and NoExecute taints) - ok = utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool { + ok := utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool { return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute }) if !ok { errors = append(errors, fmt.Errorf("pod does not tolerate taints on the node")) } // Check if the pod can fit on a node based off it's requests - ok, reqErrors := FitsRequest(nodeIndexer, pod, node) + ok, reqErrors := fitsRequest(nodeIndexer, pod, node) if !ok { errors = append(errors, reqErrors...) } @@ -132,7 +131,7 @@ func NodeFit(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v return errors } -// PodFitsAnyOtherNode checks if the given pod will probably fit any of the given nodes, besides the node +// PodFitsAnyOtherNode checks if the given pod will fit any of the given nodes, besides the node // the pod is already running on. The predicates used to determine if the pod will fit can be found in the NodeFit function. func PodFitsAnyOtherNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) bool { for _, node := range nodes { @@ -155,7 +154,7 @@ func PodFitsAnyOtherNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1. return false } -// PodFitsAnyNode checks if the given pod will probably fit any of the given nodes. The predicates used +// PodFitsAnyNode checks if the given pod will fit any of the given nodes. The predicates used // to determine if the pod will fit can be found in the NodeFit function. func PodFitsAnyNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) bool { for _, node := range nodes { @@ -173,7 +172,7 @@ func PodFitsAnyNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, return false } -// PodFitsCurrentNode checks if the given pod will probably fit onto the given node. The predicates used +// PodFitsCurrentNode checks if the given pod will fit onto the given node. The predicates used // to determine if the pod will fit can be found in the NodeFit function. func PodFitsCurrentNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) bool { errors := NodeFit(nodeIndexer, pod, node) @@ -195,9 +194,9 @@ func IsNodeUnschedulable(node *v1.Node) bool { return node.Spec.Unschedulable } -// FitsRequest determines if a pod can fit on a node based on that pod's requests. It returns true if +// fitsRequest determines if a pod can fit on a node based on its resource requests. It returns true if // the pod will fit. -func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) (bool, []error) { +func fitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) (bool, []error) { var insufficientResources []error // Get pod requests @@ -207,7 +206,7 @@ func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nod resourceNames = append(resourceNames, name) } - availableResources, err := NodeAvailableResources(nodeIndexer, node, resourceNames) + availableResources, err := nodeAvailableResources(nodeIndexer, node, resourceNames) if err != nil { return false, []error{err} } @@ -215,15 +214,8 @@ func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nod podFitsOnNode := true for _, resource := range resourceNames { podResourceRequest := podRequests[resource] - var requestTooLarge bool - switch resource { - case v1.ResourceCPU: - requestTooLarge = podResourceRequest.MilliValue() > availableResources[resource].MilliValue() - default: - requestTooLarge = podResourceRequest.Value() > availableResources[resource].Value() - } - - if requestTooLarge { + availableResource, ok := availableResources[resource] + if !ok || podResourceRequest.MilliValue() > availableResource.MilliValue() { insufficientResources = append(insufficientResources, fmt.Errorf("insufficient %v", resource)) podFitsOnNode = false } @@ -231,18 +223,34 @@ func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nod return podFitsOnNode, insufficientResources } -// NodeAvailableResources returns resources mapped to the quanitity available on the node. -func NodeAvailableResources(nodeIndexer podutil.GetPodsAssignedToNodeFunc, node *v1.Node, resourceNames []v1.ResourceName) (map[v1.ResourceName]*resource.Quantity, error) { +// nodeAvailableResources returns resources mapped to the quanitity available on the node. +func nodeAvailableResources(nodeIndexer podutil.GetPodsAssignedToNodeFunc, node *v1.Node, resourceNames []v1.ResourceName) (map[v1.ResourceName]*resource.Quantity, error) { podsOnNode, err := podutil.ListPodsOnANode(node.Name, nodeIndexer, nil) if err != nil { return nil, err } - aggregatePodRequests := AggregatePodRequests(podsOnNode, resourceNames) - return nodeRemainingResources(node, aggregatePodRequests, resourceNames), nil + nodeUtilization := NodeUtilization(podsOnNode, resourceNames) + remainingResources := map[v1.ResourceName]*resource.Quantity{ + v1.ResourceCPU: resource.NewMilliQuantity(node.Status.Allocatable.Cpu().MilliValue()-nodeUtilization[v1.ResourceCPU].MilliValue(), resource.DecimalSI), + v1.ResourceMemory: resource.NewQuantity(node.Status.Allocatable.Memory().Value()-nodeUtilization[v1.ResourceMemory].Value(), resource.BinarySI), + v1.ResourcePods: resource.NewQuantity(node.Status.Allocatable.Pods().Value()-nodeUtilization[v1.ResourcePods].Value(), resource.DecimalSI), + } + for _, name := range resourceNames { + if !IsBasicResource(name) { + if _, exists := node.Status.Allocatable[name]; exists { + allocatableResource := node.Status.Allocatable[name] + remainingResources[name] = resource.NewQuantity(allocatableResource.Value()-nodeUtilization[name].Value(), resource.DecimalSI) + } else { + remainingResources[name] = resource.NewQuantity(0, resource.DecimalSI) + } + } + } + + return remainingResources, nil } -// AggregatePodRequests returns the resources requested by the given pods. Only resources supplied in the resourceNames parameter are calculated. -func AggregatePodRequests(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity { +// NodeUtilization returns the resources requested by the given pods. Only resources supplied in the resourceNames parameter are calculated. +func NodeUtilization(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity { totalReqs := map[v1.ResourceName]*resource.Quantity{ v1.ResourceCPU: resource.NewMilliQuantity(0, resource.DecimalSI), v1.ResourceMemory: resource.NewQuantity(0, resource.BinarySI), @@ -269,22 +277,6 @@ func AggregatePodRequests(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v return totalReqs } -func nodeRemainingResources(node *v1.Node, aggregatePodRequests map[v1.ResourceName]*resource.Quantity, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity { - remainingResources := map[v1.ResourceName]*resource.Quantity{ - v1.ResourceCPU: resource.NewMilliQuantity(node.Status.Allocatable.Cpu().MilliValue()-aggregatePodRequests[v1.ResourceCPU].MilliValue(), resource.DecimalSI), - v1.ResourceMemory: resource.NewQuantity(node.Status.Allocatable.Memory().Value()-aggregatePodRequests[v1.ResourceMemory].Value(), resource.BinarySI), - v1.ResourcePods: resource.NewQuantity(node.Status.Allocatable.Pods().Value()-aggregatePodRequests[v1.ResourcePods].Value(), resource.DecimalSI), - } - for _, name := range resourceNames { - if !IsBasicResource(name) { - allocatableResource := node.Status.Allocatable[name] - remainingResources[name] = resource.NewQuantity(allocatableResource.Value()-aggregatePodRequests[name].Value(), resource.DecimalSI) - } - } - - return remainingResources -} - // IsBasicResource checks if resource is basic native. func IsBasicResource(name v1.ResourceName) bool { switch name { diff --git a/pkg/descheduler/node/node_test.go b/pkg/descheduler/node/node_test.go index deb086cd9..bd22542ec 100644 --- a/pkg/descheduler/node/node_test.go +++ b/pkg/descheduler/node/node_test.go @@ -26,7 +26,6 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes/fake" - core "k8s.io/client-go/testing" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" "sigs.k8s.io/descheduler/test" ) @@ -196,11 +195,6 @@ func TestPodFitsCurrentNode(t *testing.T) { }, } - fakeClient := &fake.Clientset{} - fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { - return true, &v1.PodList{Items: nil}, nil - }) - for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) diff --git a/pkg/descheduler/strategies/node_taint_test.go b/pkg/descheduler/strategies/node_taint_test.go index bacb663bb..ae2d06d2e 100644 --- a/pkg/descheduler/strategies/node_taint_test.go +++ b/pkg/descheduler/strategies/node_taint_test.go @@ -70,13 +70,17 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { Unschedulable: true, } }) - node5 := test.BuildTestNode("n5", 1, 1, 1, nil) node5 := test.BuildTestNode("n5", 2000, 3000, 10, nil) node5.Spec.Taints = []v1.Taint{ createPreferNoScheduleTaint("testTaint", "test", 1), } + node6 := test.BuildTestNode("n6", 1, 1, 1, nil) + node6.Spec.Taints = []v1.Taint{ + createPreferNoScheduleTaint("testTaint", "test", 1), + } + p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil) p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil) p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil) @@ -293,7 +297,7 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { { description: "Critical and non critical pods, pods not tolerating node taint can't be evicted because the only available node does not have enough resources.", pods: []*v1.Pod{p2, p7, p9, p10}, - nodes: []*v1.Node{node1, node5}, + nodes: []*v1.Node{node1, node6}, evictLocalStoragePods: false, evictSystemCriticalPods: true, expectedEvictedPodCount: 0, //p2 and p7 can't be evicted diff --git a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go index 06ccf3232..3196825f6 100644 --- a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go @@ -695,112 +695,6 @@ func TestLowNodeUtilization(t *testing.T) { expectedPodsEvicted: 2, evictedPods: []string{}, }, - { - name: "without priorities, but only other node doesn't match pod node affinity for p4 and p5", - thresholds: api.ResourceThresholds{ - v1.ResourceCPU: 30, - v1.ResourcePods: 30, - }, - targetThresholds: api.ResourceThresholds{ - v1.ResourceCPU: 50, - v1.ResourcePods: 50, - }, - nodes: []*v1.Node{ - test.BuildTestNode(n1NodeName, 500, 200, 9, nil), - test.BuildTestNode(n2NodeName, 200, 200, 10, func(node *v1.Node) { - node.ObjectMeta.Labels = map[string]string{ - nodeSelectorKey: notMatchingNodeSelectorValue, - } - }), - }, - pods: []*v1.Pod{ - test.BuildTestPod("p1", 10, 0, n1NodeName, test.SetRSOwnerRef), - test.BuildTestPod("p2", 10, 0, n1NodeName, test.SetRSOwnerRef), - test.BuildTestPod("p3", 10, 0, n1NodeName, test.SetRSOwnerRef), - // These won't be evicted. - test.BuildTestPod("p4", 400, 100, n1NodeName, func(pod *v1.Pod) { - // A pod that requests too much CPU - test.SetNormalOwnerRef(pod) - }), - test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) { - // A pod with local storage. - test.SetNormalOwnerRef(pod) - pod.Spec.Volumes = []v1.Volume{ - { - Name: "sample", - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, - EmptyDir: &v1.EmptyDirVolumeSource{ - SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, - }, - }, - } - // A Mirror Pod. - pod.Annotations = test.GetMirrorPodAnnotation() - }), - test.BuildTestPod("p6", 400, 0, n1NodeName, func(pod *v1.Pod) { - // A Critical Pod. - pod.Namespace = "kube-system" - priority := utils.SystemCriticalPriority - pod.Spec.Priority = &priority - }), - test.BuildTestPod("p9", 0, 0, n2NodeName, test.SetRSOwnerRef), - }, - expectedPodsEvicted: 3, - }, - { - name: "without priorities, but only other node doesn't match pod node affinity for p4 and p5", - thresholds: api.ResourceThresholds{ - v1.ResourceCPU: 30, - v1.ResourcePods: 30, - }, - targetThresholds: api.ResourceThresholds{ - v1.ResourceCPU: 50, - v1.ResourcePods: 50, - }, - nodes: []*v1.Node{ - test.BuildTestNode(n1NodeName, 500, 200, 9, nil), - test.BuildTestNode(n2NodeName, 200, 200, 10, func(node *v1.Node) { - node.ObjectMeta.Labels = map[string]string{ - nodeSelectorKey: notMatchingNodeSelectorValue, - } - }), - }, - pods: []*v1.Pod{ - test.BuildTestPod("p1", 10, 0, n1NodeName, test.SetRSOwnerRef), - test.BuildTestPod("p2", 10, 0, n1NodeName, test.SetRSOwnerRef), - test.BuildTestPod("p3", 10, 0, n1NodeName, test.SetRSOwnerRef), - // These won't be evicted. - test.BuildTestPod("p4", 400, 100, n1NodeName, func(pod *v1.Pod) { - // A pod that requests too much CPU - test.SetNormalOwnerRef(pod) - }), - test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) { - // A pod with local storage. - test.SetNormalOwnerRef(pod) - pod.Spec.Volumes = []v1.Volume{ - { - Name: "sample", - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, - EmptyDir: &v1.EmptyDirVolumeSource{ - SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, - }, - }, - } - // A Mirror Pod. - pod.Annotations = test.GetMirrorPodAnnotation() - }), - test.BuildTestPod("p6", 400, 0, n1NodeName, func(pod *v1.Pod) { - // A Critical Pod. - pod.Namespace = "kube-system" - priority := utils.SystemCriticalPriority - pod.Spec.Priority = &priority - }), - test.BuildTestPod("p9", 0, 0, n2NodeName, test.SetRSOwnerRef), - }, - expectedPodsEvicted: 3, - }, } for _, test := range testCases { diff --git a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go index 3fcd411e3..87f2ac34f 100644 --- a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go +++ b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go @@ -157,7 +157,7 @@ func getNodeUsage( nodeUsageList = append(nodeUsageList, NodeUsage{ node: node, - usage: nodeUtilization(node, pods, resourceNames), + usage: nodeutil.NodeUtilization(pods, resourceNames), allPods: pods, }) } @@ -279,7 +279,7 @@ func evictPodsFromSourceNodes( for _, node := range sourceNodes { klog.V(3).InfoS("Evicting pods from node", "node", klog.KObj(node.node), "usage", node.usage) - nonRemovablePods, removablePods := classifyPods(ctx, node.allPods, podFilter) + nonRemovablePods, removablePods := classifyPods(node.allPods, podFilter) klog.V(2).InfoS("Pods on node", "node", klog.KObj(node.node), "allPods", len(node.allPods), "nonRemovablePods", len(nonRemovablePods), "removablePods", len(removablePods)) if len(removablePods) == 0 { @@ -413,7 +413,7 @@ func getResourceNames(thresholds api.ResourceThresholds) []v1.ResourceName { return resourceNames } -func classifyPods(ctx context.Context, pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) { +func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) { var nonRemovablePods, removablePods []*v1.Pod for _, pod := range pods { @@ -437,7 +437,7 @@ func averageNodeBasicresources(nodes []*v1.Node, getPodsAssignedToNode podutil.G numberOfNodes-- continue } - usage := nodeUtilization(node, pods, resourceNames) + usage := nodeutil.NodeUtilization(pods, resourceNames) nodeCapacity := node.Status.Capacity if len(node.Status.Allocatable) > 0 { nodeCapacity = node.Status.Allocatable diff --git a/pkg/descheduler/strategies/topologyspreadconstraint.go b/pkg/descheduler/strategies/topologyspreadconstraint.go index 98c472aee..035896bbc 100644 --- a/pkg/descheduler/strategies/topologyspreadconstraint.go +++ b/pkg/descheduler/strategies/topologyspreadconstraint.go @@ -168,7 +168,7 @@ func RemovePodsViolatingTopologySpreadConstraint( klog.V(2).InfoS("Skipping topology constraint because it is already balanced", "constraint", constraint) continue } - balanceDomains(ctx, client, getPodsAssignedToNode, podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodes) + balanceDomains(client, getPodsAssignedToNode, podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodes) } } @@ -223,7 +223,6 @@ func topologyIsBalanced(topology map[topologyPair][]*v1.Pod, constraint v1.Topol // [5, 5, 5, 5, 5, 5] // (assuming even distribution by the scheduler of the evicted pods) func balanceDomains( - ctx context.Context, client clientset.Interface, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, podsForEviction map[*v1.Pod]struct{}, @@ -234,7 +233,7 @@ func balanceDomains( nodes []*v1.Node) { idealAvg := sumPods / float64(len(constraintTopologies)) - sortedDomains := sortDomains(ctx, constraintTopologies, isEvictable) + sortedDomains := sortDomains(constraintTopologies, isEvictable) // i is the index for belowOrEqualAvg // j is the index for aboveAvg i := 0 @@ -274,6 +273,17 @@ func balanceDomains( // also (just for tracking), add them to the list of pods in the lower topology aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:] for k := range aboveToEvict { + // PodFitsAnyOtherNode excludes the current node because, for the sake of domain balancing only, we care about if there is any other + // place it could theoretically fit. + // If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading + // Also, if the pod has a hard nodeAffinity/nodeSelector/toleration that only matches this node, + // don't bother evicting it as it will just end up back on the same node + // however we still account for it "being evicted" so the algorithm can complete + // TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod + // but still try to get the required # of movePods (instead of just chopping that value off the slice above). + // In other words, PTS can perform suboptimally if some of its chosen pods don't fit on other nodes. + // This is because the chosen pods aren't sorted, but immovable pods still count as "evicted" toward the PTS algorithm. + // So, a better selection heuristic could improve performance. if !node.PodFitsAnyOtherNode(getPodsAssignedToNode, aboveToEvict[k], nodes) { klog.V(2).InfoS("ignoring pod for eviction as it does not fit on any other node", "pod", klog.KObj(aboveToEvict[k])) continue @@ -293,7 +303,7 @@ func balanceDomains( // 3. pods in descending priority // 4. all other pods // We then pop pods off the back of the list for eviction -func sortDomains(ctx context.Context, constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(pod *v1.Pod) bool) []topology { +func sortDomains(constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(pod *v1.Pod) bool) []topology { sortedTopologies := make([]topology, 0, len(constraintTopologyPairs)) // sort the topologies and return 2 lists: those <= the average and those > the average (> list inverted) for pair, list := range constraintTopologyPairs {