diff --git a/README.md b/README.md index 1d39039ab..068e45a48 100644 --- a/README.md +++ b/README.md @@ -226,10 +226,10 @@ If that parameter is set to `true`, the thresholds are considered as percentage `thresholds` will be deducted from the mean among all nodes and `targetThresholds` will be added to the mean. A resource consumption above (resp. below) this window is considered as overutilization (resp. underutilization). -**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. -This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same -design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands -like `kubectl top`) may differ from the calculated consumption, due to these components reporting +**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. +This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same +design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands +like `kubectl top`) may differ from the calculated consumption, due to these components reporting actual usage metrics. Implementing metrics-based descheduling is currently TODO for the project. **Parameters:** @@ -280,8 +280,8 @@ under utilized frequently or for a short period of time. By default, `numberOfNo ### HighNodeUtilization -This strategy finds nodes that are under utilized and evicts pods from the nodes in the hope that these pods will be -scheduled compactly into fewer nodes. Used in conjunction with node auto-scaling, this strategy is intended to help +This strategy finds nodes that are under utilized and evicts pods from the nodes in the hope that these pods will be +scheduled compactly into fewer nodes. Used in conjunction with node auto-scaling, this strategy is intended to help trigger down scaling of under utilized nodes. This strategy **must** be used with the scheduler scoring strategy `MostAllocated`. The parameters of this strategy are configured under `nodeResourceUtilizationThresholds`. @@ -300,10 +300,10 @@ strategy evicts pods from `underutilized nodes` (those with usage below `thresho so that they can be recreated in appropriately utilized nodes. The strategy will abort if any number of `underutilized nodes` or `appropriately utilized nodes` is zero. -**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. -This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same -design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands -like `kubectl top`) may differ from the calculated consumption, due to these components reporting +**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. +This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same +design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands +like `kubectl top`) may differ from the calculated consumption, due to these components reporting actual usage metrics. Implementing metrics-based descheduling is currently TODO for the project. **Parameters:** @@ -737,9 +737,9 @@ The following strategies accept a `nodeFit` boolean parameter which can optimize If set to `true` the descheduler will consider whether or not the pods that meet eviction criteria will fit on other nodes before evicting them. If a pod cannot be rescheduled to another node, it will not be evicted. Currently the following criteria are considered when setting `nodeFit` to `true`: - A `nodeSelector` on the pod -- Any `Tolerations` on the pod and any `Taints` on the other nodes +- Any `tolerations` on the pod and any `taints` on the other nodes - `nodeAffinity` on the pod -- Resource `Requests` made by the pod and the resources available on other nodes +- Resource `requests` made by the pod and the resources available on other nodes - Whether any of the other nodes are marked as `unschedulable` E.g. diff --git a/pkg/descheduler/node/node.go b/pkg/descheduler/node/node.go index a088facb3..e95606325 100644 --- a/pkg/descheduler/node/node.go +++ b/pkg/descheduler/node/node.go @@ -99,28 +99,27 @@ func IsReady(node *v1.Node) bool { return true } -// NodeFit returns true if the provided pod can probably be scheduled onto the provided node. +// NodeFit returns true if the provided pod can be scheduled onto the provided node. // This function is used when the NodeFit pod filtering feature of the Descheduler is enabled. // This function currently considers a subset of the Kubernetes Scheduler's predicates when // deciding if a pod would fit on a node, but more predicates may be added in the future. func NodeFit(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) []error { // Check node selector and required affinity var errors []error - ok, err := utils.PodMatchNodeSelector(pod, node) - if err != nil { + if ok, err := utils.PodMatchNodeSelector(pod, node); err != nil { errors = append(errors, err) } else if !ok { errors = append(errors, fmt.Errorf("pod node selector does not match the node label")) } // Check taints (we only care about NoSchedule and NoExecute taints) - ok = utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool { + ok := utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool { return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute }) if !ok { errors = append(errors, fmt.Errorf("pod does not tolerate taints on the node")) } // Check if the pod can fit on a node based off it's requests - ok, reqErrors := FitsRequest(nodeIndexer, pod, node) + ok, reqErrors := fitsRequest(nodeIndexer, pod, node) if !ok { errors = append(errors, reqErrors...) } @@ -132,7 +131,7 @@ func NodeFit(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v return errors } -// PodFitsAnyOtherNode checks if the given pod will probably fit any of the given nodes, besides the node +// PodFitsAnyOtherNode checks if the given pod will fit any of the given nodes, besides the node // the pod is already running on. The predicates used to determine if the pod will fit can be found in the NodeFit function. func PodFitsAnyOtherNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) bool { for _, node := range nodes { @@ -155,7 +154,7 @@ func PodFitsAnyOtherNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1. return false } -// PodFitsAnyNode checks if the given pod will probably fit any of the given nodes. The predicates used +// PodFitsAnyNode checks if the given pod will fit any of the given nodes. The predicates used // to determine if the pod will fit can be found in the NodeFit function. func PodFitsAnyNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) bool { for _, node := range nodes { @@ -173,7 +172,7 @@ func PodFitsAnyNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, return false } -// PodFitsCurrentNode checks if the given pod will probably fit onto the given node. The predicates used +// PodFitsCurrentNode checks if the given pod will fit onto the given node. The predicates used // to determine if the pod will fit can be found in the NodeFit function. func PodFitsCurrentNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) bool { errors := NodeFit(nodeIndexer, pod, node) @@ -195,9 +194,9 @@ func IsNodeUnschedulable(node *v1.Node) bool { return node.Spec.Unschedulable } -// FitsRequest determines if a pod can fit on a node based on that pod's requests. It returns true if +// fitsRequest determines if a pod can fit on a node based on its resource requests. It returns true if // the pod will fit. -func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) (bool, []error) { +func fitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) (bool, []error) { var insufficientResources []error // Get pod requests @@ -207,7 +206,7 @@ func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nod resourceNames = append(resourceNames, name) } - availableResources, err := NodeAvailableResources(nodeIndexer, node, resourceNames) + availableResources, err := nodeAvailableResources(nodeIndexer, node, resourceNames) if err != nil { return false, []error{err} } @@ -215,15 +214,8 @@ func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nod podFitsOnNode := true for _, resource := range resourceNames { podResourceRequest := podRequests[resource] - var requestTooLarge bool - switch resource { - case v1.ResourceCPU: - requestTooLarge = podResourceRequest.MilliValue() > availableResources[resource].MilliValue() - default: - requestTooLarge = podResourceRequest.Value() > availableResources[resource].Value() - } - - if requestTooLarge { + availableResource, ok := availableResources[resource] + if !ok || podResourceRequest.MilliValue() > availableResource.MilliValue() { insufficientResources = append(insufficientResources, fmt.Errorf("insufficient %v", resource)) podFitsOnNode = false } @@ -231,18 +223,34 @@ func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nod return podFitsOnNode, insufficientResources } -// NodeAvailableResources returns resources mapped to the quanitity available on the node. -func NodeAvailableResources(nodeIndexer podutil.GetPodsAssignedToNodeFunc, node *v1.Node, resourceNames []v1.ResourceName) (map[v1.ResourceName]*resource.Quantity, error) { +// nodeAvailableResources returns resources mapped to the quanitity available on the node. +func nodeAvailableResources(nodeIndexer podutil.GetPodsAssignedToNodeFunc, node *v1.Node, resourceNames []v1.ResourceName) (map[v1.ResourceName]*resource.Quantity, error) { podsOnNode, err := podutil.ListPodsOnANode(node.Name, nodeIndexer, nil) if err != nil { return nil, err } - aggregatePodRequests := AggregatePodRequests(podsOnNode, resourceNames) - return nodeRemainingResources(node, aggregatePodRequests, resourceNames), nil + nodeUtilization := NodeUtilization(podsOnNode, resourceNames) + remainingResources := map[v1.ResourceName]*resource.Quantity{ + v1.ResourceCPU: resource.NewMilliQuantity(node.Status.Allocatable.Cpu().MilliValue()-nodeUtilization[v1.ResourceCPU].MilliValue(), resource.DecimalSI), + v1.ResourceMemory: resource.NewQuantity(node.Status.Allocatable.Memory().Value()-nodeUtilization[v1.ResourceMemory].Value(), resource.BinarySI), + v1.ResourcePods: resource.NewQuantity(node.Status.Allocatable.Pods().Value()-nodeUtilization[v1.ResourcePods].Value(), resource.DecimalSI), + } + for _, name := range resourceNames { + if !IsBasicResource(name) { + if _, exists := node.Status.Allocatable[name]; exists { + allocatableResource := node.Status.Allocatable[name] + remainingResources[name] = resource.NewQuantity(allocatableResource.Value()-nodeUtilization[name].Value(), resource.DecimalSI) + } else { + remainingResources[name] = resource.NewQuantity(0, resource.DecimalSI) + } + } + } + + return remainingResources, nil } -// AggregatePodRequests returns the resources requested by the given pods. Only resources supplied in the resourceNames parameter are calculated. -func AggregatePodRequests(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity { +// NodeUtilization returns the resources requested by the given pods. Only resources supplied in the resourceNames parameter are calculated. +func NodeUtilization(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity { totalReqs := map[v1.ResourceName]*resource.Quantity{ v1.ResourceCPU: resource.NewMilliQuantity(0, resource.DecimalSI), v1.ResourceMemory: resource.NewQuantity(0, resource.BinarySI), @@ -269,22 +277,6 @@ func AggregatePodRequests(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v return totalReqs } -func nodeRemainingResources(node *v1.Node, aggregatePodRequests map[v1.ResourceName]*resource.Quantity, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity { - remainingResources := map[v1.ResourceName]*resource.Quantity{ - v1.ResourceCPU: resource.NewMilliQuantity(node.Status.Allocatable.Cpu().MilliValue()-aggregatePodRequests[v1.ResourceCPU].MilliValue(), resource.DecimalSI), - v1.ResourceMemory: resource.NewQuantity(node.Status.Allocatable.Memory().Value()-aggregatePodRequests[v1.ResourceMemory].Value(), resource.BinarySI), - v1.ResourcePods: resource.NewQuantity(node.Status.Allocatable.Pods().Value()-aggregatePodRequests[v1.ResourcePods].Value(), resource.DecimalSI), - } - for _, name := range resourceNames { - if !IsBasicResource(name) { - allocatableResource := node.Status.Allocatable[name] - remainingResources[name] = resource.NewQuantity(allocatableResource.Value()-aggregatePodRequests[name].Value(), resource.DecimalSI) - } - } - - return remainingResources -} - // IsBasicResource checks if resource is basic native. func IsBasicResource(name v1.ResourceName) bool { switch name { diff --git a/pkg/descheduler/node/node_test.go b/pkg/descheduler/node/node_test.go index deb086cd9..bd22542ec 100644 --- a/pkg/descheduler/node/node_test.go +++ b/pkg/descheduler/node/node_test.go @@ -26,7 +26,6 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes/fake" - core "k8s.io/client-go/testing" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" "sigs.k8s.io/descheduler/test" ) @@ -196,11 +195,6 @@ func TestPodFitsCurrentNode(t *testing.T) { }, } - fakeClient := &fake.Clientset{} - fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { - return true, &v1.PodList{Items: nil}, nil - }) - for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) diff --git a/pkg/descheduler/strategies/node_taint_test.go b/pkg/descheduler/strategies/node_taint_test.go index bacb663bb..ae2d06d2e 100644 --- a/pkg/descheduler/strategies/node_taint_test.go +++ b/pkg/descheduler/strategies/node_taint_test.go @@ -70,13 +70,17 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { Unschedulable: true, } }) - node5 := test.BuildTestNode("n5", 1, 1, 1, nil) node5 := test.BuildTestNode("n5", 2000, 3000, 10, nil) node5.Spec.Taints = []v1.Taint{ createPreferNoScheduleTaint("testTaint", "test", 1), } + node6 := test.BuildTestNode("n6", 1, 1, 1, nil) + node6.Spec.Taints = []v1.Taint{ + createPreferNoScheduleTaint("testTaint", "test", 1), + } + p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil) p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil) p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil) @@ -293,7 +297,7 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { { description: "Critical and non critical pods, pods not tolerating node taint can't be evicted because the only available node does not have enough resources.", pods: []*v1.Pod{p2, p7, p9, p10}, - nodes: []*v1.Node{node1, node5}, + nodes: []*v1.Node{node1, node6}, evictLocalStoragePods: false, evictSystemCriticalPods: true, expectedEvictedPodCount: 0, //p2 and p7 can't be evicted diff --git a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go index 06ccf3232..3196825f6 100644 --- a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go @@ -695,112 +695,6 @@ func TestLowNodeUtilization(t *testing.T) { expectedPodsEvicted: 2, evictedPods: []string{}, }, - { - name: "without priorities, but only other node doesn't match pod node affinity for p4 and p5", - thresholds: api.ResourceThresholds{ - v1.ResourceCPU: 30, - v1.ResourcePods: 30, - }, - targetThresholds: api.ResourceThresholds{ - v1.ResourceCPU: 50, - v1.ResourcePods: 50, - }, - nodes: []*v1.Node{ - test.BuildTestNode(n1NodeName, 500, 200, 9, nil), - test.BuildTestNode(n2NodeName, 200, 200, 10, func(node *v1.Node) { - node.ObjectMeta.Labels = map[string]string{ - nodeSelectorKey: notMatchingNodeSelectorValue, - } - }), - }, - pods: []*v1.Pod{ - test.BuildTestPod("p1", 10, 0, n1NodeName, test.SetRSOwnerRef), - test.BuildTestPod("p2", 10, 0, n1NodeName, test.SetRSOwnerRef), - test.BuildTestPod("p3", 10, 0, n1NodeName, test.SetRSOwnerRef), - // These won't be evicted. - test.BuildTestPod("p4", 400, 100, n1NodeName, func(pod *v1.Pod) { - // A pod that requests too much CPU - test.SetNormalOwnerRef(pod) - }), - test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) { - // A pod with local storage. - test.SetNormalOwnerRef(pod) - pod.Spec.Volumes = []v1.Volume{ - { - Name: "sample", - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, - EmptyDir: &v1.EmptyDirVolumeSource{ - SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, - }, - }, - } - // A Mirror Pod. - pod.Annotations = test.GetMirrorPodAnnotation() - }), - test.BuildTestPod("p6", 400, 0, n1NodeName, func(pod *v1.Pod) { - // A Critical Pod. - pod.Namespace = "kube-system" - priority := utils.SystemCriticalPriority - pod.Spec.Priority = &priority - }), - test.BuildTestPod("p9", 0, 0, n2NodeName, test.SetRSOwnerRef), - }, - expectedPodsEvicted: 3, - }, - { - name: "without priorities, but only other node doesn't match pod node affinity for p4 and p5", - thresholds: api.ResourceThresholds{ - v1.ResourceCPU: 30, - v1.ResourcePods: 30, - }, - targetThresholds: api.ResourceThresholds{ - v1.ResourceCPU: 50, - v1.ResourcePods: 50, - }, - nodes: []*v1.Node{ - test.BuildTestNode(n1NodeName, 500, 200, 9, nil), - test.BuildTestNode(n2NodeName, 200, 200, 10, func(node *v1.Node) { - node.ObjectMeta.Labels = map[string]string{ - nodeSelectorKey: notMatchingNodeSelectorValue, - } - }), - }, - pods: []*v1.Pod{ - test.BuildTestPod("p1", 10, 0, n1NodeName, test.SetRSOwnerRef), - test.BuildTestPod("p2", 10, 0, n1NodeName, test.SetRSOwnerRef), - test.BuildTestPod("p3", 10, 0, n1NodeName, test.SetRSOwnerRef), - // These won't be evicted. - test.BuildTestPod("p4", 400, 100, n1NodeName, func(pod *v1.Pod) { - // A pod that requests too much CPU - test.SetNormalOwnerRef(pod) - }), - test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) { - // A pod with local storage. - test.SetNormalOwnerRef(pod) - pod.Spec.Volumes = []v1.Volume{ - { - Name: "sample", - VolumeSource: v1.VolumeSource{ - HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, - EmptyDir: &v1.EmptyDirVolumeSource{ - SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, - }, - }, - } - // A Mirror Pod. - pod.Annotations = test.GetMirrorPodAnnotation() - }), - test.BuildTestPod("p6", 400, 0, n1NodeName, func(pod *v1.Pod) { - // A Critical Pod. - pod.Namespace = "kube-system" - priority := utils.SystemCriticalPriority - pod.Spec.Priority = &priority - }), - test.BuildTestPod("p9", 0, 0, n2NodeName, test.SetRSOwnerRef), - }, - expectedPodsEvicted: 3, - }, } for _, test := range testCases { diff --git a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go index 3fcd411e3..87f2ac34f 100644 --- a/pkg/descheduler/strategies/nodeutilization/nodeutilization.go +++ b/pkg/descheduler/strategies/nodeutilization/nodeutilization.go @@ -157,7 +157,7 @@ func getNodeUsage( nodeUsageList = append(nodeUsageList, NodeUsage{ node: node, - usage: nodeUtilization(node, pods, resourceNames), + usage: nodeutil.NodeUtilization(pods, resourceNames), allPods: pods, }) } @@ -279,7 +279,7 @@ func evictPodsFromSourceNodes( for _, node := range sourceNodes { klog.V(3).InfoS("Evicting pods from node", "node", klog.KObj(node.node), "usage", node.usage) - nonRemovablePods, removablePods := classifyPods(ctx, node.allPods, podFilter) + nonRemovablePods, removablePods := classifyPods(node.allPods, podFilter) klog.V(2).InfoS("Pods on node", "node", klog.KObj(node.node), "allPods", len(node.allPods), "nonRemovablePods", len(nonRemovablePods), "removablePods", len(removablePods)) if len(removablePods) == 0 { @@ -413,7 +413,7 @@ func getResourceNames(thresholds api.ResourceThresholds) []v1.ResourceName { return resourceNames } -func classifyPods(ctx context.Context, pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) { +func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) { var nonRemovablePods, removablePods []*v1.Pod for _, pod := range pods { @@ -437,7 +437,7 @@ func averageNodeBasicresources(nodes []*v1.Node, getPodsAssignedToNode podutil.G numberOfNodes-- continue } - usage := nodeUtilization(node, pods, resourceNames) + usage := nodeutil.NodeUtilization(pods, resourceNames) nodeCapacity := node.Status.Capacity if len(node.Status.Allocatable) > 0 { nodeCapacity = node.Status.Allocatable diff --git a/pkg/descheduler/strategies/topologyspreadconstraint.go b/pkg/descheduler/strategies/topologyspreadconstraint.go index 98c472aee..035896bbc 100644 --- a/pkg/descheduler/strategies/topologyspreadconstraint.go +++ b/pkg/descheduler/strategies/topologyspreadconstraint.go @@ -168,7 +168,7 @@ func RemovePodsViolatingTopologySpreadConstraint( klog.V(2).InfoS("Skipping topology constraint because it is already balanced", "constraint", constraint) continue } - balanceDomains(ctx, client, getPodsAssignedToNode, podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodes) + balanceDomains(client, getPodsAssignedToNode, podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodes) } } @@ -223,7 +223,6 @@ func topologyIsBalanced(topology map[topologyPair][]*v1.Pod, constraint v1.Topol // [5, 5, 5, 5, 5, 5] // (assuming even distribution by the scheduler of the evicted pods) func balanceDomains( - ctx context.Context, client clientset.Interface, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, podsForEviction map[*v1.Pod]struct{}, @@ -234,7 +233,7 @@ func balanceDomains( nodes []*v1.Node) { idealAvg := sumPods / float64(len(constraintTopologies)) - sortedDomains := sortDomains(ctx, constraintTopologies, isEvictable) + sortedDomains := sortDomains(constraintTopologies, isEvictable) // i is the index for belowOrEqualAvg // j is the index for aboveAvg i := 0 @@ -274,6 +273,17 @@ func balanceDomains( // also (just for tracking), add them to the list of pods in the lower topology aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:] for k := range aboveToEvict { + // PodFitsAnyOtherNode excludes the current node because, for the sake of domain balancing only, we care about if there is any other + // place it could theoretically fit. + // If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading + // Also, if the pod has a hard nodeAffinity/nodeSelector/toleration that only matches this node, + // don't bother evicting it as it will just end up back on the same node + // however we still account for it "being evicted" so the algorithm can complete + // TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod + // but still try to get the required # of movePods (instead of just chopping that value off the slice above). + // In other words, PTS can perform suboptimally if some of its chosen pods don't fit on other nodes. + // This is because the chosen pods aren't sorted, but immovable pods still count as "evicted" toward the PTS algorithm. + // So, a better selection heuristic could improve performance. if !node.PodFitsAnyOtherNode(getPodsAssignedToNode, aboveToEvict[k], nodes) { klog.V(2).InfoS("ignoring pod for eviction as it does not fit on any other node", "pod", klog.KObj(aboveToEvict[k])) continue @@ -293,7 +303,7 @@ func balanceDomains( // 3. pods in descending priority // 4. all other pods // We then pop pods off the back of the list for eviction -func sortDomains(ctx context.Context, constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(pod *v1.Pod) bool) []topology { +func sortDomains(constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(pod *v1.Pod) bool) []topology { sortedTopologies := make([]topology, 0, len(constraintTopologyPairs)) // sort the topologies and return 2 lists: those <= the average and those > the average (> list inverted) for pair, list := range constraintTopologyPairs {