From 5396282e3ddfc2ab8a180ae026f8c3af39b9bc13 Mon Sep 17 00:00:00 2001 From: Jan Chaloupka Date: Wed, 5 May 2021 14:37:09 +0200 Subject: [PATCH] RemoveDuplicates: take node taints, node affinity and node selector into account when computing a number of feasible nodes for the average occurence of pods per node Nodes with taints which are not tolerated by evicted pods will never run the pods. The same holds for node affinity and node selector. So increase the number of pods per feasible nodes to decrease the number of evicted pods. --- pkg/descheduler/strategies/duplicates.go | 81 ++++++- pkg/descheduler/strategies/duplicates_test.go | 211 ++++++++++++++++++ 2 files changed, 289 insertions(+), 3 deletions(-) diff --git a/pkg/descheduler/strategies/duplicates.go b/pkg/descheduler/strategies/duplicates.go index 92de6ec3d..380979d6d 100644 --- a/pkg/descheduler/strategies/duplicates.go +++ b/pkg/descheduler/strategies/duplicates.go @@ -183,9 +183,17 @@ func RemoveDuplicatePods( } // 1. how many pods can be evicted to respect uniform placement of pods among viable nodes? - for ownerKey, nodes := range duplicatePods { - upperAvg := int(math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(nodeCount))) - for nodeName, pods := range nodes { + for ownerKey, podNodes := range duplicatePods { + targetNodes := getTargetNodes(podNodes, nodes) + + klog.V(2).InfoS("Adjusting feasible nodes", "owner", ownerKey, "from", nodeCount, "to", len(targetNodes)) + if len(targetNodes) < 2 { + klog.V(1).InfoS("Less than two feasible nodes for duplicates to land, skipping eviction", "owner", ownerKey) + continue + } + + upperAvg := int(math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(len(targetNodes)))) + for nodeName, pods := range podNodes { klog.V(2).InfoS("Average occurrence per node", "node", klog.KObj(nodeMap[nodeName]), "ownerKey", ownerKey, "avg", upperAvg) // list of duplicated pods does not contain the original referential pod if len(pods)+1 > upperAvg { @@ -202,6 +210,73 @@ func RemoveDuplicatePods( } } +func getNodeAffinityNodeSelector(pod *v1.Pod) *v1.NodeSelector { + if pod.Spec.Affinity == nil { + return nil + } + if pod.Spec.Affinity.NodeAffinity == nil { + return nil + } + return pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution +} + +func getTargetNodes(podNodes map[string][]*v1.Pod, nodes []*v1.Node) []*v1.Node { + // In order to reduce the number of pods processed, identify pods which have + // equal (tolerations, nodeselectors, node affinity) terms and considered them + // as identical. Identical pods wrt. (tolerations, nodeselectors, node affinity) terms + // will produce the same result when checking if a pod is feasible for a node. + // Thus, improving efficiency of processing pods marked for eviction. + + // Collect all distinct pods which differ in at least taints, node affinity or node selector terms + distinctPods := map[*v1.Pod]struct{}{} + for _, pods := range podNodes { + for _, pod := range pods { + duplicated := false + for dp := range distinctPods { + if utils.TolerationsEqual(pod.Spec.Tolerations, dp.Spec.Tolerations) && + utils.NodeSelectorsEqual(getNodeAffinityNodeSelector(pod), getNodeAffinityNodeSelector(dp)) && + reflect.DeepEqual(pod.Spec.NodeSelector, dp.Spec.NodeSelector) { + duplicated = true + continue + } + } + if duplicated { + continue + } + distinctPods[pod] = struct{}{} + } + } + + // For each distinct pod get a list of nodes where it can land + targetNodesMap := map[string]*v1.Node{} + for pod := range distinctPods { + matchingNodes := map[string]*v1.Node{} + for _, node := range nodes { + if !utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool { + return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute + }) { + continue + } + if match, err := utils.PodMatchNodeSelector(pod, node); err == nil && !match { + continue + } + matchingNodes[node.Name] = node + } + if len(matchingNodes) > 1 { + for nodeName := range matchingNodes { + targetNodesMap[nodeName] = matchingNodes[nodeName] + } + } + } + + targetNodes := []*v1.Node{} + for _, node := range targetNodesMap { + targetNodes = append(targetNodes, node) + } + + return targetNodes +} + func hasExcludedOwnerRefKind(ownerRefs []metav1.OwnerReference, strategy api.DeschedulerStrategy) bool { if strategy.Params == nil || strategy.Params.RemoveDuplicates == nil { return false diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/descheduler/strategies/duplicates_test.go index 8121d3a77..fe8d4dab9 100644 --- a/pkg/descheduler/strategies/duplicates_test.go +++ b/pkg/descheduler/strategies/duplicates_test.go @@ -240,6 +240,117 @@ func TestRemoveDuplicatesUniformly(t *testing.T) { } } + setTolerationsK1 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + pod.Spec.Tolerations = []v1.Toleration{ + { + Key: "k1", + Value: "v1", + Operator: v1.TolerationOpEqual, + Effect: v1.TaintEffectNoSchedule, + }, + } + } + setTolerationsK2 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + pod.Spec.Tolerations = []v1.Toleration{ + { + Key: "k2", + Value: "v2", + Operator: v1.TolerationOpEqual, + Effect: v1.TaintEffectNoSchedule, + }, + } + } + + setMasterNoScheduleTaint := func(node *v1.Node) { + node.Spec.Taints = []v1.Taint{ + { + Effect: v1.TaintEffectNoSchedule, + Key: "node-role.kubernetes.io/master", + }, + } + } + + setMasterNoScheduleLabel := func(node *v1.Node) { + if node.ObjectMeta.Labels == nil { + node.ObjectMeta.Labels = map[string]string{} + } + node.ObjectMeta.Labels["node-role.kubernetes.io/master"] = "" + } + + setWorkerLabel := func(node *v1.Node) { + if node.ObjectMeta.Labels == nil { + node.ObjectMeta.Labels = map[string]string{} + } + node.ObjectMeta.Labels["node-role.kubernetes.io/worker"] = "k1" + node.ObjectMeta.Labels["node-role.kubernetes.io/worker"] = "k2" + } + + setNotMasterNodeSelectorK1 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + pod.Spec.Affinity = &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "node-role.kubernetes.io/master", + Operator: v1.NodeSelectorOpDoesNotExist, + }, + { + Key: "k1", + Operator: v1.NodeSelectorOpDoesNotExist, + }, + }, + }, + }, + }, + }, + } + } + + setNotMasterNodeSelectorK2 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + pod.Spec.Affinity = &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "node-role.kubernetes.io/master", + Operator: v1.NodeSelectorOpDoesNotExist, + }, + { + Key: "k2", + Operator: v1.NodeSelectorOpDoesNotExist, + }, + }, + }, + }, + }, + }, + } + } + + setWorkerLabelSelectorK1 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + if pod.Spec.NodeSelector == nil { + pod.Spec.NodeSelector = map[string]string{} + } + pod.Spec.NodeSelector["node-role.kubernetes.io/worker"] = "k1" + } + + setWorkerLabelSelectorK2 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + if pod.Spec.NodeSelector == nil { + pod.Spec.NodeSelector = map[string]string{} + } + pod.Spec.NodeSelector["node-role.kubernetes.io/worker"] = "k2" + } + testCases := []struct { description string maxPodsToEvictPerNode int @@ -393,6 +504,106 @@ func TestRemoveDuplicatesUniformly(t *testing.T) { }, strategy: api.DeschedulerStrategy{}, }, + { + description: "Evict pods uniformly respecting taints", + pods: []v1.Pod{ + // (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions + *test.BuildTestPod("p1", 100, 0, "worker1", setTolerationsK1), + *test.BuildTestPod("p2", 100, 0, "worker1", setTolerationsK2), + *test.BuildTestPod("p3", 100, 0, "worker1", setTolerationsK1), + *test.BuildTestPod("p4", 100, 0, "worker1", setTolerationsK2), + *test.BuildTestPod("p5", 100, 0, "worker1", setTolerationsK1), + *test.BuildTestPod("p6", 100, 0, "worker2", setTolerationsK2), + *test.BuildTestPod("p7", 100, 0, "worker2", setTolerationsK1), + *test.BuildTestPod("p8", 100, 0, "worker2", setTolerationsK2), + *test.BuildTestPod("p9", 100, 0, "worker3", setTolerationsK1), + }, + expectedEvictedPodCount: 2, + nodes: []*v1.Node{ + test.BuildTestNode("worker1", 2000, 3000, 10, nil), + test.BuildTestNode("worker2", 2000, 3000, 10, nil), + test.BuildTestNode("worker3", 2000, 3000, 10, nil), + test.BuildTestNode("master1", 2000, 3000, 10, setMasterNoScheduleTaint), + test.BuildTestNode("master2", 2000, 3000, 10, setMasterNoScheduleTaint), + test.BuildTestNode("master3", 2000, 3000, 10, setMasterNoScheduleTaint), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods uniformly respecting RequiredDuringSchedulingIgnoredDuringExecution node affinity", + pods: []v1.Pod{ + // (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions + *test.BuildTestPod("p1", 100, 0, "worker1", setNotMasterNodeSelectorK1), + *test.BuildTestPod("p2", 100, 0, "worker1", setNotMasterNodeSelectorK2), + *test.BuildTestPod("p3", 100, 0, "worker1", setNotMasterNodeSelectorK1), + *test.BuildTestPod("p4", 100, 0, "worker1", setNotMasterNodeSelectorK2), + *test.BuildTestPod("p5", 100, 0, "worker1", setNotMasterNodeSelectorK1), + *test.BuildTestPod("p6", 100, 0, "worker2", setNotMasterNodeSelectorK2), + *test.BuildTestPod("p7", 100, 0, "worker2", setNotMasterNodeSelectorK1), + *test.BuildTestPod("p8", 100, 0, "worker2", setNotMasterNodeSelectorK2), + *test.BuildTestPod("p9", 100, 0, "worker3", setNotMasterNodeSelectorK1), + }, + expectedEvictedPodCount: 2, + nodes: []*v1.Node{ + test.BuildTestNode("worker1", 2000, 3000, 10, nil), + test.BuildTestNode("worker2", 2000, 3000, 10, nil), + test.BuildTestNode("worker3", 2000, 3000, 10, nil), + test.BuildTestNode("master1", 2000, 3000, 10, setMasterNoScheduleLabel), + test.BuildTestNode("master2", 2000, 3000, 10, setMasterNoScheduleLabel), + test.BuildTestNode("master3", 2000, 3000, 10, setMasterNoScheduleLabel), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods uniformly respecting node selector", + pods: []v1.Pod{ + // (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions + *test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2), + *test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2), + *test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2), + *test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1), + *test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2), + *test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1), + }, + expectedEvictedPodCount: 2, + nodes: []*v1.Node{ + test.BuildTestNode("worker1", 2000, 3000, 10, setWorkerLabel), + test.BuildTestNode("worker2", 2000, 3000, 10, setWorkerLabel), + test.BuildTestNode("worker3", 2000, 3000, 10, setWorkerLabel), + test.BuildTestNode("master1", 2000, 3000, 10, nil), + test.BuildTestNode("master2", 2000, 3000, 10, nil), + test.BuildTestNode("master3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods uniformly respecting node selector with zero target nodes", + pods: []v1.Pod{ + // (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions + *test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2), + *test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2), + *test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2), + *test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1), + *test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2), + *test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1), + }, + expectedEvictedPodCount: 0, + nodes: []*v1.Node{ + test.BuildTestNode("worker1", 2000, 3000, 10, nil), + test.BuildTestNode("worker2", 2000, 3000, 10, nil), + test.BuildTestNode("worker3", 2000, 3000, 10, nil), + test.BuildTestNode("master1", 2000, 3000, 10, nil), + test.BuildTestNode("master2", 2000, 3000, 10, nil), + test.BuildTestNode("master3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, } for _, testCase := range testCases {