diff --git a/pkg/descheduler/strategies/duplicates.go b/pkg/descheduler/strategies/duplicates.go index 92de6ec3d..380979d6d 100644 --- a/pkg/descheduler/strategies/duplicates.go +++ b/pkg/descheduler/strategies/duplicates.go @@ -183,9 +183,17 @@ func RemoveDuplicatePods( } // 1. how many pods can be evicted to respect uniform placement of pods among viable nodes? - for ownerKey, nodes := range duplicatePods { - upperAvg := int(math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(nodeCount))) - for nodeName, pods := range nodes { + for ownerKey, podNodes := range duplicatePods { + targetNodes := getTargetNodes(podNodes, nodes) + + klog.V(2).InfoS("Adjusting feasible nodes", "owner", ownerKey, "from", nodeCount, "to", len(targetNodes)) + if len(targetNodes) < 2 { + klog.V(1).InfoS("Less than two feasible nodes for duplicates to land, skipping eviction", "owner", ownerKey) + continue + } + + upperAvg := int(math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(len(targetNodes)))) + for nodeName, pods := range podNodes { klog.V(2).InfoS("Average occurrence per node", "node", klog.KObj(nodeMap[nodeName]), "ownerKey", ownerKey, "avg", upperAvg) // list of duplicated pods does not contain the original referential pod if len(pods)+1 > upperAvg { @@ -202,6 +210,73 @@ func RemoveDuplicatePods( } } +func getNodeAffinityNodeSelector(pod *v1.Pod) *v1.NodeSelector { + if pod.Spec.Affinity == nil { + return nil + } + if pod.Spec.Affinity.NodeAffinity == nil { + return nil + } + return pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution +} + +func getTargetNodes(podNodes map[string][]*v1.Pod, nodes []*v1.Node) []*v1.Node { + // In order to reduce the number of pods processed, identify pods which have + // equal (tolerations, nodeselectors, node affinity) terms and considered them + // as identical. Identical pods wrt. (tolerations, nodeselectors, node affinity) terms + // will produce the same result when checking if a pod is feasible for a node. + // Thus, improving efficiency of processing pods marked for eviction. + + // Collect all distinct pods which differ in at least taints, node affinity or node selector terms + distinctPods := map[*v1.Pod]struct{}{} + for _, pods := range podNodes { + for _, pod := range pods { + duplicated := false + for dp := range distinctPods { + if utils.TolerationsEqual(pod.Spec.Tolerations, dp.Spec.Tolerations) && + utils.NodeSelectorsEqual(getNodeAffinityNodeSelector(pod), getNodeAffinityNodeSelector(dp)) && + reflect.DeepEqual(pod.Spec.NodeSelector, dp.Spec.NodeSelector) { + duplicated = true + continue + } + } + if duplicated { + continue + } + distinctPods[pod] = struct{}{} + } + } + + // For each distinct pod get a list of nodes where it can land + targetNodesMap := map[string]*v1.Node{} + for pod := range distinctPods { + matchingNodes := map[string]*v1.Node{} + for _, node := range nodes { + if !utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool { + return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute + }) { + continue + } + if match, err := utils.PodMatchNodeSelector(pod, node); err == nil && !match { + continue + } + matchingNodes[node.Name] = node + } + if len(matchingNodes) > 1 { + for nodeName := range matchingNodes { + targetNodesMap[nodeName] = matchingNodes[nodeName] + } + } + } + + targetNodes := []*v1.Node{} + for _, node := range targetNodesMap { + targetNodes = append(targetNodes, node) + } + + return targetNodes +} + func hasExcludedOwnerRefKind(ownerRefs []metav1.OwnerReference, strategy api.DeschedulerStrategy) bool { if strategy.Params == nil || strategy.Params.RemoveDuplicates == nil { return false diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/descheduler/strategies/duplicates_test.go index 8121d3a77..fe8d4dab9 100644 --- a/pkg/descheduler/strategies/duplicates_test.go +++ b/pkg/descheduler/strategies/duplicates_test.go @@ -240,6 +240,117 @@ func TestRemoveDuplicatesUniformly(t *testing.T) { } } + setTolerationsK1 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + pod.Spec.Tolerations = []v1.Toleration{ + { + Key: "k1", + Value: "v1", + Operator: v1.TolerationOpEqual, + Effect: v1.TaintEffectNoSchedule, + }, + } + } + setTolerationsK2 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + pod.Spec.Tolerations = []v1.Toleration{ + { + Key: "k2", + Value: "v2", + Operator: v1.TolerationOpEqual, + Effect: v1.TaintEffectNoSchedule, + }, + } + } + + setMasterNoScheduleTaint := func(node *v1.Node) { + node.Spec.Taints = []v1.Taint{ + { + Effect: v1.TaintEffectNoSchedule, + Key: "node-role.kubernetes.io/master", + }, + } + } + + setMasterNoScheduleLabel := func(node *v1.Node) { + if node.ObjectMeta.Labels == nil { + node.ObjectMeta.Labels = map[string]string{} + } + node.ObjectMeta.Labels["node-role.kubernetes.io/master"] = "" + } + + setWorkerLabel := func(node *v1.Node) { + if node.ObjectMeta.Labels == nil { + node.ObjectMeta.Labels = map[string]string{} + } + node.ObjectMeta.Labels["node-role.kubernetes.io/worker"] = "k1" + node.ObjectMeta.Labels["node-role.kubernetes.io/worker"] = "k2" + } + + setNotMasterNodeSelectorK1 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + pod.Spec.Affinity = &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "node-role.kubernetes.io/master", + Operator: v1.NodeSelectorOpDoesNotExist, + }, + { + Key: "k1", + Operator: v1.NodeSelectorOpDoesNotExist, + }, + }, + }, + }, + }, + }, + } + } + + setNotMasterNodeSelectorK2 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + pod.Spec.Affinity = &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "node-role.kubernetes.io/master", + Operator: v1.NodeSelectorOpDoesNotExist, + }, + { + Key: "k2", + Operator: v1.NodeSelectorOpDoesNotExist, + }, + }, + }, + }, + }, + }, + } + } + + setWorkerLabelSelectorK1 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + if pod.Spec.NodeSelector == nil { + pod.Spec.NodeSelector = map[string]string{} + } + pod.Spec.NodeSelector["node-role.kubernetes.io/worker"] = "k1" + } + + setWorkerLabelSelectorK2 := func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + if pod.Spec.NodeSelector == nil { + pod.Spec.NodeSelector = map[string]string{} + } + pod.Spec.NodeSelector["node-role.kubernetes.io/worker"] = "k2" + } + testCases := []struct { description string maxPodsToEvictPerNode int @@ -393,6 +504,106 @@ func TestRemoveDuplicatesUniformly(t *testing.T) { }, strategy: api.DeschedulerStrategy{}, }, + { + description: "Evict pods uniformly respecting taints", + pods: []v1.Pod{ + // (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions + *test.BuildTestPod("p1", 100, 0, "worker1", setTolerationsK1), + *test.BuildTestPod("p2", 100, 0, "worker1", setTolerationsK2), + *test.BuildTestPod("p3", 100, 0, "worker1", setTolerationsK1), + *test.BuildTestPod("p4", 100, 0, "worker1", setTolerationsK2), + *test.BuildTestPod("p5", 100, 0, "worker1", setTolerationsK1), + *test.BuildTestPod("p6", 100, 0, "worker2", setTolerationsK2), + *test.BuildTestPod("p7", 100, 0, "worker2", setTolerationsK1), + *test.BuildTestPod("p8", 100, 0, "worker2", setTolerationsK2), + *test.BuildTestPod("p9", 100, 0, "worker3", setTolerationsK1), + }, + expectedEvictedPodCount: 2, + nodes: []*v1.Node{ + test.BuildTestNode("worker1", 2000, 3000, 10, nil), + test.BuildTestNode("worker2", 2000, 3000, 10, nil), + test.BuildTestNode("worker3", 2000, 3000, 10, nil), + test.BuildTestNode("master1", 2000, 3000, 10, setMasterNoScheduleTaint), + test.BuildTestNode("master2", 2000, 3000, 10, setMasterNoScheduleTaint), + test.BuildTestNode("master3", 2000, 3000, 10, setMasterNoScheduleTaint), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods uniformly respecting RequiredDuringSchedulingIgnoredDuringExecution node affinity", + pods: []v1.Pod{ + // (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions + *test.BuildTestPod("p1", 100, 0, "worker1", setNotMasterNodeSelectorK1), + *test.BuildTestPod("p2", 100, 0, "worker1", setNotMasterNodeSelectorK2), + *test.BuildTestPod("p3", 100, 0, "worker1", setNotMasterNodeSelectorK1), + *test.BuildTestPod("p4", 100, 0, "worker1", setNotMasterNodeSelectorK2), + *test.BuildTestPod("p5", 100, 0, "worker1", setNotMasterNodeSelectorK1), + *test.BuildTestPod("p6", 100, 0, "worker2", setNotMasterNodeSelectorK2), + *test.BuildTestPod("p7", 100, 0, "worker2", setNotMasterNodeSelectorK1), + *test.BuildTestPod("p8", 100, 0, "worker2", setNotMasterNodeSelectorK2), + *test.BuildTestPod("p9", 100, 0, "worker3", setNotMasterNodeSelectorK1), + }, + expectedEvictedPodCount: 2, + nodes: []*v1.Node{ + test.BuildTestNode("worker1", 2000, 3000, 10, nil), + test.BuildTestNode("worker2", 2000, 3000, 10, nil), + test.BuildTestNode("worker3", 2000, 3000, 10, nil), + test.BuildTestNode("master1", 2000, 3000, 10, setMasterNoScheduleLabel), + test.BuildTestNode("master2", 2000, 3000, 10, setMasterNoScheduleLabel), + test.BuildTestNode("master3", 2000, 3000, 10, setMasterNoScheduleLabel), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods uniformly respecting node selector", + pods: []v1.Pod{ + // (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions + *test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2), + *test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2), + *test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2), + *test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1), + *test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2), + *test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1), + }, + expectedEvictedPodCount: 2, + nodes: []*v1.Node{ + test.BuildTestNode("worker1", 2000, 3000, 10, setWorkerLabel), + test.BuildTestNode("worker2", 2000, 3000, 10, setWorkerLabel), + test.BuildTestNode("worker3", 2000, 3000, 10, setWorkerLabel), + test.BuildTestNode("master1", 2000, 3000, 10, nil), + test.BuildTestNode("master2", 2000, 3000, 10, nil), + test.BuildTestNode("master3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods uniformly respecting node selector with zero target nodes", + pods: []v1.Pod{ + // (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions + *test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2), + *test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2), + *test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1), + *test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2), + *test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1), + *test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2), + *test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1), + }, + expectedEvictedPodCount: 0, + nodes: []*v1.Node{ + test.BuildTestNode("worker1", 2000, 3000, 10, nil), + test.BuildTestNode("worker2", 2000, 3000, 10, nil), + test.BuildTestNode("worker3", 2000, 3000, 10, nil), + test.BuildTestNode("master1", 2000, 3000, 10, nil), + test.BuildTestNode("master2", 2000, 3000, 10, nil), + test.BuildTestNode("master3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, } for _, testCase := range testCases {