diff --git a/pkg/descheduler/strategies/duplicates.go b/pkg/descheduler/strategies/duplicates.go index dd4be54a9..92de6ec3d 100644 --- a/pkg/descheduler/strategies/duplicates.go +++ b/pkg/descheduler/strategies/duplicates.go @@ -19,6 +19,7 @@ package strategies import ( "context" "fmt" + "math" "reflect" "sort" "strings" @@ -50,6 +51,11 @@ func validateRemoveDuplicatePodsParams(params *api.StrategyParameters) error { return nil } +type podOwner struct { + namespace, kind, name string + imagesHash string +} + // RemoveDuplicatePods removes the duplicate pods on node. This strategy evicts all duplicate pods on node. // A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same // namespace, and have at least one container with the same image. @@ -79,6 +85,11 @@ func RemoveDuplicatePods( evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) + duplicatePods := make(map[podOwner]map[string][]*v1.Pod) + ownerKeyOccurence := make(map[podOwner]int32) + nodeCount := 0 + nodeMap := make(map[string]*v1.Node) + for _, node := range nodes { klog.V(1).InfoS("Processing node", "node", klog.KObj(node)) pods, err := podutil.ListPodsOnANode(ctx, @@ -92,8 +103,8 @@ func RemoveDuplicatePods( klog.ErrorS(err, "Error listing evictable pods on node", "node", klog.KObj(node)) continue } - - duplicatePods := make([]*v1.Pod, 0, len(pods)) + nodeMap[node.Name] = node + nodeCount++ // Each pod has a list of owners and a list of containers, and each container has 1 image spec. // For each pod, we go through all the OwnerRef/Image mappings and represent them as a "key" string. // All of those mappings together makes a list of "key" strings that essentially represent that pod's uniqueness. @@ -114,12 +125,25 @@ func RemoveDuplicatePods( continue } podContainerKeys := make([]string, 0, len(ownerRefList)*len(pod.Spec.Containers)) + imageList := []string{} + for _, container := range pod.Spec.Containers { + imageList = append(imageList, container.Image) + } + sort.Strings(imageList) + imagesHash := strings.Join(imageList, "#") for _, ownerRef := range ownerRefList { - for _, container := range pod.Spec.Containers { + ownerKey := podOwner{ + namespace: pod.ObjectMeta.Namespace, + kind: ownerRef.Kind, + name: ownerRef.Name, + imagesHash: imagesHash, + } + ownerKeyOccurence[ownerKey] = ownerKeyOccurence[ownerKey] + 1 + for _, image := range imageList { // Namespace/Kind/Name should be unique for the cluster. // We also consider the image, as 2 pods could have the same owner but serve different purposes // So any non-unique Namespace/Kind/Name/Image pattern is a duplicate pod. - s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name, container.Image}, "/") + s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name, image}, "/") podContainerKeys = append(podContainerKeys, s) } } @@ -131,7 +155,19 @@ func RemoveDuplicatePods( for _, keys := range existing { if reflect.DeepEqual(keys, podContainerKeys) { matched = true - duplicatePods = append(duplicatePods, pod) + klog.V(3).InfoS("Duplicate found", "pod", klog.KObj(pod)) + for _, ownerRef := range ownerRefList { + ownerKey := podOwner{ + namespace: pod.ObjectMeta.Namespace, + kind: ownerRef.Kind, + name: ownerRef.Name, + imagesHash: imagesHash, + } + if _, ok := duplicatePods[ownerKey]; !ok { + duplicatePods[ownerKey] = make(map[string][]*v1.Pod) + } + duplicatePods[ownerKey][node.Name] = append(duplicatePods[ownerKey][node.Name], pod) + } break } } @@ -144,11 +180,23 @@ func RemoveDuplicatePods( duplicateKeysMap[podContainerKeys[0]] = [][]string{podContainerKeys} } } + } - for _, pod := range duplicatePods { - if _, err := podEvictor.EvictPod(ctx, pod, node, "RemoveDuplicatePods"); err != nil { - klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) - break + // 1. how many pods can be evicted to respect uniform placement of pods among viable nodes? + for ownerKey, nodes := range duplicatePods { + upperAvg := int(math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(nodeCount))) + for nodeName, pods := range nodes { + klog.V(2).InfoS("Average occurrence per node", "node", klog.KObj(nodeMap[nodeName]), "ownerKey", ownerKey, "avg", upperAvg) + // list of duplicated pods does not contain the original referential pod + if len(pods)+1 > upperAvg { + // It's assumed all duplicated pods are in the same priority class + // TODO(jchaloup): check if the pod has a different node to lend to + for _, pod := range pods[upperAvg-1:] { + if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[nodeName], "RemoveDuplicatePods"); err != nil { + klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) + break + } + } } } } diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/descheduler/strategies/duplicates_test.go index a15196e10..0b9060009 100644 --- a/pkg/descheduler/strategies/duplicates_test.go +++ b/pkg/descheduler/strategies/duplicates_test.go @@ -20,8 +20,9 @@ import ( "context" "testing" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes/fake" core "k8s.io/client-go/testing" @@ -31,34 +32,45 @@ import ( "sigs.k8s.io/descheduler/test" ) +func buildTestPodWithImage(podName, node, image string) *v1.Pod { + pod := test.BuildTestPod(podName, 100, 0, node, test.SetRSOwnerRef) + pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{ + Name: image, + Image: image, + }) + return pod +} + func TestFindDuplicatePods(t *testing.T) { ctx := context.Background() // first setup pods - node := test.BuildTestNode("n1", 2000, 3000, 10, nil) - p1 := test.BuildTestPod("p1", 100, 0, node.Name, nil) + node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil) + node2 := test.BuildTestNode("n2", 2000, 3000, 10, nil) + + p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil) p1.Namespace = "dev" - p2 := test.BuildTestPod("p2", 100, 0, node.Name, nil) + p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil) p2.Namespace = "dev" - p3 := test.BuildTestPod("p3", 100, 0, node.Name, nil) + p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil) p3.Namespace = "dev" - p4 := test.BuildTestPod("p4", 100, 0, node.Name, nil) - p5 := test.BuildTestPod("p5", 100, 0, node.Name, nil) - p6 := test.BuildTestPod("p6", 100, 0, node.Name, nil) - p7 := test.BuildTestPod("p7", 100, 0, node.Name, nil) + p4 := test.BuildTestPod("p4", 100, 0, node1.Name, nil) + p5 := test.BuildTestPod("p5", 100, 0, node1.Name, nil) + p6 := test.BuildTestPod("p6", 100, 0, node1.Name, nil) + p7 := test.BuildTestPod("p7", 100, 0, node1.Name, nil) p7.Namespace = "kube-system" - p8 := test.BuildTestPod("p8", 100, 0, node.Name, nil) + p8 := test.BuildTestPod("p8", 100, 0, node1.Name, nil) p8.Namespace = "test" - p9 := test.BuildTestPod("p9", 100, 0, node.Name, nil) + p9 := test.BuildTestPod("p9", 100, 0, node1.Name, nil) p9.Namespace = "test" - p10 := test.BuildTestPod("p10", 100, 0, node.Name, nil) + p10 := test.BuildTestPod("p10", 100, 0, node1.Name, nil) p10.Namespace = "test" - p11 := test.BuildTestPod("p11", 100, 0, node.Name, nil) + p11 := test.BuildTestPod("p11", 100, 0, node1.Name, nil) p11.Namespace = "different-images" - p12 := test.BuildTestPod("p12", 100, 0, node.Name, nil) + p12 := test.BuildTestPod("p12", 100, 0, node1.Name, nil) p12.Namespace = "different-images" - p13 := test.BuildTestPod("p13", 100, 0, node.Name, nil) + p13 := test.BuildTestPod("p13", 100, 0, node1.Name, nil) p13.Namespace = "different-images" - p14 := test.BuildTestPod("p14", 100, 0, node.Name, nil) + p14 := test.BuildTestPod("p14", 100, 0, node1.Name, nil) p14.Namespace = "different-images" // ### Evictable Pods ### @@ -121,10 +133,10 @@ func TestFindDuplicatePods(t *testing.T) { strategy api.DeschedulerStrategy }{ { - description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. 2 should be evicted.", + description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. 1 should be evicted.", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p1, *p2, *p3}, - expectedEvictedPodCount: 2, + expectedEvictedPodCount: 1, strategy: api.DeschedulerStrategy{}, }, { @@ -135,17 +147,17 @@ func TestFindDuplicatePods(t *testing.T) { strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{RemoveDuplicates: &api.RemoveDuplicates{ExcludeOwnerKinds: []string{"ReplicaSet"}}}}, }, { - description: "Three Pods in the `test` Namespace, bound to same ReplicaSet. 2 should be evicted.", + description: "Three Pods in the `test` Namespace, bound to same ReplicaSet. 1 should be evicted.", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p8, *p9, *p10}, - expectedEvictedPodCount: 2, + expectedEvictedPodCount: 1, strategy: api.DeschedulerStrategy{}, }, { description: "Three Pods in the `dev` Namespace, three Pods in the `test` Namespace. Bound to ReplicaSet with same name. 4 should be evicted.", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p1, *p2, *p3, *p8, *p9, *p10}, - expectedEvictedPodCount: 4, + expectedEvictedPodCount: 2, strategy: api.DeschedulerStrategy{}, }, { @@ -159,7 +171,7 @@ func TestFindDuplicatePods(t *testing.T) { description: "Test all Pods: 4 should be evicted.", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p1, *p2, *p3, *p4, *p5, *p6, *p7, *p8, *p9, *p10}, - expectedEvictedPodCount: 4, + expectedEvictedPodCount: 2, strategy: api.DeschedulerStrategy{}, }, { @@ -186,27 +198,220 @@ func TestFindDuplicatePods(t *testing.T) { } for _, testCase := range testCases { - fakeClient := &fake.Clientset{} - fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { - return true, &v1.PodList{Items: testCase.pods}, nil - }) - fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) { - return true, node, nil - }) - podEvictor := evictions.NewPodEvictor( - fakeClient, - "v1", - false, - testCase.maxPodsToEvictPerNode, - []*v1.Node{node}, - false, - ) + t.Run(testCase.description, func(t *testing.T) { + fakeClient := &fake.Clientset{} + fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { + return true, &v1.PodList{Items: testCase.pods}, nil + }) + podEvictor := evictions.NewPodEvictor( + fakeClient, + "v1", + false, + testCase.maxPodsToEvictPerNode, + []*v1.Node{node1, node2}, + false, + ) - RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, []*v1.Node{node}, podEvictor) - podsEvicted := podEvictor.TotalEvicted() - if podsEvicted != testCase.expectedEvictedPodCount { - t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted) - } + RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, []*v1.Node{node1, node2}, podEvictor) + podsEvicted := podEvictor.TotalEvicted() + if podsEvicted != testCase.expectedEvictedPodCount { + t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted) + } + }) } } + +func TestRemoveDuplicatesUniformly(t *testing.T) { + ctx := context.Background() + + setRSOwnerRef2 := func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = []metav1.OwnerReference{ + {Kind: "ReplicaSet", APIVersion: "v1", Name: "replicaset-2"}, + } + } + setTwoRSOwnerRef := func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = []metav1.OwnerReference{ + {Kind: "ReplicaSet", APIVersion: "v1", Name: "replicaset-1"}, + {Kind: "ReplicaSet", APIVersion: "v1", Name: "replicaset-2"}, + } + } + + testCases := []struct { + description string + maxPodsToEvictPerNode int + pods []v1.Pod + nodes []*v1.Node + expectedEvictedPodCount int + strategy api.DeschedulerStrategy + }{ + { + description: "Evict pods uniformly", + pods: []v1.Pod{ + // (5,3,1) -> (3,3,3) -> 2 evictions + *test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p3", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p4", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p5", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p6", 100, 0, "n2", test.SetRSOwnerRef), + *test.BuildTestPod("p7", 100, 0, "n2", test.SetRSOwnerRef), + *test.BuildTestPod("p8", 100, 0, "n2", test.SetRSOwnerRef), + *test.BuildTestPod("p9", 100, 0, "n3", test.SetRSOwnerRef), + }, + expectedEvictedPodCount: 2, + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, nil), + test.BuildTestNode("n2", 2000, 3000, 10, nil), + test.BuildTestNode("n3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods uniformly with one node left out", + pods: []v1.Pod{ + // (5,3,1) -> (4,4,1) -> 1 eviction + *test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p3", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p4", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p5", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p6", 100, 0, "n2", test.SetRSOwnerRef), + *test.BuildTestPod("p7", 100, 0, "n2", test.SetRSOwnerRef), + *test.BuildTestPod("p8", 100, 0, "n2", test.SetRSOwnerRef), + *test.BuildTestPod("p9", 100, 0, "n3", test.SetRSOwnerRef), + }, + expectedEvictedPodCount: 1, + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, nil), + test.BuildTestNode("n2", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods uniformly with two replica sets", + pods: []v1.Pod{ + // (5,3,1) -> (3,3,3) -> 2 evictions + *test.BuildTestPod("p11", 100, 0, "n1", setTwoRSOwnerRef), + *test.BuildTestPod("p12", 100, 0, "n1", setTwoRSOwnerRef), + *test.BuildTestPod("p13", 100, 0, "n1", setTwoRSOwnerRef), + *test.BuildTestPod("p14", 100, 0, "n1", setTwoRSOwnerRef), + *test.BuildTestPod("p15", 100, 0, "n1", setTwoRSOwnerRef), + *test.BuildTestPod("p16", 100, 0, "n2", setTwoRSOwnerRef), + *test.BuildTestPod("p17", 100, 0, "n2", setTwoRSOwnerRef), + *test.BuildTestPod("p18", 100, 0, "n2", setTwoRSOwnerRef), + *test.BuildTestPod("p19", 100, 0, "n3", setTwoRSOwnerRef), + }, + expectedEvictedPodCount: 4, + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, nil), + test.BuildTestNode("n2", 2000, 3000, 10, nil), + test.BuildTestNode("n3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods uniformly with two owner references", + pods: []v1.Pod{ + // (5,3,1) -> (3,3,3) -> 2 evictions + *test.BuildTestPod("p11", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p12", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p13", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p14", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p15", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p16", 100, 0, "n2", test.SetRSOwnerRef), + *test.BuildTestPod("p17", 100, 0, "n2", test.SetRSOwnerRef), + *test.BuildTestPod("p18", 100, 0, "n2", test.SetRSOwnerRef), + *test.BuildTestPod("p19", 100, 0, "n3", test.SetRSOwnerRef), + // (1,3,5) -> (3,3,3) -> 2 evictions + *test.BuildTestPod("p21", 100, 0, "n1", setRSOwnerRef2), + *test.BuildTestPod("p22", 100, 0, "n2", setRSOwnerRef2), + *test.BuildTestPod("p23", 100, 0, "n2", setRSOwnerRef2), + *test.BuildTestPod("p24", 100, 0, "n2", setRSOwnerRef2), + *test.BuildTestPod("p25", 100, 0, "n3", setRSOwnerRef2), + *test.BuildTestPod("p26", 100, 0, "n3", setRSOwnerRef2), + *test.BuildTestPod("p27", 100, 0, "n3", setRSOwnerRef2), + *test.BuildTestPod("p28", 100, 0, "n3", setRSOwnerRef2), + *test.BuildTestPod("p29", 100, 0, "n3", setRSOwnerRef2), + }, + expectedEvictedPodCount: 4, + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, nil), + test.BuildTestNode("n2", 2000, 3000, 10, nil), + test.BuildTestNode("n3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods with number of pods less than nodes", + pods: []v1.Pod{ + // (2,0,0) -> (1,1,0) -> 1 eviction + *test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef), + *test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef), + }, + expectedEvictedPodCount: 1, + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, nil), + test.BuildTestNode("n2", 2000, 3000, 10, nil), + test.BuildTestNode("n3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods with number of pods less than nodes, but ignore different pods with the same ownerref", + pods: []v1.Pod{ + // (1, 0, 0) for "bar","baz" images -> no eviction, even with a matching ownerKey + // (2, 0, 0) for "foo" image -> (1,1,0) - 1 eviction + // In this case the only "real" duplicates are p1 and p4, so one of those should be evicted + *buildTestPodWithImage("p1", "n1", "foo"), + *buildTestPodWithImage("p2", "n1", "bar"), + *buildTestPodWithImage("p3", "n1", "baz"), + *buildTestPodWithImage("p4", "n1", "foo"), + }, + expectedEvictedPodCount: 1, + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, nil), + test.BuildTestNode("n2", 2000, 3000, 10, nil), + test.BuildTestNode("n3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Evict pods with a single pod with three nodes", + pods: []v1.Pod{ + // (2,0,0) -> (1,1,0) -> 1 eviction + *test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef), + }, + expectedEvictedPodCount: 0, + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, nil), + test.BuildTestNode("n2", 2000, 3000, 10, nil), + test.BuildTestNode("n3", 2000, 3000, 10, nil), + }, + strategy: api.DeschedulerStrategy{}, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { + fakeClient := &fake.Clientset{} + fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { + return true, &v1.PodList{Items: testCase.pods}, nil + }) + podEvictor := evictions.NewPodEvictor( + fakeClient, + "v1", + false, + testCase.maxPodsToEvictPerNode, + testCase.nodes, + false, + ) + + RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor) + podsEvicted := podEvictor.TotalEvicted() + if podsEvicted != testCase.expectedEvictedPodCount { + t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted) + } + }) + } +}