diff --git a/README.md b/README.md index 6af68990b..16fe806ba 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ $ kubectl create -f descheduler-job.yaml ## Policy and Strategies Descheduler's policy is configurable and includes strategies to be enabled or disabled. -Four strategies, `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`, `RemovePodsViolatingNodeAffinity` are currently implemented. +Five strategies, `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`, `RemovePodsViolatingNodeAffinity` , `RemovePodsViolatingNodeTaints` are currently implemented. As part of the policy, the parameters associated with the strategies can be configured too. By default, all strategies are enabled. @@ -248,7 +248,17 @@ strategies: nodeAffinityType: - "requiredDuringSchedulingIgnoredDuringExecution" ``` +### RemovePodsViolatingNodeTaints +This strategy makes sure that pods violating NoSchedule taints on nodes are removed. For example: there is a pod "podA" with toleration to tolerate a taint ``key=value:NoSchedule`` scheduled and running on the tainted node. If the node's taint is subsequently updated/removed, taint is no longer satisfied by its pods' tolerations and will be evicted. The policy file should look like: + +```` +apiVersion: "descheduler/v1alpha1" +kind: "DeschedulerPolicy" +strategies: + "RemovePodsViolatingNodeTaints": + enabled: true +```` ## Pod Evictions When the descheduler decides to evict pods from a node, it employs following general mechanism: @@ -271,7 +281,6 @@ disruption budget (PDB). The pods are evicted by using eviction subresource to h This roadmap is not in any particular order. -* Strategy to consider taints and tolerations * Consideration of pod affinity * Strategy to consider pod life time * Strategy to consider number of pending pods diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index f23427472..3bcc4da5f 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -65,6 +65,6 @@ func Run(rs *options.DeschedulerServer) error { strategies.LowNodeUtilization(rs, deschedulerPolicy.Strategies["LowNodeUtilization"], evictionPolicyGroupVersion, nodes, nodePodCount) strategies.RemovePodsViolatingInterPodAntiAffinity(rs, deschedulerPolicy.Strategies["RemovePodsViolatingInterPodAntiAffinity"], evictionPolicyGroupVersion, nodes, nodePodCount) strategies.RemovePodsViolatingNodeAffinity(rs, deschedulerPolicy.Strategies["RemovePodsViolatingNodeAffinity"], evictionPolicyGroupVersion, nodes, nodePodCount) - + strategies.RemovePodsViolatingNodeTaints(rs, deschedulerPolicy.Strategies["RemovePodsViolatingNodeTaints"], evictionPolicyGroupVersion, nodes, nodePodCount) return nil } diff --git a/pkg/descheduler/strategies/node_taint.go b/pkg/descheduler/strategies/node_taint.go new file mode 100644 index 000000000..d2903e817 --- /dev/null +++ b/pkg/descheduler/strategies/node_taint.go @@ -0,0 +1,138 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package strategies + +import ( + "sigs.k8s.io/descheduler/cmd/descheduler/app/options" + "sigs.k8s.io/descheduler/pkg/api" + "sigs.k8s.io/descheduler/pkg/descheduler/evictions" + podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + + "k8s.io/api/core/v1" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/klog" +) + +const ( + TolerationOpExists v1.TolerationOperator = "Exists" + TolerationOpEqual v1.TolerationOperator = "Equal" +) + +// RemovePodsViolatingNodeTaints with elimination strategy +func RemovePodsViolatingNodeTaints(ds *options.DeschedulerServer, strategy api.DeschedulerStrategy, policyGroupVersion string, nodes []*v1.Node, nodePodCount nodePodEvictedCount) { + if !strategy.Enabled { + return + } + deletePodsViolatingNodeTaints(ds.Client, policyGroupVersion, nodes, ds.DryRun, nodePodCount, ds.MaxNoOfPodsToEvictPerNode, ds.EvictLocalStoragePods) +} + +// deletePodsViolatingNodeTaints evicts pods on the node which violate NoSchedule Taints on nodes +func deletePodsViolatingNodeTaints(client clientset.Interface, policyGroupVersion string, nodes []*v1.Node, dryRun bool, nodePodCount nodePodEvictedCount, maxPodsToEvict int, evictLocalStoragePods bool) int { + podsEvicted := 0 + for _, node := range nodes { + klog.V(1).Infof("Processing node: %#v\n", node.Name) + pods, err := podutil.ListEvictablePodsOnNode(client, node, evictLocalStoragePods) + if err != nil { + //no pods evicted as error encountered retrieving evictable Pods + return 0 + } + totalPods := len(pods) + for i := 0; i < totalPods; i++ { + if maxPodsToEvict > 0 && nodePodCount[node]+1 > maxPodsToEvict { + break + } + if !checkPodsSatisfyTolerations(pods[i], node) { + success, err := evictions.EvictPod(client, pods[i], policyGroupVersion, dryRun) + if !success { + klog.Errorf("Error when evicting pod: %#v (%#v)\n", pods[i].Name, err) + } else { + nodePodCount[node]++ + klog.V(1).Infof("Evicted pod: %#v (%#v)", pods[i].Name, err) + } + } + } + podsEvicted += nodePodCount[node] + } + return podsEvicted +} + +// checkPodsSatisfyTolerations checks if the node's taints (NoSchedule) are still satisfied by pods' tolerations. +func checkPodsSatisfyTolerations(pod *v1.Pod, node *v1.Node) bool { + tolerations := pod.Spec.Tolerations + taints := node.Spec.Taints + if len(taints) == 0 { + return true + } + noScheduleTaints := getNoScheduleTaints(taints) + if !allTaintsTolerated(noScheduleTaints, tolerations) { + klog.V(2).Infof("Not all taints are tolerated after update for Pod %v on node %v", pod.Name, node.Name) + return false + } + return true +} + +// getNoScheduleTaints return a slice of NoSchedule taints from the a slice of taints that it receives. +func getNoScheduleTaints(taints []v1.Taint) []v1.Taint { + result := []v1.Taint{} + for i := range taints { + if taints[i].Effect == v1.TaintEffectNoSchedule { + result = append(result, taints[i]) + } + } + return result +} + +//toleratesTaint returns true if a toleration tolerates a taint, or false otherwise +func toleratesTaint(toleration *v1.Toleration, taint *v1.Taint) bool { + + if (len(toleration.Key) > 0 && toleration.Key != taint.Key) || + (len(toleration.Effect) > 0 && toleration.Effect != taint.Effect) { + return false + } + switch toleration.Operator { + // empty operator means Equal + case "", TolerationOpEqual: + return toleration.Value == taint.Value + case TolerationOpExists: + return true + default: + return false + } +} + +// allTaintsTolerated returns true if all are tolerated, or false otherwise. +func allTaintsTolerated(taints []v1.Taint, tolerations []v1.Toleration) bool { + if len(taints) == 0 { + return true + } + if len(tolerations) == 0 && len(taints) > 0 { + return false + } + for i := range taints { + tolerated := false + for j := range tolerations { + if toleratesTaint(&tolerations[j], &taints[i]) { + tolerated = true + break + } + } + if !tolerated { + return false + } + } + return true +} diff --git a/pkg/descheduler/strategies/node_taint_test.go b/pkg/descheduler/strategies/node_taint_test.go new file mode 100644 index 000000000..88b72c02f --- /dev/null +++ b/pkg/descheduler/strategies/node_taint_test.go @@ -0,0 +1,295 @@ +package strategies + +import ( + "fmt" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + core "k8s.io/client-go/testing" + "sigs.k8s.io/descheduler/test" + "testing" +) + +func createNoScheduleTaint(key, value string, index int) v1.Taint { + return v1.Taint{ + Key: "testTaint" + fmt.Sprintf("%v", index), + Value: "test" + fmt.Sprintf("%v", index), + Effect: v1.TaintEffectNoSchedule, + } +} + +func addTaintsToNode(node *v1.Node, key, value string, indices []int) *v1.Node { + taints := []v1.Taint{} + for _, index := range indices { + taints = append(taints, createNoScheduleTaint(key, value, index)) + } + node.Spec.Taints = taints + return node +} + +func addTolerationToPod(pod *v1.Pod, key, value string, index int) *v1.Pod { + if pod.Annotations == nil { + pod.Annotations = map[string]string{} + } + + pod.Spec.Tolerations = []v1.Toleration{{Key: key + fmt.Sprintf("%v", index), Value: value + fmt.Sprintf("%v", index), Effect: v1.TaintEffectNoSchedule}} + + return pod +} + +func TestDeletePodsViolatingNodeTaints(t *testing.T) { + + node1 := test.BuildTestNode("n1", 2000, 3000, 10) + node1 = addTaintsToNode(node1, "testTaint", "test", []int{1}) + node2 := test.BuildTestNode("n2", 2000, 3000, 10) + node1 = addTaintsToNode(node2, "testingTaint", "testing", []int{1}) + + p1 := test.BuildTestPod("p1", 100, 0, node1.Name) + p2 := test.BuildTestPod("p2", 100, 0, node1.Name) + p3 := test.BuildTestPod("p3", 100, 0, node1.Name) + p4 := test.BuildTestPod("p4", 100, 0, node1.Name) + p5 := test.BuildTestPod("p5", 100, 0, node1.Name) + p6 := test.BuildTestPod("p6", 100, 0, node1.Name) + + p1.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + p2.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + p3.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + p4.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + p5.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + p6.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + p7 := test.BuildTestPod("p7", 100, 0, node2.Name) + p8 := test.BuildTestPod("p8", 100, 0, node2.Name) + p9 := test.BuildTestPod("p9", 100, 0, node2.Name) + p10 := test.BuildTestPod("p10", 100, 0, node2.Name) + p11 := test.BuildTestPod("p11", 100, 0, node2.Name) + p11.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + + // The following 4 pods won't get evicted. + // A Critical Pod. + p7.Namespace = "kube-system" + p7.Annotations = test.GetCriticalPodAnnotation() + + // A daemonset. + p8.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList() + // A pod with local storage. + p9.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + p9.Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, + }, + }, + } + // A Mirror Pod. + p10.Annotations = test.GetMirrorPodAnnotation() + + p1 = addTolerationToPod(p1, "testTaint", "test", 1) + p3 = addTolerationToPod(p3, "testTaint", "test", 1) + p4 = addTolerationToPod(p4, "testTaintX", "testX", 1) + + tests := []struct { + description string + nodes []*v1.Node + pods []v1.Pod + evictLocalStoragePods bool + npe nodePodEvictedCount + maxPodsToEvict int + expectedEvictedPodCount int + }{ + + { + description: "Pods not tolerating node taint should be evicted", + pods: []v1.Pod{*p1, *p2, *p3}, + nodes: []*v1.Node{node1}, + evictLocalStoragePods: false, + npe: nodePodEvictedCount{node1: 0}, + maxPodsToEvict: 0, + expectedEvictedPodCount: 1, //p2 gets evicted + }, + { + description: "Pods with tolerations but not tolerating node taint should be evicted", + pods: []v1.Pod{*p1, *p3, *p4}, + nodes: []*v1.Node{node1}, + evictLocalStoragePods: false, + npe: nodePodEvictedCount{node1: 0}, + maxPodsToEvict: 0, + expectedEvictedPodCount: 1, //p4 gets evicted + }, + { + description: "Only number of Pods not tolerating node taint should be evicted", + pods: []v1.Pod{*p1, *p5, *p6}, + nodes: []*v1.Node{node1}, + evictLocalStoragePods: false, + npe: nodePodEvictedCount{node1: 0}, + maxPodsToEvict: 1, + expectedEvictedPodCount: 1, //p5 or p6 gets evicted + }, + { + description: "Critical pods not tolerating node taint should not be evicted", + pods: []v1.Pod{*p7, *p8, *p9, *p10}, + nodes: []*v1.Node{node2}, + evictLocalStoragePods: false, + npe: nodePodEvictedCount{node2: 0}, + maxPodsToEvict: 0, + expectedEvictedPodCount: 0, + }, + { + description: "Critical pods except storage pods not tolerating node taint should not be evicted", + pods: []v1.Pod{*p7, *p8, *p9, *p10}, + nodes: []*v1.Node{node2}, + evictLocalStoragePods: true, + npe: nodePodEvictedCount{node2: 0}, + maxPodsToEvict: 0, + expectedEvictedPodCount: 1, + }, + { + description: "Critical and non critical pods, only non critical pods not tolerating node taint should be evicted", + pods: []v1.Pod{*p7, *p8, *p10, *p11}, + nodes: []*v1.Node{node2}, + evictLocalStoragePods: false, + npe: nodePodEvictedCount{node2: 0}, + maxPodsToEvict: 0, + expectedEvictedPodCount: 1, + }, + } + + for _, tc := range tests { + + // create fake client + fakeClient := &fake.Clientset{} + fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { + return true, &v1.PodList{Items: tc.pods}, nil + }) + + actualEvictedPodCount := deletePodsViolatingNodeTaints(fakeClient, "v1", tc.nodes, false, tc.npe, tc.maxPodsToEvict, tc.evictLocalStoragePods) + if actualEvictedPodCount != tc.expectedEvictedPodCount { + t.Errorf("Test %#v failed, Unexpected no of pods evicted: pods evicted: %d, expected: %d", tc.description, actualEvictedPodCount, tc.expectedEvictedPodCount) + } + } + +} + +func TestToleratesTaint(t *testing.T) { + + testCases := []struct { + description string + toleration v1.Toleration + taint v1.Taint + expectTolerated bool + }{ + { + description: "toleration and taint have the same key and effect, and operator is Exists, and taint has no value, expect tolerated", + toleration: v1.Toleration{ + Key: "foo", + Operator: TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }, + taint: v1.Taint{ + Key: "foo", + Effect: v1.TaintEffectNoSchedule, + }, + expectTolerated: true, + }, + { + description: "toleration and taint have the same key and effect, and operator is Exists, and taint has some value, expect tolerated", + toleration: v1.Toleration{ + Key: "foo", + Operator: TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }, + taint: v1.Taint{ + Key: "foo", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + expectTolerated: true, + }, + { + description: "toleration and taint have the same effect, toleration has empty key and operator is Exists, means match all taints, expect tolerated", + toleration: v1.Toleration{ + Key: "", + Operator: TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }, + taint: v1.Taint{ + Key: "foo", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + expectTolerated: true, + }, + { + description: "toleration and taint have the same key, effect and value, and operator is Equal, expect tolerated", + toleration: v1.Toleration{ + Key: "foo", + Operator: TolerationOpEqual, + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + taint: v1.Taint{ + Key: "foo", + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + expectTolerated: true, + }, + { + description: "toleration and taint have the same key and effect, but different values, and operator is Equal, expect not tolerated", + toleration: v1.Toleration{ + Key: "foo", + Operator: TolerationOpEqual, + Value: "value1", + Effect: v1.TaintEffectNoSchedule, + }, + taint: v1.Taint{ + Key: "foo", + Value: "value2", + Effect: v1.TaintEffectNoSchedule, + }, + expectTolerated: false, + }, + { + description: "toleration and taint have the same key and value, but different effects, and operator is Equal, expect not tolerated", + toleration: v1.Toleration{ + Key: "foo", + Operator: TolerationOpEqual, + Value: "bar", + Effect: v1.TaintEffectNoSchedule, + }, + taint: v1.Taint{ + Key: "foo", + Value: "bar", + Effect: v1.TaintEffectNoExecute, + }, + expectTolerated: false, + }, + } + for _, tc := range testCases { + if tolerated := toleratesTaint(&tc.toleration, &tc.taint); tc.expectTolerated != tolerated { + t.Errorf("[%s] expect %v, got %v: toleration %+v, taint %s", tc.description, tc.expectTolerated, tolerated, tc.toleration, tc.taint.ToString()) + } + } +} + +func TestFilterNoExecuteTaints(t *testing.T) { + taints := []v1.Taint{ + { + Key: "one", + Value: "one", + Effect: v1.TaintEffectNoExecute, + }, + { + Key: "two", + Value: "two", + Effect: v1.TaintEffectNoSchedule, + }, + } + taints = getNoScheduleTaints(taints) + if len(taints) != 1 || taints[0].Key != "two" { + t.Errorf("Filtering doesn't work. Got %v", taints) + } +}