diff --git a/.gitignore b/.gitignore index eb94dbb3e..65f7e6348 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ _output/ _tmp/ vendordiff.patch +.idea/ + diff --git a/README.md b/README.md index 3ea9a147e..4df278c17 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,22 @@ strategies: enabled: true ```` +### RemovePodsHavingTooManyRestarts + +This strategy makes sure that pods having too many restarts are removed from nodes. For example a pod with EBS/PD that can't get the volume/disk attached to the instance, then the pod should be re-scheduled to other nodes. + +``` +apiVersion: "descheduler/v1alpha1" +kind: "DeschedulerPolicy" +strategies: + "RemovePodsHavingTooManyRestarts": + enabled: true + params: + podsHavingTooManyRestarts: + podRestartThreshold: 100 + includingInitContainers: true +``` + ## Pod Evictions When the descheduler decides to evict pods from a node, it employs the following general mechanism: diff --git a/examples/policy.yaml b/examples/policy.yaml index 38fd0d8f3..e1aaf624b 100644 --- a/examples/policy.yaml +++ b/examples/policy.yaml @@ -17,3 +17,9 @@ strategies: "cpu" : 50 "memory": 50 "pods": 50 + "RemovePodsHavingTooManyRestarts": + enabled: true + params: + podsHavingTooManyRestarts: + podRestartThresholds: 100 + includingInitContainers: true diff --git a/pkg/api/types.go b/pkg/api/types.go index 5b1d1c30c..5d814f55c 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -48,6 +48,7 @@ type DeschedulerStrategy struct { type StrategyParameters struct { NodeResourceUtilizationThresholds NodeResourceUtilizationThresholds NodeAffinityType []string + PodsHavingTooManyRestarts PodsHavingTooManyRestarts } type Percentage float64 @@ -58,3 +59,8 @@ type NodeResourceUtilizationThresholds struct { TargetThresholds ResourceThresholds NumberOfNodes int } + +type PodsHavingTooManyRestarts struct { + PodRestartThreshold int32 + IncludingInitContainers bool +} diff --git a/pkg/api/v1alpha1/types.go b/pkg/api/v1alpha1/types.go index 1ca0cac9b..213b63fcf 100644 --- a/pkg/api/v1alpha1/types.go +++ b/pkg/api/v1alpha1/types.go @@ -48,6 +48,7 @@ type DeschedulerStrategy struct { type StrategyParameters struct { NodeResourceUtilizationThresholds NodeResourceUtilizationThresholds `json:"nodeResourceUtilizationThresholds,omitempty"` NodeAffinityType []string `json:"nodeAffinityType,omitempty"` + PodsHavingTooManyRestarts PodsHavingTooManyRestarts `json:"podsHavingTooManyRestarts,omitempty"` } type Percentage float64 @@ -58,3 +59,8 @@ type NodeResourceUtilizationThresholds struct { TargetThresholds ResourceThresholds `json:"targetThresholds,omitempty"` NumberOfNodes int `json:"numberOfNodes,omitempty"` } + +type PodsHavingTooManyRestarts struct { + PodRestartThreshold int32 `json:"podRestartThreshold,omitempty"` + IncludingInitContainers bool `json:"includingInitContainers,omitempty"` +} diff --git a/pkg/api/v1alpha1/zz_generated.conversion.go b/pkg/api/v1alpha1/zz_generated.conversion.go index 829370b0a..d251ac64a 100644 --- a/pkg/api/v1alpha1/zz_generated.conversion.go +++ b/pkg/api/v1alpha1/zz_generated.conversion.go @@ -65,6 +65,16 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddGeneratedConversionFunc((*PodsHavingTooManyRestarts)(nil), (*api.PodsHavingTooManyRestarts)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1alpha1_PodsHavingTooManyRestarts_To_api_PodsHavingTooManyRestarts(a.(*PodsHavingTooManyRestarts), b.(*api.PodsHavingTooManyRestarts), scope) + }); err != nil { + return err + } + if err := s.AddGeneratedConversionFunc((*api.PodsHavingTooManyRestarts)(nil), (*PodsHavingTooManyRestarts)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts(a.(*api.PodsHavingTooManyRestarts), b.(*PodsHavingTooManyRestarts), scope) + }); err != nil { + return err + } if err := s.AddGeneratedConversionFunc((*StrategyParameters)(nil), (*api.StrategyParameters)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha1_StrategyParameters_To_api_StrategyParameters(a.(*StrategyParameters), b.(*api.StrategyParameters), scope) }); err != nil { @@ -150,11 +160,36 @@ func Convert_api_NodeResourceUtilizationThresholds_To_v1alpha1_NodeResourceUtili return autoConvert_api_NodeResourceUtilizationThresholds_To_v1alpha1_NodeResourceUtilizationThresholds(in, out, s) } +func autoConvert_v1alpha1_PodsHavingTooManyRestarts_To_api_PodsHavingTooManyRestarts(in *PodsHavingTooManyRestarts, out *api.PodsHavingTooManyRestarts, s conversion.Scope) error { + out.PodRestartThreshold = in.PodRestartThreshold + out.IncludingInitContainers = in.IncludingInitContainers + return nil +} + +// Convert_v1alpha1_PodsHavingTooManyRestarts_To_api_PodsHavingTooManyRestarts is an autogenerated conversion function. +func Convert_v1alpha1_PodsHavingTooManyRestarts_To_api_PodsHavingTooManyRestarts(in *PodsHavingTooManyRestarts, out *api.PodsHavingTooManyRestarts, s conversion.Scope) error { + return autoConvert_v1alpha1_PodsHavingTooManyRestarts_To_api_PodsHavingTooManyRestarts(in, out, s) +} + +func autoConvert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts(in *api.PodsHavingTooManyRestarts, out *PodsHavingTooManyRestarts, s conversion.Scope) error { + out.PodRestartThreshold = in.PodRestartThreshold + out.IncludingInitContainers = in.IncludingInitContainers + return nil +} + +// Convert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts is an autogenerated conversion function. +func Convert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts(in *api.PodsHavingTooManyRestarts, out *PodsHavingTooManyRestarts, s conversion.Scope) error { + return autoConvert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts(in, out, s) +} + func autoConvert_v1alpha1_StrategyParameters_To_api_StrategyParameters(in *StrategyParameters, out *api.StrategyParameters, s conversion.Scope) error { if err := Convert_v1alpha1_NodeResourceUtilizationThresholds_To_api_NodeResourceUtilizationThresholds(&in.NodeResourceUtilizationThresholds, &out.NodeResourceUtilizationThresholds, s); err != nil { return err } out.NodeAffinityType = *(*[]string)(unsafe.Pointer(&in.NodeAffinityType)) + if err := Convert_v1alpha1_PodsHavingTooManyRestarts_To_api_PodsHavingTooManyRestarts(&in.PodsHavingTooManyRestarts, &out.PodsHavingTooManyRestarts, s); err != nil { + return err + } return nil } @@ -168,6 +203,9 @@ func autoConvert_api_StrategyParameters_To_v1alpha1_StrategyParameters(in *api.S return err } out.NodeAffinityType = *(*[]string)(unsafe.Pointer(&in.NodeAffinityType)) + if err := Convert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts(&in.PodsHavingTooManyRestarts, &out.PodsHavingTooManyRestarts, s); err != nil { + return err + } return nil } diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index af41198ce..37420ba13 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -103,6 +103,22 @@ func (in *NodeResourceUtilizationThresholds) DeepCopy() *NodeResourceUtilization return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodsHavingTooManyRestarts) DeepCopyInto(out *PodsHavingTooManyRestarts) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodsHavingTooManyRestarts. +func (in *PodsHavingTooManyRestarts) DeepCopy() *PodsHavingTooManyRestarts { + if in == nil { + return nil + } + out := new(PodsHavingTooManyRestarts) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in ResourceThresholds) DeepCopyInto(out *ResourceThresholds) { { @@ -156,6 +172,7 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) { *out = make([]string, len(*in)) copy(*out, *in) } + out.PodsHavingTooManyRestarts = in.PodsHavingTooManyRestarts return } diff --git a/pkg/api/zz_generated.deepcopy.go b/pkg/api/zz_generated.deepcopy.go index 1af80dbca..e046e70c1 100644 --- a/pkg/api/zz_generated.deepcopy.go +++ b/pkg/api/zz_generated.deepcopy.go @@ -103,6 +103,22 @@ func (in *NodeResourceUtilizationThresholds) DeepCopy() *NodeResourceUtilization return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodsHavingTooManyRestarts) DeepCopyInto(out *PodsHavingTooManyRestarts) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodsHavingTooManyRestarts. +func (in *PodsHavingTooManyRestarts) DeepCopy() *PodsHavingTooManyRestarts { + if in == nil { + return nil + } + out := new(PodsHavingTooManyRestarts) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in ResourceThresholds) DeepCopyInto(out *ResourceThresholds) { { @@ -156,6 +172,7 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) { *out = make([]string, len(*in)) copy(*out, *in) } + out.PodsHavingTooManyRestarts = in.PodsHavingTooManyRestarts return } diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index 9ff210500..1f77ceeac 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -90,6 +90,7 @@ func RunDeschedulerStrategies(rs *options.DeschedulerServer, deschedulerPolicy * strategies.RemovePodsViolatingInterPodAntiAffinity(rs, deschedulerPolicy.Strategies["RemovePodsViolatingInterPodAntiAffinity"], nodes, podEvictor) strategies.RemovePodsViolatingNodeAffinity(rs, deschedulerPolicy.Strategies["RemovePodsViolatingNodeAffinity"], nodes, podEvictor) strategies.RemovePodsViolatingNodeTaints(rs, deschedulerPolicy.Strategies["RemovePodsViolatingNodeTaints"], nodes, podEvictor) + strategies.RemovePodsHavingTooManyRestarts(rs, deschedulerPolicy.Strategies["RemovePodsHavingTooManyRestarts"], nodes, podEvictor) // If there was no interval specified, send a signal to the stopChannel to end the wait.Until loop after 1 iteration if rs.DeschedulingInterval.Seconds() == 0 { diff --git a/pkg/descheduler/strategies/toomanyrestarts.go b/pkg/descheduler/strategies/toomanyrestarts.go new file mode 100644 index 000000000..ab919c237 --- /dev/null +++ b/pkg/descheduler/strategies/toomanyrestarts.go @@ -0,0 +1,77 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package strategies + +import ( + "k8s.io/api/core/v1" + "k8s.io/klog" + + "sigs.k8s.io/descheduler/cmd/descheduler/app/options" + "sigs.k8s.io/descheduler/pkg/api" + "sigs.k8s.io/descheduler/pkg/descheduler/evictions" + podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" +) + +// RemovePodsHavingTooManyRestarts removes the pods that have too many restarts on node. +// There are too many cases leading this issue: Volume mount failed, app error due to nodes' different settings. +// As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages. +func RemovePodsHavingTooManyRestarts(ds *options.DeschedulerServer, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) { + if !strategy.Enabled || strategy.Params.PodsHavingTooManyRestarts.PodRestartThreshold < 1 { + return + } + removePodsHavingTooManyRestarts(ds, strategy, nodes, podEvictor, ds.EvictLocalStoragePods) +} + +func removePodsHavingTooManyRestarts(ds *options.DeschedulerServer, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, evictLocalStoragePods bool) { + for _, node := range nodes { + klog.V(1).Infof("Processing node: %s", node.Name) + pods, err := podutil.ListEvictablePodsOnNode(ds.Client, node, evictLocalStoragePods) + if err != nil { + klog.Errorf("Error when list pods at node %s", node.Name) + continue + } + + for i, pod := range pods { + restarts, initRestarts := calcContainerRestarts(pod) + if strategy.Params.PodsHavingTooManyRestarts.IncludingInitContainers { + if restarts+initRestarts < strategy.Params.PodsHavingTooManyRestarts.PodRestartThreshold { + continue + } + } else if restarts < strategy.Params.PodsHavingTooManyRestarts.PodRestartThreshold { + continue + } + if _, err := podEvictor.EvictPod(pods[i], node); err != nil { + break + } + } + } +} + +// calcContainerRestarts get container restarts and init container restarts. +func calcContainerRestarts(pod *v1.Pod) (int32, int32) { + var restarts, initRestarts int32 + + for _, cs := range pod.Status.ContainerStatuses { + restarts += cs.RestartCount + } + + for _, cs := range pod.Status.InitContainerStatuses { + initRestarts += cs.RestartCount + } + + return restarts, initRestarts +} diff --git a/pkg/descheduler/strategies/toomanyrestarts_test.go b/pkg/descheduler/strategies/toomanyrestarts_test.go new file mode 100644 index 000000000..9948d9df7 --- /dev/null +++ b/pkg/descheduler/strategies/toomanyrestarts_test.go @@ -0,0 +1,191 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package strategies + +import ( + "testing" + + "fmt" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + core "k8s.io/client-go/testing" + "sigs.k8s.io/descheduler/cmd/descheduler/app/options" + "sigs.k8s.io/descheduler/pkg/api" + "sigs.k8s.io/descheduler/pkg/apis/componentconfig" + "sigs.k8s.io/descheduler/pkg/descheduler/evictions" + "sigs.k8s.io/descheduler/test" +) + +func initPods(node *v1.Node) []v1.Pod { + pods := make([]v1.Pod, 0) + + for i := int32(0); i <= 9; i++ { + pod := test.BuildTestPod(fmt.Sprintf("pod-%d", i), 100, 0, node.Name, nil) + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + + // pod at index i will have 25 * i restarts. + pod.Status = v1.PodStatus{ + InitContainerStatuses: []v1.ContainerStatus{ + { + RestartCount: 5 * i, + }, + }, + ContainerStatuses: []v1.ContainerStatus{ + { + RestartCount: 10 * i, + }, + { + RestartCount: 10 * i, + }, + }, + } + pods = append(pods, *pod) + } + + // The following 3 pods won't get evicted. + // A daemonset. + pods[6].ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList() + // A pod with local storage. + pods[7].ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + pods[7].Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, + }, + }, + } + // A Mirror Pod. + pods[8].Annotations = test.GetMirrorPodAnnotation() + + return pods +} + +func TestRemovePodsHavingTooManyRestarts(t *testing.T) { + createStrategy := func(enabled, includingInitContainers bool, restartThresholds int32) api.DeschedulerStrategy { + return api.DeschedulerStrategy{ + Enabled: enabled, + Params: api.StrategyParameters{ + PodsHavingTooManyRestarts: api.PodsHavingTooManyRestarts{ + PodRestartThreshold: restartThresholds, + IncludingInitContainers: includingInitContainers, + }, + }, + } + } + + node := test.BuildTestNode("node1", 2000, 3000, 10, nil) + pods := initPods(node) + + fakeClient := &fake.Clientset{} + + fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { + return true, &v1.PodList{Items: pods}, nil + }) + + tests := []struct { + description string + pods []v1.Pod + strategy api.DeschedulerStrategy + expectedEvictedPodCount int + maxPodsToEvict int + }{ + { + description: "All pods have total restarts under threshold, no pod evictions", + strategy: createStrategy(true, true, 10000), + expectedEvictedPodCount: 0, + maxPodsToEvict: 0, + }, + { + description: "Some pods have total restarts bigger than threshold", + strategy: createStrategy(true, true, 1), + expectedEvictedPodCount: 6, + maxPodsToEvict: 0, + }, + { + description: "Nine pods have total restarts equals threshold(includingInitContainers=true), 6 pods evictions", + strategy: createStrategy(true, true, 1*25), + expectedEvictedPodCount: 6, + maxPodsToEvict: 0, + }, + { + description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 5 pods evictions", + strategy: createStrategy(true, false, 1*25), + expectedEvictedPodCount: 5, + maxPodsToEvict: 0, + }, + { + description: "All pods have total restarts equals threshold(includingInitContainers=true), 6 pods evictions", + strategy: createStrategy(true, true, 1*20), + expectedEvictedPodCount: 6, + maxPodsToEvict: 0, + }, + { + description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 6 pods evictions", + strategy: createStrategy(true, false, 1*20), + expectedEvictedPodCount: 6, + maxPodsToEvict: 0, + }, + { + description: "Five pods have total restarts bigger than threshold(includingInitContainers=true), but only 1 pod eviction", + strategy: createStrategy(true, true, 5*25+1), + expectedEvictedPodCount: 1, + maxPodsToEvict: 0, + }, + { + description: "Five pods have total restarts bigger than threshold(includingInitContainers=false), but only 1 pod eviction", + strategy: createStrategy(true, false, 5*20+1), + expectedEvictedPodCount: 1, + maxPodsToEvict: 0, + }, + { + description: "All pods have total restarts equals threshold(maxPodsToEvict=3), 3 pods evictions", + strategy: createStrategy(true, true, 1), + expectedEvictedPodCount: 3, + maxPodsToEvict: 3, + }, + } + + for _, tc := range tests { + + ds := options.DeschedulerServer{ + DeschedulerConfiguration: componentconfig.DeschedulerConfiguration{ + MaxNoOfPodsToEvictPerNode: tc.maxPodsToEvict, + }, + Client: fakeClient, + } + + podEvictor := evictions.NewPodEvictor( + fakeClient, + "v1", + false, + tc.maxPodsToEvict, + []*v1.Node{node}, + ) + + removePodsHavingTooManyRestarts(&ds, tc.strategy, []*v1.Node{node}, podEvictor, ds.EvictLocalStoragePods) + actualEvictedPodCount := podEvictor.TotalEvicted() + if actualEvictedPodCount != tc.expectedEvictedPodCount { + t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount) + } + } + +}