mirror of
https://github.com/kubernetes-sigs/descheduler.git
synced 2026-01-26 05:14:13 +01:00
Merge pull request #1165 from a7i/toomanyrestarts-CrashLoopBackOff
TooManyRestart: state filter for CrashLoopBackOff
This commit is contained in:
@@ -561,6 +561,13 @@ include `podRestartThreshold`, which is the number of restarts (summed over all
|
||||
should be evicted, and `includingInitContainers`, which determines whether init container restarts should be factored
|
||||
into that calculation.
|
||||
|
||||
You can also specify `states` parameter to **only** evict pods matching the following conditions:
|
||||
- [Pod Phase](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase) status of: `Running`
|
||||
- [Container State Waiting](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-state-waiting) of: `CrashLoopBackOff`
|
||||
|
||||
If a value for `states` or `podStatusPhases` is not specified,
|
||||
Pods in any state (even `Running`) are considered for eviction.
|
||||
|
||||
**Parameters:**
|
||||
|
||||
|Name|Type|
|
||||
@@ -569,6 +576,7 @@ into that calculation.
|
||||
|`includingInitContainers`|bool|
|
||||
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|
||||
|`labelSelector`|(see [label filtering](#label-filtering))|
|
||||
|`states`|list(string)|Only supported in v0.28+|
|
||||
|
||||
**Example:**
|
||||
|
||||
|
||||
15
examples/too-many-restarts.yml
Normal file
15
examples/too-many-restarts.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: "descheduler/v1alpha2"
|
||||
kind: "DeschedulerPolicy"
|
||||
profiles:
|
||||
- name: ProfileName
|
||||
pluginConfig:
|
||||
- name: "RemovePodsHavingTooManyRestarts"
|
||||
args:
|
||||
podRestartThreshold: 100
|
||||
includingInitContainers: true
|
||||
states:
|
||||
- CrashLoopBackOff
|
||||
plugins:
|
||||
deschedule:
|
||||
enabled:
|
||||
- "RemovePodsHavingTooManyRestarts"
|
||||
@@ -44,14 +44,12 @@ func ValidatePodLifeTimeArgs(obj runtime.Object) error {
|
||||
}
|
||||
}
|
||||
podLifeTimeAllowedStates := sets.New(
|
||||
// Pod phase reasons
|
||||
string(v1.PodRunning),
|
||||
string(v1.PodPending),
|
||||
|
||||
// Container state reasons
|
||||
// Container state reasons: https://github.com/kubernetes/kubernetes/blob/release-1.24/pkg/kubelet/kubelet_pods.go#L76-L79
|
||||
"PodInitializing",
|
||||
"ContainerCreating",
|
||||
"CrashLoopBackOff",
|
||||
)
|
||||
|
||||
if !podLifeTimeAllowedStates.HasAll(args.States...) {
|
||||
|
||||
@@ -20,7 +20,6 @@ import (
|
||||
"testing"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
utilpointer "k8s.io/utils/pointer"
|
||||
)
|
||||
|
||||
func TestValidateRemovePodLifeTimeArgs(t *testing.T) {
|
||||
@@ -32,7 +31,7 @@ func TestValidateRemovePodLifeTimeArgs(t *testing.T) {
|
||||
{
|
||||
description: "valid arg, no errors",
|
||||
args: &PodLifeTimeArgs{
|
||||
MaxPodLifeTimeSeconds: utilpointer.Uint(1),
|
||||
MaxPodLifeTimeSeconds: func(i uint) *uint { return &i }(1),
|
||||
States: []string{string(v1.PodRunning)},
|
||||
},
|
||||
expectError: false,
|
||||
@@ -51,14 +50,6 @@ func TestValidateRemovePodLifeTimeArgs(t *testing.T) {
|
||||
},
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
description: "allows CrashLoopBackOff state",
|
||||
args: &PodLifeTimeArgs{
|
||||
MaxPodLifeTimeSeconds: utilpointer.Uint(1),
|
||||
States: []string{"CrashLoopBackOff"},
|
||||
},
|
||||
expectError: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
|
||||
@@ -37,4 +37,7 @@ func SetDefaults_RemovePodsHavingTooManyRestartsArgs(obj runtime.Object) {
|
||||
if !args.IncludingInitContainers {
|
||||
args.IncludingInitContainers = false
|
||||
}
|
||||
if args.States == nil {
|
||||
args.States = nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
||||
@@ -37,6 +38,7 @@ func TestSetDefaults_RemovePodsHavingTooManyRestartsArgs(t *testing.T) {
|
||||
LabelSelector: nil,
|
||||
PodRestartThreshold: 0,
|
||||
IncludingInitContainers: false,
|
||||
States: nil,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -46,12 +48,14 @@ func TestSetDefaults_RemovePodsHavingTooManyRestartsArgs(t *testing.T) {
|
||||
LabelSelector: &metav1.LabelSelector{},
|
||||
PodRestartThreshold: 10,
|
||||
IncludingInitContainers: true,
|
||||
States: []string{string(v1.PodRunning)},
|
||||
},
|
||||
want: &RemovePodsHavingTooManyRestartsArgs{
|
||||
Namespaces: &api.Namespaces{},
|
||||
LabelSelector: &metav1.LabelSelector{},
|
||||
PodRestartThreshold: 10,
|
||||
IncludingInitContainers: true,
|
||||
States: []string{string(v1.PodRunning)},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -75,6 +75,23 @@ func New(args runtime.Object, handle frameworktypes.Handle) (frameworktypes.Plug
|
||||
return true
|
||||
})
|
||||
|
||||
if len(tooManyRestartsArgs.States) > 0 {
|
||||
states := sets.New(tooManyRestartsArgs.States...)
|
||||
podFilter = podutil.WrapFilterFuncs(podFilter, func(pod *v1.Pod) bool {
|
||||
if states.Has(string(pod.Status.Phase)) {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, containerStatus := range pod.Status.ContainerStatuses {
|
||||
if containerStatus.State.Waiting != nil && states.Has(containerStatus.State.Waiting.Reason) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
})
|
||||
}
|
||||
|
||||
return &RemovePodsHavingTooManyRestarts{
|
||||
handle: handle,
|
||||
args: tooManyRestartsArgs,
|
||||
|
||||
@@ -104,8 +104,6 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
|
||||
node4 := test.BuildTestNode("node4", 200, 3000, 10, nil)
|
||||
node5 := test.BuildTestNode("node5", 2000, 3000, 10, nil)
|
||||
|
||||
pods := append(append(initPods(node1), test.BuildTestPod("CPU-consumer-1", 150, 100, node4.Name, nil)), test.BuildTestPod("CPU-consumer-2", 150, 100, node5.Name, nil))
|
||||
|
||||
createRemovePodsHavingTooManyRestartsAgrs := func(
|
||||
podRestartThresholds int32,
|
||||
includingInitContainers bool,
|
||||
@@ -126,6 +124,7 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
|
||||
maxPodsToEvictPerNode *uint
|
||||
maxNoOfPodsToEvictPerNamespace *uint
|
||||
nodeFit bool
|
||||
applyFunc func([]*v1.Pod)
|
||||
}{
|
||||
{
|
||||
description: "All pods have total restarts under threshold, no pod evictions",
|
||||
@@ -191,7 +190,7 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
|
||||
maxNoOfPodsToEvictPerNamespace: &uint3,
|
||||
},
|
||||
{
|
||||
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node is tained, 0 pod evictions",
|
||||
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node is tainted, 0 pod evictions",
|
||||
args: createRemovePodsHavingTooManyRestartsAgrs(1, true),
|
||||
nodes: []*v1.Node{node1, node2},
|
||||
expectedEvictedPodCount: 0,
|
||||
@@ -222,10 +221,68 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
|
||||
maxPodsToEvictPerNode: &uint3,
|
||||
nodeFit: true,
|
||||
},
|
||||
{
|
||||
description: "pods are in CrashLoopBackOff with states=CrashLoopBackOff, 3 pod evictions",
|
||||
args: RemovePodsHavingTooManyRestartsArgs{PodRestartThreshold: 1, States: []string{"CrashLoopBackOff"}},
|
||||
nodes: []*v1.Node{node1, node5},
|
||||
expectedEvictedPodCount: 3,
|
||||
maxPodsToEvictPerNode: &uint3,
|
||||
nodeFit: true,
|
||||
applyFunc: func(pods []*v1.Pod) {
|
||||
for _, pod := range pods {
|
||||
if len(pod.Status.ContainerStatuses) > 0 {
|
||||
pod.Status.ContainerStatuses[0].State = v1.ContainerState{
|
||||
Waiting: &v1.ContainerStateWaiting{Reason: "CrashLoopBackOff"},
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "pods without CrashLoopBackOff with states=CrashLoopBackOff, 0 pod evictions",
|
||||
args: RemovePodsHavingTooManyRestartsArgs{PodRestartThreshold: 1, States: []string{"CrashLoopBackOff"}},
|
||||
nodes: []*v1.Node{node1, node5},
|
||||
expectedEvictedPodCount: 0,
|
||||
maxPodsToEvictPerNode: &uint3,
|
||||
nodeFit: true,
|
||||
},
|
||||
{
|
||||
description: "pods running with state=Running, 3 pod evictions",
|
||||
args: RemovePodsHavingTooManyRestartsArgs{PodRestartThreshold: 1, States: []string{string(v1.PodRunning)}},
|
||||
nodes: []*v1.Node{node1},
|
||||
expectedEvictedPodCount: 3,
|
||||
maxPodsToEvictPerNode: &uint3,
|
||||
applyFunc: func(pods []*v1.Pod) {
|
||||
for _, pod := range pods {
|
||||
pod.Status.Phase = v1.PodRunning
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "pods pending with state=Running, 0 pod evictions",
|
||||
args: RemovePodsHavingTooManyRestartsArgs{PodRestartThreshold: 1, States: []string{string(v1.PodRunning)}},
|
||||
nodes: []*v1.Node{node1},
|
||||
expectedEvictedPodCount: 0,
|
||||
maxPodsToEvictPerNode: &uint3,
|
||||
applyFunc: func(pods []*v1.Pod) {
|
||||
for _, pod := range pods {
|
||||
pod.Status.Phase = v1.PodPending
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
pods := append(
|
||||
initPods(node1),
|
||||
test.BuildTestPod("CPU-consumer-1", 150, 100, node4.Name, nil),
|
||||
test.BuildTestPod("CPU-consumer-2", 150, 100, node5.Name, nil),
|
||||
)
|
||||
if tc.applyFunc != nil {
|
||||
tc.applyFunc(pods)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
|
||||
@@ -29,4 +29,5 @@ type RemovePodsHavingTooManyRestartsArgs struct {
|
||||
LabelSelector *metav1.LabelSelector `json:"labelSelector"`
|
||||
PodRestartThreshold int32 `json:"podRestartThreshold"`
|
||||
IncludingInitContainers bool `json:"includingInitContainers"`
|
||||
States []string `json:"states"`
|
||||
}
|
||||
|
||||
@@ -16,8 +16,10 @@ package removepodshavingtoomanyrestarts
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
)
|
||||
|
||||
// ValidateRemovePodsHavingTooManyRestartsArgs validates RemovePodsHavingTooManyRestarts arguments
|
||||
@@ -38,5 +40,17 @@ func ValidateRemovePodsHavingTooManyRestartsArgs(obj runtime.Object) error {
|
||||
return fmt.Errorf("invalid PodsHavingTooManyRestarts threshold")
|
||||
}
|
||||
|
||||
allowedStates := sets.New(
|
||||
// Pod phases:
|
||||
string(v1.PodRunning),
|
||||
|
||||
// Container state reasons:
|
||||
"CrashLoopBackOff",
|
||||
)
|
||||
|
||||
if !allowedStates.HasAll(args.States...) {
|
||||
return fmt.Errorf("states must be one of %v", allowedStates.UnsortedList())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
Copyright 2022 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package removepodshavingtoomanyrestarts
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
)
|
||||
|
||||
func TestValidateRemovePodsHavingTooManyRestartsArgs(t *testing.T) {
|
||||
testCases := []struct {
|
||||
description string
|
||||
args *RemovePodsHavingTooManyRestartsArgs
|
||||
expectError bool
|
||||
}{
|
||||
{
|
||||
description: "valid arg, no errors",
|
||||
args: &RemovePodsHavingTooManyRestartsArgs{
|
||||
PodRestartThreshold: 1,
|
||||
States: []string{string(v1.PodRunning)},
|
||||
},
|
||||
expectError: false,
|
||||
},
|
||||
{
|
||||
description: "invalid PodRestartThreshold arg, expects errors",
|
||||
args: &RemovePodsHavingTooManyRestartsArgs{
|
||||
PodRestartThreshold: 0,
|
||||
},
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
description: "invalid States arg, expects errors",
|
||||
args: &RemovePodsHavingTooManyRestartsArgs{
|
||||
PodRestartThreshold: 1,
|
||||
States: []string{string(v1.PodFailed)},
|
||||
},
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
description: "allows CrashLoopBackOff state",
|
||||
args: &RemovePodsHavingTooManyRestartsArgs{
|
||||
PodRestartThreshold: 1,
|
||||
States: []string{"CrashLoopBackOff"},
|
||||
},
|
||||
expectError: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
err := ValidateRemovePodsHavingTooManyRestartsArgs(tc.args)
|
||||
|
||||
hasError := err != nil
|
||||
if tc.expectError != hasError {
|
||||
t.Error("unexpected arg validation behavior")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -41,6 +41,11 @@ func (in *RemovePodsHavingTooManyRestartsArgs) DeepCopyInto(out *RemovePodsHavin
|
||||
*out = new(v1.LabelSelector)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
if in.States != nil {
|
||||
in, out := &in.States, &out.States
|
||||
*out = make([]string, len(*in))
|
||||
copy(*out, *in)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user