diff --git a/README.md b/README.md index 69ea8fe6d..4c0e3a585 100644 --- a/README.md +++ b/README.md @@ -234,7 +234,9 @@ strategies: maxPodLifeTimeSeconds: 86400 ```` -## Namespace filtering +## Filter Pods + +### Namespace filtering Strategies like `PodLifeTime`, `RemovePodsHavingTooManyRestarts`, `RemovePodsViolatingNodeTaints`, `RemovePodsViolatingNodeAffinity` and `RemovePodsViolatingInterPodAntiAffinity` can specify `namespaces` @@ -276,6 +278,41 @@ The strategy gets executed over all namespaces but `namespace1` and `namespace2` It's not allowed to compute `include` with `exclude` field. +### Priority filtering + +All strategies are able to configure a priority threshold, only pods under the threshold can be evicted. You can +specify this threshold by setting `thresholdPriorityClassName`(setting the threshold to the value of the given +priority class) or `thresholdPriority`(directly setting the threshold) parameters. By default, this threshold +is set to the value of `system-cluster-critical` priority class. +E.g. + +Setting `thresholdPriority` +``` +apiVersion: "descheduler/v1alpha1" +kind: "DeschedulerPolicy" +strategies: + "PodLifeTime": + enabled: true + params: + maxPodLifeTimeSeconds: 86400 + thresholdPriority: 10000 +``` + +Setting `thresholdPriorityClassName` +``` +apiVersion: "descheduler/v1alpha1" +kind: "DeschedulerPolicy" +strategies: + "PodLifeTime": + enabled: true + params: + maxPodLifeTimeSeconds: 86400 + thresholdPriorityClassName: "priorityclass1" +``` + +Note that you can't configure both `thresholdPriority` and `thresholdPriorityClassName`, if the given priority class +does not exist, descheduler won't create it and will throw an error. + ## Pod Evictions When the descheduler decides to evict pods from a node, it employs the following general mechanism: diff --git a/charts/descheduler/templates/clusterrole.yaml b/charts/descheduler/templates/clusterrole.yaml index b63ef308f..2a2091736 100644 --- a/charts/descheduler/templates/clusterrole.yaml +++ b/charts/descheduler/templates/clusterrole.yaml @@ -18,4 +18,7 @@ rules: - apiGroups: [""] resources: ["pods/eviction"] verbs: ["create"] +- apiGroups: ["scheduling.k8s.io"] + resources: ["priorityclasses"] + verbs: ["get", "watch", "list"] {{- end -}} diff --git a/kubernetes/rbac.yaml b/kubernetes/rbac.yaml index 1432caab4..079964360 100644 --- a/kubernetes/rbac.yaml +++ b/kubernetes/rbac.yaml @@ -17,6 +17,9 @@ rules: - apiGroups: [""] resources: ["pods/eviction"] verbs: ["create"] +- apiGroups: ["scheduling.k8s.io"] + resources: ["priorityclasses"] + verbs: ["get", "watch", "list"] --- apiVersion: v1 kind: ServiceAccount diff --git a/pkg/api/types.go b/pkg/api/types.go index fb3becbea..dd74c94ff 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -52,8 +52,8 @@ type Namespaces struct { } // Besides Namespaces only one of its members may be specified -// TODO(jchaloup): move Namespaces to individual strategies once the policy -// version is bumped to v1alpha2 +// TODO(jchaloup): move Namespaces ThresholdPriority and ThresholdPriorityClassName to individual strategies +// once the policy version is bumped to v1alpha2 type StrategyParameters struct { NodeResourceUtilizationThresholds *NodeResourceUtilizationThresholds NodeAffinityType []string @@ -61,6 +61,8 @@ type StrategyParameters struct { MaxPodLifeTimeSeconds *uint RemoveDuplicates *RemoveDuplicates Namespaces Namespaces + ThresholdPriority *int32 + ThresholdPriorityClassName string } type Percentage float64 diff --git a/pkg/api/v1alpha1/types.go b/pkg/api/v1alpha1/types.go index aa1f60c8d..d672d9bea 100644 --- a/pkg/api/v1alpha1/types.go +++ b/pkg/api/v1alpha1/types.go @@ -51,7 +51,7 @@ type Namespaces struct { Exclude []string `json:"exclude"` } -// Besides Namespaces only one of its members may be specified +// Besides Namespaces ThresholdPriority and ThresholdPriorityClassName only one of its members may be specified type StrategyParameters struct { NodeResourceUtilizationThresholds *NodeResourceUtilizationThresholds `json:"nodeResourceUtilizationThresholds,omitempty"` NodeAffinityType []string `json:"nodeAffinityType,omitempty"` @@ -59,6 +59,8 @@ type StrategyParameters struct { MaxPodLifeTimeSeconds *uint `json:"maxPodLifeTimeSeconds,omitempty"` RemoveDuplicates *RemoveDuplicates `json:"removeDuplicates,omitempty"` Namespaces Namespaces `json:"namespaces"` + ThresholdPriority *int32 `json:"thresholdPriority"` + ThresholdPriorityClassName string `json:"thresholdPriorityClassName"` } type Percentage float64 diff --git a/pkg/api/v1alpha1/zz_generated.conversion.go b/pkg/api/v1alpha1/zz_generated.conversion.go index d2a58a2eb..e0f0cfc77 100644 --- a/pkg/api/v1alpha1/zz_generated.conversion.go +++ b/pkg/api/v1alpha1/zz_generated.conversion.go @@ -249,6 +249,8 @@ func autoConvert_v1alpha1_StrategyParameters_To_api_StrategyParameters(in *Strat if err := Convert_v1alpha1_Namespaces_To_api_Namespaces(&in.Namespaces, &out.Namespaces, s); err != nil { return err } + out.ThresholdPriority = (*int32)(unsafe.Pointer(in.ThresholdPriority)) + out.ThresholdPriorityClassName = in.ThresholdPriorityClassName return nil } @@ -266,6 +268,8 @@ func autoConvert_api_StrategyParameters_To_v1alpha1_StrategyParameters(in *api.S if err := Convert_api_Namespaces_To_v1alpha1_Namespaces(&in.Namespaces, &out.Namespaces, s); err != nil { return err } + out.ThresholdPriority = (*int32)(unsafe.Pointer(in.ThresholdPriority)) + out.ThresholdPriorityClassName = in.ThresholdPriorityClassName return nil } diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index f9707bb1d..33321a5cd 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -243,6 +243,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) { (*in).DeepCopyInto(*out) } in.Namespaces.DeepCopyInto(&out.Namespaces) + if in.ThresholdPriority != nil { + in, out := &in.ThresholdPriority, &out.ThresholdPriority + *out = new(int32) + **out = **in + } return } diff --git a/pkg/api/zz_generated.deepcopy.go b/pkg/api/zz_generated.deepcopy.go index b8d8dc574..b337e6ae6 100644 --- a/pkg/api/zz_generated.deepcopy.go +++ b/pkg/api/zz_generated.deepcopy.go @@ -243,6 +243,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) { (*in).DeepCopyInto(*out) } in.Namespaces.DeepCopyInto(&out.Namespaces) + if in.ThresholdPriority != nil { + in, out := &in.ThresholdPriority, &out.ThresholdPriority + *out = new(int32) + **out = **in + } return } diff --git a/pkg/descheduler/evictions/evictions.go b/pkg/descheduler/evictions/evictions.go index 2c3acf2cb..7a5c71234 100644 --- a/pkg/descheduler/evictions/evictions.go +++ b/pkg/descheduler/evictions/evictions.go @@ -78,12 +78,16 @@ func NewPodEvictor( } // IsEvictable checks if a pod is evictable or not. -func (pe *PodEvictor) IsEvictable(pod *v1.Pod) bool { +func (pe *PodEvictor) IsEvictable(pod *v1.Pod, thresholdPriority int32) bool { checkErrs := []error{} if IsCriticalPod(pod) { checkErrs = append(checkErrs, fmt.Errorf("pod is critical")) } + if !IsPodEvictableBasedOnPriority(pod, thresholdPriority) { + checkErrs = append(checkErrs, fmt.Errorf("pod is not evictable due to its priority")) + } + ownerRefList := podutil.OwnerRef(pod) if IsDaemonsetPod(ownerRefList) { checkErrs = append(checkErrs, fmt.Errorf("pod is a DaemonSet pod")) @@ -216,3 +220,8 @@ func IsPodWithLocalStorage(pod *v1.Pod) bool { return false } + +// IsPodEvictableBasedOnPriority checks if the given pod is evictable based on priority resolved from pod Spec. +func IsPodEvictableBasedOnPriority(pod *v1.Pod, priority int32) bool { + return pod.Spec.Priority == nil || (*pod.Spec.Priority < priority && *pod.Spec.Priority < utils.SystemCriticalPriority) +} diff --git a/pkg/descheduler/evictions/evictions_test.go b/pkg/descheduler/evictions/evictions_test.go index 52c9d3f5f..81f3155f5 100644 --- a/pkg/descheduler/evictions/evictions_test.go +++ b/pkg/descheduler/evictions/evictions_test.go @@ -71,10 +71,13 @@ func TestEvictPod(t *testing.T) { func TestIsEvictable(t *testing.T) { n1 := test.BuildTestNode("node1", 1000, 2000, 13, nil) + lowPriority := int32(800) + highPriority := int32(900) type testCase struct { pod *v1.Pod runBefore func(*v1.Pod) evictLocalStoragePods bool + priorityThreshold *int32 result bool } @@ -179,6 +182,7 @@ func TestIsEvictable(t *testing.T) { }, { pod: test.BuildTestPod("p10", 400, 0, n1.Name, nil), runBefore: func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Annotations = test.GetMirrorPodAnnotation() }, evictLocalStoragePods: false, @@ -186,6 +190,7 @@ func TestIsEvictable(t *testing.T) { }, { pod: test.BuildTestPod("p11", 400, 0, n1.Name, nil), runBefore: func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Annotations = test.GetMirrorPodAnnotation() pod.Annotations["descheduler.alpha.kubernetes.io/evict"] = "true" }, @@ -194,6 +199,7 @@ func TestIsEvictable(t *testing.T) { }, { pod: test.BuildTestPod("p12", 400, 0, n1.Name, nil), runBefore: func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() priority := utils.SystemCriticalPriority pod.Spec.Priority = &priority }, @@ -202,6 +208,7 @@ func TestIsEvictable(t *testing.T) { }, { pod: test.BuildTestPod("p13", 400, 0, n1.Name, nil), runBefore: func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() priority := utils.SystemCriticalPriority pod.Spec.Priority = &priority pod.Annotations = map[string]string{ @@ -210,6 +217,25 @@ func TestIsEvictable(t *testing.T) { }, evictLocalStoragePods: false, result: true, + }, { + pod: test.BuildTestPod("p14", 400, 0, n1.Name, nil), + runBefore: func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + pod.Spec.Priority = &highPriority + }, + evictLocalStoragePods: false, + priorityThreshold: &lowPriority, + result: false, + }, { + pod: test.BuildTestPod("p15", 400, 0, n1.Name, nil), + runBefore: func(pod *v1.Pod) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} + pod.Spec.Priority = &highPriority + }, + evictLocalStoragePods: false, + priorityThreshold: &lowPriority, + result: true, }, } @@ -218,7 +244,11 @@ func TestIsEvictable(t *testing.T) { podEvictor := &PodEvictor{ evictLocalStoragePods: test.evictLocalStoragePods, } - result := podEvictor.IsEvictable(test.pod) + testPriorityThreshold := utils.SystemCriticalPriority + if test.priorityThreshold != nil { + testPriorityThreshold = *test.priorityThreshold + } + result := podEvictor.IsEvictable(test.pod, testPriorityThreshold) if result != test.result { t.Errorf("IsEvictable should return for pod %s %t, but it returns %t", test.pod.Name, test.result, result) } @@ -233,10 +263,6 @@ func TestPodTypes(t *testing.T) { p2 := test.BuildTestPod("p2", 400, 0, n1.Name, nil) p3 := test.BuildTestPod("p3", 400, 0, n1.Name, nil) p4 := test.BuildTestPod("p4", 400, 0, n1.Name, nil) - p5 := test.BuildTestPod("p5", 400, 0, n1.Name, nil) - p6 := test.BuildTestPod("p6", 400, 0, n1.Name, nil) - - p6.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() p1.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() // The following 4 pods won't get evicted. @@ -257,18 +283,9 @@ func TestPodTypes(t *testing.T) { } // A Mirror Pod. p4.Annotations = test.GetMirrorPodAnnotation() - // A Critical Pod. - p5.Namespace = "kube-system" - priority := utils.SystemCriticalPriority - p5.Spec.Priority = &priority - systemCriticalPriority := utils.SystemCriticalPriority - p5.Spec.Priority = &systemCriticalPriority if !IsMirrorPod(p4) { t.Errorf("Expected p4 to be a mirror pod.") } - if !IsCriticalPod(p5) { - t.Errorf("Expected p5 to be a critical pod.") - } if !IsPodWithLocalStorage(p3) { t.Errorf("Expected p3 to be a pod with local storage.") } diff --git a/pkg/descheduler/strategies/duplicates.go b/pkg/descheduler/strategies/duplicates.go index 4d485e3c8..7aabd4208 100644 --- a/pkg/descheduler/strategies/duplicates.go +++ b/pkg/descheduler/strategies/duplicates.go @@ -18,6 +18,7 @@ package strategies import ( "context" + "fmt" "reflect" "sort" "strings" @@ -31,8 +32,20 @@ import ( "sigs.k8s.io/descheduler/pkg/api" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + "sigs.k8s.io/descheduler/pkg/utils" ) +func validateRemoveDuplicatePodsParams(params *api.StrategyParameters) error { + if params == nil { + return nil + } + if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" { + return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set") + } + + return nil +} + // RemoveDuplicatePods removes the duplicate pods on node. This strategy evicts all duplicate pods on node. // A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same // namespace, and have at least one container with the same image. @@ -44,9 +57,21 @@ func RemoveDuplicatePods( nodes []*v1.Node, podEvictor *evictions.PodEvictor, ) { + if err := validateRemoveDuplicatePodsParams(strategy.Params); err != nil { + klog.V(1).Info(err) + return + } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) + if err != nil { + klog.V(1).Infof("failed to get threshold priority from strategy's params: %#v", err) + return + } + for _, node := range nodes { klog.V(1).Infof("Processing node: %#v", node.Name) - pods, err := podutil.ListPodsOnANode(ctx, client, node, podutil.WithFilter(podEvictor.IsEvictable)) + pods, err := podutil.ListPodsOnANode(ctx, client, node, podutil.WithFilter(func(pod *v1.Pod) bool { + return podEvictor.IsEvictable(pod, thresholdPriority) + })) if err != nil { klog.Errorf("error listing evictable pods on node %s: %+v", node.Name, err) continue diff --git a/pkg/descheduler/strategies/lownodeutilization.go b/pkg/descheduler/strategies/lownodeutilization.go index 788775946..2a58101bf 100644 --- a/pkg/descheduler/strategies/lownodeutilization.go +++ b/pkg/descheduler/strategies/lownodeutilization.go @@ -50,13 +50,28 @@ const ( MaxResourcePercentage = 100 ) +func validateLowNodeUtilizationParams(params *api.StrategyParameters) error { + if params == nil || params.NodeResourceUtilizationThresholds == nil { + return fmt.Errorf("NodeResourceUtilizationThresholds not set") + } + if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" { + return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set") + } + + return nil +} + // LowNodeUtilization evicts pods from overutilized nodes to underutilized nodes. Note that CPU/Memory requests are used // to calculate nodes' utilization and not the actual resource usage. func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) { - // todo: move to config validation? // TODO: May be create a struct for the strategy as well, so that we don't have to pass along the all the params? - if strategy.Params == nil || strategy.Params.NodeResourceUtilizationThresholds == nil { - klog.V(1).Infof("NodeResourceUtilizationThresholds not set") + if err := validateLowNodeUtilizationParams(strategy.Params); err != nil { + klog.V(1).Info(err) + return + } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) + if err != nil { + klog.V(1).Infof("failed to get threshold priority from strategy's params: %#v", err) return } @@ -116,7 +131,8 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg targetNodes, lowNodes, targetThresholds, - podEvictor) + podEvictor, + thresholdPriority) klog.V(1).Infof("Total number of pods evicted: %v", podEvictor.TotalEvicted()) } @@ -196,6 +212,7 @@ func evictPodsFromTargetNodes( targetNodes, lowNodes []NodeUsageMap, targetThresholds api.ResourceThresholds, podEvictor *evictions.PodEvictor, + thresholdPriority int32, ) { sortNodesByUsage(targetNodes) @@ -232,7 +249,7 @@ func evictPodsFromTargetNodes( } klog.V(3).Infof("evicting pods from node %#v with usage: %#v", node.node.Name, node.usage) - nonRemovablePods, removablePods := classifyPods(node.allPods, podEvictor) + nonRemovablePods, removablePods := classifyPods(node.allPods, podEvictor, thresholdPriority) klog.V(2).Infof("allPods:%v, nonRemovablePods:%v, removablePods:%v", len(node.allPods), len(nonRemovablePods), len(removablePods)) if len(removablePods) == 0 { @@ -393,11 +410,11 @@ func nodeUtilization(node *v1.Node, pods []*v1.Pod) api.ResourceThresholds { } } -func classifyPods(pods []*v1.Pod, evictor *evictions.PodEvictor) ([]*v1.Pod, []*v1.Pod) { +func classifyPods(pods []*v1.Pod, evictor *evictions.PodEvictor, thresholdPriority int32) ([]*v1.Pod, []*v1.Pod) { var nonRemovablePods, removablePods []*v1.Pod for _, pod := range pods { - if !evictor.IsEvictable(pod) { + if !evictor.IsEvictable(pod, thresholdPriority) { nonRemovablePods = append(nonRemovablePods, pod) } else { removablePods = append(removablePods, pod) diff --git a/pkg/descheduler/strategies/node_affinity.go b/pkg/descheduler/strategies/node_affinity.go index 5454ce947..e4d5eac5a 100644 --- a/pkg/descheduler/strategies/node_affinity.go +++ b/pkg/descheduler/strategies/node_affinity.go @@ -28,6 +28,7 @@ import ( "sigs.k8s.io/descheduler/pkg/descheduler/evictions" nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + "sigs.k8s.io/descheduler/pkg/utils" ) func validatePodsViolatingNodeAffinityParams(params *api.StrategyParameters) error { @@ -38,6 +39,9 @@ func validatePodsViolatingNodeAffinityParams(params *api.StrategyParameters) err if len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 { return fmt.Errorf("only one of Include/Exclude namespaces can be set") } + if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" { + return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set") + } return nil } @@ -48,6 +52,12 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter klog.V(1).Info(err) return } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) + if err != nil { + klog.V(1).Infof("failed to get threshold priority from strategy's params: %#v", err) + return + } + for _, nodeAffinity := range strategy.Params.NodeAffinityType { klog.V(2).Infof("Executing for nodeAffinityType: %v", nodeAffinity) @@ -61,7 +71,7 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter client, node, podutil.WithFilter(func(pod *v1.Pod) bool { - return podEvictor.IsEvictable(pod) && + return podEvictor.IsEvictable(pod, thresholdPriority) && !nodeutil.PodFitsCurrentNode(pod, node) && nodeutil.PodFitsAnyNode(pod, nodes) }), diff --git a/pkg/descheduler/strategies/node_taint.go b/pkg/descheduler/strategies/node_taint.go index 529b6b6b3..2a94e0e5e 100644 --- a/pkg/descheduler/strategies/node_taint.go +++ b/pkg/descheduler/strategies/node_taint.go @@ -39,6 +39,9 @@ func validateRemovePodsViolatingNodeTaintsParams(params *api.StrategyParameters) if len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 { return fmt.Errorf("only one of Include/Exclude namespaces can be set") } + if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" { + return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set") + } return nil } @@ -53,6 +56,11 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa if strategy.Params != nil { namespaces = strategy.Params.Namespaces } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) + if err != nil { + klog.V(1).Infof("failed to get threshold priority from strategy's params: %#v", err) + return + } for _, node := range nodes { klog.V(1).Infof("Processing node: %#v\n", node.Name) @@ -60,7 +68,9 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa ctx, client, node, - podutil.WithFilter(podEvictor.IsEvictable), + podutil.WithFilter(func(pod *v1.Pod) bool { + return podEvictor.IsEvictable(pod, thresholdPriority) + }), podutil.WithNamespaces(namespaces.Include), podutil.WithoutNamespaces(namespaces.Exclude), ) diff --git a/pkg/descheduler/strategies/pod_antiaffinity.go b/pkg/descheduler/strategies/pod_antiaffinity.go index d82d17075..ddc7a2911 100644 --- a/pkg/descheduler/strategies/pod_antiaffinity.go +++ b/pkg/descheduler/strategies/pod_antiaffinity.go @@ -40,6 +40,9 @@ func validateRemovePodsViolatingInterPodAntiAffinityParams(params *api.StrategyP if len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 { return fmt.Errorf("only one of Include/Exclude namespaces can be set") } + if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" { + return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set") + } return nil } @@ -54,6 +57,11 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients if strategy.Params != nil { namespaces = strategy.Params.Namespaces } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) + if err != nil { + klog.V(1).Infof("failed to get threshold priority from strategy's params: %#v", err) + return + } for _, node := range nodes { klog.V(1).Infof("Processing node: %#v\n", node.Name) @@ -61,7 +69,9 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients ctx, client, node, - podutil.WithFilter(podEvictor.IsEvictable), + podutil.WithFilter(func(pod *v1.Pod) bool { + return podEvictor.IsEvictable(pod, thresholdPriority) + }), podutil.WithNamespaces(namespaces.Include), podutil.WithoutNamespaces(namespaces.Exclude), ) diff --git a/pkg/descheduler/strategies/pod_lifetime.go b/pkg/descheduler/strategies/pod_lifetime.go index a214dc3cf..16cdb09a2 100644 --- a/pkg/descheduler/strategies/pod_lifetime.go +++ b/pkg/descheduler/strategies/pod_lifetime.go @@ -28,6 +28,7 @@ import ( "sigs.k8s.io/descheduler/pkg/api" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + "sigs.k8s.io/descheduler/pkg/utils" ) func validatePodLifeTimeParams(params *api.StrategyParameters) error { @@ -39,6 +40,9 @@ func validatePodLifeTimeParams(params *api.StrategyParameters) error { if len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 { return fmt.Errorf("only one of Include/Exclude namespaces can be set") } + if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" { + return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set") + } return nil } @@ -49,11 +53,16 @@ func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.D klog.V(1).Info(err) return } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) + if err != nil { + klog.V(1).Infof("failed to get threshold priority from strategy's params: %#v", err) + return + } for _, node := range nodes { klog.V(1).Infof("Processing node: %#v", node.Name) - pods := listOldPodsOnNode(ctx, client, node, strategy.Params, podEvictor) + pods := listOldPodsOnNode(ctx, client, node, strategy.Params, podEvictor, thresholdPriority) for _, pod := range pods { success, err := podEvictor.EvictPod(ctx, pod, node, "PodLifeTime") if success { @@ -69,12 +78,14 @@ func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.D } } -func listOldPodsOnNode(ctx context.Context, client clientset.Interface, node *v1.Node, params *api.StrategyParameters, evictor *evictions.PodEvictor) []*v1.Pod { +func listOldPodsOnNode(ctx context.Context, client clientset.Interface, node *v1.Node, params *api.StrategyParameters, podEvictor *evictions.PodEvictor, thresholdPriority int32) []*v1.Pod { pods, err := podutil.ListPodsOnANode( ctx, client, node, - podutil.WithFilter(evictor.IsEvictable), + podutil.WithFilter(func(pod *v1.Pod) bool { + return podEvictor.IsEvictable(pod, thresholdPriority) + }), podutil.WithNamespaces(params.Namespaces.Include), podutil.WithoutNamespaces(params.Namespaces.Exclude), ) diff --git a/pkg/descheduler/strategies/toomanyrestarts.go b/pkg/descheduler/strategies/toomanyrestarts.go index ae745b45f..81caf0b01 100644 --- a/pkg/descheduler/strategies/toomanyrestarts.go +++ b/pkg/descheduler/strategies/toomanyrestarts.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/descheduler/pkg/api" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + "sigs.k8s.io/descheduler/pkg/utils" ) func validateRemovePodsHavingTooManyRestartsParams(params *api.StrategyParameters) error { @@ -38,6 +39,9 @@ func validateRemovePodsHavingTooManyRestartsParams(params *api.StrategyParameter if len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 { return fmt.Errorf("only one of Include/Exclude namespaces can be set") } + if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" { + return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set") + } return nil } @@ -50,13 +54,21 @@ func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Inter klog.V(1).Info(err) return } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) + if err != nil { + klog.V(1).Infof("failed to get threshold priority from strategy's params: %#v", err) + return + } + for _, node := range nodes { klog.V(1).Infof("Processing node: %s", node.Name) pods, err := podutil.ListPodsOnANode( ctx, client, node, - podutil.WithFilter(podEvictor.IsEvictable), + podutil.WithFilter(func(pod *v1.Pod) bool { + return podEvictor.IsEvictable(pod, thresholdPriority) + }), podutil.WithNamespaces(strategy.Params.Namespaces.Include), podutil.WithoutNamespaces(strategy.Params.Namespaces.Exclude), ) diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go index 7299491c3..0beaa6610 100644 --- a/pkg/utils/pod.go +++ b/pkg/utils/pod.go @@ -104,7 +104,7 @@ func GetPodSource(pod *v1.Pod) (string, error) { return "", fmt.Errorf("cannot get source of pod %q", pod.UID) } -// IsCriticalPod returns true if pod's priority is greater than or equal to SystemCriticalPriority. +// IsCriticalPod returns true if the pod is a static or mirror pod. func IsCriticalPod(pod *v1.Pod) bool { if IsStaticPod(pod) { return true @@ -112,17 +112,9 @@ func IsCriticalPod(pod *v1.Pod) bool { if IsMirrorPod(pod) { return true } - if pod.Spec.Priority != nil && IsCriticalPodBasedOnPriority(*pod.Spec.Priority) { - return true - } return false } -// IsCriticalPodBasedOnPriority checks if the given pod is a critical pod based on priority resolved from pod Spec. -func IsCriticalPodBasedOnPriority(priority int32) bool { - return priority >= SystemCriticalPriority -} - // PodRequestsAndLimits returns a dictionary of all defined resources summed up for all // containers of the pod. If PodOverhead feature is enabled, pod overhead is added to the // total container resource requests and to the total container limits which have a diff --git a/pkg/utils/priority.go b/pkg/utils/priority.go index a5a5737cf..709273dfc 100644 --- a/pkg/utils/priority.go +++ b/pkg/utils/priority.go @@ -1,9 +1,15 @@ package utils import ( + "context" + "fmt" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/sets" + clientset "k8s.io/client-go/kubernetes" + "sigs.k8s.io/descheduler/pkg/api" ) const SystemCriticalPriority = 2 * int32(1000000000) @@ -33,3 +39,36 @@ func PodMatchesTermsNamespaceAndSelector(pod *v1.Pod, namespaces sets.String, se } return true } + +// GetPriorityFromPriorityClass gets priority from the given priority class. +// If no priority class is provided, it will return SystemCriticalPriority by default. +func GetPriorityFromPriorityClass(ctx context.Context, client clientset.Interface, name string) (int32, error) { + if name != "" { + priorityClass, err := client.SchedulingV1().PriorityClasses().Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return 0, err + } + return priorityClass.Value, nil + } + return SystemCriticalPriority, nil +} + +// GetPriorityFromStrategyParams gets priority from the given StrategyParameters. +// It will return SystemCriticalPriority by default. +func GetPriorityFromStrategyParams(ctx context.Context, client clientset.Interface, params *api.StrategyParameters) (priority int32, err error) { + if params == nil { + return SystemCriticalPriority, nil + } + if params.ThresholdPriority != nil { + priority = *params.ThresholdPriority + } else { + priority, err = GetPriorityFromPriorityClass(ctx, client, params.ThresholdPriorityClassName) + if err != nil { + return + } + } + if priority > SystemCriticalPriority { + return 0, fmt.Errorf("Priority threshold can't be greater than %d", SystemCriticalPriority) + } + return +} diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 2449298f0..4c63ff546 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -20,12 +20,14 @@ import ( "context" "math" "os" + "sigs.k8s.io/descheduler/pkg/utils" "sort" "strings" "testing" "time" v1 "k8s.io/api/core/v1" + schedulingv1 "k8s.io/api/scheduling/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -47,7 +49,7 @@ import ( "sigs.k8s.io/descheduler/pkg/descheduler/strategies" ) -func MakePodSpec() v1.PodSpec { +func MakePodSpec(priorityClassName string) v1.PodSpec { return v1.PodSpec{ Containers: []v1.Container{{ Name: "pause", @@ -65,11 +67,12 @@ func MakePodSpec() v1.PodSpec { }, }, }}, + PriorityClassName: priorityClassName, } } // RcByNameContainer returns a ReplicationControoler with specified name and container -func RcByNameContainer(name, namespace string, replicas int32, labels map[string]string, gracePeriod *int64) *v1.ReplicationController { +func RcByNameContainer(name, namespace string, replicas int32, labels map[string]string, gracePeriod *int64, priorityClassName string) *v1.ReplicationController { zeroGracePeriod := int64(0) // Add "name": name to the labels, overwriting if it exists. @@ -95,7 +98,7 @@ func RcByNameContainer(name, namespace string, replicas int32, labels map[string ObjectMeta: metav1.ObjectMeta{ Labels: labels, }, - Spec: MakePodSpec(), + Spec: MakePodSpec(priorityClassName), }, }, } @@ -176,7 +179,7 @@ func TestLowNodeUtilization(t *testing.T) { } defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{}) - rc := RcByNameContainer("test-rc-node-utilization", testNamespace.Name, int32(15), map[string]string{"test": "node-utilization"}, nil) + rc := RcByNameContainer("test-rc-node-utilization", testNamespace.Name, int32(15), map[string]string{"test": "node-utilization"}, nil, "") if _, err := clientSet.CoreV1().ReplicationControllers(rc.Namespace).Create(ctx, rc, metav1.CreateOptions{}); err != nil { t.Errorf("Error creating deployment %v", err) } @@ -185,7 +188,7 @@ func TestLowNodeUtilization(t *testing.T) { deleteRC(ctx, t, clientSet, rc) } -func runPodLifetimeStrategy(ctx context.Context, clientset clientset.Interface, nodeInformer coreinformers.NodeInformer, namespaces deschedulerapi.Namespaces) { +func runPodLifetimeStrategy(ctx context.Context, clientset clientset.Interface, nodeInformer coreinformers.NodeInformer, namespaces deschedulerapi.Namespaces, priorityClass string, priority *int32) { // Run descheduler. evictionPolicyGroupVersion, err := eutils.SupportEviction(clientset) if err != nil || len(evictionPolicyGroupVersion) == 0 { @@ -204,8 +207,10 @@ func runPodLifetimeStrategy(ctx context.Context, clientset clientset.Interface, deschedulerapi.DeschedulerStrategy{ Enabled: true, Params: &deschedulerapi.StrategyParameters{ - MaxPodLifeTimeSeconds: &maxPodLifeTimeSeconds, - Namespaces: namespaces, + MaxPodLifeTimeSeconds: &maxPodLifeTimeSeconds, + Namespaces: namespaces, + ThresholdPriority: priority, + ThresholdPriorityClassName: priorityClass, }, }, nodes, @@ -257,7 +262,7 @@ func TestNamespaceConstraintsInclude(t *testing.T) { } defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{}) - rc := RcByNameContainer("test-rc-podlifetime", testNamespace.Name, 5, map[string]string{"test": "podlifetime-include"}, nil) + rc := RcByNameContainer("test-rc-podlifetime", testNamespace.Name, 5, map[string]string{"test": "podlifetime-include"}, nil, "") if _, err := clientSet.CoreV1().ReplicationControllers(rc.Namespace).Create(ctx, rc, metav1.CreateOptions{}); err != nil { t.Errorf("Error creating deployment %v", err) } @@ -283,7 +288,7 @@ func TestNamespaceConstraintsInclude(t *testing.T) { t.Logf("set the strategy to delete pods from %v namespace", rc.Namespace) runPodLifetimeStrategy(ctx, clientSet, nodeInformer, deschedulerapi.Namespaces{ Include: []string{rc.Namespace}, - }) + }, "", nil) // All pods are supposed to be deleted, wait until all the old pods are deleted if err := wait.PollImmediate(time.Second, 20*time.Second, func() (bool, error) { @@ -328,7 +333,7 @@ func TestNamespaceConstraintsExclude(t *testing.T) { } defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{}) - rc := RcByNameContainer("test-rc-podlifetime", testNamespace.Name, 5, map[string]string{"test": "podlifetime-exclude"}, nil) + rc := RcByNameContainer("test-rc-podlifetime", testNamespace.Name, 5, map[string]string{"test": "podlifetime-exclude"}, nil, "") if _, err := clientSet.CoreV1().ReplicationControllers(rc.Namespace).Create(ctx, rc, metav1.CreateOptions{}); err != nil { t.Errorf("Error creating deployment %v", err) } @@ -354,7 +359,7 @@ func TestNamespaceConstraintsExclude(t *testing.T) { t.Logf("set the strategy to delete pods from namespaces except the %v namespace", rc.Namespace) runPodLifetimeStrategy(ctx, clientSet, nodeInformer, deschedulerapi.Namespaces{ Exclude: []string{rc.Namespace}, - }) + }, "", nil) t.Logf("Waiting 10s") time.Sleep(10 * time.Second) @@ -373,6 +378,145 @@ func TestNamespaceConstraintsExclude(t *testing.T) { } } +func TestThresholdPriority(t *testing.T) { + testPriority(t, false) +} + +func TestThresholdPriorityClass(t *testing.T) { + testPriority(t, true) +} + +func testPriority(t *testing.T, isPriorityClass bool) { + var highPriority = int32(1000) + var lowPriority = int32(500) + ctx := context.Background() + + clientSet, nodeInformer, stopCh := initializeClient(t) + defer close(stopCh) + + testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}} + if _, err := clientSet.CoreV1().Namespaces().Create(ctx, testNamespace, metav1.CreateOptions{}); err != nil { + t.Fatalf("Unable to create ns %v", testNamespace.Name) + } + defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{}) + + // create two priority classes + highPriorityClass := &schedulingv1.PriorityClass{ + ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name()) + "-highpriority"}, + Value: highPriority, + } + if _, err := clientSet.SchedulingV1().PriorityClasses().Create(ctx, highPriorityClass, metav1.CreateOptions{}); err != nil { + t.Fatalf("Error creating priorityclass %s: %v", highPriorityClass.Name, err) + } + defer clientSet.SchedulingV1().PriorityClasses().Delete(ctx, highPriorityClass.Name, metav1.DeleteOptions{}) + + lowPriorityClass := &schedulingv1.PriorityClass{ + ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name()) + "-lowpriority"}, + Value: lowPriority, + } + if _, err := clientSet.SchedulingV1().PriorityClasses().Create(ctx, lowPriorityClass, metav1.CreateOptions{}); err != nil { + t.Fatalf("Error creating priorityclass %s: %v", lowPriorityClass.Name, err) + } + defer clientSet.SchedulingV1().PriorityClasses().Delete(ctx, lowPriorityClass.Name, metav1.DeleteOptions{}) + + // create two RCs with different priority classes in the same namespace + rcHighPriority := RcByNameContainer("test-rc-podlifetime-highpriority", testNamespace.Name, 5, + map[string]string{"test": "podlifetime-highpriority"}, nil, highPriorityClass.Name) + if _, err := clientSet.CoreV1().ReplicationControllers(rcHighPriority.Namespace).Create(ctx, rcHighPriority, metav1.CreateOptions{}); err != nil { + t.Errorf("Error creating rc %s: %v", rcHighPriority.Name, err) + } + defer deleteRC(ctx, t, clientSet, rcHighPriority) + + rcLowPriority := RcByNameContainer("test-rc-podlifetime-lowpriority", testNamespace.Name, 5, + map[string]string{"test": "podlifetime-lowpriority"}, nil, lowPriorityClass.Name) + if _, err := clientSet.CoreV1().ReplicationControllers(rcLowPriority.Namespace).Create(ctx, rcLowPriority, metav1.CreateOptions{}); err != nil { + t.Errorf("Error creating rc %s: %v", rcLowPriority.Name, err) + } + defer deleteRC(ctx, t, clientSet, rcLowPriority) + + // wait for a while so all the pods are at least few seconds older + time.Sleep(5 * time.Second) + + // it's assumed all new pods are named differently from currently running -> no name collision + podListHighPriority, err := clientSet.CoreV1().Pods(rcHighPriority.Namespace).List( + ctx, metav1.ListOptions{LabelSelector: labels.SelectorFromSet(rcHighPriority.Spec.Template.Labels).String()}) + if err != nil { + t.Fatalf("Unable to list pods: %v", err) + } + podListLowPriority, err := clientSet.CoreV1().Pods(rcLowPriority.Namespace).List( + ctx, metav1.ListOptions{LabelSelector: labels.SelectorFromSet(rcLowPriority.Spec.Template.Labels).String()}) + if err != nil { + t.Fatalf("Unable to list pods: %v", err) + } + + if len(podListHighPriority.Items)+len(podListLowPriority.Items) != 10 { + t.Fatalf("Expected 10 replicas, got %v instead", len(podListHighPriority.Items)+len(podListLowPriority.Items)) + } + + expectReservePodNames := getPodNames(podListHighPriority.Items) + expectEvictPodNames := getPodNames(podListLowPriority.Items) + sort.Strings(expectReservePodNames) + sort.Strings(expectEvictPodNames) + t.Logf("Pods not expect to be evicted: %v, pods expect to be evicted: %v", expectReservePodNames, expectEvictPodNames) + + if isPriorityClass { + t.Logf("set the strategy to delete pods with priority lower than priority class %s", highPriorityClass.Name) + runPodLifetimeStrategy(ctx, clientSet, nodeInformer, deschedulerapi.Namespaces{}, highPriorityClass.Name, nil) + } else { + t.Logf("set the strategy to delete pods with priority lower than %d", highPriority) + runPodLifetimeStrategy(ctx, clientSet, nodeInformer, deschedulerapi.Namespaces{}, "", &highPriority) + } + + t.Logf("Waiting 10s") + time.Sleep(10 * time.Second) + // check if all pods with high priority class are not evicted + podListHighPriority, err = clientSet.CoreV1().Pods(rcHighPriority.Namespace).List( + ctx, metav1.ListOptions{LabelSelector: labels.SelectorFromSet(rcHighPriority.Spec.Template.Labels).String()}) + if err != nil { + t.Fatalf("Unable to list pods after running strategy: %v", err) + } + + excludePodNames := getPodNames(podListHighPriority.Items) + sort.Strings(excludePodNames) + t.Logf("Existing high priority pods: %v", excludePodNames) + + // validate no pods were deleted + if len(intersectStrings(expectReservePodNames, excludePodNames)) != 5 { + t.Fatalf("None of %v high priority pods are expected to be deleted", expectReservePodNames) + } + + //check if all pods with low priority class are evicted + if err := wait.PollImmediate(time.Second, 20*time.Second, func() (bool, error) { + podListLowPriority, err := clientSet.CoreV1().Pods(rcLowPriority.Namespace).List( + ctx, metav1.ListOptions{LabelSelector: labels.SelectorFromSet(rcLowPriority.Spec.Template.Labels).String()}) + if err != nil { + return false, nil + } + + includePodNames := getPodNames(podListLowPriority.Items) + // validate all pod were deleted + if len(intersectStrings(expectEvictPodNames, includePodNames)) > 0 { + t.Logf("Waiting until %v low priority pods get deleted", intersectStrings(expectEvictPodNames, includePodNames)) + // check if there's at least one pod not in Terminating state + for _, pod := range podListLowPriority.Items { + // In case podList contains newly created pods + if len(intersectStrings(expectEvictPodNames, []string{pod.Name})) == 0 { + continue + } + if pod.DeletionTimestamp == nil { + t.Logf("Pod %v not in terminating state", pod.Name) + return false, nil + } + } + t.Logf("All %v pods are terminating", intersectStrings(expectEvictPodNames, includePodNames)) + } + + return true, nil + }); err != nil { + t.Fatalf("Error waiting for pods to be deleted: %v", err) + } +} + func TestEvictAnnotation(t *testing.T) { ctx := context.Background() @@ -396,7 +540,7 @@ func TestEvictAnnotation(t *testing.T) { } defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{}) - rc := RcByNameContainer("test-rc-evict-annotation", testNamespace.Name, int32(15), map[string]string{"test": "annotation"}, nil) + rc := RcByNameContainer("test-rc-evict-annotation", testNamespace.Name, int32(15), map[string]string{"test": "annotation"}, nil, "") rc.Spec.Template.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} rc.Spec.Template.Spec.Volumes = []v1.Volume{ { @@ -509,7 +653,10 @@ func evictPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, continue } // List all the pods on the current Node - podsOnANode, err := podutil.ListPodsOnANode(ctx, clientSet, node, podutil.WithFilter(podEvictor.IsEvictable)) + podsOnANode, err := podutil.ListPodsOnANode(ctx, clientSet, node, + podutil.WithFilter(func(pod *v1.Pod) bool { + return podEvictor.IsEvictable(pod, utils.SystemCriticalPriority) + })) if err != nil { t.Errorf("Error listing pods on a node %v", err) } @@ -521,7 +668,10 @@ func evictPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, } t.Log("Eviction of pods starting") startEndToEndForLowNodeUtilization(ctx, clientSet, nodeInformer, podEvictor) - podsOnleastUtilizedNode, err := podutil.ListPodsOnANode(ctx, clientSet, leastLoadedNode, podutil.WithFilter(podEvictor.IsEvictable)) + podsOnleastUtilizedNode, err := podutil.ListPodsOnANode(ctx, clientSet, leastLoadedNode, + podutil.WithFilter(func(pod *v1.Pod) bool { + return podEvictor.IsEvictable(pod, utils.SystemCriticalPriority) + })) if err != nil { t.Errorf("Error listing pods on a node %v", err) }