diff --git a/README.md b/README.md index 9d4236ac7..d103d0e4f 100644 --- a/README.md +++ b/README.md @@ -118,15 +118,16 @@ The Descheduler Policy is configurable and includes default strategy plugins tha These are top level keys in the Descheduler Policy that you can use to configure all evictions. -| Name | type | Default Value | Description | -|------------------------------------|--------|---------------|----------------------------------------------------------------------------------------------------------------------------| -| `nodeSelector` | `string` | `nil` | Limiting the nodes which are processed. Only used when `nodeFit`=`true` and only by the PreEvictionFilter Extension Point. | -| `maxNoOfPodsToEvictPerNode` | `int` | `nil` | Maximum number of pods evicted from each node (summed through all strategies). | -| `maxNoOfPodsToEvictPerNamespace` | `int` | `nil` | Maximum number of pods evicted from each namespace (summed through all strategies). | -| `maxNoOfPodsToEvictTotal` | `int` | `nil` | Maximum number of pods evicted per rescheduling cycle (summed through all strategies). | -| `metricsCollector` | `object` | `nil` | Configures collection of metrics for actual resource utilization. | -| `metricsCollector.enabled` | `bool` | `false` | Enables Kubernetes [Metrics Server](https://kubernetes-sigs.github.io/metrics-server/) collection. | -| `evictionFailureEventNotification` | `bool` | `false` | Enables eviction failure event notification. | +| Name | type | Default Value | Description | +|------------------------------------|----------|---------------|----------------------------------------------------------------------------------------------------------------------------| +| `nodeSelector` | `string` | `nil` | Limiting the nodes which are processed. Only used when `nodeFit`=`true` and only by the PreEvictionFilter Extension Point. | +| `maxNoOfPodsToEvictPerNode` | `int` | `nil` | Maximum number of pods evicted from each node (summed through all strategies). | +| `maxNoOfPodsToEvictPerNamespace` | `int` | `nil` | Maximum number of pods evicted from each namespace (summed through all strategies). | +| `maxNoOfPodsToEvictTotal` | `int` | `nil` | Maximum number of pods evicted per rescheduling cycle (summed through all strategies). | +| `metricsCollector` | `object` | `nil` | Configures collection of metrics for actual resource utilization. | +| `metricsCollector.enabled` | `bool` | `false` | Enables Kubernetes [Metrics Server](https://kubernetes-sigs.github.io/metrics-server/) collection. | +| `evictionFailureEventNotification` | `bool` | `false` | Enables eviction failure event notification. | +| `gracePeriodSeconds` | `int` | `0` | The duration in seconds before the object should be deleted. The value zero indicates delete immediately. | ### Evictor Plugin configuration (Default Evictor) @@ -161,6 +162,7 @@ nodeSelector: "node=node1" # you don't need to set this, if not set all will be maxNoOfPodsToEvictPerNode: 5000 # you don't need to set this, unlimited if not set maxNoOfPodsToEvictPerNamespace: 5000 # you don't need to set this, unlimited if not set maxNoOfPodsToEvictTotal: 5000 # you don't need to set this, unlimited if not set +gracePeriodSeconds: 60 # you don't need to set this, 0 if not set metricsCollector: enabled: true # you don't need to set this, metrics are not collected if not set profiles: diff --git a/pkg/api/types.go b/pkg/api/types.go index 041c13e12..1f2ce2aeb 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -48,6 +48,12 @@ type DeschedulerPolicy struct { // MetricsCollector configures collection of metrics about actual resource utilization MetricsCollector MetricsCollector + + // GracePeriodSeconds The duration in seconds before the object should be deleted. Value must be non-negative integer. + // The value zero indicates delete immediately. If this value is nil, the default grace period for the + // specified type will be used. + // Defaults to a per object value if not specified. zero means delete immediately. + GracePeriodSeconds *int64 } // Namespaces carries a list of included/excluded namespaces diff --git a/pkg/api/v1alpha2/types.go b/pkg/api/v1alpha2/types.go index 533d7792c..e67b54b53 100644 --- a/pkg/api/v1alpha2/types.go +++ b/pkg/api/v1alpha2/types.go @@ -47,6 +47,12 @@ type DeschedulerPolicy struct { // MetricsCollector configures collection of metrics for actual resource utilization MetricsCollector MetricsCollector `json:"metricsCollector,omitempty"` + + // GracePeriodSeconds The duration in seconds before the object should be deleted. Value must be non-negative integer. + // The value zero indicates delete immediately. If this value is nil, the default grace period for the + // specified type will be used. + // Defaults to a per object value if not specified. zero means delete immediately. + GracePeriodSeconds *int64 `json:"gracePeriodSeconds,omitempty"` } type DeschedulerProfile struct { diff --git a/pkg/api/v1alpha2/zz_generated.conversion.go b/pkg/api/v1alpha2/zz_generated.conversion.go index c10b643d8..61879b9e8 100644 --- a/pkg/api/v1alpha2/zz_generated.conversion.go +++ b/pkg/api/v1alpha2/zz_generated.conversion.go @@ -119,6 +119,7 @@ func autoConvert_v1alpha2_DeschedulerPolicy_To_api_DeschedulerPolicy(in *Desched if err := Convert_v1alpha2_MetricsCollector_To_api_MetricsCollector(&in.MetricsCollector, &out.MetricsCollector, s); err != nil { return err } + out.GracePeriodSeconds = (*int64)(unsafe.Pointer(in.GracePeriodSeconds)) return nil } @@ -142,6 +143,7 @@ func autoConvert_api_DeschedulerPolicy_To_v1alpha2_DeschedulerPolicy(in *api.Des if err := Convert_api_MetricsCollector_To_v1alpha2_MetricsCollector(&in.MetricsCollector, &out.MetricsCollector, s); err != nil { return err } + out.GracePeriodSeconds = (*int64)(unsafe.Pointer(in.GracePeriodSeconds)) return nil } diff --git a/pkg/api/v1alpha2/zz_generated.deepcopy.go b/pkg/api/v1alpha2/zz_generated.deepcopy.go index a468d0a6e..f6e56be59 100644 --- a/pkg/api/v1alpha2/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha2/zz_generated.deepcopy.go @@ -62,6 +62,11 @@ func (in *DeschedulerPolicy) DeepCopyInto(out *DeschedulerPolicy) { **out = **in } out.MetricsCollector = in.MetricsCollector + if in.GracePeriodSeconds != nil { + in, out := &in.GracePeriodSeconds, &out.GracePeriodSeconds + *out = new(int64) + **out = **in + } return } diff --git a/pkg/api/zz_generated.deepcopy.go b/pkg/api/zz_generated.deepcopy.go index c084fd76a..137fcefbd 100644 --- a/pkg/api/zz_generated.deepcopy.go +++ b/pkg/api/zz_generated.deepcopy.go @@ -62,6 +62,11 @@ func (in *DeschedulerPolicy) DeepCopyInto(out *DeschedulerPolicy) { **out = **in } out.MetricsCollector = in.MetricsCollector + if in.GracePeriodSeconds != nil { + in, out := &in.GracePeriodSeconds, &out.GracePeriodSeconds + *out = new(int64) + **out = **in + } return } diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index 75b0bd800..046822334 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -157,6 +157,7 @@ func newDescheduler(ctx context.Context, rs *options.DeschedulerServer, deschedu WithMaxPodsToEvictPerNamespace(deschedulerPolicy.MaxNoOfPodsToEvictPerNamespace). WithMaxPodsToEvictTotal(deschedulerPolicy.MaxNoOfPodsToEvictTotal). WithEvictionFailureEventNotification(deschedulerPolicy.EvictionFailureEventNotification). + WithGracePeriodSeconds(deschedulerPolicy.GracePeriodSeconds). WithDryRun(rs.DryRun). WithMetricsEnabled(!rs.DisableMetrics), ) diff --git a/pkg/descheduler/evictions/evictions.go b/pkg/descheduler/evictions/evictions.go index df1060fdb..02c349d1d 100644 --- a/pkg/descheduler/evictions/evictions.go +++ b/pkg/descheduler/evictions/evictions.go @@ -214,6 +214,7 @@ type PodEvictor struct { maxPodsToEvictPerNode *uint maxPodsToEvictPerNamespace *uint maxPodsToEvictTotal *uint + gracePeriodSeconds *int64 nodePodCount nodePodEvictedCount namespacePodCount namespacePodEvictCount totalPodCount uint @@ -247,6 +248,7 @@ func NewPodEvictor( maxPodsToEvictPerNode: options.maxPodsToEvictPerNode, maxPodsToEvictPerNamespace: options.maxPodsToEvictPerNamespace, maxPodsToEvictTotal: options.maxPodsToEvictTotal, + gracePeriodSeconds: options.gracePeriodSeconds, metricsEnabled: options.metricsEnabled, nodePodCount: make(nodePodEvictedCount), namespacePodCount: make(namespacePodEvictCount), @@ -563,7 +565,9 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio // return (ignore, err) func (pe *PodEvictor) evictPod(ctx context.Context, pod *v1.Pod) (bool, error) { - deleteOptions := &metav1.DeleteOptions{} + deleteOptions := &metav1.DeleteOptions{ + GracePeriodSeconds: pe.gracePeriodSeconds, + } // GracePeriodSeconds ? eviction := &policy.Eviction{ TypeMeta: metav1.TypeMeta{ diff --git a/pkg/descheduler/evictions/options.go b/pkg/descheduler/evictions/options.go index b7c52b550..2646966d2 100644 --- a/pkg/descheduler/evictions/options.go +++ b/pkg/descheduler/evictions/options.go @@ -12,6 +12,7 @@ type Options struct { maxPodsToEvictTotal *uint evictionFailureEventNotification bool metricsEnabled bool + gracePeriodSeconds *int64 } // NewOptions returns an Options with default values. @@ -46,6 +47,11 @@ func (o *Options) WithMaxPodsToEvictTotal(maxPodsToEvictTotal *uint) *Options { return o } +func (o *Options) WithGracePeriodSeconds(gracePeriodSeconds *int64) *Options { + o.gracePeriodSeconds = gracePeriodSeconds + return o +} + func (o *Options) WithMetricsEnabled(metricsEnabled bool) *Options { o.metricsEnabled = metricsEnabled return o diff --git a/test/e2e/e2e_toomanyrestarts_test.go b/test/e2e/e2e_toomanyrestarts_test.go index ba9f9d4d8..0dd5b9bf5 100644 --- a/test/e2e/e2e_toomanyrestarts_test.go +++ b/test/e2e/e2e_toomanyrestarts_test.go @@ -43,7 +43,7 @@ import ( const deploymentReplicas = 4 -func tooManyRestartsPolicy(targetNamespace string, podRestartThresholds int32, includingInitContainers bool) *apiv1alpha2.DeschedulerPolicy { +func tooManyRestartsPolicy(targetNamespace string, podRestartThresholds int32, includingInitContainers bool, gracePeriodSeconds int64) *apiv1alpha2.DeschedulerPolicy { return &apiv1alpha2.DeschedulerPolicy{ Profiles: []apiv1alpha2.DeschedulerProfile{ { @@ -84,6 +84,7 @@ func tooManyRestartsPolicy(targetNamespace string, podRestartThresholds int32, i }, }, }, + GracePeriodSeconds: &gracePeriodSeconds, } } @@ -127,16 +128,23 @@ func TestTooManyRestarts(t *testing.T) { tests := []struct { name string policy *apiv1alpha2.DeschedulerPolicy + enableGracePeriod bool expectedEvictedPodCount uint }{ { name: "test-no-evictions", - policy: tooManyRestartsPolicy(testNamespace.Name, 10000, true), + policy: tooManyRestartsPolicy(testNamespace.Name, 10000, true, 0), expectedEvictedPodCount: 0, }, { name: "test-one-evictions", - policy: tooManyRestartsPolicy(testNamespace.Name, 3, true), + policy: tooManyRestartsPolicy(testNamespace.Name, 3, true, 0), + expectedEvictedPodCount: 4, + }, + { + name: "test-one-evictions-use-gracePeriodSeconds", + policy: tooManyRestartsPolicy(testNamespace.Name, 3, true, 60), + enableGracePeriod: true, expectedEvictedPodCount: 4, }, } @@ -197,8 +205,32 @@ func TestTooManyRestarts(t *testing.T) { deschedulerPodName = deschedulerPods[0].Name } + // Check if grace period is enabled and wait accordingly + if tc.enableGracePeriod { + // Ensure no pods are evicted during the grace period + // Wait for 55 seconds to ensure that the pods are not evicted during the grace period + // We do not want to use an extreme waiting time, such as 59 seconds, + // because the grace period is set to 60 seconds. + // In order to avoid unnecessary flake failures, + // we only need to make sure that the pod is not evicted within a certain range. + t.Logf("Waiting for grace period of %d seconds", 55) + if err := wait.PollUntilContextTimeout(ctx, 1*time.Second, time.Duration(55)*time.Second, true, func(ctx context.Context) (bool, error) { + currentRunNames := sets.NewString(getCurrentPodNames(ctx, clientSet, testNamespace.Name, t)...) + actualEvictedPod := preRunNames.Difference(currentRunNames) + actualEvictedPodCount := uint(actualEvictedPod.Len()) + t.Logf("preRunNames: %v, currentRunNames: %v, actualEvictedPodCount: %v\n", preRunNames.List(), currentRunNames.List(), actualEvictedPodCount) + if actualEvictedPodCount > 0 { + t.Fatalf("Pods were evicted during grace period; expected 0, got %v", actualEvictedPodCount) + return false, nil + } + return true, nil + }); err != nil { + t.Fatalf("Error waiting during grace period: %v", err) + } + } + // Run RemovePodsHavingTooManyRestarts strategy - if err := wait.PollUntilContextTimeout(ctx, 1*time.Second, 20*time.Second, true, func(ctx context.Context) (bool, error) { + if err := wait.PollUntilContextTimeout(ctx, 1*time.Second, 50*time.Second, true, func(ctx context.Context) (bool, error) { currentRunNames := sets.NewString(getCurrentPodNames(ctx, clientSet, testNamespace.Name, t)...) actualEvictedPod := preRunNames.Difference(currentRunNames) actualEvictedPodCount := uint(actualEvictedPod.Len()) @@ -210,7 +242,7 @@ func TestTooManyRestarts(t *testing.T) { return true, nil }); err != nil { - t.Errorf("Error waiting for descheduler running: %v", err) + t.Fatalf("Error waiting for descheduler running: %v", err) } waitForTerminatingPodsToDisappear(ctx, t, clientSet, testNamespace.Name) })