diff --git a/README.md b/README.md index 6d7709b27..b035e92a0 100644 --- a/README.md +++ b/README.md @@ -405,6 +405,12 @@ strategy evicts pods from `underutilized nodes` (those with usage below `thresho so that they can be recreated in appropriately utilized nodes. The strategy will abort if any number of `underutilized nodes` or `appropriately utilized nodes` is zero. +To control pod eviction from underutilized nodes, use the `evictionModes` +array. A lenient policy, which evicts pods regardless of their resource +requests, is the default. To enable a stricter policy that only evicts pods +with resource requests defined for the provided threshold resources, add the +option `OnlyThresholdingResources` to the `evictionModes` configuration. + **NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage. This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands @@ -417,8 +423,15 @@ actual usage metrics. Implementing metrics-based descheduling is currently TODO |---|---| |`thresholds`|map(string:int)| |`numberOfNodes`|int| +|`evictionModes`|list(string)| |`evictableNamespaces`|(see [namespace filtering](#namespace-filtering))| +**Supported Eviction Modes:** + +|Name|Description| +|---|---| +|`OnlyThresholdingResources`|Evict only pods that have resource requests defined for the provided threshold resources.| + **Example:** ```yaml @@ -437,6 +450,8 @@ profiles: exclude: - "kube-system" - "namespace1" + evictionModes: + - "OnlyThresholdingResources" plugins: balance: enabled: diff --git a/pkg/framework/plugins/nodeutilization/highnodeutilization.go b/pkg/framework/plugins/nodeutilization/highnodeutilization.go index 55ea9d032..e2baa2037 100644 --- a/pkg/framework/plugins/nodeutilization/highnodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/highnodeutilization.go @@ -19,6 +19,7 @@ package nodeutilization import ( "context" "fmt" + "slices" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" @@ -74,9 +75,26 @@ func NewHighNodeUtilization( highThresholds[rname] = MaxResourcePercentage } + // get the resource names for which we have a threshold. this is + // later used when determining if we are going to evict a pod. + resourceThresholds := getResourceNames(args.Thresholds) + + // by default we evict pods from the under utilized nodes even if they + // don't define a request for a given threshold. this works most of the + // times and there is an use case for it. When using the restrict mode + // we evaluate if the pod has a request for any of the resources the + // user has provided as threshold. + filters := []podutil.FilterFunc{handle.Evictor().Filter} + if slices.Contains(args.EvictionModes, EvictionModeOnlyThresholdingResources) { + filters = append( + filters, + withResourceRequestForAny(resourceThresholds...), + ) + } + podFilter, err := podutil. NewOptions(). - WithFilter(handle.Evictor().Filter). + WithFilter(podutil.WrapFilterFuncs(filters...)). BuildFilterFunc() if err != nil { return nil, fmt.Errorf("error initializing pod filter function: %v", err) @@ -87,7 +105,7 @@ func NewHighNodeUtilization( // all we consider the basic resources (cpu, memory, pods). resourceNames := uniquifyResourceNames( append( - getResourceNames(args.Thresholds), + resourceThresholds, v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods, diff --git a/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go b/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go index f436f4516..15903012b 100644 --- a/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go +++ b/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go @@ -48,6 +48,7 @@ func TestHighNodeUtilization(t *testing.T) { testCases := []struct { name string thresholds api.ResourceThresholds + evictionModes []EvictionMode nodes []*v1.Node pods []*v1.Pod expectedPodsEvicted uint @@ -433,6 +434,53 @@ func TestHighNodeUtilization(t *testing.T) { }, expectedPodsEvicted: 0, }, + { + name: "with extended resource threshold and no extended resource pods", + thresholds: api.ResourceThresholds{ + extendedResource: 40, + }, + evictionModes: []EvictionMode{EvictionModeOnlyThresholdingResources}, + nodes: []*v1.Node{ + test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) { + test.SetNodeExtendedResource(node, extendedResource, 10) + }), + test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) { + test.SetNodeExtendedResource(node, extendedResource, 10) + }), + test.BuildTestNode(n3NodeName, 4000, 3000, 10, func(node *v1.Node) { + test.SetNodeExtendedResource(node, extendedResource, 10) + }), + }, + pods: []*v1.Pod{ + // pods on node1 have the extended resource + // request set and they put the node in the + // over utilization range. + test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 3) + }), + test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + test.SetPodExtendedResourceRequest(pod, extendedResource, 3) + }), + // pods in the other nodes must not be evicted + // because they do not have the extended + // resource defined in their requests. + test.BuildTestPod("p3", 500, 0, n2NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + }), + test.BuildTestPod("p4", 500, 0, n2NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + }), + test.BuildTestPod("p5", 500, 0, n2NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + }), + test.BuildTestPod("p6", 500, 0, n2NodeName, func(pod *v1.Pod) { + test.SetRSOwnerRef(pod) + }), + }, + expectedPodsEvicted: 0, + }, } for _, testCase := range testCases { @@ -474,10 +522,13 @@ func TestHighNodeUtilization(t *testing.T) { }) } - plugin, err := NewHighNodeUtilization(&HighNodeUtilizationArgs{ - Thresholds: testCase.thresholds, - }, - handle) + plugin, err := NewHighNodeUtilization( + &HighNodeUtilizationArgs{ + Thresholds: testCase.thresholds, + EvictionModes: testCase.evictionModes, + }, + handle, + ) if err != nil { t.Fatalf("Unable to initialize the plugin: %v", err) } diff --git a/pkg/framework/plugins/nodeutilization/nodeutilization.go b/pkg/framework/plugins/nodeutilization/nodeutilization.go index 705cbc32f..6c9744024 100644 --- a/pkg/framework/plugins/nodeutilization/nodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/nodeutilization.go @@ -32,6 +32,7 @@ import ( "k8s.io/utils/ptr" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" + "sigs.k8s.io/descheduler/pkg/descheduler/pod" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" "sigs.k8s.io/descheduler/pkg/framework/plugins/nodeutilization/normalizer" frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" @@ -752,3 +753,19 @@ func assessAvailableResourceInNodes( return available, nil } + +// withResourceRequestForAny returns a filter function that checks if a pod +// has a resource request specified for any of the given resources names. +func withResourceRequestForAny(names ...v1.ResourceName) pod.FilterFunc { + return func(pod *v1.Pod) bool { + all := append(pod.Spec.Containers, pod.Spec.InitContainers...) + for _, name := range names { + for _, container := range all { + if _, ok := container.Resources.Requests[name]; ok { + return true + } + } + } + return false + } +} diff --git a/pkg/framework/plugins/nodeutilization/types.go b/pkg/framework/plugins/nodeutilization/types.go index e49434d9d..5f6e94799 100644 --- a/pkg/framework/plugins/nodeutilization/types.go +++ b/pkg/framework/plugins/nodeutilization/types.go @@ -18,6 +18,18 @@ import ( "sigs.k8s.io/descheduler/pkg/api" ) +// EvictionMode describe a mode of eviction. See the list below for the +// available modes. +type EvictionMode string + +const ( + // EvictionModeOnlyThresholdingResources makes the descheduler evict + // only pods that have a resource request defined for any of the user + // provided thresholds. If the pod does not request the resource, it + // will not be evicted. + EvictionModeOnlyThresholdingResources EvictionMode = "OnlyThresholdingResources" +) + // +k8s:deepcopy-gen=true // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -48,6 +60,13 @@ type HighNodeUtilizationArgs struct { Thresholds api.ResourceThresholds `json:"thresholds"` NumberOfNodes int `json:"numberOfNodes,omitempty"` + // EvictionModes is a set of modes to be taken into account when the + // descheduler evicts pods. For example the mode + // `OnlyThresholdingResources` can be used to make sure the descheduler + // only evicts pods who have resource requests for the defined + // thresholds. + EvictionModes []EvictionMode `json:"evictionModes,omitempty"` + // Naming this one differently since namespaces are still // considered while considering resources used by pods // but then filtered out before eviction diff --git a/pkg/framework/plugins/nodeutilization/validation.go b/pkg/framework/plugins/nodeutilization/validation.go index 1392dda26..6ac48b72a 100644 --- a/pkg/framework/plugins/nodeutilization/validation.go +++ b/pkg/framework/plugins/nodeutilization/validation.go @@ -30,7 +30,25 @@ func ValidateHighNodeUtilizationArgs(obj runtime.Object) error { if err != nil { return err } + // make sure we know about the eviction modes defined by the user. + return validateEvictionModes(args.EvictionModes) +} +// validateEvictionModes checks if the eviction modes are valid/known +// to the descheduler. +func validateEvictionModes(modes []EvictionMode) error { + // we are using this approach to make the code more extensible + // in the future. + validModes := map[EvictionMode]bool{ + EvictionModeOnlyThresholdingResources: true, + } + + for _, mode := range modes { + if validModes[mode] { + continue + } + return fmt.Errorf("invalid eviction mode %s", mode) + } return nil } diff --git a/pkg/framework/plugins/nodeutilization/zz_generated.deepcopy.go b/pkg/framework/plugins/nodeutilization/zz_generated.deepcopy.go index 9a1d9c5c7..7f84492b9 100644 --- a/pkg/framework/plugins/nodeutilization/zz_generated.deepcopy.go +++ b/pkg/framework/plugins/nodeutilization/zz_generated.deepcopy.go @@ -37,6 +37,11 @@ func (in *HighNodeUtilizationArgs) DeepCopyInto(out *HighNodeUtilizationArgs) { (*out)[key] = val } } + if in.EvictionModes != nil { + in, out := &in.EvictionModes, &out.EvictionModes + *out = make([]EvictionMode, len(*in)) + copy(*out, *in) + } if in.EvictableNamespaces != nil { in, out := &in.EvictableNamespaces, &out.EvictableNamespaces *out = new(api.Namespaces)