1
0
mirror of https://github.com/kubernetes-sigs/descheduler.git synced 2026-01-26 05:14:13 +01:00

feat: introduce strict eviction policy

with strict eviction policy the descheduler only evict pods if the pod
contains a request for the given threshold. for example, if using a
threshold for an extended resource called `example.com/gpu` only pods
who request such a resource will be evicted.
This commit is contained in:
Ricardo Maraschini
2025-04-08 19:33:23 +02:00
parent cca28f7bbe
commit 35a7178df6
7 changed files with 149 additions and 6 deletions

View File

@@ -405,6 +405,12 @@ strategy evicts pods from `underutilized nodes` (those with usage below `thresho
so that they can be recreated in appropriately utilized nodes.
The strategy will abort if any number of `underutilized nodes` or `appropriately utilized nodes` is zero.
To control pod eviction from underutilized nodes, use the `evictionModes`
array. A lenient policy, which evicts pods regardless of their resource
requests, is the default. To enable a stricter policy that only evicts pods
with resource requests defined for the provided threshold resources, add the
option `OnlyThresholdingResources` to the `evictionModes` configuration.
**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage.
This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same
design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands
@@ -417,8 +423,15 @@ actual usage metrics. Implementing metrics-based descheduling is currently TODO
|---|---|
|`thresholds`|map(string:int)|
|`numberOfNodes`|int|
|`evictionModes`|list(string)|
|`evictableNamespaces`|(see [namespace filtering](#namespace-filtering))|
**Supported Eviction Modes:**
|Name|Description|
|---|---|
|`OnlyThresholdingResources`|Evict only pods that have resource requests defined for the provided threshold resources.|
**Example:**
```yaml
@@ -437,6 +450,8 @@ profiles:
exclude:
- "kube-system"
- "namespace1"
evictionModes:
- "OnlyThresholdingResources"
plugins:
balance:
enabled:

View File

@@ -19,6 +19,7 @@ package nodeutilization
import (
"context"
"fmt"
"slices"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
@@ -74,9 +75,26 @@ func NewHighNodeUtilization(
highThresholds[rname] = MaxResourcePercentage
}
// get the resource names for which we have a threshold. this is
// later used when determining if we are going to evict a pod.
resourceThresholds := getResourceNames(args.Thresholds)
// by default we evict pods from the under utilized nodes even if they
// don't define a request for a given threshold. this works most of the
// times and there is an use case for it. When using the restrict mode
// we evaluate if the pod has a request for any of the resources the
// user has provided as threshold.
filters := []podutil.FilterFunc{handle.Evictor().Filter}
if slices.Contains(args.EvictionModes, EvictionModeOnlyThresholdingResources) {
filters = append(
filters,
withResourceRequestForAny(resourceThresholds...),
)
}
podFilter, err := podutil.
NewOptions().
WithFilter(handle.Evictor().Filter).
WithFilter(podutil.WrapFilterFuncs(filters...)).
BuildFilterFunc()
if err != nil {
return nil, fmt.Errorf("error initializing pod filter function: %v", err)
@@ -87,7 +105,7 @@ func NewHighNodeUtilization(
// all we consider the basic resources (cpu, memory, pods).
resourceNames := uniquifyResourceNames(
append(
getResourceNames(args.Thresholds),
resourceThresholds,
v1.ResourceCPU,
v1.ResourceMemory,
v1.ResourcePods,

View File

@@ -48,6 +48,7 @@ func TestHighNodeUtilization(t *testing.T) {
testCases := []struct {
name string
thresholds api.ResourceThresholds
evictionModes []EvictionMode
nodes []*v1.Node
pods []*v1.Pod
expectedPodsEvicted uint
@@ -433,6 +434,53 @@ func TestHighNodeUtilization(t *testing.T) {
},
expectedPodsEvicted: 0,
},
{
name: "with extended resource threshold and no extended resource pods",
thresholds: api.ResourceThresholds{
extendedResource: 40,
},
evictionModes: []EvictionMode{EvictionModeOnlyThresholdingResources},
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 10)
}),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 10)
}),
test.BuildTestNode(n3NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 10)
}),
},
pods: []*v1.Pod{
// pods on node1 have the extended resource
// request set and they put the node in the
// over utilization range.
test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 3)
}),
test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 3)
}),
// pods in the other nodes must not be evicted
// because they do not have the extended
// resource defined in their requests.
test.BuildTestPod("p3", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
}),
test.BuildTestPod("p4", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
}),
test.BuildTestPod("p5", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
}),
test.BuildTestPod("p6", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
}),
},
expectedPodsEvicted: 0,
},
}
for _, testCase := range testCases {
@@ -474,10 +522,13 @@ func TestHighNodeUtilization(t *testing.T) {
})
}
plugin, err := NewHighNodeUtilization(&HighNodeUtilizationArgs{
Thresholds: testCase.thresholds,
},
handle)
plugin, err := NewHighNodeUtilization(
&HighNodeUtilizationArgs{
Thresholds: testCase.thresholds,
EvictionModes: testCase.evictionModes,
},
handle,
)
if err != nil {
t.Fatalf("Unable to initialize the plugin: %v", err)
}

View File

@@ -32,6 +32,7 @@ import (
"k8s.io/utils/ptr"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
"sigs.k8s.io/descheduler/pkg/descheduler/pod"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/framework/plugins/nodeutilization/normalizer"
frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types"
@@ -752,3 +753,19 @@ func assessAvailableResourceInNodes(
return available, nil
}
// withResourceRequestForAny returns a filter function that checks if a pod
// has a resource request specified for any of the given resources names.
func withResourceRequestForAny(names ...v1.ResourceName) pod.FilterFunc {
return func(pod *v1.Pod) bool {
all := append(pod.Spec.Containers, pod.Spec.InitContainers...)
for _, name := range names {
for _, container := range all {
if _, ok := container.Resources.Requests[name]; ok {
return true
}
}
}
return false
}
}

View File

@@ -18,6 +18,18 @@ import (
"sigs.k8s.io/descheduler/pkg/api"
)
// EvictionMode describe a mode of eviction. See the list below for the
// available modes.
type EvictionMode string
const (
// EvictionModeOnlyThresholdingResources makes the descheduler evict
// only pods that have a resource request defined for any of the user
// provided thresholds. If the pod does not request the resource, it
// will not be evicted.
EvictionModeOnlyThresholdingResources EvictionMode = "OnlyThresholdingResources"
)
// +k8s:deepcopy-gen=true
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
@@ -48,6 +60,13 @@ type HighNodeUtilizationArgs struct {
Thresholds api.ResourceThresholds `json:"thresholds"`
NumberOfNodes int `json:"numberOfNodes,omitempty"`
// EvictionModes is a set of modes to be taken into account when the
// descheduler evicts pods. For example the mode
// `OnlyThresholdingResources` can be used to make sure the descheduler
// only evicts pods who have resource requests for the defined
// thresholds.
EvictionModes []EvictionMode `json:"evictionModes,omitempty"`
// Naming this one differently since namespaces are still
// considered while considering resources used by pods
// but then filtered out before eviction

View File

@@ -30,7 +30,25 @@ func ValidateHighNodeUtilizationArgs(obj runtime.Object) error {
if err != nil {
return err
}
// make sure we know about the eviction modes defined by the user.
return validateEvictionModes(args.EvictionModes)
}
// validateEvictionModes checks if the eviction modes are valid/known
// to the descheduler.
func validateEvictionModes(modes []EvictionMode) error {
// we are using this approach to make the code more extensible
// in the future.
validModes := map[EvictionMode]bool{
EvictionModeOnlyThresholdingResources: true,
}
for _, mode := range modes {
if validModes[mode] {
continue
}
return fmt.Errorf("invalid eviction mode %s", mode)
}
return nil
}

View File

@@ -37,6 +37,11 @@ func (in *HighNodeUtilizationArgs) DeepCopyInto(out *HighNodeUtilizationArgs) {
(*out)[key] = val
}
}
if in.EvictionModes != nil {
in, out := &in.EvictionModes, &out.EvictionModes
*out = make([]EvictionMode, len(*in))
copy(*out, *in)
}
if in.EvictableNamespaces != nil {
in, out := &in.EvictableNamespaces, &out.EvictableNamespaces
*out = new(api.Namespaces)