1
0
mirror of https://github.com/kubernetes-sigs/descheduler.git synced 2026-01-26 13:29:11 +01:00

Compare commits

..

2 Commits

Author SHA1 Message Date
Mike Dame
052d7e7cb0 (TEST): update helm chart 2020-07-22 14:21:36 -04:00
Mike Dame
6c5afc327b (TEST): change helm workflow branches 2020-07-22 14:20:59 -04:00
4178 changed files with 180157 additions and 656243 deletions

View File

@@ -3,6 +3,7 @@ name: Release Charts
on:
push:
branches:
- master
- release-*
jobs:
@@ -11,21 +12,20 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Configure Git
run: |
git config user.name "$GITHUB_ACTOR"
git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
- name: Install Helm
uses: azure/setup-helm@v1
with:
version: v3.4.0
- name: Fetch history
run: git fetch --prune --unshallow
- name: Add dependency chart repos
run: |
helm repo add stable https://kubernetes-charts.storage.googleapis.com/
- name: Run chart-releaser
uses: helm/chart-releaser-action@v1.1.0
uses: helm/chart-releaser-action@v1.0.0-rc.2
env:
CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
CR_RELEASE_NAME_TEMPLATE: "descheduler-helm-chart-{{ .Version }}"

2
.gitignore vendored
View File

@@ -3,5 +3,3 @@ _tmp/
vendordiff.patch
.idea/
*.code-workspace
.vscode/
kind

View File

@@ -11,19 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM golang:1.16.7
FROM golang:1.14.4
WORKDIR /go/src/sigs.k8s.io/descheduler
COPY . .
ARG ARCH
ARG VERSION
RUN VERSION=${VERSION} make build.$ARCH
RUN make
FROM scratch
MAINTAINER Kubernetes SIG Scheduling <kubernetes-sig-scheduling@googlegroups.com>
USER 1000
MAINTAINER Avesh Agarwal <avesh.ncsu@gmail.com>
COPY --from=0 /go/src/sigs.k8s.io/descheduler/_output/bin/descheduler /bin/descheduler

View File

@@ -13,9 +13,7 @@
# limitations under the License.
FROM scratch
MAINTAINER Kubernetes SIG Scheduling <kubernetes-sig-scheduling@googlegroups.com>
USER 1000
MAINTAINER Avesh Agarwal <avesh.ncsu@gmail.com>
COPY _output/bin/descheduler /bin/descheduler

View File

@@ -14,18 +14,16 @@
.PHONY: test
# VERSION is based on a date stamp plus the last commit
VERSION?=v$(shell date +%Y%m%d)-$(shell git describe --tags --match "v*")
BRANCH?=$(shell git branch --show-current)
SHA1?=$(shell git rev-parse HEAD)
# VERSION is currently based on the last commit
VERSION?=$(shell git describe --tags)
COMMIT=$(shell git rev-parse HEAD)
BUILD=$(shell date +%FT%T%z)
LDFLAG_LOCATION=sigs.k8s.io/descheduler/pkg/version
ARCHS = amd64 arm arm64
LDFLAG_LOCATION=sigs.k8s.io/descheduler/cmd/descheduler/app
LDFLAGS=-ldflags "-X ${LDFLAG_LOCATION}.version=${VERSION} -X ${LDFLAG_LOCATION}.buildDate=${BUILD} -X ${LDFLAG_LOCATION}.gitbranch=${BRANCH} -X ${LDFLAG_LOCATION}.gitsha1=${SHA1}"
LDFLAGS=-ldflags "-X ${LDFLAG_LOCATION}.version=${VERSION} -X ${LDFLAG_LOCATION}.buildDate=${BUILD} -X ${LDFLAG_LOCATION}.gitCommit=${COMMIT}"
GOLANGCI_VERSION := v1.30.0
HAS_GOLANGCI := $(shell ls _output/bin/golangci-lint 2> /dev/null)
GOLANGCI_VERSION := v1.15.0
HAS_GOLANGCI := $(shell ls _output/bin/golangci-lint)
# REGISTRY is the container registry to push
# into. The default is to push to the staging
@@ -43,65 +41,30 @@ IMAGE_GCLOUD:=$(REGISTRY)/descheduler:$(VERSION)
# In the future binaries can be uploaded to
# GCS bucket gs://k8s-staging-descheduler.
HAS_HELM := $(shell which helm 2> /dev/null)
HAS_HELM := $(shell which helm)
all: build
build:
CGO_ENABLED=0 go build ${LDFLAGS} -o _output/bin/descheduler sigs.k8s.io/descheduler/cmd/descheduler
build.amd64:
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build ${LDFLAGS} -o _output/bin/descheduler sigs.k8s.io/descheduler/cmd/descheduler
build.arm:
CGO_ENABLED=0 GOOS=linux GOARCH=arm GOARM=7 go build ${LDFLAGS} -o _output/bin/descheduler sigs.k8s.io/descheduler/cmd/descheduler
build.arm64:
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build ${LDFLAGS} -o _output/bin/descheduler sigs.k8s.io/descheduler/cmd/descheduler
dev-image: build
docker build -f Dockerfile.dev -t $(IMAGE) .
image:
docker build --build-arg VERSION="$(VERSION)" --build-arg ARCH="amd64" -t $(IMAGE) .
docker build -t $(IMAGE) .
image.amd64:
docker build --build-arg VERSION="$(VERSION)" --build-arg ARCH="amd64" -t $(IMAGE)-amd64 .
image.arm:
docker build --build-arg VERSION="$(VERSION)" --build-arg ARCH="arm" -t $(IMAGE)-arm .
image.arm64:
docker build --build-arg VERSION="$(VERSION)" --build-arg ARCH="arm64" -t $(IMAGE)-arm64 .
push: image
push-container-to-gcloud: image
gcloud auth configure-docker
docker tag $(IMAGE) $(IMAGE_GCLOUD)
docker push $(IMAGE_GCLOUD)
push-all: image.amd64 image.arm image.arm64
gcloud auth configure-docker
for arch in $(ARCHS); do \
docker tag $(IMAGE)-$${arch} $(IMAGE_GCLOUD)-$${arch} ;\
docker push $(IMAGE_GCLOUD)-$${arch} ;\
done
DOCKER_CLI_EXPERIMENTAL=enabled docker manifest create $(IMAGE_GCLOUD) $(addprefix --amend $(IMAGE_GCLOUD)-, $(ARCHS))
for arch in $(ARCHS); do \
DOCKER_CLI_EXPERIMENTAL=enabled docker manifest annotate --arch $${arch} $(IMAGE_GCLOUD) $(IMAGE_GCLOUD)-$${arch} ;\
done
DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push $(IMAGE_GCLOUD) ;\
push: push-container-to-gcloud
clean:
rm -rf _output
rm -rf _tmp
verify: verify-govet verify-spelling verify-gofmt verify-vendor lint lint-chart verify-toc verify-gen
verify-govet:
./hack/verify-govet.sh
verify-spelling:
./hack/verify-spelling.sh
verify: verify-gofmt verify-vendor lint lint-chart
verify-gofmt:
./hack/verify-gofmt.sh
@@ -109,9 +72,6 @@ verify-gofmt:
verify-vendor:
./hack/verify-vendor.sh
verify-toc:
./hack/verify-toc.sh
test-unit:
./test/run-unit-tests.sh
@@ -122,12 +82,6 @@ gen:
./hack/update-generated-conversions.sh
./hack/update-generated-deep-copies.sh
./hack/update-generated-defaulters.sh
./hack/update-toc.sh
verify-gen:
./hack/verify-conversions.sh
./hack/verify-deep-copies.sh
./hack/verify-defaulters.sh
lint:
ifndef HAS_GOLANGCI
@@ -135,13 +89,8 @@ ifndef HAS_GOLANGCI
endif
./_output/bin/golangci-lint run
lint-chart: ensure-helm-install
helm lint ./charts/descheduler
test-helm: ensure-helm-install
./test/run-helm-tests.sh
ensure-helm-install:
lint-chart:
ifndef HAS_HELM
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 && chmod 700 ./get_helm.sh && ./get_helm.sh
endif
endif
helm lint ./charts/descheduler

10
OWNERS
View File

@@ -1,7 +1,8 @@
approvers:
- aveshagarwal
- k82cn
- ravisantoshgudimetla
- damemi
- ingvagabund
- seanmalloy
reviewers:
- aveshagarwal
- k82cn
@@ -9,8 +10,3 @@ reviewers:
- damemi
- seanmalloy
- ingvagabund
- lixiang233
emeritus_approvers:
- aveshagarwal
- k82cn
- ravisantoshgudimetla

617
README.md
View File

@@ -1,8 +1,9 @@
[![Go Report Card](https://goreportcard.com/badge/sigs.k8s.io/descheduler)](https://goreportcard.com/report/sigs.k8s.io/descheduler)
![Release Charts](https://github.com/kubernetes-sigs/descheduler/workflows/Release%20Charts/badge.svg)
[![Go Report Card](https://goreportcard.com/badge/kubernetes-sigs/descheduler)](https://goreportcard.com/report/sigs.k8s.io/descheduler)
# Descheduler for Kubernetes
## Introduction
Scheduling in Kubernetes is the process of binding pending pods to nodes, and is performed by
a component of Kubernetes called kube-scheduler. The scheduler's decisions, whether or where a
pod can or can not be scheduled, are guided by its configurable policy which comprises of set of
@@ -22,45 +23,9 @@ Descheduler, based on its policy, finds pods that can be moved and evicts them.
note, in current implementation, descheduler does not schedule replacement of evicted pods
but relies on the default scheduler for that.
Table of Contents
=================
<!-- toc -->
- [Quick Start](#quick-start)
- [Run As A Job](#run-as-a-job)
- [Run As A CronJob](#run-as-a-cronjob)
- [Run As A Deployment](#run-as-a-deployment)
- [Install Using Helm](#install-using-helm)
- [Install Using Kustomize](#install-using-kustomize)
- [User Guide](#user-guide)
- [Policy and Strategies](#policy-and-strategies)
- [RemoveDuplicates](#removeduplicates)
- [LowNodeUtilization](#lownodeutilization)
- [HighNodeUtilization](#highnodeutilization)
- [RemovePodsViolatingInterPodAntiAffinity](#removepodsviolatinginterpodantiaffinity)
- [RemovePodsViolatingNodeAffinity](#removepodsviolatingnodeaffinity)
- [RemovePodsViolatingNodeTaints](#removepodsviolatingnodetaints)
- [RemovePodsViolatingTopologySpreadConstraint](#removepodsviolatingtopologyspreadconstraint)
- [RemovePodsHavingTooManyRestarts](#removepodshavingtoomanyrestarts)
- [PodLifeTime](#podlifetime)
- [RemoveFailedPods](#removefailedpods)
- [Filter Pods](#filter-pods)
- [Namespace filtering](#namespace-filtering)
- [Priority filtering](#priority-filtering)
- [Label filtering](#label-filtering)
- [Node Fit filtering](#node-fit-filtering)
- [Pod Evictions](#pod-evictions)
- [Pod Disruption Budget (PDB)](#pod-disruption-budget-pdb)
- [Metrics](#metrics)
- [Compatibility Matrix](#compatibility-matrix)
- [Getting Involved and Contributing](#getting-involved-and-contributing)
- [Communicating With Contributors](#communicating-with-contributors)
- [Roadmap](#roadmap)
- [Code of conduct](#code-of-conduct)
<!-- /toc -->
## Quick Start
The descheduler can be run as a `Job`, `CronJob`, or `Deployment` inside of a k8s cluster. It has the
The descheduler can be run as a Job or CronJob inside of a k8s cluster. It has the
advantage of being able to be run multiple times without needing user intervention.
The descheduler pod is run as a critical pod in the `kube-system` namespace to avoid
being evicted by itself or by the kubelet.
@@ -68,52 +33,17 @@ being evicted by itself or by the kubelet.
### Run As A Job
```
kubectl create -f kubernetes/base/rbac.yaml
kubectl create -f kubernetes/base/configmap.yaml
kubectl create -f kubernetes/job/job.yaml
kubectl create -f kubernetes/rbac.yaml
kubectl create -f kubernetes/configmap.yaml
kubectl create -f kubernetes/job.yaml
```
### Run As A CronJob
```
kubectl create -f kubernetes/base/rbac.yaml
kubectl create -f kubernetes/base/configmap.yaml
kubectl create -f kubernetes/cronjob/cronjob.yaml
```
### Run As A Deployment
```
kubectl create -f kubernetes/base/rbac.yaml
kubectl create -f kubernetes/base/configmap.yaml
kubectl create -f kubernetes/deployment/deployment.yaml
```
### Install Using Helm
Starting with release v0.18.0 there is an official helm chart that can be used to install the
descheduler. See the [helm chart README](https://github.com/kubernetes-sigs/descheduler/blob/master/charts/descheduler/README.md) for detailed instructions.
The descheduler helm chart is also listed on the [artifact hub](https://artifacthub.io/packages/helm/descheduler/descheduler).
### Install Using Kustomize
You can use kustomize to install descheduler.
See the [resources | Kustomize](https://kubectl.docs.kubernetes.io/references/kustomize/resource/) for detailed instructions.
Run As A Job
```
kustomize build 'github.com/kubernetes-sigs/descheduler/kubernetes/job?ref=v0.22.1' | kubectl apply -f -
```
Run As A CronJob
```
kustomize build 'github.com/kubernetes-sigs/descheduler/kubernetes/cronjob?ref=v0.22.1' | kubectl apply -f -
```
Run As A Deployment
```
kustomize build 'github.com/kubernetes-sigs/descheduler/kubernetes/deployment?ref=v0.22.1' | kubectl apply -f -
kubectl create -f kubernetes/rbac.yaml
kubectl create -f kubernetes/configmap.yaml
kubectl create -f kubernetes/cronjob.yaml
```
## User Guide
@@ -122,65 +52,25 @@ See the [user guide](docs/user-guide.md) in the `/docs` directory.
## Policy and Strategies
Descheduler's policy is configurable and includes strategies that can be enabled or disabled. By default, all strategies are enabled.
The policy includes a common configuration that applies to all the strategies:
| Name | Default Value | Description |
|------|---------------|-------------|
| `nodeSelector` | `nil` | limiting the nodes which are processed |
| `evictLocalStoragePods` | `false` | allows eviction of pods with local storage |
| `evictSystemCriticalPods` | `false` | [Warning: Will evict Kubernetes system pods] allows eviction of pods with any priority, including system pods like kube-dns |
| `ignorePvcPods` | `false` | set whether PVC pods should be evicted or ignored |
| `maxNoOfPodsToEvictPerNode` | `nil` | maximum number of pods evicted from each node (summed through all strategies) |
As part of the policy, the parameters associated with each strategy can be configured.
See each strategy for details on available parameters.
**Policy:**
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
nodeSelector: prod=dev
evictLocalStoragePods: true
evictSystemCriticalPods: true
maxNoOfPodsToEvictPerNode: 40
ignorePvcPods: false
strategies:
...
```
The following diagram provides a visualization of most of the strategies to help
categorize how strategies fit together.
![Strategies diagram](strategies_diagram.png)
Descheduler's policy is configurable and includes strategies that can be enabled or disabled.
Seven strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`,
`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsHavingTooManyRestarts`, and `PodLifeTime`
are currently implemented. As part of the policy, the parameters associated with the strategies can be configured too.
By default, all strategies are enabled.
### RemoveDuplicates
This strategy makes sure that there is only one pod associated with a ReplicaSet (RS),
ReplicationController (RC), StatefulSet, or Job running on the same node. If there are more,
This strategy makes sure that there is only one pod associated with a Replica Set (RS),
Replication Controller (RC), Deployment, or Job running on the same node. If there are more,
those duplicate pods are evicted for better spreading of pods in a cluster. This issue could happen
if some nodes went down due to whatever reasons, and pods on them were moved to other nodes leading to
more than one pod associated with a RS or RC, for example, running on the same node. Once the failed nodes
are ready again, this strategy could be enabled to evict those duplicate pods.
It provides one optional parameter, `excludeOwnerKinds`, which is a list of OwnerRef `Kind`s. If a pod
has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considered for eviction. Note that
pods created by Deployments are considered for eviction by this strategy. The `excludeOwnerKinds` parameter
should include `ReplicaSet` to have pods created by Deployments excluded.
It provides one optional parameter, `ExcludeOwnerKinds`, which is a list of OwnerRef `Kind`s. If a pod
has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considered for eviction.
**Parameters:**
|Name|Type|
|---|---|
|`excludeOwnerKinds`|list(string)|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
```yaml
```
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -199,37 +89,20 @@ in the hope that recreation of evicted pods will be scheduled on these underutil
parameters of this strategy are configured under `nodeResourceUtilizationThresholds`.
The under utilization of nodes is determined by a configurable threshold `thresholds`. The threshold
`thresholds` can be configured for cpu, memory, number of pods, and extended resources in terms of percentage (the percentage is
calculated as the current resources requested on the node vs [total allocatable](https://kubernetes.io/docs/concepts/architecture/nodes/#capacity).
For pods, this means the number of pods on the node as a fraction of the pod capacity set for that node).
If a node's usage is below threshold for all (cpu, memory, number of pods and extended resources), the node is considered underutilized.
`thresholds` can be configured for cpu, memory, and number of pods in terms of percentage. If a node's
usage is below threshold for all (cpu, memory, and number of pods), the node is considered underutilized.
Currently, pods request resource requirements are considered for computing node resource utilization.
There is another configurable threshold, `targetThresholds`, that is used to compute those potential nodes
from where pods could be evicted. If a node's usage is above targetThreshold for any (cpu, memory, number of pods, or extended resources),
from where pods could be evicted. If a node's usage is above targetThreshold for any (cpu, memory, or number of pods),
the node is considered over utilized. Any node between the thresholds, `thresholds` and `targetThresholds` is
considered appropriately utilized and is not considered for eviction. The threshold, `targetThresholds`,
can be configured for cpu, memory, and number of pods too in terms of percentage.
These thresholds, `thresholds` and `targetThresholds`, could be tuned as per your cluster requirements. Note that this
strategy evicts pods from `overutilized nodes` (those with usage above `targetThresholds`) to `underutilized nodes`
(those with usage below `thresholds`), it will abort if any number of `underutilized nodes` or `overutilized nodes` is zero.
These thresholds, `thresholds` and `targetThresholds`, could be tuned as per your cluster requirements.
Here is an example of a policy for this strategy:
**Parameters:**
|Name|Type|
|---|---|
|`thresholds`|map(string:int)|
|`targetThresholds`|map(string:int)|
|`numberOfNodes`|int|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
```yaml
```
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -248,104 +121,34 @@ strategies:
```
Policy should pass the following validation checks:
* Three basic native types of resources are supported: `cpu`, `memory` and `pods`.
If any of these resource types is not specified, all its thresholds default to 100% to avoid nodes going from underutilized to overutilized.
* Extended resources are supported. For example, resource type `nvidia.com/gpu` is specified for GPU node utilization. Extended resources are optional,
and will not be used to compute node's usage if it's not specified in `thresholds` and `targetThresholds` explicitly.
* Only three types of resources are supported: `cpu`, `memory` and `pods`.
* `thresholds` or `targetThresholds` can not be nil and they must configure exactly the same types of resources.
* The valid range of the resource's percentage value is \[0, 100\]
* Percentage value of `thresholds` can not be greater than `targetThresholds` for the same resource.
If any of the resource types is not specified, all its thresholds default to 100% to avoid nodes going
from underutilized to overutilized.
There is another parameter associated with the `LowNodeUtilization` strategy, called `numberOfNodes`.
This parameter can be configured to activate the strategy only when the number of under utilized nodes
are above the configured value. This could be helpful in large clusters where a few nodes could go
under utilized frequently or for a short period of time. By default, `numberOfNodes` is set to zero.
### HighNodeUtilization
This strategy finds nodes that are under utilized and evicts pods from the nodes in the hope that these pods will be
scheduled compactly into fewer nodes. Used in conjunction with node auto-scaling, this strategy is intended to help
trigger down scaling of under utilized nodes.
This strategy **must** be used with the scheduler strategy `MostRequestedPriority`. The parameters of this strategy are
configured under `nodeResourceUtilizationThresholds`.
The under utilization of nodes is determined by a configurable threshold `thresholds`. The threshold
`thresholds` can be configured for cpu, memory, number of pods, and extended resources in terms of percentage. The percentage is
calculated as the current resources requested on the node vs [total allocatable](https://kubernetes.io/docs/concepts/architecture/nodes/#capacity).
For pods, this means the number of pods on the node as a fraction of the pod capacity set for that node.
If a node's usage is below threshold for all (cpu, memory, number of pods and extended resources), the node is considered underutilized.
Currently, pods request resource requirements are considered for computing node resource utilization.
Any node above `thresholds` is considered appropriately utilized and is not considered for eviction.
The `thresholds` param could be tuned as per your cluster requirements. Note that this
strategy evicts pods from `underutilized nodes` (those with usage below `thresholds`)
so that they can be recreated in appropriately utilized nodes.
The strategy will abort if any number of `underutilized nodes` or `appropriately utilized nodes` is zero.
**Parameters:**
|Name|Type|
|---|---|
|`thresholds`|map(string:int)|
|`numberOfNodes`|int|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"HighNodeUtilization":
enabled: true
params:
nodeResourceUtilizationThresholds:
thresholds:
"cpu" : 20
"memory": 20
"pods": 20
```
Policy should pass the following validation checks:
* Three basic native types of resources are supported: `cpu`, `memory` and `pods`. If any of these resource types is not specified, all its thresholds default to 100%.
* Extended resources are supported. For example, resource type `nvidia.com/gpu` is specified for GPU node utilization. Extended resources are optional, and will not be used to compute node's usage if it's not specified in `thresholds` explicitly.
* `thresholds` can not be nil.
* The valid range of the resource's percentage value is \[0, 100\]
There is another parameter associated with the `HighNodeUtilization` strategy, called `numberOfNodes`.
This parameter can be configured to activate the strategy only when the number of under utilized nodes
is above the configured value. This could be helpful in large clusters where a few nodes could go
under utilized frequently or for a short period of time. By default, `numberOfNodes` is set to zero.
### RemovePodsViolatingInterPodAntiAffinity
This strategy makes sure that pods violating interpod anti-affinity are removed from nodes. For example,
if there is podA on a node and podB and podC (running on the same node) have anti-affinity rules which prohibit
them to run on the same node, then podA will be evicted from the node so that podB and podC could run. This
issue could happen, when the anti-affinity rules for podB and podC are created when they are already running on
node.
node. Currently, there are no parameters associated with this strategy. To disable this strategy, the
policy should look like:
**Parameters:**
|Name|Type|
|---|---|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`labelSelector`|(see [label filtering](#label-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
```yaml
```
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemovePodsViolatingInterPodAntiAffinity":
enabled: true
enabled: false
```
### RemovePodsViolatingNodeAffinity
@@ -366,20 +169,9 @@ of scheduling. Over time nodeA stops to satisfy the rule. When the strategy gets
executed and there is another node available that satisfies the node affinity rule,
podA gets evicted from nodeA.
**Parameters:**
The policy file should look like:
|Name|Type|
|---|---|
|`nodeAffinityType`|list(string)|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`labelSelector`|(see [label filtering](#label-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
```yaml
```
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -395,21 +187,9 @@ strategies:
This strategy makes sure that pods violating NoSchedule taints on nodes are removed. For example there is a
pod "podA" with a toleration to tolerate a taint ``key=value:NoSchedule`` scheduled and running on the tainted
node. If the node's taint is subsequently updated/removed, taint is no longer satisfied by its pods' tolerations
and will be evicted.
and will be evicted. The policy file should look like:
**Parameters:**
|Name|Type|
|---|---|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`labelSelector`|(see [label filtering](#label-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
````yaml
````
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -417,63 +197,11 @@ strategies:
enabled: true
````
### RemovePodsViolatingTopologySpreadConstraint
This strategy makes sure that pods violating [topology spread constraints](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/)
are evicted from nodes. Specifically, it tries to evict the minimum number of pods required to balance topology domains to within each constraint's `maxSkew`.
This strategy requires k8s version 1.18 at a minimum.
By default, this strategy only deals with hard constraints, setting parameter `includeSoftConstraints` to `true` will
include soft constraints.
Strategy parameter `labelSelector` is not utilized when balancing topology domains and is only applied during eviction to determine if the pod can be evicted.
**Parameters:**
|Name|Type|
|---|---|
|`includeSoftConstraints`|bool|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`labelSelector`|(see [label filtering](#label-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemovePodsViolatingTopologySpreadConstraint":
enabled: true
params:
includeSoftConstraints: false
```
### RemovePodsHavingTooManyRestarts
This strategy makes sure that pods having too many restarts are removed from nodes. For example a pod with EBS/PD that
can't get the volume/disk attached to the instance, then the pod should be re-scheduled to other nodes. Its parameters
include `podRestartThreshold`, which is the number of restarts at which a pod should be evicted, and `includingInitContainers`,
which determines whether init container restarts should be factored into that calculation.
|`labelSelector`|(see [label filtering](#label-filtering))|
This strategy makes sure that pods having too many restarts are removed from nodes. For example a pod with EBS/PD that can't get the volume/disk attached to the instance, then the pod should be re-scheduled to other nodes.
**Parameters:**
|Name|Type|
|---|---|
|`podRestartThreshold`|int|
|`includingInitContainers`|bool|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
```yaml
```
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -487,266 +215,31 @@ strategies:
### PodLifeTime
This strategy evicts pods that are older than `maxPodLifeTimeSeconds`.
This strategy evicts pods that are older than `.strategies.PodLifeTime.params.maxPodLifeTimeSeconds` The policy
file should look like:
You can also specify `podStatusPhases` to `only` evict pods with specific `StatusPhases`, currently this parameter is limited
to `Running` and `Pending`.
**Parameters:**
|Name|Type|
|---|---|
|`maxPodLifeTimeSeconds`|int|
|`podStatusPhases`|list(string)|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`labelSelector`|(see [label filtering](#label-filtering))|
**Example:**
```yaml
````
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"PodLifeTime":
enabled: true
params:
podLifeTime:
maxPodLifeTimeSeconds: 86400
podStatusPhases:
- "Pending"
```
### RemoveFailedPods
This strategy evicts pods that are in failed status phase.
You can provide an optional parameter to filter by failed `reasons`.
`reasons` can be expanded to include reasons of InitContainers as well by setting the optional parameter `includingInitContainers` to `true`.
You can specify an optional parameter `minPodLifeTimeSeconds` to evict pods that are older than specified seconds.
Lastly, you can specify the optional parameter `excludeOwnerKinds` and if a pod
has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considered for eviction.
**Parameters:**
|Name|Type|
|---|---|
|`minPodLifeTimeSeconds`|uint|
|`excludeOwnerKinds`|list(string)|
|`reasons`|list(string)|
|`includingInitContainers`|bool|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`labelSelector`|(see [label filtering](#label-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemoveFailedPods":
enabled: true
params:
failedPods:
reasons:
- "NodeAffinity"
includingInitContainers: true
excludeOwnerKinds:
- "Job"
minPodLifeTimeSeconds: 3600
```
## Filter Pods
### Namespace filtering
The following strategies accept a `namespaces` parameter which allows to specify a list of including, resp. excluding namespaces:
* `PodLifeTime`
* `RemovePodsHavingTooManyRestarts`
* `RemovePodsViolatingNodeTaints`
* `RemovePodsViolatingNodeAffinity`
* `RemovePodsViolatingInterPodAntiAffinity`
* `RemoveDuplicates`
* `RemovePodsViolatingTopologySpreadConstraint`
* `RemoveFailedPods`
For example:
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"PodLifeTime":
enabled: true
params:
podLifeTime:
maxPodLifeTimeSeconds: 86400
namespaces:
include:
- "namespace1"
- "namespace2"
```
In the examples `PodLifeTime` gets executed only over `namespace1` and `namespace2`.
The similar holds for `exclude` field:
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"PodLifeTime":
enabled: true
params:
podLifeTime:
maxPodLifeTimeSeconds: 86400
namespaces:
exclude:
- "namespace1"
- "namespace2"
```
The strategy gets executed over all namespaces but `namespace1` and `namespace2`.
It's not allowed to compute `include` with `exclude` field.
### Priority filtering
All strategies are able to configure a priority threshold, only pods under the threshold can be evicted. You can
specify this threshold by setting `thresholdPriorityClassName`(setting the threshold to the value of the given
priority class) or `thresholdPriority`(directly setting the threshold) parameters. By default, this threshold
is set to the value of `system-cluster-critical` priority class.
Note: Setting `evictSystemCriticalPods` to true disables priority filtering entirely.
E.g.
Setting `thresholdPriority`
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"PodLifeTime":
enabled: true
params:
podLifeTime:
maxPodLifeTimeSeconds: 86400
thresholdPriority: 10000
```
Setting `thresholdPriorityClassName`
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"PodLifeTime":
enabled: true
params:
podLifeTime:
maxPodLifeTimeSeconds: 86400
thresholdPriorityClassName: "priorityclass1"
```
Note that you can't configure both `thresholdPriority` and `thresholdPriorityClassName`, if the given priority class
does not exist, descheduler won't create it and will throw an error.
### Label filtering
The following strategies can configure a [standard kubernetes labelSelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#labelselector-v1-meta)
to filter pods by their labels:
* `PodLifeTime`
* `RemovePodsHavingTooManyRestarts`
* `RemovePodsViolatingNodeTaints`
* `RemovePodsViolatingNodeAffinity`
* `RemovePodsViolatingInterPodAntiAffinity`
* `RemovePodsViolatingTopologySpreadConstraint`
* `RemoveFailedPods`
This allows running strategies among pods the descheduler is interested in.
For example:
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"PodLifeTime":
enabled: true
params:
podLifeTime:
maxPodLifeTimeSeconds: 86400
labelSelector:
matchLabels:
component: redis
matchExpressions:
- {key: tier, operator: In, values: [cache]}
- {key: environment, operator: NotIn, values: [dev]}
```
### Node Fit filtering
The following strategies accept a `nodeFit` boolean parameter which can optimize descheduling:
* `RemoveDuplicates`
* `LowNodeUtilization`
* `HighNodeUtilization`
* `RemovePodsViolatingInterPodAntiAffinity`
* `RemovePodsViolatingNodeAffinity`
* `RemovePodsViolatingNodeTaints`
* `RemovePodsViolatingTopologySpreadConstraint`
* `RemovePodsHavingTooManyRestarts`
* `RemoveFailedPods`
If set to `true` the descheduler will consider whether or not the pods that meet eviction criteria will fit on other nodes before evicting them. If a pod cannot be rescheduled to another node, it will not be evicted. Currently the following criteria are considered when setting `nodeFit` to `true`:
- A `nodeSelector` on the pod
- Any `Tolerations` on the pod and any `Taints` on the other nodes
- `nodeAffinity` on the pod
- Whether any of the other nodes are marked as `unschedulable`
E.g.
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"LowNodeUtilization":
enabled: true
params:
nodeResourceUtilizationThresholds:
thresholds:
"cpu" : 20
"memory": 20
"pods": 20
targetThresholds:
"cpu" : 50
"memory": 50
"pods": 50
nodeFit: true
```
Note that node fit filtering references the current pod spec, and not that of it's owner.
Thus, if the pod is owned by a ReplicationController (and that ReplicationController was modified recently),
the pod may be running with an outdated spec, which the descheduler will reference when determining node fit.
This is expected behavior as the descheduler is a "best-effort" mechanism.
Using Deployments instead of ReplicationControllers provides an automated rollout of pod spec changes, therefore ensuring that the descheduler has an up-to-date view of the cluster state.
````
## Pod Evictions
When the descheduler decides to evict pods from a node, it employs the following general mechanism:
* [Critical pods](https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/) (with priorityClassName set to system-cluster-critical or system-node-critical) are never evicted (unless `evictSystemCriticalPods: true` is set).
* Pods (static or mirrored pods or stand alone pods) not part of an ReplicationController, ReplicaSet(Deployment), StatefulSet, or Job are
* [Critical pods](https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/) (with priorityClassName set to system-cluster-critical or system-node-critical) are never evicted.
* Pods (static or mirrored pods or stand alone pods) not part of an RC, RS, Deployment or Job are
never evicted because these pods won't be recreated.
* Pods associated with DaemonSets are never evicted.
* Pods with local storage are never evicted (unless `evictLocalStoragePods: true` is set).
* Pods with PVCs are evicted (unless `ignorePvcPods: true` is set).
* Pods with local storage are never evicted.
* In `LowNodeUtilization` and `RemovePodsViolatingInterPodAntiAffinity`, pods are evicted by their priority from low to high, and if they have same priority,
best effort pods are evicted before burstable and guaranteed pods.
* All types of pods with the annotation `descheduler.alpha.kubernetes.io/evict` are eligible for eviction. This
* All types of pods with the annotation descheduler.alpha.kubernetes.io/evict are evicted. This
annotation is used to override checks which prevent eviction and users can select which pod is evicted.
Users should know how and if the pod will be recreated.
@@ -757,16 +250,6 @@ Setting `--v=4` or greater on the Descheduler will log all reasons why any pod i
Pods subject to a Pod Disruption Budget(PDB) are not evicted if descheduling violates its PDB. The pods
are evicted by using the eviction subresource to handle PDB.
## Metrics
| name | type | description |
|-------|-------|----------------|
| build_info | gauge | constant 1 |
| pods_evicted | CounterVec | total number of pods evicted |
The metrics are served through https://localhost:10258/metrics by default.
The address and port can be changed by setting `--binding-address` and `--secure-port` flags.
## Compatibility Matrix
The below compatibility matrix shows the k8s client package(client-go, apimachinery, etc) versions that descheduler
is compiled with. At this time descheduler does not have a hard dependency to a specific k8s release. However a
@@ -778,10 +261,6 @@ packages that it is compiled with.
Descheduler | Supported Kubernetes Version
-------------|-----------------------------
v0.22 | v1.22
v0.21 | v1.21
v0.20 | v1.20
v0.19 | v1.19
v0.18 | v1.18
v0.10 | v1.17
v0.4-v0.9 | v1.9+

View File

@@ -1,7 +1,7 @@
apiVersion: v1
name: descheduler
version: 0.22.1
appVersion: 0.22.1
name: descheduler-helm-chart
version: 0.18.999
appVersion: 0.18.0
description: Descheduler for Kubernetes is used to rebalance clusters by evicting pods that can potentially be scheduled on better nodes. In the current implementation, descheduler does not schedule replacement of evicted pods but relies on the default scheduler for that.
keywords:
- kubernetes
@@ -12,5 +12,5 @@ icon: https://kubernetes.io/images/favicon.png
sources:
- https://github.com/kubernetes-sigs/descheduler
maintainers:
- name: Kubernetes SIG Scheduling
email: kubernetes-sig-scheduling@googlegroups.com
- name: stevehipwell
email: steve.hipwell@github.com

View File

@@ -6,12 +6,12 @@
```shell
helm repo add descheduler https://kubernetes-sigs.github.io/descheduler/
helm install my-release --namespace kube-system descheduler/descheduler
$ helm install descheduler/descheduler --name my-release
```
## Introduction
This chart bootstraps a [descheduler](https://github.com/kubernetes-sigs/descheduler/) cron job on a [Kubernetes](http://kubernetes.io) cluster using the [Helm](https://helm.sh) package manager.
This chart bootstraps a [desheduler](https://github.com/kubernetes-sigs/descheduler/) cron job on a [Kubernetes](http://kubernetes.io) cluster using the [Helm](https://helm.sh) package manager.
## Prerequisites
@@ -22,7 +22,7 @@ This chart bootstraps a [descheduler](https://github.com/kubernetes-sigs/desched
To install the chart with the release name `my-release`:
```shell
helm install --namespace kube-system my-release descheduler/descheduler
helm install --name my-release descheduler/descheduler
```
The command deploys _descheduler_ on the Kubernetes cluster in the default configuration. The [configuration](#configuration) section lists the parameters that can be configured during installation.
@@ -43,28 +43,17 @@ The command removes all the Kubernetes components associated with the chart and
The following table lists the configurable parameters of the _descheduler_ chart and their default values.
| Parameter | Description | Default |
| ------------------------------ | --------------------------------------------------------------------------------------------------------------------- | ------------------------------------ |
| `kind` | Use as CronJob or Deployment | `CronJob` |
| `image.repository` | Docker repository to use | `k8s.gcr.io/descheduler/descheduler` |
| `image.tag` | Docker tag to use | `v[chart appVersion]` |
| `image.pullPolicy` | Docker image pull policy | `IfNotPresent` |
| `imagePullSecrets` | Docker repository secrets | `[]` |
| `nameOverride` | String to partially override `descheduler.fullname` template (will prepend the release name) | `""` |
| `fullnameOverride` | String to fully override `descheduler.fullname` template | `""` |
| `cronJobApiVersion` | CronJob API Group Version | `"batch/v1"` |
| `schedule` | The cron schedule to run the _descheduler_ job on | `"*/2 * * * *"` |
| `startingDeadlineSeconds` | If set, configure `startingDeadlineSeconds` for the _descheduler_ job | `nil` |
| `successfulJobsHistoryLimit` | If set, configure `successfulJobsHistoryLimit` for the _descheduler_ job | `nil` |
| `failedJobsHistoryLimit` | If set, configure `failedJobsHistoryLimit` for the _descheduler_ job | `nil` |
| `deschedulingInterval` | If using kind:Deployment, sets time between consecutive descheduler executions. | `5m` |
| `cmdOptions` | The options to pass to the _descheduler_ command | _see values.yaml_ |
| `deschedulerPolicy.strategies` | The _descheduler_ strategies to apply | _see values.yaml_ |
| `priorityClassName` | The name of the priority class to add to pods | `system-cluster-critical` |
| `rbac.create` | If `true`, create & use RBAC resources | `true` |
| `podSecurityPolicy.create` | If `true`, create PodSecurityPolicy | `true` |
| `resources` | Descheduler container CPU and memory requests/limits | _see values.yaml_ |
| `serviceAccount.create` | If `true`, create a service account for the cron job | `true` |
| `serviceAccount.name` | The name of the service account to use, if not set and create is true a name is generated using the fullname template | `nil` |
| `nodeSelector` | Node selectors to run the descheduler cronjob on specific nodes | `nil` |
| `tolerations` | tolerations to run the descheduler cronjob on specific nodes | `nil` |
| Parameter | Description | Default |
| ------------------------------ | --------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ |
| `image.repository` | Docker repository to use | `us.gcr.io/k8s-artifacts-prod/descheduler/descheduler` |
| `image.tag` | Docker tag to use | `v[chart appVersion]` |
| `image.pullPolicy` | Docker image pull policy | `IfNotPresent` |
| `nameOverride` | String to partially override `descheduler.fullname` template (will prepend the release name) | `""` |
| `fullnameOverride` | String to fully override `descheduler.fullname` template | `""` |
| `schedule` | The cron schedule to run the _descheduler_ job on | `"*/2 * * * *"` |
| `cmdOptions` | The options to pass to the _descheduler_ command | _see values.yaml_ |
| `deschedulerPolicy.strategies` | The _descheduler_ strategies to apply | _see values.yaml_ |
| `priorityClassName` | The name of the priority class to add to pods | `system-cluster-critical` |
| `rbac.create` | If `true`, create & use RBAC resources | `true` |
| `serviceAccount.create` | If `true`, create a service account for the cron job | `true` |
| `serviceAccount.name` | The name of the service account to use, if not set and create is true a name is generated using the fullname template | `nil` |

View File

@@ -44,14 +44,6 @@ app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end -}}
{{/*
Selector labels
*/}}
{{- define "descheduler.selectorLabels" -}}
app.kubernetes.io/name: {{ include "descheduler.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end -}}
{{/*
Create the name of the service account to use
*/}}

View File

@@ -12,23 +12,10 @@ rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list", "delete"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: ["scheduling.k8s.io"]
resources: ["priorityclasses"]
verbs: ["get", "watch", "list"]
{{- if .Values.podSecurityPolicy.create }}
- apiGroups: ['policy']
resources: ['podsecuritypolicies']
verbs: ['use']
resourceNames:
- {{ template "descheduler.fullname" . }}
{{- end }}
{{- end -}}

View File

@@ -1,5 +1,4 @@
{{- if eq .Values.kind "CronJob" }}
apiVersion: {{ .Values.cronJobApiVersion | default "batch/v1" }}
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: {{ template "descheduler.fullname" . }}
@@ -8,15 +7,6 @@ metadata:
spec:
schedule: {{ .Values.schedule | quote }}
concurrencyPolicy: "Forbid"
{{- if .Values.startingDeadlineSeconds }}
startingDeadlineSeconds: {{ .Values.startingDeadlineSeconds }}
{{- end }}
{{- if .Values.successfulJobsHistoryLimit }}
successfulJobsHistoryLimit: {{ .Values.successfulJobsHistoryLimit }}
{{- end }}
{{- if .Values.failedJobsHistoryLimit }}
failedJobsHistoryLimit: {{ .Values.failedJobsHistoryLimit }}
{{- end }}
jobTemplate:
spec:
template:
@@ -28,28 +18,17 @@ spec:
{{- .Values.podAnnotations | toYaml | nindent 12 }}
{{- end }}
labels:
{{- include "descheduler.selectorLabels" . | nindent 12 }}
app.kubernetes.io/name: {{ include "descheduler.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- if .Values.podLabels }}
{{- .Values.podLabels | toYaml | nindent 12 }}
{{- end }}
spec:
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- if .Values.priorityClassName }}
priorityClassName: {{ .Values.priorityClassName }}
{{- end }}
serviceAccountName: {{ template "descheduler.serviceAccountName" . }}
restartPolicy: "Never"
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 10 }}
{{- end }}
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "v%s" .Chart.AppVersion) }}"
@@ -65,16 +44,6 @@ spec:
- {{ $value | quote }}
{{- end }}
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 16 }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
runAsNonRoot: true
volumeMounts:
- mountPath: /policy-dir
name: policy-volume
@@ -82,4 +51,3 @@ spec:
- name: policy-volume
configMap:
name: {{ template "descheduler.fullname" . }}
{{- end }}

View File

@@ -1,75 +0,0 @@
{{- if eq .Values.kind "Deployment" }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ template "descheduler.fullname" . }}
labels:
{{- include "descheduler.labels" . | nindent 4 }}
spec:
replicas: 1
selector:
matchLabels:
{{- include "descheduler.selectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "descheduler.selectorLabels" . | nindent 8 }}
{{- if .Values.podLabels }}
{{- .Values.podLabels | toYaml | nindent 8 }}
{{- end }}
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
{{- if .Values.podAnnotations }}
{{- .Values.podAnnotations | toYaml | nindent 8 }}
{{- end }}
spec:
{{- if .Values.priorityClassName }}
priorityClassName: {{ .Values.priorityClassName }}
{{- end }}
serviceAccountName: {{ template "descheduler.serviceAccountName" . }}
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "v%s" .Chart.AppVersion) }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- "/bin/descheduler"
args:
- "--policy-config-file"
- "/policy-dir/policy.yaml"
- "--descheduling-interval"
- {{ required "deschedulingInterval required for running as Deployment" .Values.deschedulingInterval }}
{{- range $key, $value := .Values.cmdOptions }}
- {{ printf "--%s" $key | quote }}
{{- if $value }}
- {{ $value | quote }}
{{- end }}
{{- end }}
ports:
- containerPort: 10258
protocol: TCP
resources:
{{- toYaml .Values.resources | nindent 12 }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
runAsNonRoot: true
volumeMounts:
- mountPath: /policy-dir
name: policy-volume
volumes:
- name: policy-volume
configMap:
name: {{ template "descheduler.fullname" . }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}

View File

@@ -1,38 +0,0 @@
{{- if .Values.podSecurityPolicy.create -}}
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
name: {{ template "descheduler.fullname" . }}
annotations:
seccomp.security.alpha.kubernetes.io/allowedProfileNames: 'docker/default,runtime/default'
seccomp.security.alpha.kubernetes.io/defaultProfileName: 'runtime/default'
spec:
privileged: false
allowPrivilegeEscalation: false
requiredDropCapabilities:
- ALL
volumes:
- 'configMap'
- 'secret'
hostNetwork: false
hostIPC: false
hostPID: false
runAsUser:
rule: 'MustRunAs'
ranges:
- min: 1
max: 65535
seLinux:
rule: 'RunAsAny'
supplementalGroups:
rule: 'MustRunAs'
ranges:
- min: 1
max: 65535
fsGroup:
rule: 'MustRunAs'
ranges:
- min: 1
max: 65535
readOnlyRootFilesystem: true
{{- end -}}

View File

@@ -1,29 +0,0 @@
apiVersion: v1
kind: Pod
metadata:
name: descheduler-test-pod
annotations:
"helm.sh/hook": test
spec:
restartPolicy: Never
serviceAccountName: descheduler-ci
containers:
- name: descheduler-test-container
image: alpine:latest
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- All
privileged: false
runAsNonRoot: false
command: ["/bin/ash"]
args:
- -c
- >-
apk --no-cache add curl &&
curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl &&
chmod +x ./kubectl &&
mv ./kubectl /usr/local/bin/kubectl &&
/usr/local/bin/kubectl get pods --namespace kube-system --token "$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" | grep "descheduler" | grep "Completed"

View File

@@ -2,36 +2,16 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# CronJob or Deployment
kind: CronJob
image:
repository: k8s.gcr.io/descheduler/descheduler
repository: us.gcr.io/k8s-artifacts-prod/descheduler/descheduler
# Overrides the image tag whose default is the chart version
tag: ""
pullPolicy: IfNotPresent
imagePullSecrets: []
resources:
requests:
cpu: 500m
memory: 256Mi
# limits:
# cpu: 100m
# memory: 128Mi
nameOverride: ""
fullnameOverride: ""
cronJobApiVersion: "batch/v1" # Use "batch/v1beta1" for k8s version < 1.21.0. TODO(@7i) remove with 1.23 release
schedule: "*/2 * * * *"
#startingDeadlineSeconds: 200
#successfulJobsHistoryLimit: 1
#failedJobsHistoryLimit: 1
# Required when running as a Deployment
deschedulingInterval: 5m
cmdOptions:
v: 3
@@ -48,8 +28,8 @@ deschedulerPolicy:
RemovePodsViolatingNodeAffinity:
enabled: true
params:
nodeAffinityType:
- requiredDuringSchedulingIgnoredDuringExecution
nodeAffinityType:
- requiredDuringSchedulingIgnoredDuringExecution
RemovePodsViolatingInterPodAntiAffinity:
enabled: true
LowNodeUtilization:
@@ -67,23 +47,10 @@ deschedulerPolicy:
priorityClassName: system-cluster-critical
nodeSelector: {}
# foo: bar
tolerations: []
# - key: 'management'
# operator: 'Equal'
# value: 'tool'
# effect: 'NoSchedule'
rbac:
# Specifies whether RBAC resources should be created
create: true
podSecurityPolicy:
# Specifies whether PodSecurityPolicy should be created.
create: true
serviceAccount:
# Specifies whether a ServiceAccount should be created
create: true

View File

@@ -14,7 +14,7 @@ steps:
- VERSION=$_GIT_TAG
- BASE_REF=$_PULL_BASE_REF
args:
- push-all
- push
substitutions:
# _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and
# can be used as a substitution

View File

@@ -18,80 +18,44 @@ limitations under the License.
package options
import (
"github.com/spf13/pflag"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
apiserveroptions "k8s.io/apiserver/pkg/server/options"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/component-base/logs"
// install the componentconfig api so we get its defaulting and conversion functions
"sigs.k8s.io/descheduler/pkg/apis/componentconfig"
"sigs.k8s.io/descheduler/pkg/apis/componentconfig/v1alpha1"
deschedulerscheme "sigs.k8s.io/descheduler/pkg/descheduler/scheme"
)
const (
DefaultDeschedulerPort = 10258
"github.com/spf13/pflag"
)
// DeschedulerServer configuration
type DeschedulerServer struct {
componentconfig.DeschedulerConfiguration
Client clientset.Interface
Logs *logs.Options
SecureServing *apiserveroptions.SecureServingOptionsWithLoopback
DisableMetrics bool
Client clientset.Interface
}
// NewDeschedulerServer creates a new DeschedulerServer with default parameters
func NewDeschedulerServer() (*DeschedulerServer, error) {
cfg, err := newDefaultComponentConfig()
if err != nil {
return nil, err
}
secureServing := apiserveroptions.NewSecureServingOptions().WithLoopback()
secureServing.BindPort = DefaultDeschedulerPort
return &DeschedulerServer{
DeschedulerConfiguration: *cfg,
Logs: logs.NewOptions(),
SecureServing: secureServing,
}, nil
}
// Validation checks for DeschedulerServer.
func (s *DeschedulerServer) Validate() error {
var errs []error
errs = append(errs, s.Logs.Validate()...)
return utilerrors.NewAggregate(errs)
}
func newDefaultComponentConfig() (*componentconfig.DeschedulerConfiguration, error) {
versionedCfg := v1alpha1.DeschedulerConfiguration{}
deschedulerscheme.Scheme.Default(&versionedCfg)
func NewDeschedulerServer() *DeschedulerServer {
versioned := v1alpha1.DeschedulerConfiguration{}
deschedulerscheme.Scheme.Default(&versioned)
cfg := componentconfig.DeschedulerConfiguration{}
if err := deschedulerscheme.Scheme.Convert(&versionedCfg, &cfg, nil); err != nil {
return nil, err
deschedulerscheme.Scheme.Convert(versioned, &cfg, nil)
s := DeschedulerServer{
DeschedulerConfiguration: cfg,
}
return &cfg, nil
return &s
}
// AddFlags adds flags for a specific SchedulerServer to the specified FlagSet
func (rs *DeschedulerServer) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&rs.Logging.Format, "logging-format", "text", `Sets the log format. Permitted formats: "text", "json". Non-default formats don't honor these flags: --add-dir-header, --alsologtostderr, --log-backtrace-at, --log-dir, --log-file, --log-file-max-size, --logtostderr, --skip-headers, --skip-log-headers, --stderrthreshold, --log-flush-frequency.\nNon-default choices are currently alpha and subject to change without warning.`)
fs.DurationVar(&rs.DeschedulingInterval, "descheduling-interval", rs.DeschedulingInterval, "Time interval between two consecutive descheduler executions. Setting this value instructs the descheduler to run in a continuous loop at the interval specified.")
fs.StringVar(&rs.KubeconfigFile, "kubeconfig", rs.KubeconfigFile, "File with kube configuration.")
fs.StringVar(&rs.PolicyConfigFile, "policy-config-file", rs.PolicyConfigFile, "File with descheduler policy configuration.")
fs.BoolVar(&rs.DryRun, "dry-run", rs.DryRun, "execute descheduler in dry run mode.")
// node-selector query causes descheduler to run only on nodes that matches the node labels in the query
fs.StringVar(&rs.NodeSelector, "node-selector", rs.NodeSelector, "DEPRECATED: selector (label query) to filter on, supports '=', '==', and '!='.(e.g. -l key1=value1,key2=value2)")
fs.StringVar(&rs.NodeSelector, "node-selector", rs.NodeSelector, "Selector (label query) to filter on, supports '=', '==', and '!='.(e.g. -l key1=value1,key2=value2)")
// max-no-pods-to-evict limits the maximum number of pods to be evicted per node by descheduler.
fs.IntVar(&rs.MaxNoOfPodsToEvictPerNode, "max-pods-to-evict-per-node", rs.MaxNoOfPodsToEvictPerNode, "DEPRECATED: limits the maximum number of pods to be evicted per node by descheduler")
fs.IntVar(&rs.MaxNoOfPodsToEvictPerNode, "max-pods-to-evict-per-node", rs.MaxNoOfPodsToEvictPerNode, "Limits the maximum number of pods to be evicted per node by descheduler")
// evict-local-storage-pods allows eviction of pods that are using local storage. This is false by default.
fs.BoolVar(&rs.EvictLocalStoragePods, "evict-local-storage-pods", rs.EvictLocalStoragePods, "DEPRECATED: enables evicting pods using local storage by descheduler")
fs.BoolVar(&rs.DisableMetrics, "disable-metrics", rs.DisableMetrics, "Disables metrics. The metrics are by default served through https://localhost:10258/metrics. Secure address, resp. port can be changed through --bind-address, resp. --secure-port flags.")
rs.SecureServing.AddFlags(fs)
fs.BoolVar(&rs.EvictLocalStoragePods, "evict-local-storage-pods", rs.EvictLocalStoragePods, "Enables evicting pods using local storage by descheduler")
}

View File

@@ -18,7 +18,6 @@ limitations under the License.
package app
import (
"context"
"flag"
"io"
@@ -27,61 +26,29 @@ import (
"github.com/spf13/cobra"
apiserver "k8s.io/apiserver/pkg/server"
"k8s.io/apiserver/pkg/server/mux"
restclient "k8s.io/client-go/rest"
aflag "k8s.io/component-base/cli/flag"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/component-base/logs"
"k8s.io/klog/v2"
)
// NewDeschedulerCommand creates a *cobra.Command object with default parameters
func NewDeschedulerCommand(out io.Writer) *cobra.Command {
s, err := options.NewDeschedulerServer()
if err != nil {
klog.ErrorS(err, "unable to initialize server")
}
s := options.NewDeschedulerServer()
cmd := &cobra.Command{
Use: "descheduler",
Short: "descheduler",
Long: `The descheduler evicts pods which may be bound to less desired nodes`,
Run: func(cmd *cobra.Command, args []string) {
s.Logs.Config.Format = s.Logging.Format
s.Logs.Apply()
// LoopbackClientConfig is a config for a privileged loopback connection
var LoopbackClientConfig *restclient.Config
var SecureServing *apiserver.SecureServingInfo
if err := s.SecureServing.ApplyTo(&SecureServing, &LoopbackClientConfig); err != nil {
klog.ErrorS(err, "failed to apply secure server configuration")
return
}
if err := s.Validate(); err != nil {
klog.ErrorS(err, "failed to validate server configuration")
return
}
if !s.DisableMetrics {
ctx := context.TODO()
pathRecorderMux := mux.NewPathRecorderMux("descheduler")
pathRecorderMux.Handle("/metrics", legacyregistry.HandlerWithReset())
if _, err := SecureServing.Serve(pathRecorderMux, 0, ctx.Done()); err != nil {
klog.Fatalf("failed to start secure server: %v", err)
return
}
}
logs.InitLogs()
defer logs.FlushLogs()
err := Run(s)
if err != nil {
klog.ErrorS(err, "descheduler server")
klog.Errorf("%v", err)
}
},
}
cmd.SetOut(out)
cmd.SetOutput(out)
flags := cmd.Flags()
flags.SetNormalizeFunc(aflag.WordSepNormalizeFunc)
flags.AddGoFlagSet(flag.CommandLine)

View File

@@ -18,19 +18,70 @@ package app
import (
"fmt"
"runtime"
"strings"
"github.com/spf13/cobra"
"sigs.k8s.io/descheduler/pkg/version"
)
var (
// gitCommit is a constant representing the source version that
// generated this build. It should be set during build via -ldflags.
gitCommit string
// version is a constant representing the version tag that
// generated this build. It should be set during build via -ldflags.
version string
// buildDate in ISO8601 format, output of $(date -u +'%Y-%m-%dT%H:%M:%SZ')
//It should be set during build via -ldflags.
buildDate string
)
// Info holds the information related to descheduler app version.
type Info struct {
Major string `json:"major"`
Minor string `json:"minor"`
GitCommit string `json:"gitCommit"`
GitVersion string `json:"gitVersion"`
BuildDate string `json:"buildDate"`
GoVersion string `json:"goVersion"`
Compiler string `json:"compiler"`
Platform string `json:"platform"`
}
// Get returns the overall codebase version. It's for detecting
// what code a binary was built from.
func Get() Info {
majorVersion, minorVersion := splitVersion(version)
return Info{
Major: majorVersion,
Minor: minorVersion,
GitCommit: gitCommit,
GitVersion: version,
BuildDate: buildDate,
GoVersion: runtime.Version(),
Compiler: runtime.Compiler,
Platform: fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH),
}
}
func NewVersionCommand() *cobra.Command {
var versionCmd = &cobra.Command{
Use: "version",
Short: "Version of descheduler",
Long: `Prints the version of descheduler.`,
Run: func(cmd *cobra.Command, args []string) {
fmt.Printf("Descheduler version %+v\n", version.Get())
fmt.Printf("Descheduler version %+v\n", Get())
},
}
return versionCmd
}
// splitVersion splits the git version to generate major and minor versions needed.
func splitVersion(version string) (string, string) {
if version == "" {
return "", ""
}
// A sample version would be of form v0.1.0-7-ge884046, so split at first '.' and
// then return 0 and 1+(+ appended to follow semver convention) for major and minor versions.
return strings.Trim(strings.Split(version, ".")[0], "v"), strings.Split(version, ".")[1] + "+"
}

View File

@@ -17,9 +17,10 @@ limitations under the License.
package main
import (
"flag"
"fmt"
"k8s.io/component-base/logs"
"os"
"sigs.k8s.io/descheduler/cmd/descheduler/app"
)
@@ -27,10 +28,7 @@ func main() {
out := os.Stdout
cmd := app.NewDeschedulerCommand(out)
cmd.AddCommand(app.NewVersionCommand())
logs.InitLogs()
defer logs.FlushLogs()
flag.CommandLine.Parse([]string{})
if err := cmd.Execute(); err != nil {
fmt.Println(err)
os.Exit(1)

View File

@@ -3,10 +3,10 @@
## Required Tools
- [Git](https://git-scm.com/downloads)
- [Go 1.16+](https://golang.org/dl/)
- [Go 1.14+](https://golang.org/dl/)
- [Docker](https://docs.docker.com/install/)
- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl)
- [kind v0.10.0+](https://kind.sigs.k8s.io/)
- [kind](https://kind.sigs.k8s.io/)
## Build and Run
@@ -34,23 +34,9 @@ GOOS=linux make dev-image
kind create cluster --config hack/kind_config.yaml
kind load docker-image <image name>
kind get kubeconfig > /tmp/admin.conf
export KUBECONFIG=/tmp/admin.conf
make test-unit
make test-e2e
```
## Run Helm Tests
Run the helm test for a particular descheduler release by setting below variables,
```
HELM_IMAGE_REPO="descheduler"
HELM_IMAGE_TAG="helm-test"
HELM_CHART_LOCATION="./charts/descheduler"
```
The helm tests runs as part of descheduler CI. But, to run it manually from the descheduler root,
```
make test-helm
```
### Miscellaneous
See the [hack directory](https://github.com/kubernetes-sigs/descheduler/tree/master/hack) for additional tools and scripts used for developing the descheduler.
See the [hack directory](https://github.com/kubernetes-sigs/descheduler/tree/master/hack) for additional tools and scripts used for developing the descheduler.

View File

@@ -7,9 +7,9 @@
1. Make sure your repo is clean by git's standards
2. Create a release branch `git checkout -b release-1.18` (not required for patch releases)
3. Push the release branch to the descheuler repo and ensure branch protection is enabled (not required for patch releases)
4. Tag the repository from the `master` branch (from the `release-1.18` branch for a patch release) and push the tag `VERSION=v0.18.0 git tag -m $VERSION $VERSION; git push origin $VERSION`
4. Tag the repository and push the tag `VERSION=v0.18.0 git tag -m $VERSION $VERSION; git push origin $VERSION`
5. Publish a draft release using the tag you just created
6. Perform the [image promotion process](https://github.com/kubernetes/k8s.io/tree/main/k8s.gcr.io#image-promoter)
6. Perform the [image promotion process](https://github.com/kubernetes/k8s.io/tree/master/k8s.gcr.io#image-promoter)
7. Publish release
8. Email `kubernetes-sig-scheduling@googlegroups.com` to announce the release
@@ -18,20 +18,15 @@
1. Make sure your repo is clean by git's standards
2. Create a release branch `git checkout -b release-1.18` (not required for patch releases)
3. Push the release branch to the descheuler repo and ensure branch protection is enabled (not required for patch releases)
4. Tag the repository from the `master` branch (from the `release-1.18` branch for a patch release) and push the tag `VERSION=v0.18.0 git tag -m $VERSION $VERSION; git push origin $VERSION`
4. Tag the repository and push the tag `VERSION=v0.18.0 git tag -m $VERSION $VERSION; git push origin $VERSION`
5. Checkout the tag you just created and make sure your repo is clean by git's standards `git checkout $VERSION`
6. Build and push the container image to the staging registry `VERSION=$VERSION make push-all`
6. Build and push the container image to the staging registry `VERSION=$VERSION make push`
7. Publish a draft release using the tag you just created
8. Perform the [image promotion process](https://github.com/kubernetes/k8s.io/tree/main/k8s.gcr.io#image-promoter)
8. Perform the [image promotion process](https://github.com/kubernetes/k8s.io/tree/master/k8s.gcr.io#image-promoter)
9. Publish release
10. Email `kubernetes-sig-scheduling@googlegroups.com` to announce the release
### Notes
It's important to create the tag on the master branch after creating the release-* branch so that the [Helm releaser-action](#helm-chart) can work.
It compares the changes in the action-triggering branch to the latest tag on that branch, so if you tag before creating the new branch there
will be nothing to compare and it will fail (creating a new release branch usually involves no code changes). For this same reason, you should
also tag patch releases (on the release-* branch) *after* pushing changes (if those changes involve bumping the Helm chart version).
See [post-descheduler-push-images dashboard](https://testgrid.k8s.io/sig-scheduling#post-descheduler-push-images) for staging registry image build job status.
View the descheduler staging registry using [this URL](https://console.cloud.google.com/gcr/images/k8s-staging-descheduler/GLOBAL/descheduler) in a web browser
@@ -58,17 +53,17 @@ docker pull gcr.io/k8s-staging-descheduler/descheduler:v20200206-0.9.0-94-ge2a23
```
## Helm Chart
Helm chart releases are managed by a separate set of git tags that are prefixed with `descheduler-helm-chart-*`. Example git tag name is `descheduler-helm-chart-0.18.0`.
Released versions of the helm charts are stored in the `gh-pages` branch of this repo. The [chart-releaser-action GitHub Action](https://github.com/helm/chart-releaser-action)
is setup to build and push the helm charts to the `gh-pages` branch when changes are pushed to a `release-*` branch.
Helm chart releases are managed by a separate set of git tags that are prefixed with `chart-*`. Example git tag name is `chart-0.18.0`. Released versions of the
helm charts are stored in the `gh-pages` branch of this repo. The [chart-releaser-action GitHub Action](https://github.com/helm/chart-releaser-action) is setup to
build and push the helm charts to the `gh-pages` branch when a `chart-*` git tag is created.
The major and minor version of the chart matches the descheduler major and minor versions. For example descheduler helm chart version helm-descheduler-chart-0.18.0 corresponds
The major and minor version of the chart matches the descheduler major and minor versions. For example descheduler helm chart version chart-0.18.0 corresponds
to descheduler version v0.18.0. The patch version of the descheduler helm chart and the patcher version of the descheduler will not necessarily match. The patch
version of the descheduler helm chart is used to version changes specific to the helm chart.
1. Merge all helm chart changes into the master branch before the release is tagged/cut
1. Merge all helm chart changes into the appropriate release branch(i.e. release-1.18)
1. Ensure that `appVersion` in file `charts/descheduler/Chart.yaml` matches the descheduler version(no `v` prefix)
2. Ensure that `version` in file `charts/descheduler/Chart.yaml` has been incremented. This is the chart version.
2. Make sure your repo is clean by git's standards
3. Follow the release-branch or patch release tagging pattern from the above section.
4. Verify the new helm artifact has been successfully pushed to the `gh-pages` branch
3. Create the tag and push it `git checkout release-1.18; CHART_VERSION=chart-0.18.0; git tag $CHART_VERSION; git push origin $CHART_VERSION`
4. Verify the new helm artifact has been successfully pushed to the `gh-pages` branch

View File

@@ -1,25 +1,9 @@
# User Guide
Starting with descheduler release v0.10.0 container images are available in the official k8s container registry.
Descheduler Version | Container Image | Architectures |
------------------- |-----------------------------------------------------|-------------------------|
v0.22.1 | k8s.gcr.io/descheduler/descheduler:v0.22.1 | AMD64<br>ARM64<br>ARMv7 |
v0.22.0 | k8s.gcr.io/descheduler/descheduler:v0.22.0 | AMD64<br>ARM64<br>ARMv7 |
v0.21.0 | k8s.gcr.io/descheduler/descheduler:v0.21.0 | AMD64<br>ARM64<br>ARMv7 |
v0.20.0 | k8s.gcr.io/descheduler/descheduler:v0.20.0 | AMD64<br>ARM64 |
v0.19.0 | k8s.gcr.io/descheduler/descheduler:v0.19.0 | AMD64 |
v0.18.0 | k8s.gcr.io/descheduler/descheduler:v0.18.0 | AMD64 |
v0.10.0 | k8s.gcr.io/descheduler/descheduler:v0.10.0 | AMD64 |
Note that multi-arch container images cannot be pulled by [kind](https://kind.sigs.k8s.io) from a registry. Therefore
starting with descheduler release v0.20.0 use the below process to download the official descheduler
image into a kind cluster.
```
kind create cluster
docker pull k8s.gcr.io/descheduler/descheduler:v0.20.0
kind load docker-image k8s.gcr.io/descheduler/descheduler:v0.20.0
```
Starting with descheduler release v0.10.0 container images are available in these container registries.
* `asia.gcr.io/k8s-artifacts-prod/descheduler/descheduler`
* `eu.gcr.io/k8s-artifacts-prod/descheduler/descheduler`
* `us.gcr.io/k8s-artifacts-prod/descheduler/descheduler`
## Policy Configuration Examples
The [examples](https://github.com/kubernetes-sigs/descheduler/tree/master/examples) directory has descheduler policy configuration examples.
@@ -39,11 +23,11 @@ Available Commands:
version Version of descheduler
Flags:
--add-dir-header If true, adds the file directory to the header of the log messages
--add-dir-header If true, adds the file directory to the header
--alsologtostderr log to standard error as well as files
--descheduling-interval duration Time interval between two consecutive descheduler executions. Setting this value instructs the descheduler to run in a continuous loop at the interval specified.
--dry-run execute descheduler in dry run mode.
--evict-local-storage-pods DEPRECATED: enables evicting pods using local storage by descheduler
--evict-local-storage-pods Enables evicting pods using local storage by descheduler
-h, --help help for descheduler
--kubeconfig string File with kube configuration.
--log-backtrace-at traceLocation when logging hits line file:N, emit a stack trace (default :0)
@@ -52,8 +36,8 @@ Flags:
--log-file-max-size uint Defines the maximum size a log file can grow to. Unit is megabytes. If the value is 0, the maximum file size is unlimited. (default 1800)
--log-flush-frequency duration Maximum number of seconds between log flushes (default 5s)
--logtostderr log to standard error instead of files (default true)
--max-pods-to-evict-per-node int DEPRECATED: limits the maximum number of pods to be evicted per node by descheduler
--node-selector string DEPRECATED: selector (label query) to filter on, supports '=', '==', and '!='.(e.g. -l key1=value1,key2=value2)
--max-pods-to-evict-per-node int Limits the maximum number of pods to be evicted per node by descheduler
--node-selector string Selector (label query) to filter on, supports '=', '==', and '!='.(e.g. -l key1=value1,key2=value2)
--policy-config-file string File with descheduler policy configuration.
--skip-headers If true, avoid header prefixes in the log messages
--skip-log-headers If true, avoid headers when opening log files
@@ -87,57 +71,20 @@ This policy configuration file ensures that pods created more than 7 days ago ar
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"LowNodeUtilization":
enabled: false
"RemoveDuplicates":
enabled: false
"RemovePodsViolatingInterPodAntiAffinity":
enabled: false
"RemovePodsViolatingNodeAffinity":
enabled: false
"RemovePodsViolatingNodeTaints":
enabled: false
"RemovePodsHavingTooManyRestarts":
enabled: false
"PodLifeTime":
enabled: true
params:
maxPodLifeTimeSeconds: 604800 # pods run for a maximum of 7 days
```
### Balance Cluster By Node Memory Utilization
If your cluster has been running for a long period of time, you may find that the resource utilization is not very
balanced. The following two strategies can be used to rebalance your cluster based on `cpu`, `memory`
or `number of pods`.
#### Balance high utilization nodes
Using `LowNodeUtilization`, descheduler will rebalance the cluster based on memory by evicting pods
from nodes with memory utilization over 70% to nodes with memory utilization below 20%.
```
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"LowNodeUtilization":
enabled: true
params:
nodeResourceUtilizationThresholds:
thresholds:
"memory": 20
targetThresholds:
"memory": 70
```
#### Balance low utilization nodes
Using `HighNodeUtilization`, descheduler will rebalance the cluster based on memory by evicting pods
from nodes with memory utilization lower than 20%. This should be used along with scheduler strategy `MostRequestedPriority`.
The evicted pods will be compacted into minimal set of nodes.
```
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"HighNodeUtilization":
enabled: true
params:
nodeResourceUtilizationThresholds:
thresholds:
"memory": 20
```
### Autoheal Node Problems
Descheduler's `RemovePodsViolatingNodeTaints` strategy can be combined with
[Node Problem Detector](https://github.com/kubernetes/node-problem-detector/) and
[Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) to automatically remove
Nodes which have problems. Node Problem Detector can detect specific Node problems and taint any Nodes which have those
problems. The Descheduler will then deschedule workloads from those Nodes. Finally, if the descheduled Node's resource
allocation falls below the Cluster Autoscaler's scale down threshold, the Node will become a scale down candidate
and can be removed by Cluster Autoscaler. These three components form an autohealing cycle for Node problems.

View File

@@ -1,14 +0,0 @@
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemoveFailedPods":
enabled: true
params:
failedPods:
reasons:
- "OutOfcpu"
- "CreateContainerConfigError"
includingInitContainers: true
excludeOwnerKinds:
- "Job"
minPodLifeTimeSeconds: 3600 # 1 hour

View File

@@ -1,9 +0,0 @@
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"HighNodeUtilization":
enabled: true
params:
nodeResourceUtilizationThresholds:
thresholds:
"memory": 20

View File

@@ -1,11 +0,0 @@
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"LowNodeUtilization":
enabled: true
params:
nodeResourceUtilizationThresholds:
thresholds:
"memory": 20
targetThresholds:
"memory": 70

View File

@@ -1,8 +1,20 @@
---
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"LowNodeUtilization":
enabled: false
"RemoveDuplicates":
enabled: false
"RemovePodsViolatingInterPodAntiAffinity":
enabled: false
"RemovePodsViolatingNodeAffinity":
enabled: false
"RemovePodsViolatingNodeTaints":
enabled: false
"RemovePodsHavingTooManyRestarts":
enabled: false
"PodLifeTime":
enabled: true
params:
podLifeTime:
maxPodLifeTimeSeconds: 604800 # 7 days
maxPodLifeTimeSeconds: 604800 # 7 days

View File

@@ -23,7 +23,3 @@ strategies:
podsHavingTooManyRestarts:
podRestartThreshold: 100
includingInitContainers: true
"RemovePodsViolatingTopologySpreadConstraint":
enabled: true
params:
includeSoftConstraints: true

View File

@@ -1,7 +0,0 @@
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemovePodsViolatingTopologySpreadConstraint":
enabled: true
params:
includeSoftConstraints: true # Include 'ScheduleAnyways' constraints

22
go.mod
View File

@@ -1,19 +1,15 @@
module sigs.k8s.io/descheduler
go 1.16
go 1.14
require (
github.com/client9/misspell v0.3.4
github.com/spf13/cobra v1.1.3
github.com/spf13/cobra v0.0.5
github.com/spf13/pflag v1.0.5
k8s.io/api v0.22.0
k8s.io/apimachinery v0.22.0
k8s.io/apiserver v0.22.0
k8s.io/client-go v0.22.0
k8s.io/code-generator v0.22.0
k8s.io/component-base v0.22.0
k8s.io/component-helpers v0.22.0
k8s.io/klog/v2 v2.9.0
k8s.io/kubectl v0.20.5
sigs.k8s.io/mdtoc v1.0.1
k8s.io/api v0.18.4
k8s.io/apimachinery v0.18.4
k8s.io/apiserver v0.18.4
k8s.io/client-go v0.18.4
k8s.io/code-generator v0.18.4
k8s.io/component-base v0.18.4
k8s.io/klog/v2 v2.0.0
)

795
go.sum

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +0,0 @@
BUILD
CHANGELOG
OWNERS
go.mod
go.sum
vendor/

View File

@@ -18,15 +18,15 @@ E2E_GCE_HOME=$DESCHEDULER_ROOT/hack/e2e-gce
create_cluster() {
echo "#################### Creating instances ##########################"
gcloud compute instances create descheduler-$master_uuid --image-family="ubuntu-1804-lts" --image-project="ubuntu-os-cloud" --zone=us-east1-b
gcloud compute instances create descheduler-$master_uuid --image="ubuntu-1604-xenial-v20180306" --image-project="ubuntu-os-cloud" --zone=us-east1-b
# Keeping the --zone here so as to make sure that e2e's can run locally.
echo "gcloud compute instances delete descheduler-$master_uuid --zone=us-east1-b --quiet" > $E2E_GCE_HOME/delete_cluster.sh
gcloud compute instances create descheduler-$node1_uuid --image-family="ubuntu-1804-lts" --image-project="ubuntu-os-cloud" --zone=us-east1-b
gcloud compute instances create descheduler-$node1_uuid --image="ubuntu-1604-xenial-v20180306" --image-project="ubuntu-os-cloud" --zone=us-east1-b
echo "gcloud compute instances delete descheduler-$node1_uuid --zone=us-east1-b --quiet" >> $E2E_GCE_HOME/delete_cluster.sh
gcloud compute instances create descheduler-$node2_uuid --image-family="ubuntu-1804-lts" --image-project="ubuntu-os-cloud" --zone=us-east1-b
echo "gcloud compute instances delete descheduler-$node2_uuid --zone=us-east1-c --quiet" >> $E2E_GCE_HOME/delete_cluster.sh
gcloud compute instances create descheduler-$node2_uuid --image="ubuntu-1604-xenial-v20180306" --image-project="ubuntu-os-cloud" --zone=us-east1-b
echo "gcloud compute instances delete descheduler-$node2_uuid --zone=us-east1-b --quiet" >> $E2E_GCE_HOME/delete_cluster.sh
# Delete the firewall port created for master.
echo "gcloud compute firewall-rules delete kubeapiserver-$master_uuid --quiet" >> $E2E_GCE_HOME/delete_cluster.sh
@@ -44,10 +44,10 @@ generate_kubeadm_instance_files() {
transfer_install_files() {
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$master_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$master_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_install.sh descheduler-$master_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$node1_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$node2_uuid:/tmp --zone=us-east1-c
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$node1_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$node2_uuid:/tmp --zone=us-east1-b
}
@@ -55,19 +55,19 @@ install_kube() {
# Docker installation.
gcloud compute ssh descheduler-$master_uuid --command "sudo apt-get update; sudo apt-get install -y docker.io" --zone=us-east1-b
gcloud compute ssh descheduler-$node1_uuid --command "sudo apt-get update; sudo apt-get install -y docker.io" --zone=us-east1-b
gcloud compute ssh descheduler-$node2_uuid --command "sudo apt-get update; sudo apt-get install -y docker.io" --zone=us-east1-c
gcloud compute ssh descheduler-$node2_uuid --command "sudo apt-get update; sudo apt-get install -y docker.io" --zone=us-east1-b
# kubeadm installation.
# 1. Transfer files to master, nodes.
transfer_install_files
# 2. Install kubeadm.
#TODO: Add rm /tmp/kubeadm_install.sh
# Open port for kube API server
gcloud compute firewall-rules create kubeapiserver-$master_uuid --allow tcp:6443 --source-tags=descheduler-$master_uuid --source-ranges=0.0.0.0/0 --description="Opening api server port"
gcloud compute firewall-rules create kubeapiserver-$master_uuid --allow tcp:6443 --source-tags=descheduler-$master_uuid --source-ranges=0.0.0.0/0 --description="Opening api server port"
gcloud compute ssh descheduler-$master_uuid --command "sudo chmod 755 /tmp/kubeadm_preinstall.sh; sudo /tmp/kubeadm_preinstall.sh" --zone=us-east1-b
kubeadm_join_command=$(gcloud compute ssh descheduler-$master_uuid --command "sudo chmod 755 /tmp/kubeadm_install.sh; sudo /tmp/kubeadm_install.sh" --zone=us-east1-b|grep 'kubeadm join')
# Copy the kubeconfig file onto /tmp for e2e tests.
# Copy the kubeconfig file onto /tmp for e2e tests.
gcloud compute ssh descheduler-$master_uuid --command "sudo cp /etc/kubernetes/admin.conf /tmp; sudo chmod 777 /tmp/admin.conf" --zone=us-east1-b
gcloud compute scp descheduler-$master_uuid:/tmp/admin.conf /tmp/admin.conf --zone=us-east1-b
@@ -75,15 +75,16 @@ install_kube() {
gcloud compute ssh descheduler-$master_uuid --command "sudo kubectl apply -f https://raw.githubusercontent.com/cloudnativelabs/kube-router/master/daemonset/kubeadm-kuberouter.yaml --kubeconfig /etc/kubernetes/admin.conf" --zone=us-east1-b
echo $kubeadm_join_command > $E2E_GCE_HOME/kubeadm_join.sh
# Copy kubeadm_join to every node.
# Copy kubeadm_join to every node.
#TODO: Put these in a loop, so that extension becomes possible.
gcloud compute ssh descheduler-$node1_uuid --command "sudo chmod 755 /tmp/kubeadm_preinstall.sh; sudo /tmp/kubeadm_preinstall.sh" --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_join.sh descheduler-$node1_uuid:/tmp --zone=us-east1-b
gcloud compute ssh descheduler-$node1_uuid --command "sudo chmod 755 /tmp/kubeadm_join.sh; sudo /tmp/kubeadm_join.sh" --zone=us-east1-b
gcloud compute ssh descheduler-$node2_uuid --command "sudo chmod 755 /tmp/kubeadm_preinstall.sh; sudo /tmp/kubeadm_preinstall.sh" --zone=us-east1-c
gcloud compute scp $E2E_GCE_HOME/kubeadm_join.sh descheduler-$node2_uuid:/tmp --zone=us-east1-c
gcloud compute ssh descheduler-$node2_uuid --command "sudo chmod 755 /tmp/kubeadm_join.sh; sudo /tmp/kubeadm_join.sh" --zone=us-east1-c
gcloud compute ssh descheduler-$node2_uuid --command "sudo chmod 755 /tmp/kubeadm_preinstall.sh; sudo /tmp/kubeadm_preinstall.sh" --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_join.sh descheduler-$node2_uuid:/tmp --zone=us-east1-b
gcloud compute ssh descheduler-$node2_uuid --command "sudo chmod 755 /tmp/kubeadm_join.sh; sudo /tmp/kubeadm_join.sh" --zone=us-east1-b
}

View File

@@ -1,18 +1,6 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
apiVersion: kind.sigs.k8s.io/v1alpha3
nodes:
- role: control-plane
- role: worker
kubeadmConfigPatches:
- |
kind: JoinConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "topology.kubernetes.io/zone=local-a"
- role: worker
kubeadmConfigPatches:
- |
kind: JoinConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "topology.kubernetes.io/zone=local-b"

View File

@@ -19,8 +19,4 @@ limitations under the License.
// This package imports things required by build scripts, to force `go mod` to see them as dependencies
package tools
import (
_ "github.com/client9/misspell/cmd/misspell"
_ "k8s.io/code-generator"
_ "sigs.k8s.io/mdtoc"
)
import _ "k8s.io/code-generator"

View File

@@ -23,7 +23,7 @@ DESCHEDULER_ROOT=$(dirname "${BASH_SOURCE}")/..
GO_VERSION=($(go version))
if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.14|go1.15|go1.16') ]]; then
if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.2|go1.3|go1.4|go1.5|go1.6|go1.7|go1.8|go1.9|go1.10|go1.11|go1.12|go1.13|go1.14') ]]; then
echo "Unknown go version '${GO_VERSION[2]}', skipping gofmt."
exit 1
fi

View File

@@ -1,25 +0,0 @@
#!/bin/bash
# Copyright 2021 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o nounset
set -o pipefail
source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
go build -o "${OS_OUTPUT_BINPATH}/mdtoc" "sigs.k8s.io/mdtoc"
${OS_OUTPUT_BINPATH}/mdtoc --inplace README.md

View File

@@ -1,36 +0,0 @@
#!/bin/bash
set -o errexit
set -o nounset
set -o pipefail
source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
DESCHEDULER_ROOT=$(dirname "${BASH_SOURCE}")/..
mkdir -p "${DESCHEDULER_ROOT}/_tmp"
_tmpdir="$(mktemp -d "${DESCHEDULER_ROOT}/_tmp/kube-verify.XXXXXX")"
_deschedulertmp="${_tmpdir}"
mkdir -p "${_deschedulertmp}"
git archive --format=tar --prefix=descheduler/ "$(git write-tree)" | (cd "${_deschedulertmp}" && tar xf -)
_deschedulertmp="${_deschedulertmp}/descheduler"
pushd "${_deschedulertmp}" > /dev/null 2>&1
go build -o "${OS_OUTPUT_BINPATH}/conversion-gen" "k8s.io/code-generator/cmd/conversion-gen"
${OS_OUTPUT_BINPATH}/conversion-gen \
--go-header-file "hack/boilerplate/boilerplate.go.txt" \
--input-dirs "./pkg/apis/componentconfig/v1alpha1,./pkg/api/v1alpha1" \
--output-file-base zz_generated.conversion
popd > /dev/null 2>&1
pushd "${DESCHEDULER_ROOT}" > /dev/null 2>&1
if ! _out="$(diff -Naupr pkg/ "${_deschedulertmp}/pkg/")"; then
echo "Generated output differs:" >&2
echo "${_out}" >&2
echo "Generated conversions verify failed. Please run ./hack/update-generated-conversions.sh"
exit 1
fi
popd > /dev/null 2>&1
echo "Generated conversions verified."

View File

@@ -1,36 +0,0 @@
#!/bin/bash
set -o errexit
set -o nounset
set -o pipefail
source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
DESCHEDULER_ROOT=$(dirname "${BASH_SOURCE}")/..
mkdir -p "${DESCHEDULER_ROOT}/_tmp"
_tmpdir="$(mktemp -d "${DESCHEDULER_ROOT}/_tmp/kube-verify.XXXXXX")"
_deschedulertmp="${_tmpdir}"
mkdir -p "${_deschedulertmp}"
git archive --format=tar --prefix=descheduler/ "$(git write-tree)" | (cd "${_deschedulertmp}" && tar xf -)
_deschedulertmp="${_deschedulertmp}/descheduler"
pushd "${_deschedulertmp}" > /dev/null 2>&1
go build -o "${OS_OUTPUT_BINPATH}/deepcopy-gen" "k8s.io/code-generator/cmd/deepcopy-gen"
${OS_OUTPUT_BINPATH}/deepcopy-gen \
--go-header-file "hack/boilerplate/boilerplate.go.txt" \
--input-dirs "./pkg/apis/componentconfig,./pkg/apis/componentconfig/v1alpha1,./pkg/api,./pkg/api/v1alpha1" \
--output-file-base zz_generated.deepcopy
popd > /dev/null 2>&1
pushd "${DESCHEDULER_ROOT}" > /dev/null 2>&1
if ! _out="$(diff -Naupr pkg/ "${_deschedulertmp}/pkg/")"; then
echo "Generated deep-copies output differs:" >&2
echo "${_out}" >&2
echo "Generated deep-copies verify failed. Please run ./hack/update-generated-deep-copies.sh"
exit 1
fi
popd > /dev/null 2>&1
echo "Generated deep-copies verified."

View File

@@ -1,35 +0,0 @@
#!/bin/bash
set -o errexit
set -o nounset
set -o pipefail
source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
DESCHEDULER_ROOT=$(dirname "${BASH_SOURCE}")/..
_tmpdir="$(mktemp -d "${DESCHEDULER_ROOT}/_tmp/kube-verify.XXXXXX")"
_deschedulertmp="${_tmpdir}"
mkdir -p "${_deschedulertmp}"
git archive --format=tar --prefix=descheduler/ "$(git write-tree)" | (cd "${_deschedulertmp}" && tar xf -)
_deschedulertmp="${_deschedulertmp}/descheduler"
pushd "${_deschedulertmp}" > /dev/null 2>&1
go build -o "${OS_OUTPUT_BINPATH}/defaulter-gen" "k8s.io/code-generator/cmd/defaulter-gen"
${OS_OUTPUT_BINPATH}/defaulter-gen \
--go-header-file "hack/boilerplate/boilerplate.go.txt" \
--input-dirs "${PRJ_PREFIX}/pkg/apis/componentconfig/v1alpha1,${PRJ_PREFIX}/pkg/api/v1alpha1" \
--extra-peer-dirs "${PRJ_PREFIX}/pkg/apis/componentconfig/v1alpha1,${PRJ_PREFIX}/pkg/api/v1alpha1" \
--output-file-base zz_generated.defaults
popd > /dev/null 2>&1
pushd "${DESCHEDULER_ROOT}" > /dev/null 2>&1
if ! _out="$(diff -Naupr pkg/ "${_deschedulertmp}/pkg/")"; then
echo "Generated defaulters output differs:" >&2
echo "${_out}" >&2
echo "Generated defaulters verify failed. Please run ./hack/update-generated-defaulters.sh"
fi
popd > /dev/null 2>&1
echo "Generated Defaulters verified."

View File

@@ -23,7 +23,7 @@ DESCHEDULER_ROOT=$(dirname "${BASH_SOURCE}")/..
GO_VERSION=($(go version))
if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.14|go1.15|go1.16') ]]; then
if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.2|go1.3|go1.4|go1.5|go1.6|go1.7|go1.8|go1.9|go1.10|go1.11|go1.12|go1.13|go1.14') ]]; then
echo "Unknown go version '${GO_VERSION[2]}', skipping gofmt."
exit 1
fi

View File

@@ -1,19 +0,0 @@
#!/bin/bash
# Copyright 2021 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
go vet ${OS_ROOT}/...

View File

@@ -1,41 +0,0 @@
#!/usr/bin/env bash
# Copyright 2018 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script checks commonly misspelled English words in all files in the
# working directory by client9/misspell package.
# Usage: `hack/verify-spelling.sh`.
set -o errexit
set -o nounset
set -o pipefail
KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
export KUBE_ROOT
source "${KUBE_ROOT}/hack/lib/init.sh"
# Ensure that we find the binaries we build before anything else.
export GOBIN="${OS_OUTPUT_BINPATH}"
PATH="${GOBIN}:${PATH}"
# Install tools we need
pushd "${KUBE_ROOT}" >/dev/null
GO111MODULE=on go install github.com/client9/misspell/cmd/misspell
popd >/dev/null
# Spell checking
# All the skipping files are defined in hack/.spelling_failures
skipping_file="${KUBE_ROOT}/hack/.spelling_failures"
failing_packages=$(sed "s| | -e |g" "${skipping_file}")
git ls-files | grep -v -e "${failing_packages}" | xargs misspell -i "Creater,creater,ect" -error -o stderr

View File

@@ -1,29 +0,0 @@
#!/bin/bash
# Copyright 2021 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o nounset
set -o pipefail
source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
go build -o "${OS_OUTPUT_BINPATH}/mdtoc" "sigs.k8s.io/mdtoc"
if ! ${OS_OUTPUT_BINPATH}/mdtoc --inplace --dryrun README.md
then
echo "ERROR: Changes detected to table of contents. Run ./hack/update-toc.sh" >&2
exit 1
fi

View File

@@ -1,6 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- configmap.yaml
- rbac.yaml

View File

@@ -1,5 +1,5 @@
---
apiVersion: batch/v1 # for k8s version < 1.21.0, use batch/v1beta1
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: descheduler-cronjob
@@ -16,7 +16,7 @@ spec:
priorityClassName: system-cluster-critical
containers:
- name: descheduler
image: k8s.gcr.io/descheduler/descheduler:v0.22.1
image: us.gcr.io/k8s-artifacts-prod/descheduler/descheduler:v0.18.0
volumeMounts:
- mountPath: /policy-dir
name: policy-volume
@@ -27,18 +27,6 @@ spec:
- "/policy-dir/policy.yaml"
- "--v"
- "3"
resources:
requests:
cpu: "500m"
memory: "256Mi"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
runAsNonRoot: true
restartPolicy: "Never"
serviceAccountName: descheduler-sa
volumes:

View File

@@ -1,6 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../base
- cronjob.yaml

View File

@@ -1,54 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: descheduler
namespace: kube-system
labels:
app: descheduler
spec:
replicas: 1
selector:
matchLabels:
app: descheduler
template:
metadata:
labels:
app: descheduler
spec:
priorityClassName: system-cluster-critical
serviceAccountName: descheduler-sa
containers:
- name: descheduler
image: k8s.gcr.io/descheduler/descheduler:v0.22.1
imagePullPolicy: IfNotPresent
command:
- "/bin/descheduler"
args:
- "--policy-config-file"
- "/policy-dir/policy.yaml"
- "--descheduling-interval"
- "5m"
- "--v"
- "3"
ports:
- containerPort: 10258
protocol: TCP
resources:
requests:
cpu: 500m
memory: 256Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
runAsNonRoot: true
volumeMounts:
- mountPath: /policy-dir
name: policy-volume
volumes:
- name: policy-volume
configMap:
name: descheduler-policy-configmap

View File

@@ -1,6 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../base
- deployment.yaml

View File

@@ -14,7 +14,7 @@ spec:
priorityClassName: system-cluster-critical
containers:
- name: descheduler
image: k8s.gcr.io/descheduler/descheduler:v0.22.1
image: us.gcr.io/k8s-artifacts-prod/descheduler/descheduler:v0.18.0
volumeMounts:
- mountPath: /policy-dir
name: policy-volume
@@ -25,18 +25,6 @@ spec:
- "/policy-dir/policy.yaml"
- "--v"
- "3"
resources:
requests:
cpu: "500m"
memory: "256Mi"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
runAsNonRoot: true
restartPolicy: "Never"
serviceAccountName: descheduler-sa
volumes:

View File

@@ -1,6 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../base
- job.yaml

View File

@@ -3,6 +3,7 @@ kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: descheduler-cluster-role
namespace: kube-system
rules:
- apiGroups: [""]
resources: ["events"]
@@ -10,18 +11,12 @@ rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list", "delete"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: ["scheduling.k8s.io"]
resources: ["priorityclasses"]
verbs: ["get", "watch", "list"]
---
apiVersion: v1
kind: ServiceAccount
@@ -33,6 +28,7 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: descheduler-cluster-role-binding
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole

View File

@@ -1,72 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"sigs.k8s.io/descheduler/pkg/version"
)
const (
// DeschedulerSubsystem - subsystem name used by descheduler
DeschedulerSubsystem = "descheduler"
)
var (
PodsEvicted = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: DeschedulerSubsystem,
Name: "pods_evicted",
Help: "Number of evicted pods, by the result, by the strategy, by the namespace. 'failed' result means a pod could not be evicted",
StabilityLevel: metrics.ALPHA,
}, []string{"result", "strategy", "namespace"})
buildInfo = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: DeschedulerSubsystem,
Name: "build_info",
Help: "Build info about descheduler, including Go version, Descheduler version, Git SHA, Git branch",
ConstLabels: map[string]string{"GoVersion": version.Get().GoVersion, "DeschedulerVersion": version.Get().GitVersion, "GitBranch": version.Get().GitBranch, "GitSha1": version.Get().GitSha1},
StabilityLevel: metrics.ALPHA,
},
)
metricsList = []metrics.Registerable{
PodsEvicted,
buildInfo,
}
)
var registerMetrics sync.Once
// Register all metrics.
func Register() {
// Register the metrics.
registerMetrics.Do(func() {
RegisterMetrics(metricsList...)
})
}
// RegisterMetrics registers a list of metrics.
func RegisterMetrics(extraMetrics ...metrics.Registerable) {
for _, metric := range extraMetrics {
legacyregistry.MustRegister(metric)
}
}

View File

@@ -19,6 +19,8 @@ package api
import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/descheduler/pkg/descheduler/scheme"
)
var (
@@ -33,6 +35,12 @@ const GroupName = "descheduler"
// SchemeGroupVersion is group version used to register these objects
var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: runtime.APIVersionInternal}
func init() {
if err := addKnownTypes(scheme.Scheme); err != nil {
panic(err)
}
}
// Kind takes an unqualified kind and returns a Group qualified GroupKind
func Kind(kind string) schema.GroupKind {
return SchemeGroupVersion.WithKind(kind).GroupKind()

View File

@@ -28,21 +28,6 @@ type DeschedulerPolicy struct {
// Strategies
Strategies StrategyList
// NodeSelector for a set of nodes to operate over
NodeSelector *string
// EvictLocalStoragePods allows pods using local storage to be evicted.
EvictLocalStoragePods *bool
// EvictSystemCriticalPods allows eviction of pods of any priority (including Kubernetes system pods)
EvictSystemCriticalPods *bool
// IgnorePVCPods prevents pods with PVCs from being evicted.
IgnorePVCPods *bool
// MaxNoOfPodsToEvictPerNode restricts maximum of pods to be evicted per node.
MaxNoOfPodsToEvictPerNode *int
}
type StrategyName string
@@ -59,29 +44,13 @@ type DeschedulerStrategy struct {
Params *StrategyParameters
}
// Namespaces carries a list of included/excluded namespaces
// for which a given strategy is applicable
type Namespaces struct {
Include []string
Exclude []string
}
// Besides Namespaces only one of its members may be specified
// TODO(jchaloup): move Namespaces ThresholdPriority and ThresholdPriorityClassName to individual strategies
// once the policy version is bumped to v1alpha2
// Only one of its members may be specified
type StrategyParameters struct {
NodeResourceUtilizationThresholds *NodeResourceUtilizationThresholds
NodeAffinityType []string
PodsHavingTooManyRestarts *PodsHavingTooManyRestarts
PodLifeTime *PodLifeTime
MaxPodLifeTimeSeconds *uint
RemoveDuplicates *RemoveDuplicates
FailedPods *FailedPods
IncludeSoftConstraints bool
Namespaces *Namespaces
ThresholdPriority *int32
ThresholdPriorityClassName string
LabelSelector *metav1.LabelSelector
NodeFit bool
}
type Percentage float64
@@ -101,15 +70,3 @@ type PodsHavingTooManyRestarts struct {
type RemoveDuplicates struct {
ExcludeOwnerKinds []string
}
type PodLifeTime struct {
MaxPodLifeTimeSeconds *uint
PodStatusPhases []string
}
type FailedPods struct {
ExcludeOwnerKinds []string
MinPodLifetimeSeconds *uint
Reasons []string
IncludingInitContainers bool
}

View File

@@ -28,21 +28,6 @@ type DeschedulerPolicy struct {
// Strategies
Strategies StrategyList `json:"strategies,omitempty"`
// NodeSelector for a set of nodes to operate over
NodeSelector *string `json:"nodeSelector,omitempty"`
// EvictLocalStoragePods allows pods using local storage to be evicted.
EvictLocalStoragePods *bool `json:"evictLocalStoragePods,omitempty"`
// EvictSystemCriticalPods allows eviction of pods of any priority (including Kubernetes system pods)
EvictSystemCriticalPods *bool `json:"evictSystemCriticalPods,omitempty"`
// IgnorePVCPods prevents pods with PVCs from being evicted.
IgnorePVCPods *bool `json:"ignorePvcPods,omitempty"`
// MaxNoOfPodsToEvictPerNode restricts maximum of pods to be evicted per node.
MaxNoOfPodsToEvictPerNode *int `json:"maxNoOfPodsToEvictPerNode,omitempty"`
}
type StrategyName string
@@ -59,27 +44,13 @@ type DeschedulerStrategy struct {
Params *StrategyParameters `json:"params,omitempty"`
}
// Namespaces carries a list of included/excluded namespaces
// for which a given strategy is applicable.
type Namespaces struct {
Include []string `json:"include"`
Exclude []string `json:"exclude"`
}
// Besides Namespaces ThresholdPriority and ThresholdPriorityClassName only one of its members may be specified
// Only one of its members may be specified
type StrategyParameters struct {
NodeResourceUtilizationThresholds *NodeResourceUtilizationThresholds `json:"nodeResourceUtilizationThresholds,omitempty"`
NodeAffinityType []string `json:"nodeAffinityType,omitempty"`
PodsHavingTooManyRestarts *PodsHavingTooManyRestarts `json:"podsHavingTooManyRestarts,omitempty"`
PodLifeTime *PodLifeTime `json:"podLifeTime,omitempty"`
MaxPodLifeTimeSeconds *uint `json:"maxPodLifeTimeSeconds,omitempty"`
RemoveDuplicates *RemoveDuplicates `json:"removeDuplicates,omitempty"`
FailedPods *FailedPods `json:"failedPods,omitempty"`
IncludeSoftConstraints bool `json:"includeSoftConstraints"`
Namespaces *Namespaces `json:"namespaces"`
ThresholdPriority *int32 `json:"thresholdPriority"`
ThresholdPriorityClassName string `json:"thresholdPriorityClassName"`
LabelSelector *metav1.LabelSelector `json:"labelSelector"`
NodeFit bool `json:"nodeFit"`
}
type Percentage float64
@@ -99,15 +70,3 @@ type PodsHavingTooManyRestarts struct {
type RemoveDuplicates struct {
ExcludeOwnerKinds []string `json:"excludeOwnerKinds,omitempty"`
}
type PodLifeTime struct {
MaxPodLifeTimeSeconds *uint `json:"maxPodLifeTimeSeconds,omitempty"`
PodStatusPhases []string `json:"podStatusPhases,omitempty"`
}
type FailedPods struct {
ExcludeOwnerKinds []string `json:"excludeOwnerKinds,omitempty"`
MinPodLifetimeSeconds *uint `json:"minPodLifetimeSeconds,omitempty"`
Reasons []string `json:"reasons,omitempty"`
IncludingInitContainers bool `json:"includingInitContainers,omitempty"`
}

View File

@@ -1,7 +1,7 @@
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@ package v1alpha1
import (
unsafe "unsafe"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
conversion "k8s.io/apimachinery/pkg/conversion"
runtime "k8s.io/apimachinery/pkg/runtime"
api "sigs.k8s.io/descheduler/pkg/api"
@@ -56,26 +55,6 @@ func RegisterConversions(s *runtime.Scheme) error {
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*FailedPods)(nil), (*api.FailedPods)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1alpha1_FailedPods_To_api_FailedPods(a.(*FailedPods), b.(*api.FailedPods), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*api.FailedPods)(nil), (*FailedPods)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_api_FailedPods_To_v1alpha1_FailedPods(a.(*api.FailedPods), b.(*FailedPods), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*Namespaces)(nil), (*api.Namespaces)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1alpha1_Namespaces_To_api_Namespaces(a.(*Namespaces), b.(*api.Namespaces), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*api.Namespaces)(nil), (*Namespaces)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_api_Namespaces_To_v1alpha1_Namespaces(a.(*api.Namespaces), b.(*Namespaces), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*NodeResourceUtilizationThresholds)(nil), (*api.NodeResourceUtilizationThresholds)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1alpha1_NodeResourceUtilizationThresholds_To_api_NodeResourceUtilizationThresholds(a.(*NodeResourceUtilizationThresholds), b.(*api.NodeResourceUtilizationThresholds), scope)
}); err != nil {
@@ -86,16 +65,6 @@ func RegisterConversions(s *runtime.Scheme) error {
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*PodLifeTime)(nil), (*api.PodLifeTime)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1alpha1_PodLifeTime_To_api_PodLifeTime(a.(*PodLifeTime), b.(*api.PodLifeTime), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*api.PodLifeTime)(nil), (*PodLifeTime)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_api_PodLifeTime_To_v1alpha1_PodLifeTime(a.(*api.PodLifeTime), b.(*PodLifeTime), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*PodsHavingTooManyRestarts)(nil), (*api.PodsHavingTooManyRestarts)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1alpha1_PodsHavingTooManyRestarts_To_api_PodsHavingTooManyRestarts(a.(*PodsHavingTooManyRestarts), b.(*api.PodsHavingTooManyRestarts), scope)
}); err != nil {
@@ -131,11 +100,6 @@ func RegisterConversions(s *runtime.Scheme) error {
func autoConvert_v1alpha1_DeschedulerPolicy_To_api_DeschedulerPolicy(in *DeschedulerPolicy, out *api.DeschedulerPolicy, s conversion.Scope) error {
out.Strategies = *(*api.StrategyList)(unsafe.Pointer(&in.Strategies))
out.NodeSelector = (*string)(unsafe.Pointer(in.NodeSelector))
out.EvictLocalStoragePods = (*bool)(unsafe.Pointer(in.EvictLocalStoragePods))
out.EvictSystemCriticalPods = (*bool)(unsafe.Pointer(in.EvictSystemCriticalPods))
out.IgnorePVCPods = (*bool)(unsafe.Pointer(in.IgnorePVCPods))
out.MaxNoOfPodsToEvictPerNode = (*int)(unsafe.Pointer(in.MaxNoOfPodsToEvictPerNode))
return nil
}
@@ -146,11 +110,6 @@ func Convert_v1alpha1_DeschedulerPolicy_To_api_DeschedulerPolicy(in *Descheduler
func autoConvert_api_DeschedulerPolicy_To_v1alpha1_DeschedulerPolicy(in *api.DeschedulerPolicy, out *DeschedulerPolicy, s conversion.Scope) error {
out.Strategies = *(*StrategyList)(unsafe.Pointer(&in.Strategies))
out.NodeSelector = (*string)(unsafe.Pointer(in.NodeSelector))
out.EvictLocalStoragePods = (*bool)(unsafe.Pointer(in.EvictLocalStoragePods))
out.EvictSystemCriticalPods = (*bool)(unsafe.Pointer(in.EvictSystemCriticalPods))
out.IgnorePVCPods = (*bool)(unsafe.Pointer(in.IgnorePVCPods))
out.MaxNoOfPodsToEvictPerNode = (*int)(unsafe.Pointer(in.MaxNoOfPodsToEvictPerNode))
return nil
}
@@ -183,54 +142,6 @@ func Convert_api_DeschedulerStrategy_To_v1alpha1_DeschedulerStrategy(in *api.Des
return autoConvert_api_DeschedulerStrategy_To_v1alpha1_DeschedulerStrategy(in, out, s)
}
func autoConvert_v1alpha1_FailedPods_To_api_FailedPods(in *FailedPods, out *api.FailedPods, s conversion.Scope) error {
out.ExcludeOwnerKinds = *(*[]string)(unsafe.Pointer(&in.ExcludeOwnerKinds))
out.MinPodLifetimeSeconds = (*uint)(unsafe.Pointer(in.MinPodLifetimeSeconds))
out.Reasons = *(*[]string)(unsafe.Pointer(&in.Reasons))
out.IncludingInitContainers = in.IncludingInitContainers
return nil
}
// Convert_v1alpha1_FailedPods_To_api_FailedPods is an autogenerated conversion function.
func Convert_v1alpha1_FailedPods_To_api_FailedPods(in *FailedPods, out *api.FailedPods, s conversion.Scope) error {
return autoConvert_v1alpha1_FailedPods_To_api_FailedPods(in, out, s)
}
func autoConvert_api_FailedPods_To_v1alpha1_FailedPods(in *api.FailedPods, out *FailedPods, s conversion.Scope) error {
out.ExcludeOwnerKinds = *(*[]string)(unsafe.Pointer(&in.ExcludeOwnerKinds))
out.MinPodLifetimeSeconds = (*uint)(unsafe.Pointer(in.MinPodLifetimeSeconds))
out.Reasons = *(*[]string)(unsafe.Pointer(&in.Reasons))
out.IncludingInitContainers = in.IncludingInitContainers
return nil
}
// Convert_api_FailedPods_To_v1alpha1_FailedPods is an autogenerated conversion function.
func Convert_api_FailedPods_To_v1alpha1_FailedPods(in *api.FailedPods, out *FailedPods, s conversion.Scope) error {
return autoConvert_api_FailedPods_To_v1alpha1_FailedPods(in, out, s)
}
func autoConvert_v1alpha1_Namespaces_To_api_Namespaces(in *Namespaces, out *api.Namespaces, s conversion.Scope) error {
out.Include = *(*[]string)(unsafe.Pointer(&in.Include))
out.Exclude = *(*[]string)(unsafe.Pointer(&in.Exclude))
return nil
}
// Convert_v1alpha1_Namespaces_To_api_Namespaces is an autogenerated conversion function.
func Convert_v1alpha1_Namespaces_To_api_Namespaces(in *Namespaces, out *api.Namespaces, s conversion.Scope) error {
return autoConvert_v1alpha1_Namespaces_To_api_Namespaces(in, out, s)
}
func autoConvert_api_Namespaces_To_v1alpha1_Namespaces(in *api.Namespaces, out *Namespaces, s conversion.Scope) error {
out.Include = *(*[]string)(unsafe.Pointer(&in.Include))
out.Exclude = *(*[]string)(unsafe.Pointer(&in.Exclude))
return nil
}
// Convert_api_Namespaces_To_v1alpha1_Namespaces is an autogenerated conversion function.
func Convert_api_Namespaces_To_v1alpha1_Namespaces(in *api.Namespaces, out *Namespaces, s conversion.Scope) error {
return autoConvert_api_Namespaces_To_v1alpha1_Namespaces(in, out, s)
}
func autoConvert_v1alpha1_NodeResourceUtilizationThresholds_To_api_NodeResourceUtilizationThresholds(in *NodeResourceUtilizationThresholds, out *api.NodeResourceUtilizationThresholds, s conversion.Scope) error {
out.Thresholds = *(*api.ResourceThresholds)(unsafe.Pointer(&in.Thresholds))
out.TargetThresholds = *(*api.ResourceThresholds)(unsafe.Pointer(&in.TargetThresholds))
@@ -255,28 +166,6 @@ func Convert_api_NodeResourceUtilizationThresholds_To_v1alpha1_NodeResourceUtili
return autoConvert_api_NodeResourceUtilizationThresholds_To_v1alpha1_NodeResourceUtilizationThresholds(in, out, s)
}
func autoConvert_v1alpha1_PodLifeTime_To_api_PodLifeTime(in *PodLifeTime, out *api.PodLifeTime, s conversion.Scope) error {
out.MaxPodLifeTimeSeconds = (*uint)(unsafe.Pointer(in.MaxPodLifeTimeSeconds))
out.PodStatusPhases = *(*[]string)(unsafe.Pointer(&in.PodStatusPhases))
return nil
}
// Convert_v1alpha1_PodLifeTime_To_api_PodLifeTime is an autogenerated conversion function.
func Convert_v1alpha1_PodLifeTime_To_api_PodLifeTime(in *PodLifeTime, out *api.PodLifeTime, s conversion.Scope) error {
return autoConvert_v1alpha1_PodLifeTime_To_api_PodLifeTime(in, out, s)
}
func autoConvert_api_PodLifeTime_To_v1alpha1_PodLifeTime(in *api.PodLifeTime, out *PodLifeTime, s conversion.Scope) error {
out.MaxPodLifeTimeSeconds = (*uint)(unsafe.Pointer(in.MaxPodLifeTimeSeconds))
out.PodStatusPhases = *(*[]string)(unsafe.Pointer(&in.PodStatusPhases))
return nil
}
// Convert_api_PodLifeTime_To_v1alpha1_PodLifeTime is an autogenerated conversion function.
func Convert_api_PodLifeTime_To_v1alpha1_PodLifeTime(in *api.PodLifeTime, out *PodLifeTime, s conversion.Scope) error {
return autoConvert_api_PodLifeTime_To_v1alpha1_PodLifeTime(in, out, s)
}
func autoConvert_v1alpha1_PodsHavingTooManyRestarts_To_api_PodsHavingTooManyRestarts(in *PodsHavingTooManyRestarts, out *api.PodsHavingTooManyRestarts, s conversion.Scope) error {
out.PodRestartThreshold = in.PodRestartThreshold
out.IncludingInitContainers = in.IncludingInitContainers
@@ -323,15 +212,8 @@ func autoConvert_v1alpha1_StrategyParameters_To_api_StrategyParameters(in *Strat
out.NodeResourceUtilizationThresholds = (*api.NodeResourceUtilizationThresholds)(unsafe.Pointer(in.NodeResourceUtilizationThresholds))
out.NodeAffinityType = *(*[]string)(unsafe.Pointer(&in.NodeAffinityType))
out.PodsHavingTooManyRestarts = (*api.PodsHavingTooManyRestarts)(unsafe.Pointer(in.PodsHavingTooManyRestarts))
out.PodLifeTime = (*api.PodLifeTime)(unsafe.Pointer(in.PodLifeTime))
out.MaxPodLifeTimeSeconds = (*uint)(unsafe.Pointer(in.MaxPodLifeTimeSeconds))
out.RemoveDuplicates = (*api.RemoveDuplicates)(unsafe.Pointer(in.RemoveDuplicates))
out.FailedPods = (*api.FailedPods)(unsafe.Pointer(in.FailedPods))
out.IncludeSoftConstraints = in.IncludeSoftConstraints
out.Namespaces = (*api.Namespaces)(unsafe.Pointer(in.Namespaces))
out.ThresholdPriority = (*int32)(unsafe.Pointer(in.ThresholdPriority))
out.ThresholdPriorityClassName = in.ThresholdPriorityClassName
out.LabelSelector = (*v1.LabelSelector)(unsafe.Pointer(in.LabelSelector))
out.NodeFit = in.NodeFit
return nil
}
@@ -344,15 +226,8 @@ func autoConvert_api_StrategyParameters_To_v1alpha1_StrategyParameters(in *api.S
out.NodeResourceUtilizationThresholds = (*NodeResourceUtilizationThresholds)(unsafe.Pointer(in.NodeResourceUtilizationThresholds))
out.NodeAffinityType = *(*[]string)(unsafe.Pointer(&in.NodeAffinityType))
out.PodsHavingTooManyRestarts = (*PodsHavingTooManyRestarts)(unsafe.Pointer(in.PodsHavingTooManyRestarts))
out.PodLifeTime = (*PodLifeTime)(unsafe.Pointer(in.PodLifeTime))
out.MaxPodLifeTimeSeconds = (*uint)(unsafe.Pointer(in.MaxPodLifeTimeSeconds))
out.RemoveDuplicates = (*RemoveDuplicates)(unsafe.Pointer(in.RemoveDuplicates))
out.FailedPods = (*FailedPods)(unsafe.Pointer(in.FailedPods))
out.IncludeSoftConstraints = in.IncludeSoftConstraints
out.Namespaces = (*Namespaces)(unsafe.Pointer(in.Namespaces))
out.ThresholdPriority = (*int32)(unsafe.Pointer(in.ThresholdPriority))
out.ThresholdPriorityClassName = in.ThresholdPriorityClassName
out.LabelSelector = (*v1.LabelSelector)(unsafe.Pointer(in.LabelSelector))
out.NodeFit = in.NodeFit
return nil
}

View File

@@ -1,7 +1,7 @@
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@ limitations under the License.
package v1alpha1
import (
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
)
@@ -36,31 +35,6 @@ func (in *DeschedulerPolicy) DeepCopyInto(out *DeschedulerPolicy) {
(*out)[key] = *val.DeepCopy()
}
}
if in.NodeSelector != nil {
in, out := &in.NodeSelector, &out.NodeSelector
*out = new(string)
**out = **in
}
if in.EvictLocalStoragePods != nil {
in, out := &in.EvictLocalStoragePods, &out.EvictLocalStoragePods
*out = new(bool)
**out = **in
}
if in.EvictSystemCriticalPods != nil {
in, out := &in.EvictSystemCriticalPods, &out.EvictSystemCriticalPods
*out = new(bool)
**out = **in
}
if in.IgnorePVCPods != nil {
in, out := &in.IgnorePVCPods, &out.IgnorePVCPods
*out = new(bool)
**out = **in
}
if in.MaxNoOfPodsToEvictPerNode != nil {
in, out := &in.MaxNoOfPodsToEvictPerNode, &out.MaxNoOfPodsToEvictPerNode
*out = new(int)
**out = **in
}
return
}
@@ -103,63 +77,6 @@ func (in *DeschedulerStrategy) DeepCopy() *DeschedulerStrategy {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *FailedPods) DeepCopyInto(out *FailedPods) {
*out = *in
if in.ExcludeOwnerKinds != nil {
in, out := &in.ExcludeOwnerKinds, &out.ExcludeOwnerKinds
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.MinPodLifetimeSeconds != nil {
in, out := &in.MinPodLifetimeSeconds, &out.MinPodLifetimeSeconds
*out = new(uint)
**out = **in
}
if in.Reasons != nil {
in, out := &in.Reasons, &out.Reasons
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailedPods.
func (in *FailedPods) DeepCopy() *FailedPods {
if in == nil {
return nil
}
out := new(FailedPods)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Namespaces) DeepCopyInto(out *Namespaces) {
*out = *in
if in.Include != nil {
in, out := &in.Include, &out.Include
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.Exclude != nil {
in, out := &in.Exclude, &out.Exclude
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Namespaces.
func (in *Namespaces) DeepCopy() *Namespaces {
if in == nil {
return nil
}
out := new(Namespaces)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *NodeResourceUtilizationThresholds) DeepCopyInto(out *NodeResourceUtilizationThresholds) {
*out = *in
@@ -190,32 +107,6 @@ func (in *NodeResourceUtilizationThresholds) DeepCopy() *NodeResourceUtilization
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *PodLifeTime) DeepCopyInto(out *PodLifeTime) {
*out = *in
if in.MaxPodLifeTimeSeconds != nil {
in, out := &in.MaxPodLifeTimeSeconds, &out.MaxPodLifeTimeSeconds
*out = new(uint)
**out = **in
}
if in.PodStatusPhases != nil {
in, out := &in.PodStatusPhases, &out.PodStatusPhases
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodLifeTime.
func (in *PodLifeTime) DeepCopy() *PodLifeTime {
if in == nil {
return nil
}
out := new(PodLifeTime)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *PodsHavingTooManyRestarts) DeepCopyInto(out *PodsHavingTooManyRestarts) {
*out = *in
@@ -315,36 +206,16 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) {
*out = new(PodsHavingTooManyRestarts)
**out = **in
}
if in.PodLifeTime != nil {
in, out := &in.PodLifeTime, &out.PodLifeTime
*out = new(PodLifeTime)
(*in).DeepCopyInto(*out)
if in.MaxPodLifeTimeSeconds != nil {
in, out := &in.MaxPodLifeTimeSeconds, &out.MaxPodLifeTimeSeconds
*out = new(uint)
**out = **in
}
if in.RemoveDuplicates != nil {
in, out := &in.RemoveDuplicates, &out.RemoveDuplicates
*out = new(RemoveDuplicates)
(*in).DeepCopyInto(*out)
}
if in.FailedPods != nil {
in, out := &in.FailedPods, &out.FailedPods
*out = new(FailedPods)
(*in).DeepCopyInto(*out)
}
if in.Namespaces != nil {
in, out := &in.Namespaces, &out.Namespaces
*out = new(Namespaces)
(*in).DeepCopyInto(*out)
}
if in.ThresholdPriority != nil {
in, out := &in.ThresholdPriority, &out.ThresholdPriority
*out = new(int32)
**out = **in
}
if in.LabelSelector != nil {
in, out := &in.LabelSelector, &out.LabelSelector
*out = new(v1.LabelSelector)
(*in).DeepCopyInto(*out)
}
return
}

View File

@@ -1,7 +1,7 @@
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

View File

@@ -1,7 +1,7 @@
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@ limitations under the License.
package api
import (
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
)
@@ -36,31 +35,6 @@ func (in *DeschedulerPolicy) DeepCopyInto(out *DeschedulerPolicy) {
(*out)[key] = *val.DeepCopy()
}
}
if in.NodeSelector != nil {
in, out := &in.NodeSelector, &out.NodeSelector
*out = new(string)
**out = **in
}
if in.EvictLocalStoragePods != nil {
in, out := &in.EvictLocalStoragePods, &out.EvictLocalStoragePods
*out = new(bool)
**out = **in
}
if in.EvictSystemCriticalPods != nil {
in, out := &in.EvictSystemCriticalPods, &out.EvictSystemCriticalPods
*out = new(bool)
**out = **in
}
if in.IgnorePVCPods != nil {
in, out := &in.IgnorePVCPods, &out.IgnorePVCPods
*out = new(bool)
**out = **in
}
if in.MaxNoOfPodsToEvictPerNode != nil {
in, out := &in.MaxNoOfPodsToEvictPerNode, &out.MaxNoOfPodsToEvictPerNode
*out = new(int)
**out = **in
}
return
}
@@ -103,63 +77,6 @@ func (in *DeschedulerStrategy) DeepCopy() *DeschedulerStrategy {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *FailedPods) DeepCopyInto(out *FailedPods) {
*out = *in
if in.ExcludeOwnerKinds != nil {
in, out := &in.ExcludeOwnerKinds, &out.ExcludeOwnerKinds
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.MinPodLifetimeSeconds != nil {
in, out := &in.MinPodLifetimeSeconds, &out.MinPodLifetimeSeconds
*out = new(uint)
**out = **in
}
if in.Reasons != nil {
in, out := &in.Reasons, &out.Reasons
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailedPods.
func (in *FailedPods) DeepCopy() *FailedPods {
if in == nil {
return nil
}
out := new(FailedPods)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Namespaces) DeepCopyInto(out *Namespaces) {
*out = *in
if in.Include != nil {
in, out := &in.Include, &out.Include
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.Exclude != nil {
in, out := &in.Exclude, &out.Exclude
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Namespaces.
func (in *Namespaces) DeepCopy() *Namespaces {
if in == nil {
return nil
}
out := new(Namespaces)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *NodeResourceUtilizationThresholds) DeepCopyInto(out *NodeResourceUtilizationThresholds) {
*out = *in
@@ -190,32 +107,6 @@ func (in *NodeResourceUtilizationThresholds) DeepCopy() *NodeResourceUtilization
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *PodLifeTime) DeepCopyInto(out *PodLifeTime) {
*out = *in
if in.MaxPodLifeTimeSeconds != nil {
in, out := &in.MaxPodLifeTimeSeconds, &out.MaxPodLifeTimeSeconds
*out = new(uint)
**out = **in
}
if in.PodStatusPhases != nil {
in, out := &in.PodStatusPhases, &out.PodStatusPhases
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodLifeTime.
func (in *PodLifeTime) DeepCopy() *PodLifeTime {
if in == nil {
return nil
}
out := new(PodLifeTime)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *PodsHavingTooManyRestarts) DeepCopyInto(out *PodsHavingTooManyRestarts) {
*out = *in
@@ -315,36 +206,16 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) {
*out = new(PodsHavingTooManyRestarts)
**out = **in
}
if in.PodLifeTime != nil {
in, out := &in.PodLifeTime, &out.PodLifeTime
*out = new(PodLifeTime)
(*in).DeepCopyInto(*out)
if in.MaxPodLifeTimeSeconds != nil {
in, out := &in.MaxPodLifeTimeSeconds, &out.MaxPodLifeTimeSeconds
*out = new(uint)
**out = **in
}
if in.RemoveDuplicates != nil {
in, out := &in.RemoveDuplicates, &out.RemoveDuplicates
*out = new(RemoveDuplicates)
(*in).DeepCopyInto(*out)
}
if in.FailedPods != nil {
in, out := &in.FailedPods, &out.FailedPods
*out = new(FailedPods)
(*in).DeepCopyInto(*out)
}
if in.Namespaces != nil {
in, out := &in.Namespaces, &out.Namespaces
*out = new(Namespaces)
(*in).DeepCopyInto(*out)
}
if in.ThresholdPriority != nil {
in, out := &in.ThresholdPriority, &out.ThresholdPriority
*out = new(int32)
**out = **in
}
if in.LabelSelector != nil {
in, out := &in.LabelSelector, &out.LabelSelector
*out = new(v1.LabelSelector)
(*in).DeepCopyInto(*out)
}
return
}

View File

@@ -19,6 +19,8 @@ package componentconfig
import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/descheduler/pkg/descheduler/scheme"
)
var (
@@ -32,6 +34,12 @@ const GroupName = "deschedulercomponentconfig"
// SchemeGroupVersion is group version used to register these objects
var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: runtime.APIVersionInternal}
func init() {
if err := addKnownTypes(scheme.Scheme); err != nil {
panic(err)
}
}
// Kind takes an unqualified kind and returns a Group qualified GroupKind
func Kind(kind string) schema.GroupKind {
return SchemeGroupVersion.WithKind(kind).GroupKind()

View File

@@ -20,7 +20,6 @@ import (
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
componentbaseconfig "k8s.io/component-base/config"
)
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
@@ -49,11 +48,4 @@ type DeschedulerConfiguration struct {
// EvictLocalStoragePods allows pods using local storage to be evicted.
EvictLocalStoragePods bool
// IgnorePVCPods sets whether PVC pods should be allowed to be evicted
IgnorePVCPods bool
// Logging specifies the options of logging.
// Refer [Logs Options](https://github.com/kubernetes/component-base/blob/master/logs/options.go) for more information.
Logging componentbaseconfig.LoggingConfiguration
}

View File

@@ -17,7 +17,6 @@ limitations under the License.
package v1alpha1
import (
componentbaseconfig "k8s.io/component-base/config"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -49,11 +48,4 @@ type DeschedulerConfiguration struct {
// EvictLocalStoragePods allows pods using local storage to be evicted.
EvictLocalStoragePods bool `json:"evictLocalStoragePods,omitempty"`
// IgnorePVCPods sets whether PVC pods should be allowed to be evicted
IgnorePVCPods bool `json:"ignorePvcPods,omitempty"`
// Logging specifies the options of logging.
// Refer [Logs Options](https://github.com/kubernetes/component-base/blob/master/logs/options.go) for more information.
Logging componentbaseconfig.LoggingConfiguration `json:"logging,omitempty"`
}

View File

@@ -1,7 +1,7 @@
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -56,8 +56,6 @@ func autoConvert_v1alpha1_DeschedulerConfiguration_To_componentconfig_Deschedule
out.NodeSelector = in.NodeSelector
out.MaxNoOfPodsToEvictPerNode = in.MaxNoOfPodsToEvictPerNode
out.EvictLocalStoragePods = in.EvictLocalStoragePods
out.IgnorePVCPods = in.IgnorePVCPods
out.Logging = in.Logging
return nil
}
@@ -74,8 +72,6 @@ func autoConvert_componentconfig_DeschedulerConfiguration_To_v1alpha1_Deschedule
out.NodeSelector = in.NodeSelector
out.MaxNoOfPodsToEvictPerNode = in.MaxNoOfPodsToEvictPerNode
out.EvictLocalStoragePods = in.EvictLocalStoragePods
out.IgnorePVCPods = in.IgnorePVCPods
out.Logging = in.Logging
return nil
}

View File

@@ -1,7 +1,7 @@
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -28,7 +28,6 @@ import (
func (in *DeschedulerConfiguration) DeepCopyInto(out *DeschedulerConfiguration) {
*out = *in
out.TypeMeta = in.TypeMeta
out.Logging = in.Logging
return
}

View File

@@ -1,7 +1,7 @@
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

View File

@@ -1,7 +1,7 @@
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -28,7 +28,6 @@ import (
func (in *DeschedulerConfiguration) DeepCopyInto(out *DeschedulerConfiguration) {
*out = *in
out.TypeMeta = in.TypeMeta
out.Logging = in.Logging
return
}

View File

@@ -19,16 +19,14 @@ package descheduler
import (
"context"
"fmt"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies/nodeutilization"
v1 "k8s.io/api/core/v1"
"k8s.io/api/core/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/informers"
"sigs.k8s.io/descheduler/cmd/descheduler/app/options"
"sigs.k8s.io/descheduler/metrics"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/client"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
@@ -38,8 +36,6 @@ import (
)
func Run(rs *options.DeschedulerServer) error {
metrics.Register()
ctx := context.Background()
rsclient, err := client.CreateClient(rs.KubeconfigFile)
if err != nil {
@@ -73,57 +69,26 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
sharedInformerFactory.Start(stopChannel)
sharedInformerFactory.WaitForCacheSync(stopChannel)
strategyFuncs := map[api.StrategyName]strategyFunction{
"RemoveDuplicates": strategies.RemoveDuplicatePods,
"LowNodeUtilization": nodeutilization.LowNodeUtilization,
"HighNodeUtilization": nodeutilization.HighNodeUtilization,
"RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity,
"RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity,
"RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints,
"RemovePodsHavingTooManyRestarts": strategies.RemovePodsHavingTooManyRestarts,
"PodLifeTime": strategies.PodLifeTime,
"RemovePodsViolatingTopologySpreadConstraint": strategies.RemovePodsViolatingTopologySpreadConstraint,
"RemoveFailedPods": strategies.RemoveFailedPods,
}
nodeSelector := rs.NodeSelector
if deschedulerPolicy.NodeSelector != nil {
nodeSelector = *deschedulerPolicy.NodeSelector
}
evictLocalStoragePods := rs.EvictLocalStoragePods
if deschedulerPolicy.EvictLocalStoragePods != nil {
evictLocalStoragePods = *deschedulerPolicy.EvictLocalStoragePods
}
evictSystemCriticalPods := false
if deschedulerPolicy.EvictSystemCriticalPods != nil {
evictSystemCriticalPods = *deschedulerPolicy.EvictSystemCriticalPods
if evictSystemCriticalPods {
klog.V(1).InfoS("Warning: EvictSystemCriticalPods is set to True. This could cause eviction of Kubernetes system pods.")
}
}
ignorePvcPods := false
if deschedulerPolicy.IgnorePVCPods != nil {
ignorePvcPods = *deschedulerPolicy.IgnorePVCPods
}
maxNoOfPodsToEvictPerNode := rs.MaxNoOfPodsToEvictPerNode
if deschedulerPolicy.MaxNoOfPodsToEvictPerNode != nil {
maxNoOfPodsToEvictPerNode = *deschedulerPolicy.MaxNoOfPodsToEvictPerNode
strategyFuncs := map[string]strategyFunction{
"RemoveDuplicates": strategies.RemoveDuplicatePods,
"LowNodeUtilization": strategies.LowNodeUtilization,
"RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity,
"RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity,
"RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints,
"RemovePodsHavingTooManyRestarts": strategies.RemovePodsHavingTooManyRestarts,
"PodLifeTime": strategies.PodLifeTime,
}
wait.Until(func() {
nodes, err := nodeutil.ReadyNodes(ctx, rs.Client, nodeInformer, nodeSelector)
nodes, err := nodeutil.ReadyNodes(ctx, rs.Client, nodeInformer, rs.NodeSelector, stopChannel)
if err != nil {
klog.V(1).InfoS("Unable to get ready nodes", "err", err)
klog.V(1).Infof("Unable to get ready nodes: %v", err)
close(stopChannel)
return
}
if len(nodes) <= 1 {
klog.V(1).InfoS("The cluster size is 0 or 1 meaning eviction causes service disruption or degradation. So aborting..")
klog.V(1).Infof("The cluster size is 0 or 1 meaning eviction causes service disruption or degradation. So aborting..")
close(stopChannel)
return
}
@@ -132,25 +97,17 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
rs.Client,
evictionPolicyGroupVersion,
rs.DryRun,
maxNoOfPodsToEvictPerNode,
rs.MaxNoOfPodsToEvictPerNode,
nodes,
evictLocalStoragePods,
evictSystemCriticalPods,
ignorePvcPods,
rs.EvictLocalStoragePods,
)
for name, strategy := range deschedulerPolicy.Strategies {
if f, ok := strategyFuncs[name]; ok {
if strategy.Enabled {
f(ctx, rs.Client, strategy, nodes, podEvictor)
}
} else {
klog.ErrorS(fmt.Errorf("unknown strategy name"), "skipping strategy", "strategy", name)
for name, f := range strategyFuncs {
if strategy := deschedulerPolicy.Strategies[api.StrategyName(name)]; strategy.Enabled {
f(ctx, rs.Client, strategy, nodes, podEvictor)
}
}
klog.V(1).InfoS("Number of evicted pods", "totalEvicted", podEvictor.TotalEvicted())
// If there was no interval specified, send a signal to the stopChannel to end the wait.Until loop after 1 iteration
if rs.DeschedulingInterval.Seconds() == 0 {
close(stopChannel)

View File

@@ -7,7 +7,7 @@ import (
"testing"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
fakeclientset "k8s.io/client-go/kubernetes/fake"
@@ -38,27 +38,18 @@ func TestTaintsUpdated(t *testing.T) {
stopChannel := make(chan struct{})
defer close(stopChannel)
rs, err := options.NewDeschedulerServer()
if err != nil {
t.Fatalf("Unable to initialize server: %v", err)
}
rs := options.NewDeschedulerServer()
rs.Client = client
rs.DeschedulingInterval = 100 * time.Millisecond
errChan := make(chan error, 1)
defer close(errChan)
go func() {
err := RunDeschedulerStrategies(ctx, rs, dp, "v1beta1", stopChannel)
errChan <- err
}()
select {
case err := <-errChan:
if err != nil {
t.Fatalf("Unable to run descheduler strategies: %v", err)
}
case <-time.After(300 * time.Millisecond):
// Wait for few cycles and then verify the only pod still exists
}
}()
// Wait for few cycles and then verify the only pod still exists
time.Sleep(300 * time.Millisecond)
pods, err := client.CoreV1().Pods(p1.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
t.Errorf("Unable to list pods: %v", err)

View File

@@ -25,15 +25,12 @@ import (
policy "k8s.io/api/policy/v1beta1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/errors"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
clientcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/tools/record"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/metrics"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
@@ -48,15 +45,12 @@ const (
type nodePodEvictedCount map[*v1.Node]int
type PodEvictor struct {
client clientset.Interface
nodes []*v1.Node
policyGroupVersion string
dryRun bool
maxPodsToEvictPerNode int
nodepodCount nodePodEvictedCount
evictLocalStoragePods bool
evictSystemCriticalPods bool
ignorePvcPods bool
client clientset.Interface
policyGroupVersion string
dryRun bool
maxPodsToEvictPerNode int
nodepodCount nodePodEvictedCount
evictLocalStoragePods bool
}
func NewPodEvictor(
@@ -66,8 +60,6 @@ func NewPodEvictor(
maxPodsToEvictPerNode int,
nodes []*v1.Node,
evictLocalStoragePods bool,
evictSystemCriticalPods bool,
ignorePvcPods bool,
) *PodEvictor {
var nodePodCount = make(nodePodEvictedCount)
for _, node := range nodes {
@@ -76,18 +68,46 @@ func NewPodEvictor(
}
return &PodEvictor{
client: client,
nodes: nodes,
policyGroupVersion: policyGroupVersion,
dryRun: dryRun,
maxPodsToEvictPerNode: maxPodsToEvictPerNode,
nodepodCount: nodePodCount,
evictLocalStoragePods: evictLocalStoragePods,
evictSystemCriticalPods: evictSystemCriticalPods,
ignorePvcPods: ignorePvcPods,
client: client,
policyGroupVersion: policyGroupVersion,
dryRun: dryRun,
maxPodsToEvictPerNode: maxPodsToEvictPerNode,
nodepodCount: nodePodCount,
evictLocalStoragePods: evictLocalStoragePods,
}
}
// IsEvictable checks if a pod is evictable or not.
func (pe *PodEvictor) IsEvictable(pod *v1.Pod) bool {
checkErrs := []error{}
if IsCriticalPod(pod) {
checkErrs = append(checkErrs, fmt.Errorf("pod is critical"))
}
ownerRefList := podutil.OwnerRef(pod)
if IsDaemonsetPod(ownerRefList) {
checkErrs = append(checkErrs, fmt.Errorf("pod is a DaemonSet pod"))
}
if len(ownerRefList) == 0 {
checkErrs = append(checkErrs, fmt.Errorf("pod does not have any ownerrefs"))
}
if !pe.evictLocalStoragePods && IsPodWithLocalStorage(pod) {
checkErrs = append(checkErrs, fmt.Errorf("pod has local storage and descheduler is not configured with --evict-local-storage-pods"))
}
if IsMirrorPod(pod) {
checkErrs = append(checkErrs, fmt.Errorf("pod is a mirror pod"))
}
if len(checkErrs) > 0 && !HaveEvictAnnotation(pod) {
klog.V(4).Infof("Pod %s in namespace %s is not evictable: Pod lacks an eviction annotation and fails the following checks: %v", pod.Name, pod.Namespace, errors.NewAggregate(checkErrs).Error())
return false
}
return true
}
// NodeEvicted gives a number of pods evicted for node
func (pe *PodEvictor) NodeEvicted(node *v1.Node) int {
return pe.nodepodCount[node]
@@ -105,35 +125,32 @@ func (pe *PodEvictor) TotalEvicted() int {
// EvictPod returns non-nil error only when evicting a pod on a node is not
// possible (due to maxPodsToEvictPerNode constraint). Success is true when the pod
// is evicted on the server side.
func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, node *v1.Node, strategy string, reasons ...string) (bool, error) {
reason := strategy
func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, node *v1.Node, reasons ...string) (bool, error) {
var reason string
if len(reasons) > 0 {
reason += " (" + strings.Join(reasons, ", ") + ")"
reason = " (" + strings.Join(reasons, ", ") + ")"
}
if pe.maxPodsToEvictPerNode > 0 && pe.nodepodCount[node]+1 > pe.maxPodsToEvictPerNode {
metrics.PodsEvicted.With(map[string]string{"result": "maximum number reached", "strategy": strategy, "namespace": pod.Namespace}).Inc()
return false, fmt.Errorf("Maximum number %v of evicted pods per %q node reached", pe.maxPodsToEvictPerNode, node.Name)
}
err := evictPod(ctx, pe.client, pod, pe.policyGroupVersion, pe.dryRun)
if err != nil {
// err is used only for logging purposes
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", reason)
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace}).Inc()
klog.Errorf("Error evicting pod: %#v in namespace %#v%s: %#v", pod.Name, pod.Namespace, reason, err)
return false, nil
}
pe.nodepodCount[node]++
if pe.dryRun {
klog.V(1).InfoS("Evicted pod in dry run mode", "pod", klog.KObj(pod), "reason", reason)
klog.V(1).Infof("Evicted pod in dry run mode: %#v in namespace %#v%s", pod.Name, pod.Namespace, reason)
} else {
klog.V(1).InfoS("Evicted pod", "pod", klog.KObj(pod), "reason", reason)
klog.V(1).Infof("Evicted pod: %#v in namespace %#v%s", pod.Name, pod.Namespace, reason)
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartStructuredLogging(3)
eventBroadcaster.StartLogging(klog.V(3).Infof)
eventBroadcaster.StartRecordingToSink(&clientcorev1.EventSinkImpl{Interface: pe.client.CoreV1().Events(pod.Namespace)})
r := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "sigs.k8s.io.descheduler"})
r.Event(pod, v1.EventTypeNormal, "Descheduled", fmt.Sprintf("pod evicted by sigs.k8s.io/descheduler%s", reason))
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace}).Inc()
}
return true, nil
}
@@ -166,142 +183,22 @@ func evictPod(ctx context.Context, client clientset.Interface, pod *v1.Pod, poli
return err
}
type Options struct {
priority *int32
nodeFit bool
labelSelector labels.Selector
func IsCriticalPod(pod *v1.Pod) bool {
return utils.IsCriticalPod(pod)
}
// WithPriorityThreshold sets a threshold for pod's priority class.
// Any pod whose priority class is lower is evictable.
func WithPriorityThreshold(priority int32) func(opts *Options) {
return func(opts *Options) {
var p int32 = priority
opts.priority = &p
}
}
// WithNodeFit sets whether or not to consider taints, node selectors,
// and pod affinity when evicting. A pod whose tolerations, node selectors,
// and affinity match a node other than the one it is currently running on
// is evictable.
func WithNodeFit(nodeFit bool) func(opts *Options) {
return func(opts *Options) {
opts.nodeFit = nodeFit
}
}
// WithLabelSelector sets whether or not to apply label filtering when evicting.
// Any pod matching the label selector is considered evictable.
func WithLabelSelector(labelSelector labels.Selector) func(opts *Options) {
return func(opts *Options) {
opts.labelSelector = labelSelector
}
}
type constraint func(pod *v1.Pod) error
type evictable struct {
constraints []constraint
}
// Evictable provides an implementation of IsEvictable(IsEvictable(pod *v1.Pod) bool).
// The method accepts a list of options which allow to extend constraints
// which decides when a pod is considered evictable.
func (pe *PodEvictor) Evictable(opts ...func(opts *Options)) *evictable {
options := &Options{}
for _, opt := range opts {
opt(options)
}
ev := &evictable{}
if !pe.evictSystemCriticalPods {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
// Moved from IsEvictable function to allow for disabling
if utils.IsCriticalPriorityPod(pod) {
return fmt.Errorf("pod has system critical priority")
}
return nil
})
if options.priority != nil {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
if IsPodEvictableBasedOnPriority(pod, *options.priority) {
return nil
}
return fmt.Errorf("pod has higher priority than specified priority class threshold")
})
func IsDaemonsetPod(ownerRefList []metav1.OwnerReference) bool {
for _, ownerRef := range ownerRefList {
if ownerRef.Kind == "DaemonSet" {
return true
}
}
if !pe.evictLocalStoragePods {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
if utils.IsPodWithLocalStorage(pod) {
return fmt.Errorf("pod has local storage and descheduler is not configured with evictLocalStoragePods")
}
return nil
})
}
if pe.ignorePvcPods {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
if utils.IsPodWithPVC(pod) {
return fmt.Errorf("pod has a PVC and descheduler is configured to ignore PVC pods")
}
return nil
})
}
if options.nodeFit {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
if !nodeutil.PodFitsAnyOtherNode(pod, pe.nodes) {
return fmt.Errorf("pod does not fit on any other node because of nodeSelector(s), Taint(s), or nodes marked as unschedulable")
}
return nil
})
}
if options.labelSelector != nil && !options.labelSelector.Empty() {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
if !options.labelSelector.Matches(labels.Set(pod.Labels)) {
return fmt.Errorf("pod labels do not match the labelSelector filter in the policy parameter")
}
return nil
})
}
return ev
return false
}
// IsEvictable decides when a pod is evictable
func (ev *evictable) IsEvictable(pod *v1.Pod) bool {
checkErrs := []error{}
ownerRefList := podutil.OwnerRef(pod)
if utils.IsDaemonsetPod(ownerRefList) {
checkErrs = append(checkErrs, fmt.Errorf("pod is a DaemonSet pod"))
}
if len(ownerRefList) == 0 {
checkErrs = append(checkErrs, fmt.Errorf("pod does not have any ownerrefs"))
}
if utils.IsMirrorPod(pod) {
checkErrs = append(checkErrs, fmt.Errorf("pod is a mirror pod"))
}
if utils.IsStaticPod(pod) {
checkErrs = append(checkErrs, fmt.Errorf("pod is a static pod"))
}
for _, c := range ev.constraints {
if err := c(pod); err != nil {
checkErrs = append(checkErrs, err)
}
}
if len(checkErrs) > 0 && !HaveEvictAnnotation(pod) {
klog.V(4).InfoS("Pod lacks an eviction annotation and fails the following checks", "pod", klog.KObj(pod), "checks", errors.NewAggregate(checkErrs).Error())
return false
}
return true
// IsMirrorPod checks whether the pod is a mirror pod.
func IsMirrorPod(pod *v1.Pod) bool {
return utils.IsMirrorPod(pod)
}
// HaveEvictAnnotation checks if the pod have evict annotation
@@ -310,7 +207,12 @@ func HaveEvictAnnotation(pod *v1.Pod) bool {
return found
}
// IsPodEvictableBasedOnPriority checks if the given pod is evictable based on priority resolved from pod Spec.
func IsPodEvictableBasedOnPriority(pod *v1.Pod, priority int32) bool {
return pod.Spec.Priority == nil || *pod.Spec.Priority < priority
func IsPodWithLocalStorage(pod *v1.Pod) bool {
for _, volume := range pod.Spec.Volumes {
if volume.HostPath != nil || volume.EmptyDir != nil {
return true
}
}
return false
}

View File

@@ -71,80 +71,47 @@ func TestEvictPod(t *testing.T) {
func TestIsEvictable(t *testing.T) {
n1 := test.BuildTestNode("node1", 1000, 2000, 13, nil)
lowPriority := int32(800)
highPriority := int32(900)
nodeTaintKey := "hardware"
nodeTaintValue := "gpu"
nodeLabelKey := "datacenter"
nodeLabelValue := "east"
type testCase struct {
pod *v1.Pod
nodes []*v1.Node
runBefore func(*v1.Pod, []*v1.Node)
evictLocalStoragePods bool
evictSystemCriticalPods bool
priorityThreshold *int32
nodeFit bool
result bool
pod *v1.Pod
runBefore func(*v1.Pod)
evictLocalStoragePods bool
result bool
}
testCases := []testCase{
{ // Normal pod eviction with normal ownerRefs
{
pod: test.BuildTestPod("p1", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
runBefore: func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with normal ownerRefs and descheduler.alpha.kubernetes.io/evict annotation
evictLocalStoragePods: false,
result: true,
}, {
pod: test.BuildTestPod("p2", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
runBefore: func(pod *v1.Pod) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with replicaSet ownerRefs
evictLocalStoragePods: false,
result: true,
}, {
pod: test.BuildTestPod("p3", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
runBefore: func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList()
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with replicaSet ownerRefs and descheduler.alpha.kubernetes.io/evict annotation
evictLocalStoragePods: false,
result: true,
}, {
pod: test.BuildTestPod("p4", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
runBefore: func(pod *v1.Pod) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList()
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with statefulSet ownerRefs
pod: test.BuildTestPod("p18", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetStatefulSetOwnerRefList()
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with statefulSet ownerRefs and descheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p19", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetStatefulSetOwnerRefList()
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted because it is bound to a PV and evictLocalStoragePods = false
evictLocalStoragePods: false,
result: true,
}, {
pod: test.BuildTestPod("p5", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
runBefore: func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Volumes = []v1.Volume{
{
@@ -157,12 +124,11 @@ func TestIsEvictable(t *testing.T) {
},
}
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: false,
}, { // Pod is evicted because it is bound to a PV and evictLocalStoragePods = true
evictLocalStoragePods: false,
result: false,
}, {
pod: test.BuildTestPod("p6", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
runBefore: func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Volumes = []v1.Volume{
{
@@ -175,12 +141,11 @@ func TestIsEvictable(t *testing.T) {
},
}
},
evictLocalStoragePods: true,
evictSystemCriticalPods: false,
result: true,
}, { // Pod is evicted because it is bound to a PV and evictLocalStoragePods = false, but it has scheduler.alpha.kubernetes.io/evict annotation
evictLocalStoragePods: true,
result: true,
}, {
pod: test.BuildTestPod("p7", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
runBefore: func(pod *v1.Pod) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Volumes = []v1.Volume{
@@ -194,243 +159,66 @@ func TestIsEvictable(t *testing.T) {
},
}
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted becasuse it is part of a daemonSet
evictLocalStoragePods: false,
result: true,
}, {
pod: test.BuildTestPod("p8", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
runBefore: func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList()
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: false,
}, { // Pod is evicted becasuse it is part of a daemonSet, but it has scheduler.alpha.kubernetes.io/evict annotation
evictLocalStoragePods: false,
result: false,
}, {
pod: test.BuildTestPod("p9", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
runBefore: func(pod *v1.Pod) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList()
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted becasuse it is a mirror pod
evictLocalStoragePods: false,
result: true,
}, {
pod: test.BuildTestPod("p10", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
runBefore: func(pod *v1.Pod) {
pod.Annotations = test.GetMirrorPodAnnotation()
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: false,
}, { // Pod is evicted becasuse it is a mirror pod, but it has scheduler.alpha.kubernetes.io/evict annotation
evictLocalStoragePods: false,
result: false,
}, {
pod: test.BuildTestPod("p11", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
runBefore: func(pod *v1.Pod) {
pod.Annotations = test.GetMirrorPodAnnotation()
pod.Annotations["descheduler.alpha.kubernetes.io/evict"] = "true"
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted becasuse it has system critical priority
evictLocalStoragePods: false,
result: true,
}, {
pod: test.BuildTestPod("p12", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
runBefore: func(pod *v1.Pod) {
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: false,
}, { // Pod is evicted becasuse it has system critical priority, but it has scheduler.alpha.kubernetes.io/evict annotation
evictLocalStoragePods: false,
result: false,
}, {
pod: test.BuildTestPod("p13", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
runBefore: func(pod *v1.Pod) {
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
pod.Annotations = map[string]string{
"descheduler.alpha.kubernetes.io/evict": "true",
}
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted becasuse it has a priority higher than the configured priority threshold
pod: test.BuildTestPod("p14", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Priority = &highPriority
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
priorityThreshold: &lowPriority,
result: false,
}, { // Pod is evicted becasuse it has a priority higher than the configured priority threshold, but it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p15", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.Spec.Priority = &highPriority
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
priorityThreshold: &lowPriority,
result: true,
}, { // Pod is evicted becasuse it has system critical priority, but evictSystemCriticalPods = true
pod: test.BuildTestPod("p16", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
result: true,
}, { // Pod is evicted becasuse it has system critical priority, but evictSystemCriticalPods = true and it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p16", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
result: true,
}, { // Pod is evicted becasuse it has a priority higher than the configured priority threshold, but evictSystemCriticalPods = true
pod: test.BuildTestPod("p17", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Priority = &highPriority
},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
priorityThreshold: &lowPriority,
result: true,
}, { // Pod is evicted becasuse it has a priority higher than the configured priority threshold, but evictSystemCriticalPods = true and it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p17", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.Spec.Priority = &highPriority
},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
priorityThreshold: &lowPriority,
result: true,
}, { // Pod with no tolerations running on normal node, all other nodes tainted
pod: test.BuildTestPod("p1", 400, 0, n1.Name, nil),
nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)},
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
for _, node := range nodes {
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: false,
}, { // Pod with correct tolerations running on normal node, all other nodes tainted
pod: test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Spec.Tolerations = []v1.Toleration{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)},
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
for _, node := range nodes {
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: true,
}, { // Pod with incorrect node selector
pod: test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: "fail",
}
}),
nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)},
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
for _, node := range nodes {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: false,
}, { // Pod with correct node selector
pod: test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)},
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
for _, node := range nodes {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: true,
evictLocalStoragePods: false,
result: true,
},
}
for _, test := range testCases {
test.runBefore(test.pod, test.nodes)
nodes := append(test.nodes, n1)
test.runBefore(test.pod)
podEvictor := &PodEvictor{
evictLocalStoragePods: test.evictLocalStoragePods,
evictSystemCriticalPods: test.evictSystemCriticalPods,
nodes: nodes,
evictLocalStoragePods: test.evictLocalStoragePods,
}
evictable := podEvictor.Evictable()
var opts []func(opts *Options)
if test.priorityThreshold != nil {
opts = append(opts, WithPriorityThreshold(*test.priorityThreshold))
}
if test.nodeFit {
opts = append(opts, WithNodeFit(true))
}
evictable = podEvictor.Evictable(opts...)
result := evictable.IsEvictable(test.pod)
result := podEvictor.IsEvictable(test.pod)
if result != test.result {
t.Errorf("IsEvictable should return for pod %s %t, but it returns %t", test.pod.Name, test.result, result)
}
@@ -445,6 +233,10 @@ func TestPodTypes(t *testing.T) {
p2 := test.BuildTestPod("p2", 400, 0, n1.Name, nil)
p3 := test.BuildTestPod("p3", 400, 0, n1.Name, nil)
p4 := test.BuildTestPod("p4", 400, 0, n1.Name, nil)
p5 := test.BuildTestPod("p5", 400, 0, n1.Name, nil)
p6 := test.BuildTestPod("p6", 400, 0, n1.Name, nil)
p6.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
p1.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList()
// The following 4 pods won't get evicted.
@@ -465,18 +257,27 @@ func TestPodTypes(t *testing.T) {
}
// A Mirror Pod.
p4.Annotations = test.GetMirrorPodAnnotation()
if !utils.IsMirrorPod(p4) {
// A Critical Pod.
p5.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
p5.Spec.Priority = &priority
systemCriticalPriority := utils.SystemCriticalPriority
p5.Spec.Priority = &systemCriticalPriority
if !IsMirrorPod(p4) {
t.Errorf("Expected p4 to be a mirror pod.")
}
if !utils.IsPodWithLocalStorage(p3) {
if !IsCriticalPod(p5) {
t.Errorf("Expected p5 to be a critical pod.")
}
if !IsPodWithLocalStorage(p3) {
t.Errorf("Expected p3 to be a pod with local storage.")
}
ownerRefList := podutil.OwnerRef(p2)
if !utils.IsDaemonsetPod(ownerRefList) {
if !IsDaemonsetPod(ownerRefList) {
t.Errorf("Expected p2 to be a daemonset pod.")
}
ownerRefList = podutil.OwnerRef(p1)
if utils.IsDaemonsetPod(ownerRefList) || utils.IsPodWithLocalStorage(p1) || utils.IsCriticalPriorityPod(p1) || utils.IsMirrorPod(p1) || utils.IsStaticPod(p1) {
if IsDaemonsetPod(ownerRefList) || IsPodWithLocalStorage(p1) || IsCriticalPod(p1) || IsMirrorPod(p1) {
t.Errorf("Expected p1 to be a normal pod.")
}

View File

@@ -18,8 +18,7 @@ package node
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
coreinformers "k8s.io/client-go/informers/core/v1"
@@ -30,7 +29,7 @@ import (
// ReadyNodes returns ready nodes irrespective of whether they are
// schedulable or not.
func ReadyNodes(ctx context.Context, client clientset.Interface, nodeInformer coreinformers.NodeInformer, nodeSelector string) ([]*v1.Node, error) {
func ReadyNodes(ctx context.Context, client clientset.Interface, nodeInformer coreinformers.NodeInformer, nodeSelector string, stopChannel <-chan struct{}) ([]*v1.Node, error) {
ns, err := labels.Parse(nodeSelector)
if err != nil {
return []*v1.Node{}, err
@@ -43,7 +42,7 @@ func ReadyNodes(ctx context.Context, client clientset.Interface, nodeInformer co
}
if len(nodes) == 0 {
klog.V(2).InfoS("Node lister returned empty list, now fetch directly")
klog.V(2).Infof("node lister returned empty list, now fetch directly")
nItems, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{LabelSelector: nodeSelector})
if err != nil {
@@ -78,56 +77,25 @@ func IsReady(node *v1.Node) bool {
// - NodeOutOfDisk condition status is ConditionFalse,
// - NodeNetworkUnavailable condition status is ConditionFalse.
if cond.Type == v1.NodeReady && cond.Status != v1.ConditionTrue {
klog.V(1).InfoS("Ignoring node", "node", klog.KObj(node), "condition", cond.Type, "status", cond.Status)
klog.V(1).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status)
return false
} /*else if cond.Type == v1.NodeOutOfDisk && cond.Status != v1.ConditionFalse {
klog.V(4).InfoS("Ignoring node with condition status", "node", klog.KObj(node.Name), "condition", cond.Type, "status", cond.Status)
klog.V(4).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status)
return false
} else if cond.Type == v1.NodeNetworkUnavailable && cond.Status != v1.ConditionFalse {
klog.V(4).InfoS("Ignoring node with condition status", "node", klog.KObj(node.Name), "condition", cond.Type, "status", cond.Status)
klog.V(4).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status)
return false
}*/
}
// Ignore nodes that are marked unschedulable
/*if node.Spec.Unschedulable {
klog.V(4).InfoS("Ignoring node since it is unschedulable", "node", klog.KObj(node.Name))
klog.V(4).Infof("Ignoring node %v since it is unschedulable", node.Name)
return false
}*/
return true
}
// PodFitsAnyOtherNode checks if the given pod fits any of the given nodes, besides the node
// the pod is already running on. The node fit is based on multiple criteria, like, pod node selector
// matching the node label (including affinity), the taints on the node, and the node being schedulable or not.
func PodFitsAnyOtherNode(pod *v1.Pod, nodes []*v1.Node) bool {
for _, node := range nodes {
// Skip node pod is already on
if node.Name == pod.Spec.NodeName {
continue
}
// Check node selector and required affinity
ok, err := utils.PodMatchNodeSelector(pod, node)
if err != nil || !ok {
continue
}
// Check taints (we only care about NoSchedule and NoExecute taints)
ok = utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool {
return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute
})
if !ok {
continue
}
// Check if node is schedulable
if !IsNodeUnschedulable(node) {
klog.V(2).InfoS("Pod can possibly be scheduled on a different node", "pod", klog.KObj(pod), "node", klog.KObj(node))
return true
}
}
return false
}
// IsNodeUnschedulable checks if the node is unschedulable. This is a helper function to check only in case of
// IsNodeUnschedulable checks if the node is unschedulable. This is helper function to check only in case of
// underutilized node so that they won't be accounted for.
func IsNodeUnschedulable(node *v1.Node) bool {
return node.Spec.Unschedulable
@@ -144,7 +112,7 @@ func PodFitsAnyNode(pod *v1.Pod, nodes []*v1.Node) bool {
continue
}
if !IsNodeUnschedulable(node) {
klog.V(2).InfoS("Pod can possibly be scheduled on a different node", "pod", klog.KObj(pod), "node", klog.KObj(node))
klog.V(2).Infof("Pod %v can possibly be scheduled on %v", pod.Name, node.Name)
return true
}
}
@@ -157,15 +125,15 @@ func PodFitsCurrentNode(pod *v1.Pod, node *v1.Node) bool {
ok, err := utils.PodMatchNodeSelector(pod, node)
if err != nil {
klog.ErrorS(err, "Failed to match node selector")
klog.Error(err)
return false
}
if !ok {
klog.V(2).InfoS("Pod does not fit on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
klog.V(2).Infof("Pod %v does not fit on node %v", pod.Name, node.Name)
return false
}
klog.V(2).InfoS("Pod fits on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
klog.V(2).Infof("Pod %v fits on node %v", pod.Name, node.Name)
return true
}

View File

@@ -20,7 +20,7 @@ import (
"context"
"testing"
v1 "k8s.io/api/core/v1"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
@@ -69,12 +69,12 @@ func TestReadyNodesWithNodeSelector(t *testing.T) {
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
nodeInformer := sharedInformerFactory.Core().V1().Nodes()
stopChannel := make(chan struct{})
stopChannel := make(chan struct{}, 0)
sharedInformerFactory.Start(stopChannel)
sharedInformerFactory.WaitForCacheSync(stopChannel)
defer close(stopChannel)
nodes, _ := ReadyNodes(ctx, fakeClient, nodeInformer, nodeSelector)
nodes, _ := ReadyNodes(ctx, fakeClient, nodeInformer, nodeSelector, nil)
if nodes[0].Name != "node1" {
t.Errorf("Expected node1, got %s", nodes[0].Name)
@@ -200,15 +200,10 @@ func TestPodFitsCurrentNode(t *testing.T) {
}
}
func TestPodFitsAnyOtherNode(t *testing.T) {
func TestPodFitsAnyNode(t *testing.T) {
nodeLabelKey := "kubernetes.io/desiredNode"
nodeLabelValue := "yes"
nodeTaintKey := "hardware"
nodeTaintValue := "gpu"
// Staging node has no scheduling restrictions, but the pod always starts here and PodFitsAnyOtherNode() doesn't take into account the node the pod is running on.
nodeNames := []string{"node1", "node2", "stagingNode"}
tests := []struct {
description string
@@ -217,12 +212,33 @@ func TestPodFitsAnyOtherNode(t *testing.T) {
success bool
}{
{
description: "Pod fits another node matching node affinity",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
description: "Pod expected to fit one of the nodes",
pod: &v1.Pod{
Spec: v1.PodSpec{
Affinity: &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: nodeLabelKey,
Operator: "In",
Values: []string{
nodeLabelValue,
},
},
},
},
},
},
},
},
},
},
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
@@ -230,27 +246,42 @@ func TestPodFitsAnyOtherNode(t *testing.T) {
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: "no",
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
},
success: true,
},
{
description: "Pod expected to fit one of the nodes",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
description: "Pod expected to fit one of the nodes (error node first)",
pod: &v1.Pod{
Spec: v1.PodSpec{
Affinity: &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: nodeLabelKey,
Operator: "In",
Values: []string{
nodeLabelValue,
},
},
},
},
},
},
},
},
},
},
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: "no",
},
@@ -258,27 +289,42 @@ func TestPodFitsAnyOtherNode(t *testing.T) {
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
},
success: true,
},
{
description: "Pod expected to fit none of the nodes",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
pod: &v1.Pod{
Spec: v1.PodSpec{
Affinity: &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: nodeLabelKey,
Operator: "In",
Values: []string{
nodeLabelValue,
},
},
},
},
},
},
},
},
},
},
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: "unfit1",
},
@@ -286,27 +332,42 @@ func TestPodFitsAnyOtherNode(t *testing.T) {
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: "unfit2",
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
},
success: false,
},
{
description: "Nodes are unschedulable but labels match, should fail",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
pod: &v1.Pod{
Spec: v1.PodSpec{
Affinity: &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: nodeLabelKey,
Operator: "In",
Values: []string{
nodeLabelValue,
},
},
},
},
},
},
},
},
},
},
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
@@ -317,136 +378,20 @@ func TestPodFitsAnyOtherNode(t *testing.T) {
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: "no",
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
},
success: false,
},
{
description: "Two nodes matches node selector, one of them is tained, should pass",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
},
success: true,
},
{
description: "Both nodes are tained, should fail",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoExecute,
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
},
success: false,
},
}
for _, tc := range tests {
actual := PodFitsAnyOtherNode(tc.pod, tc.nodes)
actual := PodFitsAnyNode(tc.pod, tc.nodes)
if actual != tc.success {
t.Errorf("Test %#v failed", tc.description)
}
}
}
func createPodManifest(nodeName string, nodeSelectorKey string, nodeSelectorValue string) *v1.Pod {
return (&v1.Pod{
Spec: v1.PodSpec{
NodeName: nodeName,
Affinity: &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: nodeSelectorKey,
Operator: "In",
Values: []string{
nodeSelectorValue,
},
},
},
},
},
},
},
},
},
})
}

View File

@@ -18,145 +18,34 @@ package pod
import (
"context"
"sort"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
clientset "k8s.io/client-go/kubernetes"
"sigs.k8s.io/descheduler/pkg/utils"
"sort"
)
type Options struct {
filter func(pod *v1.Pod) bool
includedNamespaces []string
excludedNamespaces []string
labelSelector *metav1.LabelSelector
}
// WithFilter sets a pod filter.
// The filter function should return true if the pod should be returned from ListPodsOnANode
func WithFilter(filter func(pod *v1.Pod) bool) func(opts *Options) {
return func(opts *Options) {
opts.filter = filter
}
}
// WithNamespaces sets included namespaces
func WithNamespaces(namespaces []string) func(opts *Options) {
return func(opts *Options) {
opts.includedNamespaces = namespaces
}
}
// WithoutNamespaces sets excluded namespaces
func WithoutNamespaces(namespaces []string) func(opts *Options) {
return func(opts *Options) {
opts.excludedNamespaces = namespaces
}
}
// WithLabelSelector sets a pod label selector
func WithLabelSelector(labelSelector *metav1.LabelSelector) func(opts *Options) {
return func(opts *Options) {
opts.labelSelector = labelSelector
}
}
// ListPodsOnANode lists all of the pods on a node
// It also accepts an optional "filter" function which can be used to further limit the pods that are returned.
// (Usually this is podEvictor.Evictable().IsEvictable, in order to only list the evictable pods on a node, but can
// be used by strategies to extend it if there are further restrictions, such as with NodeAffinity).
func ListPodsOnANode(
ctx context.Context,
client clientset.Interface,
node *v1.Node,
opts ...func(opts *Options),
) ([]*v1.Pod, error) {
fieldSelectorString := "spec.nodeName=" + node.Name + ",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)
// (Usually this is podEvictor.IsEvictable, in order to only list the evictable pods on a node, but can
// be used by strategies to extend IsEvictable if there are further restrictions, such as with NodeAffinity).
// The filter function should return true if the pod should be returned from ListPodsOnANode
func ListPodsOnANode(ctx context.Context, client clientset.Interface, node *v1.Node, filter func(pod *v1.Pod) bool) ([]*v1.Pod, error) {
fieldSelector, err := fields.ParseSelector("spec.nodeName=" + node.Name + ",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed))
if err != nil {
return []*v1.Pod{}, err
}
return ListPodsOnANodeWithFieldSelector(ctx, client, node, fieldSelectorString, opts...)
}
// ListPodsOnANodeWithFieldSelector lists all of the pods on a node using the filter selectors
func ListPodsOnANodeWithFieldSelector(
ctx context.Context,
client clientset.Interface,
node *v1.Node,
fieldSelectorString string,
opts ...func(opts *Options),
) ([]*v1.Pod, error) {
options := &Options{}
for _, opt := range opts {
opt(options)
podList, err := client.CoreV1().Pods(v1.NamespaceAll).List(ctx,
metav1.ListOptions{FieldSelector: fieldSelector.String()})
if err != nil {
return []*v1.Pod{}, err
}
pods := make([]*v1.Pod, 0)
labelSelectorString := ""
if options.labelSelector != nil {
selector, err := metav1.LabelSelectorAsSelector(options.labelSelector)
if err != nil {
return []*v1.Pod{}, err
}
labelSelectorString = selector.String()
}
if len(options.includedNamespaces) > 0 {
fieldSelector, err := fields.ParseSelector(fieldSelectorString)
if err != nil {
return []*v1.Pod{}, err
}
for _, namespace := range options.includedNamespaces {
podList, err := client.CoreV1().Pods(namespace).List(ctx,
metav1.ListOptions{
FieldSelector: fieldSelector.String(),
LabelSelector: labelSelectorString,
})
if err != nil {
return []*v1.Pod{}, err
}
for i := range podList.Items {
if options.filter != nil && !options.filter(&podList.Items[i]) {
continue
}
pods = append(pods, &podList.Items[i])
}
}
return pods, nil
}
if len(options.excludedNamespaces) > 0 {
for _, namespace := range options.excludedNamespaces {
fieldSelectorString += ",metadata.namespace!=" + namespace
}
}
fieldSelector, err := fields.ParseSelector(fieldSelectorString)
if err != nil {
return []*v1.Pod{}, err
}
// INFO(jchaloup): field selectors do not work properly with listers
// Once the descheduler switches to pod listers (through informers),
// We need to flip to client-side filtering.
podList, err := client.CoreV1().Pods(v1.NamespaceAll).List(ctx,
metav1.ListOptions{
FieldSelector: fieldSelector.String(),
LabelSelector: labelSelectorString,
})
if err != nil {
return []*v1.Pod{}, err
}
for i := range podList.Items {
// fake client does not support field selectors
// so let's filter based on the node name as well (quite cheap)
if podList.Items[i].Spec.NodeName != node.Name {
continue
}
if options.filter != nil && !options.filter(&podList.Items[i]) {
if filter != nil && !filter(&podList.Items[i]) {
continue
}
pods = append(pods, &podList.Items[i])

View File

@@ -24,7 +24,6 @@ import (
"testing"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
@@ -41,7 +40,6 @@ func TestListPodsOnANode(t *testing.T) {
name string
pods map[string][]v1.Pod
node *v1.Node
labelSelector *metav1.LabelSelector
expectedPodCount int
}{
{
@@ -54,33 +52,6 @@ func TestListPodsOnANode(t *testing.T) {
"n2": {*test.BuildTestPod("pod3", 100, 0, "n2", nil)},
},
node: test.BuildTestNode("n1", 2000, 3000, 10, nil),
labelSelector: nil,
expectedPodCount: 2,
},
{
name: "test listing pods with label selector",
pods: map[string][]v1.Pod{
"n1": {
*test.BuildTestPod("pod1", 100, 0, "n1", nil),
*test.BuildTestPod("pod2", 100, 0, "n1", func(pod *v1.Pod) {
pod.Labels = map[string]string{"foo": "bar"}
}),
*test.BuildTestPod("pod3", 100, 0, "n1", func(pod *v1.Pod) {
pod.Labels = map[string]string{"foo": "bar1"}
}),
},
"n2": {*test.BuildTestPod("pod4", 100, 0, "n2", nil)},
},
node: test.BuildTestNode("n1", 2000, 3000, 10, nil),
labelSelector: &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: "foo",
Operator: metav1.LabelSelectorOpIn,
Values: []string{"bar", "bar1"},
},
},
},
expectedPodCount: 2,
},
}
@@ -96,7 +67,7 @@ func TestListPodsOnANode(t *testing.T) {
}
return true, nil, fmt.Errorf("Failed to list: %v", list)
})
pods, _ := ListPodsOnANode(context.TODO(), fakeClient, testCase.node, WithLabelSelector(testCase.labelSelector))
pods, _ := ListPodsOnANode(context.TODO(), fakeClient, testCase.node, nil)
if len(pods) != testCase.expectedPodCount {
t.Errorf("expected %v pods on node %v, got %+v", testCase.expectedPodCount, testCase.node.Name, len(pods))
}

View File

@@ -30,7 +30,7 @@ import (
func LoadPolicyConfig(policyConfigFile string) (*api.DeschedulerPolicy, error) {
if policyConfigFile == "" {
klog.V(1).InfoS("Policy config file not specified")
klog.V(1).Infof("policy config file not specified")
return nil, nil
}

View File

@@ -19,22 +19,9 @@ package scheme
import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/serializer"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/api/v1alpha1"
"sigs.k8s.io/descheduler/pkg/apis/componentconfig"
componentconfigv1alpha1 "sigs.k8s.io/descheduler/pkg/apis/componentconfig/v1alpha1"
)
var (
Scheme = runtime.NewScheme()
Codecs = serializer.NewCodecFactory(Scheme)
)
func init() {
utilruntime.Must(api.AddToScheme(Scheme))
utilruntime.Must(v1alpha1.AddToScheme(Scheme))
utilruntime.Must(componentconfig.AddToScheme(Scheme))
utilruntime.Must(componentconfigv1alpha1.AddToScheme(Scheme))
}

View File

@@ -18,8 +18,6 @@ package strategies
import (
"context"
"fmt"
"math"
"reflect"
"sort"
"strings"
@@ -33,29 +31,8 @@ import (
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
)
func validateRemoveDuplicatePodsParams(params *api.StrategyParameters) error {
if params == nil {
return nil
}
// At most one of include/exclude can be set
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
return fmt.Errorf("only one of Include/Exclude namespaces can be set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
}
return nil
}
type podOwner struct {
namespace, kind, name string
imagesHash string
}
// RemoveDuplicatePods removes the duplicate pods on node. This strategy evicts all duplicate pods on node.
// A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same
// namespace, and have at least one container with the same image.
@@ -67,49 +44,15 @@ func RemoveDuplicatePods(
nodes []*v1.Node,
podEvictor *evictions.PodEvictor,
) {
if err := validateRemoveDuplicatePodsParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemoveDuplicatePods parameters")
return
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
return
}
var includedNamespaces, excludedNamespaces []string
if strategy.Params != nil && strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
}
nodeFit := false
if strategy.Params != nil {
nodeFit = strategy.Params.NodeFit
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
duplicatePods := make(map[podOwner]map[string][]*v1.Pod)
ownerKeyOccurence := make(map[podOwner]int32)
nodeCount := 0
nodeMap := make(map[string]*v1.Node)
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListPodsOnANode(ctx,
client,
node,
podutil.WithFilter(evictable.IsEvictable),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
)
klog.V(1).Infof("Processing node: %#v", node.Name)
pods, err := podutil.ListPodsOnANode(ctx, client, node, podEvictor.IsEvictable)
if err != nil {
klog.ErrorS(err, "Error listing evictable pods on node", "node", klog.KObj(node))
klog.Errorf("error listing evictable pods on node %s: %+v", node.Name, err)
continue
}
nodeMap[node.Name] = node
nodeCount++
duplicatePods := make([]*v1.Pod, 0, len(pods))
// Each pod has a list of owners and a list of containers, and each container has 1 image spec.
// For each pod, we go through all the OwnerRef/Image mappings and represent them as a "key" string.
// All of those mappings together makes a list of "key" strings that essentially represent that pod's uniqueness.
@@ -126,29 +69,16 @@ func RemoveDuplicatePods(
duplicateKeysMap := map[string][][]string{}
for _, pod := range pods {
ownerRefList := podutil.OwnerRef(pod)
if hasExcludedOwnerRefKind(ownerRefList, strategy) || len(ownerRefList) == 0 {
if hasExcludedOwnerRefKind(ownerRefList, strategy) {
continue
}
podContainerKeys := make([]string, 0, len(ownerRefList)*len(pod.Spec.Containers))
imageList := []string{}
for _, container := range pod.Spec.Containers {
imageList = append(imageList, container.Image)
}
sort.Strings(imageList)
imagesHash := strings.Join(imageList, "#")
for _, ownerRef := range ownerRefList {
ownerKey := podOwner{
namespace: pod.ObjectMeta.Namespace,
kind: ownerRef.Kind,
name: ownerRef.Name,
imagesHash: imagesHash,
}
ownerKeyOccurence[ownerKey] = ownerKeyOccurence[ownerKey] + 1
for _, image := range imageList {
for _, container := range pod.Spec.Containers {
// Namespace/Kind/Name should be unique for the cluster.
// We also consider the image, as 2 pods could have the same owner but serve different purposes
// So any non-unique Namespace/Kind/Name/Image pattern is a duplicate pod.
s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name, image}, "/")
s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name, container.Image}, "/")
podContainerKeys = append(podContainerKeys, s)
}
}
@@ -160,19 +90,7 @@ func RemoveDuplicatePods(
for _, keys := range existing {
if reflect.DeepEqual(keys, podContainerKeys) {
matched = true
klog.V(3).InfoS("Duplicate found", "pod", klog.KObj(pod))
for _, ownerRef := range ownerRefList {
ownerKey := podOwner{
namespace: pod.ObjectMeta.Namespace,
kind: ownerRef.Kind,
name: ownerRef.Name,
imagesHash: imagesHash,
}
if _, ok := duplicatePods[ownerKey]; !ok {
duplicatePods[ownerKey] = make(map[string][]*v1.Pod)
}
duplicatePods[ownerKey][node.Name] = append(duplicatePods[ownerKey][node.Name], pod)
}
duplicatePods = append(duplicatePods, pod)
break
}
}
@@ -185,103 +103,16 @@ func RemoveDuplicatePods(
duplicateKeysMap[podContainerKeys[0]] = [][]string{podContainerKeys}
}
}
}
// 1. how many pods can be evicted to respect uniform placement of pods among viable nodes?
for ownerKey, podNodes := range duplicatePods {
targetNodes := getTargetNodes(podNodes, nodes)
klog.V(2).InfoS("Adjusting feasible nodes", "owner", ownerKey, "from", nodeCount, "to", len(targetNodes))
if len(targetNodes) < 2 {
klog.V(1).InfoS("Less than two feasible nodes for duplicates to land, skipping eviction", "owner", ownerKey)
continue
}
upperAvg := int(math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(len(targetNodes))))
for nodeName, pods := range podNodes {
klog.V(2).InfoS("Average occurrence per node", "node", klog.KObj(nodeMap[nodeName]), "ownerKey", ownerKey, "avg", upperAvg)
// list of duplicated pods does not contain the original referential pod
if len(pods)+1 > upperAvg {
// It's assumed all duplicated pods are in the same priority class
// TODO(jchaloup): check if the pod has a different node to lend to
for _, pod := range pods[upperAvg-1:] {
if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[nodeName], "RemoveDuplicatePods"); err != nil {
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
break
}
}
for _, pod := range duplicatePods {
if _, err := podEvictor.EvictPod(ctx, pod, node, "RemoveDuplicatePods"); err != nil {
klog.Errorf("Error evicting pod: (%#v)", err)
break
}
}
}
}
func getNodeAffinityNodeSelector(pod *v1.Pod) *v1.NodeSelector {
if pod.Spec.Affinity == nil {
return nil
}
if pod.Spec.Affinity.NodeAffinity == nil {
return nil
}
return pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution
}
func getTargetNodes(podNodes map[string][]*v1.Pod, nodes []*v1.Node) []*v1.Node {
// In order to reduce the number of pods processed, identify pods which have
// equal (tolerations, nodeselectors, node affinity) terms and considered them
// as identical. Identical pods wrt. (tolerations, nodeselectors, node affinity) terms
// will produce the same result when checking if a pod is feasible for a node.
// Thus, improving efficiency of processing pods marked for eviction.
// Collect all distinct pods which differ in at least taints, node affinity or node selector terms
distinctPods := map[*v1.Pod]struct{}{}
for _, pods := range podNodes {
for _, pod := range pods {
duplicated := false
for dp := range distinctPods {
if utils.TolerationsEqual(pod.Spec.Tolerations, dp.Spec.Tolerations) &&
utils.NodeSelectorsEqual(getNodeAffinityNodeSelector(pod), getNodeAffinityNodeSelector(dp)) &&
reflect.DeepEqual(pod.Spec.NodeSelector, dp.Spec.NodeSelector) {
duplicated = true
continue
}
}
if duplicated {
continue
}
distinctPods[pod] = struct{}{}
}
}
// For each distinct pod get a list of nodes where it can land
targetNodesMap := map[string]*v1.Node{}
for pod := range distinctPods {
matchingNodes := map[string]*v1.Node{}
for _, node := range nodes {
if !utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool {
return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute
}) {
continue
}
if match, err := utils.PodMatchNodeSelector(pod, node); err == nil && !match {
continue
}
matchingNodes[node.Name] = node
}
if len(matchingNodes) > 1 {
for nodeName := range matchingNodes {
targetNodesMap[nodeName] = matchingNodes[nodeName]
}
}
}
targetNodes := []*v1.Node{}
for _, node := range targetNodesMap {
targetNodes = append(targetNodes, node)
}
return targetNodes
}
func hasExcludedOwnerRefKind(ownerRefs []metav1.OwnerReference, strategy api.DeschedulerStrategy) bool {
if strategy.Params == nil || strategy.Params.RemoveDuplicates == nil {
return false

View File

@@ -20,10 +20,8 @@ import (
"context"
"testing"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
@@ -33,73 +31,35 @@ import (
"sigs.k8s.io/descheduler/test"
)
func buildTestPodWithImage(podName, node, image string) *v1.Pod {
pod := test.BuildTestPod(podName, 100, 0, node, test.SetRSOwnerRef)
pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
Name: image,
Image: image,
})
return pod
}
func TestFindDuplicatePods(t *testing.T) {
ctx := context.Background()
// first setup pods
node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
node2 := test.BuildTestNode("n2", 2000, 3000, 10, nil)
node3 := test.BuildTestNode("n3", 2000, 3000, 10, func(node *v1.Node) {
node.Spec.Taints = []v1.Taint{
{
Key: "hardware",
Value: "gpu",
Effect: v1.TaintEffectNoSchedule,
},
}
})
node4 := test.BuildTestNode("n4", 2000, 3000, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
"datacenter": "east",
}
})
node5 := test.BuildTestNode("n5", 2000, 3000, 10, func(node *v1.Node) {
node.Spec = v1.NodeSpec{
Unschedulable: true,
}
})
p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil)
node := test.BuildTestNode("n1", 2000, 3000, 10, nil)
p1 := test.BuildTestPod("p1", 100, 0, node.Name, nil)
p1.Namespace = "dev"
p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil)
p2 := test.BuildTestPod("p2", 100, 0, node.Name, nil)
p2.Namespace = "dev"
p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil)
p3 := test.BuildTestPod("p3", 100, 0, node.Name, nil)
p3.Namespace = "dev"
p4 := test.BuildTestPod("p4", 100, 0, node1.Name, nil)
p5 := test.BuildTestPod("p5", 100, 0, node1.Name, nil)
p6 := test.BuildTestPod("p6", 100, 0, node1.Name, nil)
p7 := test.BuildTestPod("p7", 100, 0, node1.Name, nil)
p4 := test.BuildTestPod("p4", 100, 0, node.Name, nil)
p5 := test.BuildTestPod("p5", 100, 0, node.Name, nil)
p6 := test.BuildTestPod("p6", 100, 0, node.Name, nil)
p7 := test.BuildTestPod("p7", 100, 0, node.Name, nil)
p7.Namespace = "kube-system"
p8 := test.BuildTestPod("p8", 100, 0, node1.Name, nil)
p8 := test.BuildTestPod("p8", 100, 0, node.Name, nil)
p8.Namespace = "test"
p9 := test.BuildTestPod("p9", 100, 0, node1.Name, nil)
p9 := test.BuildTestPod("p9", 100, 0, node.Name, nil)
p9.Namespace = "test"
p10 := test.BuildTestPod("p10", 100, 0, node1.Name, nil)
p10 := test.BuildTestPod("p10", 100, 0, node.Name, nil)
p10.Namespace = "test"
p11 := test.BuildTestPod("p11", 100, 0, node1.Name, nil)
p11 := test.BuildTestPod("p11", 100, 0, node.Name, nil)
p11.Namespace = "different-images"
p12 := test.BuildTestPod("p12", 100, 0, node1.Name, nil)
p12 := test.BuildTestPod("p12", 100, 0, node.Name, nil)
p12.Namespace = "different-images"
p13 := test.BuildTestPod("p13", 100, 0, node1.Name, nil)
p13 := test.BuildTestPod("p13", 100, 0, node.Name, nil)
p13.Namespace = "different-images"
p14 := test.BuildTestPod("p14", 100, 0, node1.Name, nil)
p14 := test.BuildTestPod("p14", 100, 0, node.Name, nil)
p14.Namespace = "different-images"
p15 := test.BuildTestPod("p15", 100, 0, node1.Name, nil)
p15.Namespace = "node-fit"
p16 := test.BuildTestPod("NOT1", 100, 0, node1.Name, nil)
p16.Namespace = "node-fit"
p17 := test.BuildTestPod("NOT2", 100, 0, node1.Name, nil)
p17.Namespace = "node-fit"
p18 := test.BuildTestPod("TARGET", 100, 0, node1.Name, nil)
p18.Namespace = "node-fit"
// ### Evictable Pods ###
@@ -153,69 +113,45 @@ func TestFindDuplicatePods(t *testing.T) {
Image: "foo",
})
// ### Pods Evictable Based On Node Fit ###
ownerRef3 := test.GetReplicaSetOwnerRefList()
p15.ObjectMeta.OwnerReferences = ownerRef3
p16.ObjectMeta.OwnerReferences = ownerRef3
p17.ObjectMeta.OwnerReferences = ownerRef3
p18.ObjectMeta.OwnerReferences = ownerRef3
p15.Spec.NodeSelector = map[string]string{
"datacenter": "west",
}
p16.Spec.NodeSelector = map[string]string{
"datacenter": "west",
}
p17.Spec.NodeSelector = map[string]string{
"datacenter": "west",
}
testCases := []struct {
description string
maxPodsToEvictPerNode int
pods []v1.Pod
nodes []*v1.Node
expectedEvictedPodCount int
strategy api.DeschedulerStrategy
}{
{
description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. 1 should be evicted.",
description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. 2 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 1,
expectedEvictedPodCount: 2,
strategy: api.DeschedulerStrategy{},
},
{
description: "Three pods in the `dev` Namespace, bound to same ReplicaSet, but ReplicaSet kind is excluded. 0 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{RemoveDuplicates: &api.RemoveDuplicates{ExcludeOwnerKinds: []string{"ReplicaSet"}}}},
},
{
description: "Three Pods in the `test` Namespace, bound to same ReplicaSet. 1 should be evicted.",
description: "Three Pods in the `test` Namespace, bound to same ReplicaSet. 2 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p8, *p9, *p10},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 1,
expectedEvictedPodCount: 2,
strategy: api.DeschedulerStrategy{},
},
{
description: "Three Pods in the `dev` Namespace, three Pods in the `test` Namespace. Bound to ReplicaSet with same name. 4 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3, *p8, *p9, *p10},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 2,
expectedEvictedPodCount: 4,
strategy: api.DeschedulerStrategy{},
},
{
description: "Pods are: part of DaemonSet, with local storage, mirror pod annotation, critical pod annotation - none should be evicted.",
maxPodsToEvictPerNode: 2,
pods: []v1.Pod{*p4, *p5, *p6, *p7},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{},
},
@@ -223,15 +159,13 @@ func TestFindDuplicatePods(t *testing.T) {
description: "Test all Pods: 4 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3, *p4, *p5, *p6, *p7, *p8, *p9, *p10},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 2,
expectedEvictedPodCount: 4,
strategy: api.DeschedulerStrategy{},
},
{
description: "Pods with the same owner but different images should not be evicted",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p11, *p12},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{},
},
@@ -239,7 +173,6 @@ func TestFindDuplicatePods(t *testing.T) {
description: "Pods with multiple containers should not match themselves",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p13},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{},
},
@@ -247,466 +180,33 @@ func TestFindDuplicatePods(t *testing.T) {
description: "Pods with matching ownerrefs and at not all matching image should not trigger an eviction",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p11, *p13},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{},
},
{
description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. Only node available has a taint, and nodeFit set to true. 0 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3},
nodes: []*v1.Node{node1, node3},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}},
},
{
description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet, all with a nodeSelector. Only node available has an incorrect node label, and nodeFit set to true. 0 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p15, *p16, *p17},
nodes: []*v1.Node{node1, node4},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}},
},
{
description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet. Only node available is not schedulable, and nodeFit set to true. 0 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3},
nodes: []*v1.Node{node1, node5},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}},
},
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: testCase.pods}, nil
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
"v1",
false,
testCase.maxPodsToEvictPerNode,
testCase.nodes,
false,
false,
false,
)
RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != testCase.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted)
}
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: testCase.pods}, nil
})
fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) {
return true, node, nil
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
"v1",
false,
testCase.maxPodsToEvictPerNode,
[]*v1.Node{node},
false,
)
RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, []*v1.Node{node}, podEvictor)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != testCase.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted)
}
}
}
func TestRemoveDuplicatesUniformly(t *testing.T) {
ctx := context.Background()
setRSOwnerRef2 := func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = []metav1.OwnerReference{
{Kind: "ReplicaSet", APIVersion: "v1", Name: "replicaset-2"},
}
}
setTwoRSOwnerRef := func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = []metav1.OwnerReference{
{Kind: "ReplicaSet", APIVersion: "v1", Name: "replicaset-1"},
{Kind: "ReplicaSet", APIVersion: "v1", Name: "replicaset-2"},
}
}
setTolerationsK1 := func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
pod.Spec.Tolerations = []v1.Toleration{
{
Key: "k1",
Value: "v1",
Operator: v1.TolerationOpEqual,
Effect: v1.TaintEffectNoSchedule,
},
}
}
setTolerationsK2 := func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
pod.Spec.Tolerations = []v1.Toleration{
{
Key: "k2",
Value: "v2",
Operator: v1.TolerationOpEqual,
Effect: v1.TaintEffectNoSchedule,
},
}
}
setMasterNoScheduleTaint := func(node *v1.Node) {
node.Spec.Taints = []v1.Taint{
{
Effect: v1.TaintEffectNoSchedule,
Key: "node-role.kubernetes.io/master",
},
}
}
setMasterNoScheduleLabel := func(node *v1.Node) {
if node.ObjectMeta.Labels == nil {
node.ObjectMeta.Labels = map[string]string{}
}
node.ObjectMeta.Labels["node-role.kubernetes.io/master"] = ""
}
setWorkerLabel := func(node *v1.Node) {
if node.ObjectMeta.Labels == nil {
node.ObjectMeta.Labels = map[string]string{}
}
node.ObjectMeta.Labels["node-role.kubernetes.io/worker"] = "k1"
node.ObjectMeta.Labels["node-role.kubernetes.io/worker"] = "k2"
}
setNotMasterNodeSelectorK1 := func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
pod.Spec.Affinity = &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: "node-role.kubernetes.io/master",
Operator: v1.NodeSelectorOpDoesNotExist,
},
{
Key: "k1",
Operator: v1.NodeSelectorOpDoesNotExist,
},
},
},
},
},
},
}
}
setNotMasterNodeSelectorK2 := func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
pod.Spec.Affinity = &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: "node-role.kubernetes.io/master",
Operator: v1.NodeSelectorOpDoesNotExist,
},
{
Key: "k2",
Operator: v1.NodeSelectorOpDoesNotExist,
},
},
},
},
},
},
}
}
setWorkerLabelSelectorK1 := func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
if pod.Spec.NodeSelector == nil {
pod.Spec.NodeSelector = map[string]string{}
}
pod.Spec.NodeSelector["node-role.kubernetes.io/worker"] = "k1"
}
setWorkerLabelSelectorK2 := func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
if pod.Spec.NodeSelector == nil {
pod.Spec.NodeSelector = map[string]string{}
}
pod.Spec.NodeSelector["node-role.kubernetes.io/worker"] = "k2"
}
testCases := []struct {
description string
maxPodsToEvictPerNode int
pods []v1.Pod
nodes []*v1.Node
expectedEvictedPodCount int
strategy api.DeschedulerStrategy
}{
{
description: "Evict pods uniformly",
pods: []v1.Pod{
// (5,3,1) -> (3,3,3) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p3", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p4", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p5", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p6", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p7", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p8", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p9", 100, 0, "n3", test.SetRSOwnerRef),
},
expectedEvictedPodCount: 2,
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, nil),
test.BuildTestNode("n2", 2000, 3000, 10, nil),
test.BuildTestNode("n3", 2000, 3000, 10, nil),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods uniformly with one node left out",
pods: []v1.Pod{
// (5,3,1) -> (4,4,1) -> 1 eviction
*test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p3", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p4", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p5", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p6", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p7", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p8", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p9", 100, 0, "n3", test.SetRSOwnerRef),
},
expectedEvictedPodCount: 1,
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, nil),
test.BuildTestNode("n2", 2000, 3000, 10, nil),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods uniformly with two replica sets",
pods: []v1.Pod{
// (5,3,1) -> (3,3,3) -> 2 evictions
*test.BuildTestPod("p11", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p12", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p13", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p14", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p15", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p16", 100, 0, "n2", setTwoRSOwnerRef),
*test.BuildTestPod("p17", 100, 0, "n2", setTwoRSOwnerRef),
*test.BuildTestPod("p18", 100, 0, "n2", setTwoRSOwnerRef),
*test.BuildTestPod("p19", 100, 0, "n3", setTwoRSOwnerRef),
},
expectedEvictedPodCount: 4,
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, nil),
test.BuildTestNode("n2", 2000, 3000, 10, nil),
test.BuildTestNode("n3", 2000, 3000, 10, nil),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods uniformly with two owner references",
pods: []v1.Pod{
// (5,3,1) -> (3,3,3) -> 2 evictions
*test.BuildTestPod("p11", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p12", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p13", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p14", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p15", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p16", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p17", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p18", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p19", 100, 0, "n3", test.SetRSOwnerRef),
// (1,3,5) -> (3,3,3) -> 2 evictions
*test.BuildTestPod("p21", 100, 0, "n1", setRSOwnerRef2),
*test.BuildTestPod("p22", 100, 0, "n2", setRSOwnerRef2),
*test.BuildTestPod("p23", 100, 0, "n2", setRSOwnerRef2),
*test.BuildTestPod("p24", 100, 0, "n2", setRSOwnerRef2),
*test.BuildTestPod("p25", 100, 0, "n3", setRSOwnerRef2),
*test.BuildTestPod("p26", 100, 0, "n3", setRSOwnerRef2),
*test.BuildTestPod("p27", 100, 0, "n3", setRSOwnerRef2),
*test.BuildTestPod("p28", 100, 0, "n3", setRSOwnerRef2),
*test.BuildTestPod("p29", 100, 0, "n3", setRSOwnerRef2),
},
expectedEvictedPodCount: 4,
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, nil),
test.BuildTestNode("n2", 2000, 3000, 10, nil),
test.BuildTestNode("n3", 2000, 3000, 10, nil),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods with number of pods less than nodes",
pods: []v1.Pod{
// (2,0,0) -> (1,1,0) -> 1 eviction
*test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef),
},
expectedEvictedPodCount: 1,
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, nil),
test.BuildTestNode("n2", 2000, 3000, 10, nil),
test.BuildTestNode("n3", 2000, 3000, 10, nil),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods with number of pods less than nodes, but ignore different pods with the same ownerref",
pods: []v1.Pod{
// (1, 0, 0) for "bar","baz" images -> no eviction, even with a matching ownerKey
// (2, 0, 0) for "foo" image -> (1,1,0) - 1 eviction
// In this case the only "real" duplicates are p1 and p4, so one of those should be evicted
*buildTestPodWithImage("p1", "n1", "foo"),
*buildTestPodWithImage("p2", "n1", "bar"),
*buildTestPodWithImage("p3", "n1", "baz"),
*buildTestPodWithImage("p4", "n1", "foo"),
},
expectedEvictedPodCount: 1,
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, nil),
test.BuildTestNode("n2", 2000, 3000, 10, nil),
test.BuildTestNode("n3", 2000, 3000, 10, nil),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods with a single pod with three nodes",
pods: []v1.Pod{
// (2,0,0) -> (1,1,0) -> 1 eviction
*test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
},
expectedEvictedPodCount: 0,
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, nil),
test.BuildTestNode("n2", 2000, 3000, 10, nil),
test.BuildTestNode("n3", 2000, 3000, 10, nil),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods uniformly respecting taints",
pods: []v1.Pod{
// (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "worker1", setTolerationsK1),
*test.BuildTestPod("p2", 100, 0, "worker1", setTolerationsK2),
*test.BuildTestPod("p3", 100, 0, "worker1", setTolerationsK1),
*test.BuildTestPod("p4", 100, 0, "worker1", setTolerationsK2),
*test.BuildTestPod("p5", 100, 0, "worker1", setTolerationsK1),
*test.BuildTestPod("p6", 100, 0, "worker2", setTolerationsK2),
*test.BuildTestPod("p7", 100, 0, "worker2", setTolerationsK1),
*test.BuildTestPod("p8", 100, 0, "worker2", setTolerationsK2),
*test.BuildTestPod("p9", 100, 0, "worker3", setTolerationsK1),
},
expectedEvictedPodCount: 2,
nodes: []*v1.Node{
test.BuildTestNode("worker1", 2000, 3000, 10, nil),
test.BuildTestNode("worker2", 2000, 3000, 10, nil),
test.BuildTestNode("worker3", 2000, 3000, 10, nil),
test.BuildTestNode("master1", 2000, 3000, 10, setMasterNoScheduleTaint),
test.BuildTestNode("master2", 2000, 3000, 10, setMasterNoScheduleTaint),
test.BuildTestNode("master3", 2000, 3000, 10, setMasterNoScheduleTaint),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods uniformly respecting RequiredDuringSchedulingIgnoredDuringExecution node affinity",
pods: []v1.Pod{
// (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "worker1", setNotMasterNodeSelectorK1),
*test.BuildTestPod("p2", 100, 0, "worker1", setNotMasterNodeSelectorK2),
*test.BuildTestPod("p3", 100, 0, "worker1", setNotMasterNodeSelectorK1),
*test.BuildTestPod("p4", 100, 0, "worker1", setNotMasterNodeSelectorK2),
*test.BuildTestPod("p5", 100, 0, "worker1", setNotMasterNodeSelectorK1),
*test.BuildTestPod("p6", 100, 0, "worker2", setNotMasterNodeSelectorK2),
*test.BuildTestPod("p7", 100, 0, "worker2", setNotMasterNodeSelectorK1),
*test.BuildTestPod("p8", 100, 0, "worker2", setNotMasterNodeSelectorK2),
*test.BuildTestPod("p9", 100, 0, "worker3", setNotMasterNodeSelectorK1),
},
expectedEvictedPodCount: 2,
nodes: []*v1.Node{
test.BuildTestNode("worker1", 2000, 3000, 10, nil),
test.BuildTestNode("worker2", 2000, 3000, 10, nil),
test.BuildTestNode("worker3", 2000, 3000, 10, nil),
test.BuildTestNode("master1", 2000, 3000, 10, setMasterNoScheduleLabel),
test.BuildTestNode("master2", 2000, 3000, 10, setMasterNoScheduleLabel),
test.BuildTestNode("master3", 2000, 3000, 10, setMasterNoScheduleLabel),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods uniformly respecting node selector",
pods: []v1.Pod{
// (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2),
*test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2),
*test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2),
*test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1),
*test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2),
*test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1),
},
expectedEvictedPodCount: 2,
nodes: []*v1.Node{
test.BuildTestNode("worker1", 2000, 3000, 10, setWorkerLabel),
test.BuildTestNode("worker2", 2000, 3000, 10, setWorkerLabel),
test.BuildTestNode("worker3", 2000, 3000, 10, setWorkerLabel),
test.BuildTestNode("master1", 2000, 3000, 10, nil),
test.BuildTestNode("master2", 2000, 3000, 10, nil),
test.BuildTestNode("master3", 2000, 3000, 10, nil),
},
strategy: api.DeschedulerStrategy{},
},
{
description: "Evict pods uniformly respecting node selector with zero target nodes",
pods: []v1.Pod{
// (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2),
*test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2),
*test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2),
*test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1),
*test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2),
*test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1),
},
expectedEvictedPodCount: 0,
nodes: []*v1.Node{
test.BuildTestNode("worker1", 2000, 3000, 10, nil),
test.BuildTestNode("worker2", 2000, 3000, 10, nil),
test.BuildTestNode("worker3", 2000, 3000, 10, nil),
test.BuildTestNode("master1", 2000, 3000, 10, nil),
test.BuildTestNode("master2", 2000, 3000, 10, nil),
test.BuildTestNode("master3", 2000, 3000, 10, nil),
},
strategy: api.DeschedulerStrategy{},
},
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: testCase.pods}, nil
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
testCase.maxPodsToEvictPerNode,
testCase.nodes,
false,
false,
false,
)
RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != testCase.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted)
}
})
}
}

View File

@@ -1,171 +0,0 @@
package strategies
import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/util/sets"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies/validation"
)
// validatedFailedPodsStrategyParams contains validated strategy parameters
type validatedFailedPodsStrategyParams struct {
validation.ValidatedStrategyParams
includingInitContainers bool
reasons sets.String
excludeOwnerKinds sets.String
minPodLifetimeSeconds *uint
}
// RemoveFailedPods removes Pods that are in failed status phase.
func RemoveFailedPods(
ctx context.Context,
client clientset.Interface,
strategy api.DeschedulerStrategy,
nodes []*v1.Node,
podEvictor *evictions.PodEvictor,
) {
strategyParams, err := validateAndParseRemoveFailedPodsParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Invalid RemoveFailedPods parameters")
return
}
evictable := podEvictor.Evictable(
evictions.WithPriorityThreshold(strategyParams.ThresholdPriority),
evictions.WithNodeFit(strategyParams.NodeFit),
evictions.WithLabelSelector(strategyParams.LabelSelector),
)
var labelSelector *metav1.LabelSelector
if strategy.Params != nil {
labelSelector = strategy.Params.LabelSelector
}
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
fieldSelectorString := "spec.nodeName=" + node.Name + ",status.phase=" + string(v1.PodFailed)
pods, err := podutil.ListPodsOnANodeWithFieldSelector(
ctx,
client,
node,
fieldSelectorString,
podutil.WithFilter(evictable.IsEvictable),
podutil.WithNamespaces(strategyParams.IncludedNamespaces.UnsortedList()),
podutil.WithoutNamespaces(strategyParams.ExcludedNamespaces.UnsortedList()),
podutil.WithLabelSelector(labelSelector),
)
if err != nil {
klog.ErrorS(err, "Error listing a nodes failed pods", "node", klog.KObj(node))
continue
}
for i, pod := range pods {
if err = validateFailedPodShouldEvict(pod, *strategyParams); err != nil {
klog.V(4).InfoS(fmt.Sprintf("ignoring pod for eviction due to: %s", err.Error()), "pod", klog.KObj(pod))
continue
}
if _, err = podEvictor.EvictPod(ctx, pods[i], node, "FailedPod"); err != nil {
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
break
}
}
}
}
func validateAndParseRemoveFailedPodsParams(
ctx context.Context,
client clientset.Interface,
params *api.StrategyParameters,
) (*validatedFailedPodsStrategyParams, error) {
if params == nil {
return &validatedFailedPodsStrategyParams{
ValidatedStrategyParams: validation.DefaultValidatedStrategyParams(),
}, nil
}
strategyParams, err := validation.ValidateAndParseStrategyParams(ctx, client, params)
if err != nil {
return nil, err
}
var reasons, excludeOwnerKinds sets.String
var includingInitContainers bool
var minPodLifetimeSeconds *uint
if params.FailedPods != nil {
reasons = sets.NewString(params.FailedPods.Reasons...)
includingInitContainers = params.FailedPods.IncludingInitContainers
excludeOwnerKinds = sets.NewString(params.FailedPods.ExcludeOwnerKinds...)
minPodLifetimeSeconds = params.FailedPods.MinPodLifetimeSeconds
}
return &validatedFailedPodsStrategyParams{
ValidatedStrategyParams: *strategyParams,
includingInitContainers: includingInitContainers,
reasons: reasons,
excludeOwnerKinds: excludeOwnerKinds,
minPodLifetimeSeconds: minPodLifetimeSeconds,
}, nil
}
// validateFailedPodShouldEvict looks at strategy params settings to see if the Pod
// should be evicted given the params in the PodFailed policy.
func validateFailedPodShouldEvict(pod *v1.Pod, strategyParams validatedFailedPodsStrategyParams) error {
var errs []error
if strategyParams.minPodLifetimeSeconds != nil {
podAgeSeconds := uint(metav1.Now().Sub(pod.GetCreationTimestamp().Local()).Seconds())
if podAgeSeconds < *strategyParams.minPodLifetimeSeconds {
errs = append(errs, fmt.Errorf("pod does not exceed the min age seconds of %d", *strategyParams.minPodLifetimeSeconds))
}
}
if len(strategyParams.excludeOwnerKinds) > 0 {
ownerRefList := podutil.OwnerRef(pod)
for _, owner := range ownerRefList {
if strategyParams.excludeOwnerKinds.Has(owner.Kind) {
errs = append(errs, fmt.Errorf("pod's owner kind of %s is excluded", owner.Kind))
}
}
}
if len(strategyParams.reasons) > 0 {
reasons := getFailedContainerStatusReasons(pod.Status.ContainerStatuses)
if strategyParams.includingInitContainers {
reasons = append(reasons, getFailedContainerStatusReasons(pod.Status.InitContainerStatuses)...)
}
if !strategyParams.reasons.HasAny(reasons...) {
errs = append(errs, fmt.Errorf("pod does not match any of the reasons"))
}
}
return utilerrors.NewAggregate(errs)
}
func getFailedContainerStatusReasons(containerStatuses []v1.ContainerStatus) []string {
reasons := make([]string, 0)
for _, containerStatus := range containerStatuses {
if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason != "" {
reasons = append(reasons, containerStatus.State.Waiting.Reason)
}
if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.Reason != "" {
reasons = append(reasons, containerStatus.State.Terminated.Reason)
}
}
return reasons
}

View File

@@ -1,282 +0,0 @@
package strategies
import (
"context"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"testing"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
"sigs.k8s.io/descheduler/test"
)
var (
OneHourInSeconds uint = 3600
)
func TestRemoveFailedPods(t *testing.T) {
ctx := context.Background()
createStrategy := func(enabled, includingInitContainers bool, reasons, excludeKinds []string, minAgeSeconds *uint, nodeFit bool) api.DeschedulerStrategy {
return api.DeschedulerStrategy{
Enabled: enabled,
Params: &api.StrategyParameters{
FailedPods: &api.FailedPods{
Reasons: reasons,
IncludingInitContainers: includingInitContainers,
ExcludeOwnerKinds: excludeKinds,
MinPodLifetimeSeconds: minAgeSeconds,
},
NodeFit: nodeFit,
},
}
}
tests := []struct {
description string
nodes []*v1.Node
strategy api.DeschedulerStrategy
expectedEvictedPodCount int
pods []v1.Pod
}{
{
description: "default empty strategy, 0 failures, 0 evictions",
strategy: api.DeschedulerStrategy{},
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []v1.Pod{}, // no pods come back with field selector phase=Failed
},
{
description: "0 failures, 0 evictions",
strategy: createStrategy(true, false, nil, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []v1.Pod{}, // no pods come back with field selector phase=Failed
},
{
description: "1 container terminated with reason NodeAffinity, 1 eviction",
strategy: createStrategy(true, false, nil, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []v1.Pod{
buildTestPod("p1", "node1", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}),
},
},
{
description: "1 init container terminated with reason NodeAffinity, 1 eviction",
strategy: createStrategy(true, true, nil, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []v1.Pod{
buildTestPod("p1", "node1", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}, nil),
},
},
{
description: "1 init container waiting with reason CreateContainerConfigError, 1 eviction",
strategy: createStrategy(true, true, nil, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []v1.Pod{
buildTestPod("p1", "node1", &v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "CreateContainerConfigError"},
}, nil),
},
},
{
description: "2 init container waiting with reason CreateContainerConfigError, 2 nodes, 2 evictions",
strategy: createStrategy(true, true, nil, nil, nil, false),
nodes: []*v1.Node{
test.BuildTestNode("node1", 2000, 3000, 10, nil),
test.BuildTestNode("node2", 2000, 3000, 10, nil),
},
expectedEvictedPodCount: 2,
pods: []v1.Pod{
buildTestPod("p1", "node1", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "CreateContainerConfigError"},
}, nil),
buildTestPod("p2", "node2", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "CreateContainerConfigError"},
}, nil),
},
},
{
description: "include reason=CreateContainerConfigError, 1 container terminated with reason CreateContainerConfigError, 1 eviction",
strategy: createStrategy(true, false, []string{"CreateContainerConfigError"}, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []v1.Pod{
buildTestPod("p1", "node1", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "CreateContainerConfigError"},
}),
},
},
{
description: "include reason=CreateContainerConfigError+NodeAffinity, 1 container terminated with reason CreateContainerConfigError, 1 eviction",
strategy: createStrategy(true, false, []string{"CreateContainerConfigError", "NodeAffinity"}, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []v1.Pod{
buildTestPod("p1", "node1", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "CreateContainerConfigError"},
}),
},
},
{
description: "include reason=CreateContainerConfigError, 1 container terminated with reason NodeAffinity, 0 eviction",
strategy: createStrategy(true, false, []string{"CreateContainerConfigError"}, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []v1.Pod{
buildTestPod("p1", "node1", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}),
},
},
{
description: "include init container=false, 1 init container waiting with reason CreateContainerConfigError, 0 eviction",
strategy: createStrategy(true, false, []string{"CreateContainerConfigError"}, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []v1.Pod{
buildTestPod("p1", "node1", &v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "CreateContainerConfigError"},
}, nil),
},
},
{
description: "lifetime 1 hour, 1 container terminated with reason NodeAffinity, 0 eviction",
strategy: createStrategy(true, false, nil, nil, &OneHourInSeconds, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []v1.Pod{
buildTestPod("p1", "node1", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}),
},
},
{
description: "nodeFit=true, 1 unschedulable node, 1 container terminated with reason NodeAffinity, 0 eviction",
strategy: createStrategy(true, false, nil, nil, nil, true),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, func(node *v1.Node) {
node.Spec.Unschedulable = true
})},
expectedEvictedPodCount: 0,
pods: []v1.Pod{
buildTestPod("p1", "node1", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}),
},
},
{
description: "excluded owner kind=ReplicaSet, 1 init container terminated with owner kind=ReplicaSet, 0 eviction",
strategy: createStrategy(true, true, nil, []string{"ReplicaSet"}, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []v1.Pod{
buildTestPod("p1", "node1", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}, nil),
},
},
{
description: "excluded owner kind=DaemonSet, 1 init container terminated with owner kind=ReplicaSet, 1 eviction",
strategy: createStrategy(true, true, nil, []string{"DaemonSet"}, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []v1.Pod{
buildTestPod("p1", "node1", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}, nil),
},
},
}
for _, tc := range tests {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: tc.pods}, nil
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
100,
tc.nodes,
false,
false,
false,
)
RemoveFailedPods(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount)
}
}
}
func TestValidRemoveFailedPodsParams(t *testing.T) {
ctx := context.Background()
fakeClient := &fake.Clientset{}
testCases := []struct {
name string
params *api.StrategyParameters
}{
{name: "validate nil params", params: nil},
{name: "validate empty params", params: &api.StrategyParameters{}},
{name: "validate reasons params", params: &api.StrategyParameters{FailedPods: &api.FailedPods{
Reasons: []string{"CreateContainerConfigError"},
}}},
{name: "validate includingInitContainers params", params: &api.StrategyParameters{FailedPods: &api.FailedPods{
IncludingInitContainers: true,
}}},
{name: "validate excludeOwnerKinds params", params: &api.StrategyParameters{FailedPods: &api.FailedPods{
ExcludeOwnerKinds: []string{"Job"},
}}},
{name: "validate excludeOwnerKinds params", params: &api.StrategyParameters{FailedPods: &api.FailedPods{
MinPodLifetimeSeconds: &OneHourInSeconds,
}}},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
params, err := validateAndParseRemoveFailedPodsParams(ctx, fakeClient, tc.params)
if err != nil {
t.Errorf("strategy params should be valid but got err: %v", err.Error())
}
if params == nil {
t.Errorf("strategy params should return a ValidatedFailedPodsStrategyParams but got nil")
}
})
}
}
func buildTestPod(podName, nodeName string, initContainerState, containerState *v1.ContainerState) v1.Pod {
pod := test.BuildTestPod(podName, 1, 1, nodeName, func(p *v1.Pod) {
ps := v1.PodStatus{}
if initContainerState != nil {
ps.InitContainerStatuses = []v1.ContainerStatus{{State: *initContainerState}}
ps.Phase = v1.PodFailed
}
if containerState != nil {
ps.ContainerStatuses = []v1.ContainerStatus{{State: *containerState}}
ps.Phase = v1.PodFailed
}
p.Status = ps
})
pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList()
pod.ObjectMeta.SetCreationTimestamp(metav1.Now())
return *pod
}

View File

@@ -0,0 +1,408 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package strategies
import (
"context"
"fmt"
"sort"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
)
// NodeUsageMap stores a node's info, pods on it and its resource usage
type NodeUsageMap struct {
node *v1.Node
usage api.ResourceThresholds
allPods []*v1.Pod
}
// NodePodsMap is a set of (node, pods) pairs
type NodePodsMap map[*v1.Node][]*v1.Pod
const (
// MinResourcePercentage is the minimum value of a resource's percentage
MinResourcePercentage = 0
// MaxResourcePercentage is the maximum value of a resource's percentage
MaxResourcePercentage = 100
)
// LowNodeUtilization evicts pods from overutilized nodes to underutilized nodes. Note that CPU/Memory requests are used
// to calculate nodes' utilization and not the actual resource usage.
func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
// todo: move to config validation?
// TODO: May be create a struct for the strategy as well, so that we don't have to pass along the all the params?
if strategy.Params == nil || strategy.Params.NodeResourceUtilizationThresholds == nil {
klog.V(1).Infof("NodeResourceUtilizationThresholds not set")
return
}
thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds
targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds
if err := validateStrategyConfig(thresholds, targetThresholds); err != nil {
klog.Errorf("LowNodeUtilization config is not valid: %v", err)
return
}
// check if Pods/CPU/Mem are set, if not, set them to 100
if _, ok := thresholds[v1.ResourcePods]; !ok {
thresholds[v1.ResourcePods] = MaxResourcePercentage
targetThresholds[v1.ResourcePods] = MaxResourcePercentage
}
if _, ok := thresholds[v1.ResourceCPU]; !ok {
thresholds[v1.ResourceCPU] = MaxResourcePercentage
targetThresholds[v1.ResourceCPU] = MaxResourcePercentage
}
if _, ok := thresholds[v1.ResourceMemory]; !ok {
thresholds[v1.ResourceMemory] = MaxResourcePercentage
targetThresholds[v1.ResourceMemory] = MaxResourcePercentage
}
npm := createNodePodsMap(ctx, client, nodes)
lowNodes, targetNodes := classifyNodes(npm, thresholds, targetThresholds)
klog.V(1).Infof("Criteria for a node under utilization: CPU: %v, Mem: %v, Pods: %v",
thresholds[v1.ResourceCPU], thresholds[v1.ResourceMemory], thresholds[v1.ResourcePods])
if len(lowNodes) == 0 {
klog.V(1).Infof("No node is underutilized, nothing to do here, you might tune your thresholds further")
return
}
klog.V(1).Infof("Total number of underutilized nodes: %v", len(lowNodes))
if len(lowNodes) < strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes {
klog.V(1).Infof("number of nodes underutilized (%v) is less than NumberOfNodes (%v), nothing to do here", len(lowNodes), strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes)
return
}
if len(lowNodes) == len(nodes) {
klog.V(1).Infof("all nodes are underutilized, nothing to do here")
return
}
if len(targetNodes) == 0 {
klog.V(1).Infof("all nodes are under target utilization, nothing to do here")
return
}
klog.V(1).Infof("Criteria for a node above target utilization: CPU: %v, Mem: %v, Pods: %v",
targetThresholds[v1.ResourceCPU], targetThresholds[v1.ResourceMemory], targetThresholds[v1.ResourcePods])
klog.V(1).Infof("Total number of nodes above target utilization: %v", len(targetNodes))
evictPodsFromTargetNodes(
ctx,
targetNodes,
lowNodes,
targetThresholds,
podEvictor)
klog.V(1).Infof("Total number of pods evicted: %v", podEvictor.TotalEvicted())
}
// validateStrategyConfig checks if the strategy's config is valid
func validateStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error {
// validate thresholds and targetThresholds config
if err := validateThresholds(thresholds); err != nil {
return fmt.Errorf("thresholds config is not valid: %v", err)
}
if err := validateThresholds(targetThresholds); err != nil {
return fmt.Errorf("targetThresholds config is not valid: %v", err)
}
// validate if thresholds and targetThresholds have same resources configured
if len(thresholds) != len(targetThresholds) {
return fmt.Errorf("thresholds and targetThresholds configured different resources")
}
for resourceName, value := range thresholds {
if targetValue, ok := targetThresholds[resourceName]; !ok {
return fmt.Errorf("thresholds and targetThresholds configured different resources")
} else if value > targetValue {
return fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", resourceName)
}
}
return nil
}
// validateThresholds checks if thresholds have valid resource name and resource percentage configured
func validateThresholds(thresholds api.ResourceThresholds) error {
if thresholds == nil || len(thresholds) == 0 {
return fmt.Errorf("no resource threshold is configured")
}
for name, percent := range thresholds {
switch name {
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods:
if percent < MinResourcePercentage || percent > MaxResourcePercentage {
return fmt.Errorf("%v threshold not in [%v, %v] range", name, MinResourcePercentage, MaxResourcePercentage)
}
default:
return fmt.Errorf("only cpu, memory, or pods thresholds can be specified")
}
}
return nil
}
// classifyNodes classifies the nodes into low-utilization or high-utilization nodes. If a node lies between
// low and high thresholds, it is simply ignored.
func classifyNodes(npm NodePodsMap, thresholds api.ResourceThresholds, targetThresholds api.ResourceThresholds) ([]NodeUsageMap, []NodeUsageMap) {
lowNodes, targetNodes := []NodeUsageMap{}, []NodeUsageMap{}
for node, pods := range npm {
usage := nodeUtilization(node, pods)
nuMap := NodeUsageMap{
node: node,
usage: usage,
allPods: pods,
}
// Check if node is underutilized and if we can schedule pods on it.
if !nodeutil.IsNodeUnschedulable(node) && isNodeWithLowUtilization(usage, thresholds) {
klog.V(2).Infof("Node %#v is under utilized with usage: %#v", node.Name, usage)
lowNodes = append(lowNodes, nuMap)
} else if isNodeAboveTargetUtilization(usage, targetThresholds) {
klog.V(2).Infof("Node %#v is over utilized with usage: %#v", node.Name, usage)
targetNodes = append(targetNodes, nuMap)
} else {
klog.V(2).Infof("Node %#v is appropriately utilized with usage: %#v", node.Name, usage)
}
}
return lowNodes, targetNodes
}
// evictPodsFromTargetNodes evicts pods based on priority, if all the pods on the node have priority, if not
// evicts them based on QoS as fallback option.
// TODO: @ravig Break this function into smaller functions.
func evictPodsFromTargetNodes(
ctx context.Context,
targetNodes, lowNodes []NodeUsageMap,
targetThresholds api.ResourceThresholds,
podEvictor *evictions.PodEvictor,
) {
sortNodesByUsage(targetNodes)
// upper bound on total number of pods/cpu/memory to be moved
var totalPods, totalCPU, totalMem float64
var taintsOfLowNodes = make(map[string][]v1.Taint, len(lowNodes))
for _, node := range lowNodes {
taintsOfLowNodes[node.node.Name] = node.node.Spec.Taints
nodeCapacity := node.node.Status.Capacity
if len(node.node.Status.Allocatable) > 0 {
nodeCapacity = node.node.Status.Allocatable
}
// totalPods to be moved
podsPercentage := targetThresholds[v1.ResourcePods] - node.usage[v1.ResourcePods]
totalPods += ((float64(podsPercentage) * float64(nodeCapacity.Pods().Value())) / 100)
// totalCPU capacity to be moved
cpuPercentage := targetThresholds[v1.ResourceCPU] - node.usage[v1.ResourceCPU]
totalCPU += ((float64(cpuPercentage) * float64(nodeCapacity.Cpu().MilliValue())) / 100)
// totalMem capacity to be moved
memPercentage := targetThresholds[v1.ResourceMemory] - node.usage[v1.ResourceMemory]
totalMem += ((float64(memPercentage) * float64(nodeCapacity.Memory().Value())) / 100)
}
klog.V(1).Infof("Total capacity to be moved: CPU:%v, Mem:%v, Pods:%v", totalCPU, totalMem, totalPods)
klog.V(1).Infof("********Number of pods evicted from each node:***********")
for _, node := range targetNodes {
nodeCapacity := node.node.Status.Capacity
if len(node.node.Status.Allocatable) > 0 {
nodeCapacity = node.node.Status.Allocatable
}
klog.V(3).Infof("evicting pods from node %#v with usage: %#v", node.node.Name, node.usage)
nonRemovablePods, removablePods := classifyPods(node.allPods, podEvictor)
klog.V(2).Infof("allPods:%v, nonRemovablePods:%v, removablePods:%v", len(node.allPods), len(nonRemovablePods), len(removablePods))
if len(removablePods) == 0 {
klog.V(1).Infof("no removable pods on node %#v, try next node", node.node.Name)
continue
}
klog.V(1).Infof("evicting pods based on priority, if they have same priority, they'll be evicted based on QoS tiers")
// sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers.
podutil.SortPodsBasedOnPriorityLowToHigh(removablePods)
evictPods(ctx, removablePods, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, taintsOfLowNodes, podEvictor, node.node)
klog.V(1).Infof("%v pods evicted from node %#v with usage %v", podEvictor.NodeEvicted(node.node), node.node.Name, node.usage)
}
}
func evictPods(
ctx context.Context,
inputPods []*v1.Pod,
targetThresholds api.ResourceThresholds,
nodeCapacity v1.ResourceList,
nodeUsage api.ResourceThresholds,
totalPods *float64,
totalCPU *float64,
totalMem *float64,
taintsOfLowNodes map[string][]v1.Taint,
podEvictor *evictions.PodEvictor,
node *v1.Node) {
// stop if node utilization drops below target threshold or any of required capacity (cpu, memory, pods) is moved
if isNodeAboveTargetUtilization(nodeUsage, targetThresholds) && *totalPods > 0 && *totalCPU > 0 && *totalMem > 0 {
onePodPercentage := api.Percentage((float64(1) * 100) / float64(nodeCapacity.Pods().Value()))
for _, pod := range inputPods {
if !utils.PodToleratesTaints(pod, taintsOfLowNodes) {
klog.V(3).Infof("Skipping eviction for Pod: %#v, doesn't tolerate node taint", pod.Name)
continue
}
cUsage := utils.GetResourceRequest(pod, v1.ResourceCPU)
mUsage := utils.GetResourceRequest(pod, v1.ResourceMemory)
success, err := podEvictor.EvictPod(ctx, pod, node, "LowNodeUtilization")
if err != nil {
klog.Errorf("Error evicting pod: (%#v)", err)
break
}
if success {
klog.V(3).Infof("Evicted pod: %#v", pod.Name)
// update remaining pods
nodeUsage[v1.ResourcePods] -= onePodPercentage
*totalPods--
// update remaining cpu
*totalCPU -= float64(cUsage)
nodeUsage[v1.ResourceCPU] -= api.Percentage((float64(cUsage) * 100) / float64(nodeCapacity.Cpu().MilliValue()))
// update remaining memory
*totalMem -= float64(mUsage)
nodeUsage[v1.ResourceMemory] -= api.Percentage(float64(mUsage) / float64(nodeCapacity.Memory().Value()) * 100)
klog.V(3).Infof("updated node usage: %#v", nodeUsage)
// check if node utilization drops below target threshold or any required capacity (cpu, memory, pods) is moved
if !isNodeAboveTargetUtilization(nodeUsage, targetThresholds) || *totalPods <= 0 || *totalCPU <= 0 || *totalMem <= 0 {
break
}
}
}
}
}
// sortNodesByUsage sorts nodes based on usage in descending order
func sortNodesByUsage(nodes []NodeUsageMap) {
sort.Slice(nodes, func(i, j int) bool {
var ti, tj api.Percentage
for name, value := range nodes[i].usage {
if name == v1.ResourceCPU || name == v1.ResourceMemory || name == v1.ResourcePods {
ti += value
}
}
for name, value := range nodes[j].usage {
if name == v1.ResourceCPU || name == v1.ResourceMemory || name == v1.ResourcePods {
tj += value
}
}
// To return sorted in descending order
return ti > tj
})
}
// createNodePodsMap returns nodepodsmap with evictable pods on node.
func createNodePodsMap(ctx context.Context, client clientset.Interface, nodes []*v1.Node) NodePodsMap {
npm := NodePodsMap{}
for _, node := range nodes {
pods, err := podutil.ListPodsOnANode(ctx, client, node, nil)
if err != nil {
klog.Warningf("node %s will not be processed, error in accessing its pods (%#v)", node.Name, err)
} else {
npm[node] = pods
}
}
return npm
}
// isNodeAboveTargetUtilization checks if a node is overutilized
func isNodeAboveTargetUtilization(nodeThresholds api.ResourceThresholds, thresholds api.ResourceThresholds) bool {
for name, nodeValue := range nodeThresholds {
if name == v1.ResourceCPU || name == v1.ResourceMemory || name == v1.ResourcePods {
if value, ok := thresholds[name]; !ok {
continue
} else if nodeValue > value {
return true
}
}
}
return false
}
// isNodeWithLowUtilization checks if a node is underutilized
func isNodeWithLowUtilization(nodeThresholds api.ResourceThresholds, thresholds api.ResourceThresholds) bool {
for name, nodeValue := range nodeThresholds {
if name == v1.ResourceCPU || name == v1.ResourceMemory || name == v1.ResourcePods {
if value, ok := thresholds[name]; !ok {
continue
} else if nodeValue > value {
return false
}
}
}
return true
}
func nodeUtilization(node *v1.Node, pods []*v1.Pod) api.ResourceThresholds {
totalReqs := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: {},
v1.ResourceMemory: {},
}
for _, pod := range pods {
req, _ := utils.PodRequestsAndLimits(pod)
for name, quantity := range req {
if name == v1.ResourceCPU || name == v1.ResourceMemory {
// As Quantity.Add says: Add adds the provided y quantity to the current value. If the current value is zero,
// the format of the quantity will be updated to the format of y.
totalReqs[name].Add(quantity)
}
}
}
nodeCapacity := node.Status.Capacity
if len(node.Status.Allocatable) > 0 {
nodeCapacity = node.Status.Allocatable
}
totalPods := len(pods)
return api.ResourceThresholds{
v1.ResourceCPU: api.Percentage((float64(totalReqs[v1.ResourceCPU].MilliValue()) * 100) / float64(nodeCapacity.Cpu().MilliValue())),
v1.ResourceMemory: api.Percentage(float64(totalReqs[v1.ResourceMemory].Value()) / float64(nodeCapacity.Memory().Value()) * 100),
v1.ResourcePods: api.Percentage((float64(totalPods) * 100) / float64(nodeCapacity.Pods().Value())),
}
}
func classifyPods(pods []*v1.Pod, evictor *evictions.PodEvictor) ([]*v1.Pod, []*v1.Pod) {
var nonRemovablePods, removablePods []*v1.Pod
for _, pod := range pods {
if !evictor.IsEvictable(pod) {
nonRemovablePods = append(nonRemovablePods, pod)
} else {
removablePods = append(removablePods, pod)
}
}
return nonRemovablePods, removablePods
}

View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeutilization
package strategies
import (
"context"
@@ -23,10 +23,13 @@ import (
"testing"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/api/policy/v1beta1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/runtime/serializer"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
@@ -35,16 +38,17 @@ import (
"sigs.k8s.io/descheduler/test"
)
var (
lowPriority = int32(0)
highPriority = int32(10000)
)
func TestLowNodeUtilization(t *testing.T) {
ctx := context.Background()
n1NodeName := "n1"
n2NodeName := "n2"
n3NodeName := "n3"
nodeSelectorKey := "datacenter"
nodeSelectorValue := "west"
notMatchingNodeSelectorValue := "east"
testCases := []struct {
name string
thresholds, targetThresholds api.ResourceThresholds
@@ -103,7 +107,7 @@ func TestLowNodeUtilization(t *testing.T) {
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p9", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p9", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {},
@@ -162,7 +166,7 @@ func TestLowNodeUtilization(t *testing.T) {
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p9", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p9", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {},
@@ -221,7 +225,7 @@ func TestLowNodeUtilization(t *testing.T) {
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p9", 400, 2100, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p9", 400, 2100, n1NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {},
@@ -300,7 +304,7 @@ func TestLowNodeUtilization(t *testing.T) {
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p9", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p9", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {},
@@ -376,7 +380,7 @@ func TestLowNodeUtilization(t *testing.T) {
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p9", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p9", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {},
@@ -385,353 +389,6 @@ func TestLowNodeUtilization(t *testing.T) {
expectedPodsEvicted: 4,
evictedPods: []string{"p1", "p2", "p4", "p5"},
},
{
name: "with extended resource",
thresholds: api.ResourceThresholds{
v1.ResourcePods: 30,
extendedResource: 30,
},
targetThresholds: api.ResourceThresholds{
v1.ResourcePods: 50,
extendedResource: 50,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 8)
}),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 8)
}),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 0, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with extended resource.
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p2", 0, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p3", 0, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p4", 0, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p5", 0, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p6", 0, 0, n1NodeName, func(pod *v1.Pod) {
test.SetNormalOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p7", 0, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with local storage.
test.SetNormalOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
}
// A Mirror Pod.
pod.Annotations = test.GetMirrorPodAnnotation()
}),
*test.BuildTestPod("p8", 0, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p9", 0, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
},
},
n3NodeName: {},
},
maxPodsToEvictPerNode: 0,
// 4 pods available for eviction based on v1.ResourcePods, only 3 pods can be evicted before extended resource is depleted
expectedPodsEvicted: 3,
},
{
name: "with extended resource in some of nodes",
thresholds: api.ResourceThresholds{
v1.ResourcePods: 30,
extendedResource: 30,
},
targetThresholds: api.ResourceThresholds{
v1.ResourcePods: 50,
extendedResource: 50,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 8)
}),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 0, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with extended resource.
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p9", 0, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {},
},
maxPodsToEvictPerNode: 0,
// 0 pods available for eviction because there's no enough extended resource in node2
expectedPodsEvicted: 0,
},
{
name: "without priorities, but only other node is unschedulable",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 50,
v1.ResourcePods: 50,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
*test.BuildTestPod("p6", 400, 0, n1NodeName, test.SetDSOwnerRef),
*test.BuildTestPod("p7", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with local storage.
test.SetNormalOwnerRef(pod)
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
}
// A Mirror Pod.
pod.Annotations = test.GetMirrorPodAnnotation()
}),
*test.BuildTestPod("p8", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
},
n2NodeName: {},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
},
{
name: "without priorities, but only other node doesn't match pod node selector for p4 and p5",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 50,
v1.ResourcePods: 50,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeSelectorKey: notMatchingNodeSelectorValue,
}
}),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
*test.BuildTestPod("p4", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod selecting nodes in the "west" datacenter
test.SetNormalOwnerRef(pod)
pod.Spec.NodeSelector = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
*test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod selecting nodes in the "west" datacenter
test.SetNormalOwnerRef(pod)
pod.Spec.NodeSelector = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
*test.BuildTestPod("p6", 400, 0, n1NodeName, test.SetDSOwnerRef),
*test.BuildTestPod("p7", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with local storage.
test.SetNormalOwnerRef(pod)
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
}
// A Mirror Pod.
pod.Annotations = test.GetMirrorPodAnnotation()
}),
*test.BuildTestPod("p8", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
},
n2NodeName: {},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 3,
},
{
name: "without priorities, but only other node doesn't match pod node affinity for p4 and p5",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 50,
v1.ResourcePods: 50,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeSelectorKey: notMatchingNodeSelectorValue,
}
}),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
*test.BuildTestPod("p4", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with affinity to run in the "west" datacenter upon scheduling
test.SetNormalOwnerRef(pod)
pod.Spec.Affinity = &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: nodeSelectorKey,
Operator: "In",
Values: []string{nodeSelectorValue},
},
},
},
},
},
},
}
}),
*test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with affinity to run in the "west" datacenter upon scheduling
test.SetNormalOwnerRef(pod)
pod.Spec.Affinity = &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: nodeSelectorKey,
Operator: "In",
Values: []string{nodeSelectorValue},
},
},
},
},
},
},
}
}),
*test.BuildTestPod("p6", 400, 0, n1NodeName, test.SetDSOwnerRef),
*test.BuildTestPod("p7", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with local storage.
test.SetNormalOwnerRef(pod)
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
}
// A Mirror Pod.
pod.Annotations = test.GetMirrorPodAnnotation()
}),
*test.BuildTestPod("p8", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
},
n2NodeName: {Items: []v1.Pod{
*test.BuildTestPod("p9", 0, 0, n2NodeName, test.SetRSOwnerRef),
}},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 3,
},
}
for _, test := range testCases {
@@ -786,13 +443,11 @@ func TestLowNodeUtilization(t *testing.T) {
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
"v1",
false,
test.maxPodsToEvictPerNode,
nodes,
false,
false,
false,
)
strategy := api.DeschedulerStrategy{
@@ -802,7 +457,6 @@ func TestLowNodeUtilization(t *testing.T) {
Thresholds: test.thresholds,
TargetThresholds: test.targetThresholds,
},
NodeFit: true,
},
}
LowNodeUtilization(ctx, fakeClient, strategy, nodes, podEvictor)
@@ -818,7 +472,7 @@ func TestLowNodeUtilization(t *testing.T) {
}
}
func TestValidateLowNodeUtilizationStrategyConfig(t *testing.T) {
func TestValidateStrategyConfig(t *testing.T) {
tests := []struct {
name string
thresholds api.ResourceThresholds
@@ -838,6 +492,19 @@ func TestValidateLowNodeUtilizationStrategyConfig(t *testing.T) {
errInfo: fmt.Errorf("thresholds config is not valid: %v", fmt.Errorf(
"%v threshold not in [%v, %v] range", v1.ResourceMemory, MinResourcePercentage, MaxResourcePercentage)),
},
{
name: "passing invalid targetThresholds",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 20,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
"resourceInvalid": 80,
},
errInfo: fmt.Errorf("targetThresholds config is not valid: %v",
fmt.Errorf("only cpu, memory, or pods thresholds can be specified")),
},
{
name: "thresholds and targetThresholds configured different num of resources",
thresholds: api.ResourceThresholds{
@@ -875,60 +542,6 @@ func TestValidateLowNodeUtilizationStrategyConfig(t *testing.T) {
},
errInfo: fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", v1.ResourceCPU),
},
{
name: "only thresholds configured extended resource",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 20,
extendedResource: 20,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
v1.ResourceMemory: 80,
},
errInfo: fmt.Errorf("thresholds and targetThresholds configured different resources"),
},
{
name: "only targetThresholds configured extended resource",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 20,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
v1.ResourceMemory: 80,
extendedResource: 80,
},
errInfo: fmt.Errorf("thresholds and targetThresholds configured different resources"),
},
{
name: "thresholds and targetThresholds configured different extended resources",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 20,
extendedResource: 20,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
v1.ResourceMemory: 80,
"example.com/bar": 80,
},
errInfo: fmt.Errorf("thresholds and targetThresholds configured different resources"),
},
{
name: "thresholds' extended resource config value is greater than targetThresholds'",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 20,
extendedResource: 90,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
v1.ResourceMemory: 80,
extendedResource: 20,
},
errInfo: fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", extendedResource),
},
{
name: "passing valid strategy config",
thresholds: api.ResourceThresholds{
@@ -941,38 +554,147 @@ func TestValidateLowNodeUtilizationStrategyConfig(t *testing.T) {
},
errInfo: nil,
},
{
name: "passing valid strategy config with extended resource",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 20,
extendedResource: 20,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
v1.ResourceMemory: 80,
extendedResource: 80,
},
errInfo: nil,
},
}
for _, testCase := range tests {
validateErr := validateLowUtilizationStrategyConfig(testCase.thresholds, testCase.targetThresholds)
validateErr := validateStrategyConfig(testCase.thresholds, testCase.targetThresholds)
if validateErr == nil || testCase.errInfo == nil {
if validateErr != testCase.errInfo {
t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v to be %v but got %v instead",
t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v\nto be %v but got %v instead",
testCase.thresholds, testCase.targetThresholds, testCase.errInfo, validateErr)
}
} else if validateErr.Error() != testCase.errInfo.Error() {
t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v to be %v but got %v instead",
t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v\nto be %v but got %v instead",
testCase.thresholds, testCase.targetThresholds, testCase.errInfo, validateErr)
}
}
}
func TestLowNodeUtilizationWithTaints(t *testing.T) {
func TestValidateThresholds(t *testing.T) {
tests := []struct {
name string
input api.ResourceThresholds
errInfo error
}{
{
name: "passing nil map for threshold",
input: nil,
errInfo: fmt.Errorf("no resource threshold is configured"),
},
{
name: "passing no threshold",
input: api.ResourceThresholds{},
errInfo: fmt.Errorf("no resource threshold is configured"),
},
{
name: "passing unsupported resource name",
input: api.ResourceThresholds{
v1.ResourceCPU: 40,
v1.ResourceStorage: 25.5,
},
errInfo: fmt.Errorf("only cpu, memory, or pods thresholds can be specified"),
},
{
name: "passing invalid resource name",
input: api.ResourceThresholds{
v1.ResourceCPU: 40,
"coolResource": 42.0,
},
errInfo: fmt.Errorf("only cpu, memory, or pods thresholds can be specified"),
},
{
name: "passing invalid resource value",
input: api.ResourceThresholds{
v1.ResourceCPU: 110,
v1.ResourceMemory: 80,
},
errInfo: fmt.Errorf("%v threshold not in [%v, %v] range", v1.ResourceCPU, MinResourcePercentage, MaxResourcePercentage),
},
{
name: "passing a valid threshold with max and min resource value",
input: api.ResourceThresholds{
v1.ResourceCPU: 100,
v1.ResourceMemory: 0,
},
errInfo: nil,
},
{
name: "passing a valid threshold with only cpu",
input: api.ResourceThresholds{
v1.ResourceCPU: 80,
},
errInfo: nil,
},
{
name: "passing a valid threshold with cpu, memory and pods",
input: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 30,
v1.ResourcePods: 40,
},
errInfo: nil,
},
}
for _, test := range tests {
validateErr := validateThresholds(test.input)
if validateErr == nil || test.errInfo == nil {
if validateErr != test.errInfo {
t.Errorf("expected validity of threshold: %#v\nto be %v but got %v instead", test.input, test.errInfo, validateErr)
}
} else if validateErr.Error() != test.errInfo.Error() {
t.Errorf("expected validity of threshold: %#v\nto be %v but got %v instead", test.input, test.errInfo, validateErr)
}
}
}
func newFake(objects ...runtime.Object) *core.Fake {
scheme := runtime.NewScheme()
codecs := serializer.NewCodecFactory(scheme)
fake.AddToScheme(scheme)
o := core.NewObjectTracker(scheme, codecs.UniversalDecoder())
for _, obj := range objects {
if err := o.Add(obj); err != nil {
panic(err)
}
}
fakePtr := core.Fake{}
fakePtr.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
objs, err := o.List(
schema.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"},
schema.GroupVersionKind{Group: "", Version: "v1", Kind: "Pod"},
action.GetNamespace(),
)
if err != nil {
return true, nil, err
}
obj := &v1.PodList{
Items: []v1.Pod{},
}
for _, pod := range objs.(*v1.PodList).Items {
podFieldSet := fields.Set(map[string]string{
"spec.nodeName": pod.Spec.NodeName,
"status.phase": string(pod.Status.Phase),
})
match := action.(core.ListAction).GetListRestrictions().Fields.Matches(podFieldSet)
if !match {
continue
}
obj.Items = append(obj.Items, *pod.DeepCopy())
}
return true, obj, nil
})
fakePtr.AddReactor("*", "*", core.ObjectReaction(o))
fakePtr.AddWatchReactor("*", core.DefaultWatchReactor(watch.NewFake(), nil))
return &fakePtr
}
func TestWithTaints(t *testing.T) {
ctx := context.Background()
strategy := api.DeschedulerStrategy{
Enabled: true,
@@ -985,7 +707,6 @@ func TestLowNodeUtilizationWithTaints(t *testing.T) {
v1.ResourcePods: 70,
},
},
NodeFit: true,
},
}
@@ -1082,23 +803,29 @@ func TestLowNodeUtilizationWithTaints(t *testing.T) {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
fakePtr := newFake(objs...)
var evictionCounter int
fakePtr.PrependReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) {
if action.GetSubresource() != "eviction" || action.GetResource().Resource != "pods" {
return false, nil, nil
}
evictionCounter++
return true, nil, nil
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
&fake.Clientset{Fake: *fakePtr},
"policy/v1",
false,
item.evictionsExpected,
item.nodes,
false,
false,
false,
)
LowNodeUtilization(ctx, fakeClient, strategy, item.nodes, podEvictor)
LowNodeUtilization(ctx, &fake.Clientset{Fake: *fakePtr}, strategy, item.nodes, podEvictor)
if item.evictionsExpected != podEvictor.TotalEvicted() {
t.Errorf("Expected %v evictions, got %v", item.evictionsExpected, podEvictor.TotalEvicted())
if item.evictionsExpected != evictionCounter {
t.Errorf("Expected %v evictions, got %v", item.evictionsExpected, evictionCounter)
}
})
}

View File

@@ -18,7 +18,6 @@ package strategies
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
clientset "k8s.io/client-go/kubernetes"
@@ -28,86 +27,44 @@ import (
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
)
func validatePodsViolatingNodeAffinityParams(params *api.StrategyParameters) error {
if params == nil || len(params.NodeAffinityType) == 0 {
return fmt.Errorf("NodeAffinityType is empty")
}
// At most one of include/exclude can be set
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
return fmt.Errorf("only one of Include/Exclude namespaces can be set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
}
return nil
}
// RemovePodsViolatingNodeAffinity evicts pods on nodes which violate node affinity
func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
if err := validatePodsViolatingNodeAffinityParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemovePodsViolatingNodeAffinity parameters")
if strategy.Params == nil {
klog.V(1).Infof("NodeAffinityType not set")
return
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
return
}
var includedNamespaces, excludedNamespaces []string
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
}
nodeFit := false
if strategy.Params != nil {
nodeFit = strategy.Params.NodeFit
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
for _, nodeAffinity := range strategy.Params.NodeAffinityType {
klog.V(2).InfoS("Executing for nodeAffinityType", "nodeAffinity", nodeAffinity)
klog.V(2).Infof("Executing for nodeAffinityType: %v", nodeAffinity)
switch nodeAffinity {
case "requiredDuringSchedulingIgnoredDuringExecution":
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
klog.V(1).Infof("Processing node: %#v\n", node.Name)
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithFilter(func(pod *v1.Pod) bool {
return evictable.IsEvictable(pod) &&
!nodeutil.PodFitsCurrentNode(pod, node) &&
nodeutil.PodFitsAnyNode(pod, nodes)
}),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(strategy.Params.LabelSelector),
)
pods, err := podutil.ListPodsOnANode(ctx, client, node, func(pod *v1.Pod) bool {
return podEvictor.IsEvictable(pod) &&
!nodeutil.PodFitsCurrentNode(pod, node) &&
nodeutil.PodFitsAnyNode(pod, nodes)
})
if err != nil {
klog.ErrorS(err, "Failed to get pods", "node", klog.KObj(node))
klog.Errorf("failed to get pods from %v: %v", node.Name, err)
}
for _, pod := range pods {
if pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil && pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil {
klog.V(1).InfoS("Evicting pod", "pod", klog.KObj(pod))
klog.V(1).Infof("Evicting pod: %v", pod.Name)
if _, err := podEvictor.EvictPod(ctx, pod, node, "NodeAffinity"); err != nil {
klog.ErrorS(err, "Error evicting pod")
klog.Errorf("Error evicting pod: (%#v)", err)
break
}
}
}
}
default:
klog.ErrorS(nil, "Invalid nodeAffinityType", "nodeAffinity", nodeAffinity)
klog.Errorf("invalid nodeAffinityType: %v", nodeAffinity)
}
}
klog.V(1).Infof("Evicted %v pods", podEvictor.TotalEvicted())
}

View File

@@ -20,8 +20,7 @@ import (
"context"
"testing"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
@@ -41,16 +40,6 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) {
},
}
requiredDuringSchedulingIgnoredDuringExecutionWithNodeFitStrategy := api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
NodeAffinityType: []string{
"requiredDuringSchedulingIgnoredDuringExecution",
},
NodeFit: true,
},
}
nodeLabelKey := "kubernetes.io/desiredNode"
nodeLabelValue := "yes"
nodeWithLabels := test.BuildTestNode("nodeWithLabels", 2000, 3000, 10, nil)
@@ -148,19 +137,11 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) {
{
description: "Pod is scheduled on node without matching labels, but no node where pod fits is available, should not evict",
expectedEvictedPodCount: 0,
strategy: requiredDuringSchedulingIgnoredDuringExecutionWithNodeFitStrategy,
strategy: requiredDuringSchedulingIgnoredDuringExecutionStrategy,
pods: addPodsToNode(nodeWithoutLabels),
nodes: []*v1.Node{nodeWithoutLabels, unschedulableNodeWithLabels},
maxPodsToEvictPerNode: 0,
},
{
description: "Pod is scheduled on node without matching labels, and node where pod fits is available, should evict",
expectedEvictedPodCount: 0,
strategy: requiredDuringSchedulingIgnoredDuringExecutionWithNodeFitStrategy,
pods: addPodsToNode(nodeWithoutLabels),
nodes: []*v1.Node{nodeWithLabels, unschedulableNodeWithLabels},
maxPodsToEvictPerNode: 1,
},
}
for _, tc := range tests {
@@ -172,13 +153,11 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) {
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
"v1",
false,
tc.maxPodsToEvictPerNode,
tc.nodes,
false,
false,
false,
)
RemovePodsViolatingNodeAffinity(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)

View File

@@ -18,7 +18,6 @@ package strategies
import (
"context"
"fmt"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
@@ -26,68 +25,15 @@ import (
"sigs.k8s.io/descheduler/pkg/utils"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
)
func validateRemovePodsViolatingNodeTaintsParams(params *api.StrategyParameters) error {
if params == nil {
return nil
}
// At most one of include/exclude can be set
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
return fmt.Errorf("only one of Include/Exclude namespaces can be set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
}
return nil
}
// RemovePodsViolatingNodeTaints evicts pods on the node which violate NoSchedule Taints on nodes
func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
if err := validateRemovePodsViolatingNodeTaintsParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemovePodsViolatingNodeTaints parameters")
return
}
var includedNamespaces, excludedNamespaces []string
var labelSelector *metav1.LabelSelector
if strategy.Params != nil {
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
}
labelSelector = strategy.Params.LabelSelector
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
return
}
nodeFit := false
if strategy.Params != nil {
nodeFit = strategy.Params.NodeFit
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithFilter(evictable.IsEvictable),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(labelSelector),
)
klog.V(1).Infof("Processing node: %#v\n", node.Name)
pods, err := podutil.ListPodsOnANode(ctx, client, node, podEvictor.IsEvictable)
if err != nil {
//no pods evicted as error encountered retrieving evictable Pods
return
@@ -99,9 +45,9 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa
node.Spec.Taints,
func(taint *v1.Taint) bool { return taint.Effect == v1.TaintEffectNoSchedule },
) {
klog.V(2).InfoS("Not all taints with NoSchedule effect are tolerated after update for pod on node", "pod", klog.KObj(pods[i]), "node", klog.KObj(node))
klog.V(2).Infof("Not all taints with NoSchedule effect are tolerated after update for pod %v on node %v", pods[i].Name, node.Name)
if _, err := podEvictor.EvictPod(ctx, pods[i], node, "NodeTaint"); err != nil {
klog.ErrorS(err, "Error evicting pod")
klog.Errorf("Error evicting pod: (%#v)", err)
break
}
}

View File

@@ -5,8 +5,7 @@ import (
"fmt"
"testing"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
@@ -51,17 +50,6 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
node2 := test.BuildTestNode("n2", 2000, 3000, 10, nil)
node1 = addTaintsToNode(node2, "testingTaint", "testing", []int{1})
node3 := test.BuildTestNode("n3", 2000, 3000, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
"datacenter": "east",
}
})
node4 := test.BuildTestNode("n4", 2000, 3000, 10, func(node *v1.Node) {
node.Spec = v1.NodeSpec{
Unschedulable: true,
}
})
p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil)
p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil)
p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil)
@@ -81,15 +69,12 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
p10 := test.BuildTestPod("p10", 100, 0, node2.Name, nil)
p11 := test.BuildTestPod("p11", 100, 0, node2.Name, nil)
p11.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
p12 := test.BuildTestPod("p11", 100, 0, node2.Name, nil)
p12.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
// The following 4 pods won't get evicted.
// A Critical Pod.
p7.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
p7.Spec.Priority = &priority
p7.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
// A daemonset.
p8.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList()
@@ -112,19 +97,13 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
p3 = addTolerationToPod(p3, "testTaint", "test", 1)
p4 = addTolerationToPod(p4, "testTaintX", "testX", 1)
p12.Spec.NodeSelector = map[string]string{
"datacenter": "west",
}
tests := []struct {
description string
nodes []*v1.Node
pods []v1.Pod
evictLocalStoragePods bool
evictSystemCriticalPods bool
maxPodsToEvictPerNode int
expectedEvictedPodCount int
nodeFit bool
}{
{
@@ -132,7 +111,6 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
pods: []v1.Pod{*p1, *p2, *p3},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 1, //p2 gets evicted
},
@@ -141,7 +119,6 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
pods: []v1.Pod{*p1, *p3, *p4},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 1, //p4 gets evicted
},
@@ -150,7 +127,6 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
pods: []v1.Pod{*p1, *p5, *p6},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 1,
expectedEvictedPodCount: 1, //p5 or p6 gets evicted
},
@@ -159,66 +135,24 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
pods: []v1.Pod{*p7, *p8, *p9, *p10},
nodes: []*v1.Node{node2},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 0, //nothing is evicted
expectedEvictedPodCount: 0,
},
{
description: "Critical pods except storage pods not tolerating node taint should not be evicted",
pods: []v1.Pod{*p7, *p8, *p9, *p10},
nodes: []*v1.Node{node2},
evictLocalStoragePods: true,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 1, //p9 gets evicted
expectedEvictedPodCount: 1,
},
{
description: "Critical and non critical pods, only non critical pods not tolerating node taint should be evicted",
pods: []v1.Pod{*p7, *p8, *p10, *p11},
nodes: []*v1.Node{node2},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 1, //p11 gets evicted
},
{
description: "Critical and non critical pods, pods not tolerating node taint should be evicted even if they are critical",
pods: []v1.Pod{*p2, *p7, *p9, *p10},
nodes: []*v1.Node{node2},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 2, //p2 and p7 are evicted
},
{
description: "Pod p2 doesn't tolerate taint on it's node, but also doesn't tolerate taints on other nodes",
pods: []v1.Pod{*p1, *p2, *p3},
nodes: []*v1.Node{node1, node2},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 0, //p2 gets evicted
nodeFit: true,
},
{
description: "Pod p12 doesn't tolerate taint on it's node, but other nodes don't match it's selector",
pods: []v1.Pod{*p1, *p3, *p12},
nodes: []*v1.Node{node1, node3},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 0, //p2 gets evicted
nodeFit: true,
},
{
description: "Pod p2 doesn't tolerate taint on it's node, but other nodes are unschedulable",
pods: []v1.Pod{*p1, *p2, *p3},
nodes: []*v1.Node{node1, node4},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 0, //p2 gets evicted
nodeFit: true,
expectedEvictedPodCount: 1,
},
}
@@ -232,22 +166,14 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
"v1",
false,
tc.maxPodsToEvictPerNode,
tc.nodes,
tc.evictLocalStoragePods,
tc.evictSystemCriticalPods,
false,
)
strategy := api.DeschedulerStrategy{
Params: &api.StrategyParameters{
NodeFit: tc.nodeFit,
},
}
RemovePodsViolatingNodeTaints(ctx, fakeClient, strategy, tc.nodes, podEvictor)
RemovePodsViolatingNodeTaints(ctx, fakeClient, api.DeschedulerStrategy{}, tc.nodes, podEvictor)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, Unexpected no of pods evicted: pods evicted: %d, expected: %d", tc.description, actualEvictedPodCount, tc.expectedEvictedPodCount)

View File

@@ -1,164 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeutilization
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
"sigs.k8s.io/descheduler/pkg/utils"
)
// HighNodeUtilization evicts pods from under utilized nodes so that scheduler can schedule according to its strategy.
// Note that CPU/Memory requests are used to calculate nodes' utilization and not the actual resource usage.
func HighNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
if err := validateNodeUtilizationParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid HighNodeUtilization parameters")
return
}
nodeFit := false
if strategy.Params != nil {
nodeFit = strategy.Params.NodeFit
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
return
}
thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds
targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds
if err := validateHighUtilizationStrategyConfig(thresholds, targetThresholds); err != nil {
klog.ErrorS(err, "HighNodeUtilization config is not valid")
return
}
targetThresholds = make(api.ResourceThresholds)
setDefaultForThresholds(thresholds, targetThresholds)
resourceNames := getResourceNames(targetThresholds)
sourceNodes, highNodes := classifyNodes(
getNodeUsage(ctx, client, nodes, thresholds, targetThresholds, resourceNames),
func(node *v1.Node, usage NodeUsage) bool {
return isNodeWithLowUtilization(usage)
},
func(node *v1.Node, usage NodeUsage) bool {
if nodeutil.IsNodeUnschedulable(node) {
klog.V(2).InfoS("Node is unschedulable", "node", klog.KObj(node))
return false
}
return !isNodeWithLowUtilization(usage)
})
// log message in one line
keysAndValues := []interface{}{
"CPU", thresholds[v1.ResourceCPU],
"Mem", thresholds[v1.ResourceMemory],
"Pods", thresholds[v1.ResourcePods],
}
for name := range thresholds {
if !isBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), int64(thresholds[name]))
}
}
klog.V(1).InfoS("Criteria for a node below target utilization", keysAndValues...)
klog.V(1).InfoS("Number of underutilized nodes", "totalNumber", len(sourceNodes))
if len(sourceNodes) == 0 {
klog.V(1).InfoS("No node is underutilized, nothing to do here, you might tune your thresholds further")
return
}
if len(sourceNodes) <= strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes {
klog.V(1).InfoS("Number of nodes underutilized is less or equal than NumberOfNodes, nothing to do here", "underutilizedNodes", len(sourceNodes), "numberOfNodes", strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes)
return
}
if len(sourceNodes) == len(nodes) {
klog.V(1).InfoS("All nodes are underutilized, nothing to do here")
return
}
if len(highNodes) == 0 {
klog.V(1).InfoS("No node is available to schedule the pods, nothing to do here")
return
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
// stop if the total available usage has dropped to zero - no more pods can be scheduled
continueEvictionCond := func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool {
for name := range totalAvailableUsage {
if totalAvailableUsage[name].CmpInt64(0) < 1 {
return false
}
}
return true
}
evictPodsFromSourceNodes(
ctx,
sourceNodes,
highNodes,
podEvictor,
evictable.IsEvictable,
resourceNames,
"HighNodeUtilization",
continueEvictionCond)
}
func validateHighUtilizationStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error {
if targetThresholds != nil {
return fmt.Errorf("targetThresholds is not applicable for HighNodeUtilization")
}
if err := validateThresholds(thresholds); err != nil {
return fmt.Errorf("thresholds config is not valid: %v", err)
}
return nil
}
func setDefaultForThresholds(thresholds, targetThresholds api.ResourceThresholds) {
// check if Pods/CPU/Mem are set, if not, set them to 100
if _, ok := thresholds[v1.ResourcePods]; !ok {
thresholds[v1.ResourcePods] = MaxResourcePercentage
}
if _, ok := thresholds[v1.ResourceCPU]; !ok {
thresholds[v1.ResourceCPU] = MaxResourcePercentage
}
if _, ok := thresholds[v1.ResourceMemory]; !ok {
thresholds[v1.ResourceMemory] = MaxResourcePercentage
}
// Default targetThreshold resource values to 100
targetThresholds[v1.ResourcePods] = MaxResourcePercentage
targetThresholds[v1.ResourceCPU] = MaxResourcePercentage
targetThresholds[v1.ResourceMemory] = MaxResourcePercentage
for name := range thresholds {
if !isBasicResource(name) {
targetThresholds[name] = MaxResourcePercentage
}
}
}

View File

@@ -1,784 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeutilization
import (
"context"
"fmt"
"strings"
"testing"
v1 "k8s.io/api/core/v1"
"k8s.io/api/policy/v1beta1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
"sigs.k8s.io/descheduler/pkg/utils"
"sigs.k8s.io/descheduler/test"
)
func TestHighNodeUtilization(t *testing.T) {
ctx := context.Background()
n1NodeName := "n1"
n2NodeName := "n2"
n3NodeName := "n3"
nodeSelectorKey := "datacenter"
nodeSelectorValue := "west"
testCases := []struct {
name string
thresholds api.ResourceThresholds
nodes map[string]*v1.Node
pods map[string]*v1.PodList
maxPodsToEvictPerNode int
expectedPodsEvicted int
evictedPods []string
}{
{
name: "no node below threshold usage",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourcePods: 20,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p8", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p9", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
},
{
name: "no evictable pods",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 40,
v1.ResourcePods: 40,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with local storage.
test.SetNormalOwnerRef(pod)
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
}
// A Mirror Pod.
pod.Annotations = test.GetMirrorPodAnnotation()
}),
*test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetDSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetDSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p5", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p8", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
},
{
name: "no node to schedule evicted pods",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourcePods: 20,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
// These can't be evicted.
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
// These can't be evicted.
*test.BuildTestPod("p2", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p3", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
},
{
name: "without priorities",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
*test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 2,
evictedPods: []string{"p1", "p7"},
},
{
name: "without priorities stop when resource capacity is depleted",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 2000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 2000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 2000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p2", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p6", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 1,
},
{
name: "with priorities",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 2000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 2000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodPriority(pod, lowPriority)
}),
*test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodPriority(pod, highPriority)
}),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p7", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p8", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p9", 400, 0, n3NodeName, test.SetDSOwnerRef),
},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 1,
evictedPods: []string{"p1"},
},
{
name: "without priorities evicting best-effort pods only",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 3000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 3000, 3000, 5, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 3000, 3000, 10, test.SetNodeUnschedulable),
},
// All pods are assumed to be burstable (test.BuildTestNode always sets both cpu/memory resource requests to some value)
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.MakeBestEffortPod(pod)
}),
*test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
}),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 1,
evictedPods: []string{"p1"},
},
{
name: "with extended resource",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
extendedResource: 40,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 8)
}),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 8)
}),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
// These won't be evicted
*test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetDSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p3", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p4", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p5", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p6", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
},
},
n3NodeName: {
Items: []v1.Pod{},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 2,
evictedPods: []string{"p1", "p2"},
},
{
name: "with extended resource in some of nodes",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 40,
extendedResource: 40,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 8)
}),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
//These won't be evicted
*test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p3", 500, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 500, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 500, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 500, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
},
{
name: "Other node match pod node selector",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetDSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p5", 400, 0, n2NodeName, func(pod *v1.Pod) {
// A pod selecting nodes in the "west" datacenter
test.SetRSOwnerRef(pod)
pod.Spec.NodeSelector = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 1,
},
{
name: "Other node does not match pod node selector",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetDSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p5", 400, 0, n2NodeName, func(pod *v1.Pod) {
// A pod selecting nodes in the "west" datacenter
test.SetRSOwnerRef(pod)
pod.Spec.NodeSelector = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
},
},
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
},
}
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
list := action.(core.ListAction)
fieldString := list.GetListRestrictions().Fields.String()
if strings.Contains(fieldString, n1NodeName) {
return true, test.pods[n1NodeName], nil
}
if strings.Contains(fieldString, n2NodeName) {
return true, test.pods[n2NodeName], nil
}
if strings.Contains(fieldString, n3NodeName) {
return true, test.pods[n3NodeName], nil
}
return true, nil, fmt.Errorf("Failed to list: %v", list)
})
fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) {
getAction := action.(core.GetAction)
if node, exists := test.nodes[getAction.GetName()]; exists {
return true, node, nil
}
return true, nil, fmt.Errorf("Wrong node: %v", getAction.GetName())
})
podsForEviction := make(map[string]struct{})
for _, pod := range test.evictedPods {
podsForEviction[pod] = struct{}{}
}
evictionFailed := false
if len(test.evictedPods) > 0 {
fakeClient.Fake.AddReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) {
getAction := action.(core.CreateAction)
obj := getAction.GetObject()
if eviction, ok := obj.(*v1beta1.Eviction); ok {
if _, exists := podsForEviction[eviction.Name]; exists {
return true, obj, nil
}
evictionFailed = true
return true, nil, fmt.Errorf("pod %q was unexpectedly evicted", eviction.Name)
}
return true, obj, nil
})
}
var nodes []*v1.Node
for _, node := range test.nodes {
nodes = append(nodes, node)
}
podEvictor := evictions.NewPodEvictor(
fakeClient,
"v1",
false,
test.maxPodsToEvictPerNode,
nodes,
false,
false,
false,
)
strategy := api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
NodeResourceUtilizationThresholds: &api.NodeResourceUtilizationThresholds{
Thresholds: test.thresholds,
},
NodeFit: true,
},
}
HighNodeUtilization(ctx, fakeClient, strategy, nodes, podEvictor)
podsEvicted := podEvictor.TotalEvicted()
if test.expectedPodsEvicted != podsEvicted {
t.Errorf("Expected %#v pods to be evicted but %#v got evicted", test.expectedPodsEvicted, podsEvicted)
}
if evictionFailed {
t.Errorf("Pod evictions failed unexpectedly")
}
})
}
}
func TestValidateHighNodeUtilizationStrategyConfig(t *testing.T) {
tests := []struct {
name string
thresholds api.ResourceThresholds
targetThresholds api.ResourceThresholds
errInfo error
}{
{
name: "passing target thresholds",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 20,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
v1.ResourceMemory: 80,
},
errInfo: fmt.Errorf("targetThresholds is not applicable for HighNodeUtilization"),
},
{
name: "passing empty thresholds",
thresholds: api.ResourceThresholds{},
errInfo: fmt.Errorf("thresholds config is not valid: no resource threshold is configured"),
},
{
name: "passing invalid thresholds",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
v1.ResourceMemory: 120,
},
errInfo: fmt.Errorf("thresholds config is not valid: %v", fmt.Errorf(
"%v threshold not in [%v, %v] range", v1.ResourceMemory, MinResourcePercentage, MaxResourcePercentage)),
},
{
name: "passing valid strategy config",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
v1.ResourceMemory: 80,
},
errInfo: nil,
},
{
name: "passing valid strategy config with extended resource",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 80,
v1.ResourceMemory: 80,
extendedResource: 80,
},
errInfo: nil,
},
}
for _, testCase := range tests {
validateErr := validateHighUtilizationStrategyConfig(testCase.thresholds, testCase.targetThresholds)
if validateErr == nil || testCase.errInfo == nil {
if validateErr != testCase.errInfo {
t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v to be %v but got %v instead",
testCase.thresholds, testCase.targetThresholds, testCase.errInfo, validateErr)
}
} else if validateErr.Error() != testCase.errInfo.Error() {
t.Errorf("expected validity of strategy config: thresholds %#v targetThresholds %#v to be %v but got %v instead",
testCase.thresholds, testCase.targetThresholds, testCase.errInfo, validateErr)
}
}
}
func TestHighNodeUtilizationWithTaints(t *testing.T) {
ctx := context.Background()
strategy := api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
NodeResourceUtilizationThresholds: &api.NodeResourceUtilizationThresholds{
Thresholds: api.ResourceThresholds{
v1.ResourceCPU: 40,
},
},
},
}
n1 := test.BuildTestNode("n1", 1000, 3000, 10, nil)
n2 := test.BuildTestNode("n2", 1000, 3000, 10, nil)
n3 := test.BuildTestNode("n3", 1000, 3000, 10, nil)
n3withTaints := n3.DeepCopy()
n3withTaints.Spec.Taints = []v1.Taint{
{
Key: "key",
Value: "value",
Effect: v1.TaintEffectNoSchedule,
},
}
podThatToleratesTaint := test.BuildTestPod("tolerate_pod", 200, 0, n1.Name, test.SetRSOwnerRef)
podThatToleratesTaint.Spec.Tolerations = []v1.Toleration{
{
Key: "key",
Value: "value",
},
}
tests := []struct {
name string
nodes []*v1.Node
pods []*v1.Pod
evictionsExpected int
}{
{
name: "No taints",
nodes: []*v1.Node{n1, n2, n3},
pods: []*v1.Pod{
//Node 1 pods
test.BuildTestPod(fmt.Sprintf("pod_1_%s", n1.Name), 200, 0, n1.Name, test.SetRSOwnerRef),
test.BuildTestPod(fmt.Sprintf("pod_2_%s", n1.Name), 200, 0, n1.Name, test.SetRSOwnerRef),
test.BuildTestPod(fmt.Sprintf("pod_3_%s", n1.Name), 200, 0, n1.Name, test.SetRSOwnerRef),
// Node 2 pods
test.BuildTestPod(fmt.Sprintf("pod_4_%s", n2.Name), 200, 0, n2.Name, test.SetRSOwnerRef),
},
evictionsExpected: 1,
},
{
name: "No pod tolerates node taint",
nodes: []*v1.Node{n1, n3withTaints},
pods: []*v1.Pod{
//Node 1 pods
test.BuildTestPod(fmt.Sprintf("pod_1_%s", n1.Name), 200, 0, n1.Name, test.SetRSOwnerRef),
// Node 3 pods
test.BuildTestPod(fmt.Sprintf("pod_2_%s", n3withTaints.Name), 200, 0, n3withTaints.Name, test.SetRSOwnerRef),
},
evictionsExpected: 0,
},
{
name: "Pod which tolerates node taint",
nodes: []*v1.Node{n1, n3withTaints},
pods: []*v1.Pod{
//Node 1 pods
test.BuildTestPod(fmt.Sprintf("pod_1_%s", n1.Name), 100, 0, n1.Name, test.SetRSOwnerRef),
podThatToleratesTaint,
// Node 3 pods
test.BuildTestPod(fmt.Sprintf("pod_9_%s", n3withTaints.Name), 500, 0, n3withTaints.Name, test.SetRSOwnerRef),
},
evictionsExpected: 1,
},
}
for _, item := range tests {
t.Run(item.name, func(t *testing.T) {
var objs []runtime.Object
for _, node := range item.nodes {
objs = append(objs, node)
}
for _, pod := range item.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
podEvictor := evictions.NewPodEvictor(
fakeClient,
"policy/v1",
false,
item.evictionsExpected,
item.nodes,
false,
false,
false,
)
HighNodeUtilization(ctx, fakeClient, strategy, item.nodes, podEvictor)
if item.evictionsExpected != podEvictor.TotalEvicted() {
t.Errorf("Expected %v evictions, got %v", item.evictionsExpected, podEvictor.TotalEvicted())
}
})
}
}

View File

@@ -1,188 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeutilization
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
"sigs.k8s.io/descheduler/pkg/utils"
)
// LowNodeUtilization evicts pods from overutilized nodes to underutilized nodes. Note that CPU/Memory requests are used
// to calculate nodes' utilization and not the actual resource usage.
func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
// TODO: May be create a struct for the strategy as well, so that we don't have to pass along the all the params?
if err := validateNodeUtilizationParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid LowNodeUtilization parameters")
return
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
return
}
nodeFit := false
if strategy.Params != nil {
nodeFit = strategy.Params.NodeFit
}
thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds
targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds
if err := validateLowUtilizationStrategyConfig(thresholds, targetThresholds); err != nil {
klog.ErrorS(err, "LowNodeUtilization config is not valid")
return
}
// check if Pods/CPU/Mem are set, if not, set them to 100
if _, ok := thresholds[v1.ResourcePods]; !ok {
thresholds[v1.ResourcePods] = MaxResourcePercentage
targetThresholds[v1.ResourcePods] = MaxResourcePercentage
}
if _, ok := thresholds[v1.ResourceCPU]; !ok {
thresholds[v1.ResourceCPU] = MaxResourcePercentage
targetThresholds[v1.ResourceCPU] = MaxResourcePercentage
}
if _, ok := thresholds[v1.ResourceMemory]; !ok {
thresholds[v1.ResourceMemory] = MaxResourcePercentage
targetThresholds[v1.ResourceMemory] = MaxResourcePercentage
}
resourceNames := getResourceNames(thresholds)
lowNodes, sourceNodes := classifyNodes(
getNodeUsage(ctx, client, nodes, thresholds, targetThresholds, resourceNames),
// The node has to be schedulable (to be able to move workload there)
func(node *v1.Node, usage NodeUsage) bool {
if nodeutil.IsNodeUnschedulable(node) {
klog.V(2).InfoS("Node is unschedulable, thus not considered as underutilized", "node", klog.KObj(node))
return false
}
return isNodeWithLowUtilization(usage)
},
func(node *v1.Node, usage NodeUsage) bool {
return isNodeAboveTargetUtilization(usage)
},
)
// log message in one line
keysAndValues := []interface{}{
"CPU", thresholds[v1.ResourceCPU],
"Mem", thresholds[v1.ResourceMemory],
"Pods", thresholds[v1.ResourcePods],
}
for name := range thresholds {
if !isBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), int64(thresholds[name]))
}
}
klog.V(1).InfoS("Criteria for a node under utilization", keysAndValues...)
klog.V(1).InfoS("Number of underutilized nodes", "totalNumber", len(lowNodes))
// log message in one line
keysAndValues = []interface{}{
"CPU", targetThresholds[v1.ResourceCPU],
"Mem", targetThresholds[v1.ResourceMemory],
"Pods", targetThresholds[v1.ResourcePods],
}
for name := range targetThresholds {
if !isBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), int64(targetThresholds[name]))
}
}
klog.V(1).InfoS("Criteria for a node above target utilization", keysAndValues...)
klog.V(1).InfoS("Number of overutilized nodes", "totalNumber", len(sourceNodes))
if len(lowNodes) == 0 {
klog.V(1).InfoS("No node is underutilized, nothing to do here, you might tune your thresholds further")
return
}
if len(lowNodes) <= strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes {
klog.V(1).InfoS("Number of nodes underutilized is less or equal than NumberOfNodes, nothing to do here", "underutilizedNodes", len(lowNodes), "numberOfNodes", strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes)
return
}
if len(lowNodes) == len(nodes) {
klog.V(1).InfoS("All nodes are underutilized, nothing to do here")
return
}
if len(sourceNodes) == 0 {
klog.V(1).InfoS("All nodes are under target utilization, nothing to do here")
return
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
// stop if node utilization drops below target threshold or any of required capacity (cpu, memory, pods) is moved
continueEvictionCond := func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool {
if !isNodeAboveTargetUtilization(nodeUsage) {
return false
}
for name := range totalAvailableUsage {
if totalAvailableUsage[name].CmpInt64(0) < 1 {
return false
}
}
return true
}
evictPodsFromSourceNodes(
ctx,
sourceNodes,
lowNodes,
podEvictor,
evictable.IsEvictable,
resourceNames,
"LowNodeUtilization",
continueEvictionCond)
klog.V(1).InfoS("Total number of pods evicted", "evictedPods", podEvictor.TotalEvicted())
}
// validateLowUtilizationStrategyConfig checks if the strategy's config is valid
func validateLowUtilizationStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error {
// validate thresholds and targetThresholds config
if err := validateThresholds(thresholds); err != nil {
return fmt.Errorf("thresholds config is not valid: %v", err)
}
if err := validateThresholds(targetThresholds); err != nil {
return fmt.Errorf("targetThresholds config is not valid: %v", err)
}
// validate if thresholds and targetThresholds have same resources configured
if len(thresholds) != len(targetThresholds) {
return fmt.Errorf("thresholds and targetThresholds configured different resources")
}
for resourceName, value := range thresholds {
if targetValue, ok := targetThresholds[resourceName]; !ok {
return fmt.Errorf("thresholds and targetThresholds configured different resources")
} else if value > targetValue {
return fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", resourceName)
}
}
return nil
}

View File

@@ -1,406 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeutilization
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
"sort"
)
// NodeUsage stores a node's info, pods on it, thresholds and its resource usage
type NodeUsage struct {
node *v1.Node
usage map[v1.ResourceName]*resource.Quantity
allPods []*v1.Pod
lowResourceThreshold map[v1.ResourceName]*resource.Quantity
highResourceThreshold map[v1.ResourceName]*resource.Quantity
}
type continueEvictionCond func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool
// NodePodsMap is a set of (node, pods) pairs
type NodePodsMap map[*v1.Node][]*v1.Pod
const (
// MinResourcePercentage is the minimum value of a resource's percentage
MinResourcePercentage = 0
// MaxResourcePercentage is the maximum value of a resource's percentage
MaxResourcePercentage = 100
)
func validateNodeUtilizationParams(params *api.StrategyParameters) error {
if params == nil || params.NodeResourceUtilizationThresholds == nil {
return fmt.Errorf("NodeResourceUtilizationThresholds not set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
}
return nil
}
// validateThresholds checks if thresholds have valid resource name and resource percentage configured
func validateThresholds(thresholds api.ResourceThresholds) error {
if thresholds == nil || len(thresholds) == 0 {
return fmt.Errorf("no resource threshold is configured")
}
for name, percent := range thresholds {
if percent < MinResourcePercentage || percent > MaxResourcePercentage {
return fmt.Errorf("%v threshold not in [%v, %v] range", name, MinResourcePercentage, MaxResourcePercentage)
}
}
return nil
}
func getNodeUsage(
ctx context.Context,
client clientset.Interface,
nodes []*v1.Node,
lowThreshold, highThreshold api.ResourceThresholds,
resourceNames []v1.ResourceName,
) []NodeUsage {
var nodeUsageList []NodeUsage
for _, node := range nodes {
pods, err := podutil.ListPodsOnANode(ctx, client, node)
if err != nil {
klog.V(2).InfoS("Node will not be processed, error accessing its pods", "node", klog.KObj(node), "err", err)
continue
}
// A threshold is in percentages but in <0;100> interval.
// Performing `threshold * 0.01` will convert <0;100> interval into <0;1>.
// Multiplying it with capacity will give fraction of the capacity corresponding to the given high/low resource threshold in Quantity units.
nodeCapacity := node.Status.Capacity
if len(node.Status.Allocatable) > 0 {
nodeCapacity = node.Status.Allocatable
}
lowResourceThreshold := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(int64(float64(lowThreshold[v1.ResourceCPU])*float64(nodeCapacity.Cpu().MilliValue())*0.01), resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(int64(float64(lowThreshold[v1.ResourceMemory])*float64(nodeCapacity.Memory().Value())*0.01), resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(int64(float64(lowThreshold[v1.ResourcePods])*float64(nodeCapacity.Pods().Value())*0.01), resource.DecimalSI),
}
for _, name := range resourceNames {
if !isBasicResource(name) {
cap := nodeCapacity[name]
lowResourceThreshold[name] = resource.NewQuantity(int64(float64(lowThreshold[name])*float64(cap.Value())*0.01), resource.DecimalSI)
}
}
highResourceThreshold := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(int64(float64(highThreshold[v1.ResourceCPU])*float64(nodeCapacity.Cpu().MilliValue())*0.01), resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(int64(float64(highThreshold[v1.ResourceMemory])*float64(nodeCapacity.Memory().Value())*0.01), resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(int64(float64(highThreshold[v1.ResourcePods])*float64(nodeCapacity.Pods().Value())*0.01), resource.DecimalSI),
}
for _, name := range resourceNames {
if !isBasicResource(name) {
cap := nodeCapacity[name]
highResourceThreshold[name] = resource.NewQuantity(int64(float64(highThreshold[name])*float64(cap.Value())*0.01), resource.DecimalSI)
}
}
nodeUsageList = append(nodeUsageList, NodeUsage{
node: node,
usage: nodeUtilization(node, pods, resourceNames),
allPods: pods,
lowResourceThreshold: lowResourceThreshold,
highResourceThreshold: highResourceThreshold,
})
}
return nodeUsageList
}
func resourceUsagePercentages(nodeUsage NodeUsage) map[v1.ResourceName]float64 {
nodeCapacity := nodeUsage.node.Status.Capacity
if len(nodeUsage.node.Status.Allocatable) > 0 {
nodeCapacity = nodeUsage.node.Status.Allocatable
}
resourceUsagePercentage := map[v1.ResourceName]float64{}
for resourceName, resourceUsage := range nodeUsage.usage {
cap := nodeCapacity[resourceName]
if !cap.IsZero() {
resourceUsagePercentage[resourceName] = 100 * float64(resourceUsage.MilliValue()) / float64(cap.MilliValue())
}
}
return resourceUsagePercentage
}
// classifyNodes classifies the nodes into low-utilization or high-utilization nodes. If a node lies between
// low and high thresholds, it is simply ignored.
func classifyNodes(
nodeUsages []NodeUsage,
lowThresholdFilter, highThresholdFilter func(node *v1.Node, usage NodeUsage) bool,
) ([]NodeUsage, []NodeUsage) {
lowNodes, highNodes := []NodeUsage{}, []NodeUsage{}
for _, nodeUsage := range nodeUsages {
if lowThresholdFilter(nodeUsage.node, nodeUsage) {
klog.V(2).InfoS("Node is underutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
lowNodes = append(lowNodes, nodeUsage)
} else if highThresholdFilter(nodeUsage.node, nodeUsage) {
klog.V(2).InfoS("Node is overutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
highNodes = append(highNodes, nodeUsage)
} else {
klog.V(2).InfoS("Node is appropriately utilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
}
}
return lowNodes, highNodes
}
// evictPodsFromSourceNodes evicts pods based on priority, if all the pods on the node have priority, if not
// evicts them based on QoS as fallback option.
// TODO: @ravig Break this function into smaller functions.
func evictPodsFromSourceNodes(
ctx context.Context,
sourceNodes, destinationNodes []NodeUsage,
podEvictor *evictions.PodEvictor,
podFilter func(pod *v1.Pod) bool,
resourceNames []v1.ResourceName,
strategy string,
continueEviction continueEvictionCond,
) {
sortNodesByUsage(sourceNodes)
// upper bound on total number of pods/cpu/memory and optional extended resources to be moved
totalAvailableUsage := map[v1.ResourceName]*resource.Quantity{
v1.ResourcePods: {},
v1.ResourceCPU: {},
v1.ResourceMemory: {},
}
var taintsOfDestinationNodes = make(map[string][]v1.Taint, len(destinationNodes))
for _, node := range destinationNodes {
taintsOfDestinationNodes[node.node.Name] = node.node.Spec.Taints
for _, name := range resourceNames {
if _, ok := totalAvailableUsage[name]; !ok {
totalAvailableUsage[name] = resource.NewQuantity(0, resource.DecimalSI)
}
totalAvailableUsage[name].Add(*node.highResourceThreshold[name])
totalAvailableUsage[name].Sub(*node.usage[name])
}
}
// log message in one line
keysAndValues := []interface{}{
"CPU", totalAvailableUsage[v1.ResourceCPU].MilliValue(),
"Mem", totalAvailableUsage[v1.ResourceMemory].Value(),
"Pods", totalAvailableUsage[v1.ResourcePods].Value(),
}
for name := range totalAvailableUsage {
if !isBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), totalAvailableUsage[name].Value())
}
}
klog.V(1).InfoS("Total capacity to be moved", keysAndValues...)
for _, node := range sourceNodes {
klog.V(3).InfoS("Evicting pods from node", "node", klog.KObj(node.node), "usage", node.usage)
nonRemovablePods, removablePods := classifyPods(node.allPods, podFilter)
klog.V(2).InfoS("Pods on node", "node", klog.KObj(node.node), "allPods", len(node.allPods), "nonRemovablePods", len(nonRemovablePods), "removablePods", len(removablePods))
if len(removablePods) == 0 {
klog.V(1).InfoS("No removable pods on node, try next node", "node", klog.KObj(node.node))
continue
}
klog.V(1).InfoS("Evicting pods based on priority, if they have same priority, they'll be evicted based on QoS tiers")
// sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers.
podutil.SortPodsBasedOnPriorityLowToHigh(removablePods)
evictPods(ctx, removablePods, node, totalAvailableUsage, taintsOfDestinationNodes, podEvictor, strategy, continueEviction)
klog.V(1).InfoS("Evicted pods from node", "node", klog.KObj(node.node), "evictedPods", podEvictor.NodeEvicted(node.node), "usage", node.usage)
}
}
func evictPods(
ctx context.Context,
inputPods []*v1.Pod,
nodeUsage NodeUsage,
totalAvailableUsage map[v1.ResourceName]*resource.Quantity,
taintsOfLowNodes map[string][]v1.Taint,
podEvictor *evictions.PodEvictor,
strategy string,
continueEviction continueEvictionCond,
) {
if continueEviction(nodeUsage, totalAvailableUsage) {
for _, pod := range inputPods {
if !utils.PodToleratesTaints(pod, taintsOfLowNodes) {
klog.V(3).InfoS("Skipping eviction for pod, doesn't tolerate node taint", "pod", klog.KObj(pod))
continue
}
success, err := podEvictor.EvictPod(ctx, pod, nodeUsage.node, strategy)
if err != nil {
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
break
}
if success {
klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod), "err", err)
for name := range totalAvailableUsage {
if name == v1.ResourcePods {
nodeUsage.usage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI))
totalAvailableUsage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI))
} else {
quantity := utils.GetResourceRequestQuantity(pod, name)
nodeUsage.usage[name].Sub(quantity)
totalAvailableUsage[name].Sub(quantity)
}
}
keysAndValues := []interface{}{
"node", nodeUsage.node.Name,
"CPU", nodeUsage.usage[v1.ResourceCPU].MilliValue(),
"Mem", nodeUsage.usage[v1.ResourceMemory].Value(),
"Pods", nodeUsage.usage[v1.ResourcePods].Value(),
}
for name := range totalAvailableUsage {
if !isBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), totalAvailableUsage[name].Value())
}
}
klog.V(3).InfoS("Updated node usage", keysAndValues...)
// check if pods can be still evicted
if !continueEviction(nodeUsage, totalAvailableUsage) {
break
}
}
}
}
}
// sortNodesByUsage sorts nodes based on usage in descending order
func sortNodesByUsage(nodes []NodeUsage) {
sort.Slice(nodes, func(i, j int) bool {
ti := nodes[i].usage[v1.ResourceMemory].Value() + nodes[i].usage[v1.ResourceCPU].MilliValue() + nodes[i].usage[v1.ResourcePods].Value()
tj := nodes[j].usage[v1.ResourceMemory].Value() + nodes[j].usage[v1.ResourceCPU].MilliValue() + nodes[j].usage[v1.ResourcePods].Value()
// extended resources
for name := range nodes[i].usage {
if !isBasicResource(name) {
ti = ti + nodes[i].usage[name].Value()
tj = tj + nodes[j].usage[name].Value()
}
}
// To return sorted in descending order
return ti > tj
})
}
// isNodeAboveTargetUtilization checks if a node is overutilized
// At least one resource has to be above the high threshold
func isNodeAboveTargetUtilization(usage NodeUsage) bool {
for name, nodeValue := range usage.usage {
// usage.highResourceThreshold[name] < nodeValue
if usage.highResourceThreshold[name].Cmp(*nodeValue) == -1 {
return true
}
}
return false
}
// isNodeWithLowUtilization checks if a node is underutilized
// All resources have to be below the low threshold
func isNodeWithLowUtilization(usage NodeUsage) bool {
for name, nodeValue := range usage.usage {
// usage.lowResourceThreshold[name] < nodeValue
if usage.lowResourceThreshold[name].Cmp(*nodeValue) == -1 {
return false
}
}
return true
}
// getResourceNames returns list of resource names in resource thresholds
func getResourceNames(thresholds api.ResourceThresholds) []v1.ResourceName {
resourceNames := make([]v1.ResourceName, 0, len(thresholds))
for name := range thresholds {
resourceNames = append(resourceNames, name)
}
return resourceNames
}
// isBasicResource checks if resource is basic native.
func isBasicResource(name v1.ResourceName) bool {
switch name {
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods:
return true
default:
return false
}
}
func nodeUtilization(node *v1.Node, pods []*v1.Pod, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity {
totalReqs := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(0, resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(0, resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(int64(len(pods)), resource.DecimalSI),
}
for _, name := range resourceNames {
if !isBasicResource(name) {
totalReqs[name] = resource.NewQuantity(0, resource.DecimalSI)
}
}
for _, pod := range pods {
req, _ := utils.PodRequestsAndLimits(pod)
for _, name := range resourceNames {
quantity, ok := req[name]
if ok && name != v1.ResourcePods {
// As Quantity.Add says: Add adds the provided y quantity to the current value. If the current value is zero,
// the format of the quantity will be updated to the format of y.
totalReqs[name].Add(quantity)
}
}
}
return totalReqs
}
func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) {
var nonRemovablePods, removablePods []*v1.Pod
for _, pod := range pods {
if !filter(pod) {
nonRemovablePods = append(nonRemovablePods, pod)
} else {
removablePods = append(removablePods, pod)
}
}
return nonRemovablePods, removablePods
}

View File

@@ -1,158 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeutilization
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"math"
"sigs.k8s.io/descheduler/pkg/api"
"testing"
)
var (
lowPriority = int32(0)
highPriority = int32(10000)
extendedResource = v1.ResourceName("example.com/foo")
)
func TestValidateThresholds(t *testing.T) {
tests := []struct {
name string
input api.ResourceThresholds
errInfo error
}{
{
name: "passing nil map for threshold",
input: nil,
errInfo: fmt.Errorf("no resource threshold is configured"),
},
{
name: "passing no threshold",
input: api.ResourceThresholds{},
errInfo: fmt.Errorf("no resource threshold is configured"),
},
{
name: "passing extended resource name other than cpu/memory/pods",
input: api.ResourceThresholds{
v1.ResourceCPU: 40,
extendedResource: 50,
},
errInfo: nil,
},
{
name: "passing invalid resource value",
input: api.ResourceThresholds{
v1.ResourceCPU: 110,
v1.ResourceMemory: 80,
},
errInfo: fmt.Errorf("%v threshold not in [%v, %v] range", v1.ResourceCPU, MinResourcePercentage, MaxResourcePercentage),
},
{
name: "passing a valid threshold with max and min resource value",
input: api.ResourceThresholds{
v1.ResourceCPU: 100,
v1.ResourceMemory: 0,
},
errInfo: nil,
},
{
name: "passing a valid threshold with only cpu",
input: api.ResourceThresholds{
v1.ResourceCPU: 80,
},
errInfo: nil,
},
{
name: "passing a valid threshold with cpu, memory and pods",
input: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 30,
v1.ResourcePods: 40,
},
errInfo: nil,
},
{
name: "passing a valid threshold with only extended resource",
input: api.ResourceThresholds{
extendedResource: 80,
},
errInfo: nil,
},
{
name: "passing a valid threshold with cpu, memory, pods and extended resource",
input: api.ResourceThresholds{
v1.ResourceCPU: 20,
v1.ResourceMemory: 30,
v1.ResourcePods: 40,
extendedResource: 50,
},
errInfo: nil,
},
}
for _, test := range tests {
validateErr := validateThresholds(test.input)
if validateErr == nil || test.errInfo == nil {
if validateErr != test.errInfo {
t.Errorf("expected validity of threshold: %#v to be %v but got %v instead", test.input, test.errInfo, validateErr)
}
} else if validateErr.Error() != test.errInfo.Error() {
t.Errorf("expected validity of threshold: %#v to be %v but got %v instead", test.input, test.errInfo, validateErr)
}
}
}
func TestResourceUsagePercentages(t *testing.T) {
resourceUsagePercentage := resourceUsagePercentages(NodeUsage{
node: &v1.Node{
Status: v1.NodeStatus{
Capacity: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(3977868*1024, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI),
},
Allocatable: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(1930, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(3287692*1024, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI),
},
},
},
usage: map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(1220, resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(3038982964, resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(11, resource.BinarySI),
},
})
expectedUsageInIntPercentage := map[v1.ResourceName]float64{
v1.ResourceCPU: 63,
v1.ResourceMemory: 90,
v1.ResourcePods: 37,
}
for resourceName, percentage := range expectedUsageInIntPercentage {
if math.Floor(resourceUsagePercentage[resourceName]) != percentage {
t.Errorf("Incorrect percentange computation, expected %v, got math.Floor(%v) instead", percentage, resourceUsagePercentage[resourceName])
}
}
t.Logf("resourceUsagePercentage: %#v\n", resourceUsagePercentage)
}

View File

@@ -18,75 +18,22 @@ package strategies
import (
"context"
"fmt"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
v1 "k8s.io/api/core/v1"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
)
func validateRemovePodsViolatingInterPodAntiAffinityParams(params *api.StrategyParameters) error {
if params == nil {
return nil
}
// At most one of include/exclude can be set
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
return fmt.Errorf("only one of Include/Exclude namespaces can be set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
}
return nil
}
// RemovePodsViolatingInterPodAntiAffinity evicts pods on the node which are having a pod affinity rules.
func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
if err := validateRemovePodsViolatingInterPodAntiAffinityParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemovePodsViolatingInterPodAntiAffinity parameters")
return
}
var includedNamespaces, excludedNamespaces []string
var labelSelector *metav1.LabelSelector
if strategy.Params != nil {
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
}
labelSelector = strategy.Params.LabelSelector
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
return
}
nodeFit := false
if strategy.Params != nil {
nodeFit = strategy.Params.NodeFit
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(labelSelector),
)
klog.V(1).Infof("Processing node: %#v\n", node.Name)
pods, err := podutil.ListPodsOnANode(ctx, client, node, podEvictor.IsEvictable)
if err != nil {
return
}
@@ -94,10 +41,10 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients
podutil.SortPodsBasedOnPriorityLowToHigh(pods)
totalPods := len(pods)
for i := 0; i < totalPods; i++ {
if checkPodsWithAntiAffinityExist(pods[i], pods) && evictable.IsEvictable(pods[i]) {
if checkPodsWithAntiAffinityExist(pods[i], pods) {
success, err := podEvictor.EvictPod(ctx, pods[i], node, "InterPodAntiAffinity")
if err != nil {
klog.ErrorS(err, "Error evicting pod")
klog.Errorf("Error evicting pod: (%#v)", err)
break
}
@@ -122,7 +69,7 @@ func checkPodsWithAntiAffinityExist(pod *v1.Pod, pods []*v1.Pod) bool {
namespaces := utils.GetNamespacesFromPodAffinityTerm(pod, &term)
selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
if err != nil {
klog.ErrorS(err, "Unable to convert LabelSelector into Selector")
klog.Infof("%v", err)
return false
}
for _, existingPod := range pods {

View File

@@ -20,51 +20,30 @@ import (
"context"
"testing"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
"sigs.k8s.io/descheduler/pkg/utils"
"sigs.k8s.io/descheduler/test"
)
func TestPodAntiAffinity(t *testing.T) {
ctx := context.Background()
node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
node2 := test.BuildTestNode("n2", 2000, 3000, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
"datacenter": "east",
}
})
node3 := test.BuildTestNode("n3", 2000, 3000, 10, func(node *v1.Node) {
node.Spec = v1.NodeSpec{
Unschedulable: true,
}
})
p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil)
p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil)
p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil)
p4 := test.BuildTestPod("p4", 100, 0, node1.Name, nil)
p5 := test.BuildTestPod("p5", 100, 0, node1.Name, nil)
p6 := test.BuildTestPod("p6", 100, 0, node1.Name, nil)
p7 := test.BuildTestPod("p7", 100, 0, node1.Name, nil)
p8 := test.BuildTestPod("p8", 100, 0, node1.Name, nil)
criticalPriority := utils.SystemCriticalPriority
nonEvictablePod := test.BuildTestPod("non-evict", 100, 0, node1.Name, func(pod *v1.Pod) {
pod.Spec.Priority = &criticalPriority
})
node := test.BuildTestNode("n1", 2000, 3000, 10, nil)
p1 := test.BuildTestPod("p1", 100, 0, node.Name, nil)
p2 := test.BuildTestPod("p2", 100, 0, node.Name, nil)
p3 := test.BuildTestPod("p3", 100, 0, node.Name, nil)
p4 := test.BuildTestPod("p4", 100, 0, node.Name, nil)
p5 := test.BuildTestPod("p5", 100, 0, node.Name, nil)
p6 := test.BuildTestPod("p6", 100, 0, node.Name, nil)
p7 := test.BuildTestPod("p7", 100, 0, node.Name, nil)
p2.Labels = map[string]string{"foo": "bar"}
p5.Labels = map[string]string{"foo": "bar"}
p6.Labels = map[string]string{"foo": "bar"}
p7.Labels = map[string]string{"foo1": "bar1"}
nonEvictablePod.Labels = map[string]string{"foo": "bar"}
test.SetNormalOwnerRef(p1)
test.SetNormalOwnerRef(p2)
test.SetNormalOwnerRef(p3)
@@ -86,70 +65,30 @@ func TestPodAntiAffinity(t *testing.T) {
test.SetPodPriority(p6, 50)
test.SetPodPriority(p7, 0)
// Set pod node selectors
p8.Spec.NodeSelector = map[string]string{
"datacenter": "west",
}
tests := []struct {
description string
maxPodsToEvictPerNode int
pods []v1.Pod
expectedEvictedPodCount int
nodeFit bool
nodes []*v1.Node
}{
{
description: "Maximum pods to evict - 0",
maxPodsToEvictPerNode: 0,
pods: []v1.Pod{*p1, *p2, *p3, *p4},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 3,
},
{
description: "Maximum pods to evict - 3",
maxPodsToEvictPerNode: 3,
pods: []v1.Pod{*p1, *p2, *p3, *p4},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 3,
},
{
description: "Evict only 1 pod after sorting",
maxPodsToEvictPerNode: 0,
pods: []v1.Pod{*p5, *p6, *p7},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "Evicts pod that conflicts with critical pod (but does not evict critical pod)",
maxPodsToEvictPerNode: 1,
pods: []v1.Pod{*p1, *nonEvictablePod},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "Evicts pod that conflicts with critical pod (but does not evict critical pod)",
maxPodsToEvictPerNode: 1,
pods: []v1.Pod{*p1, *nonEvictablePod},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "Won't evict pods because node selectors don't match available nodes",
maxPodsToEvictPerNode: 1,
pods: []v1.Pod{*p8, *nonEvictablePod},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
nodeFit: true,
},
{
description: "Won't evict pods because only other node is not schedulable",
maxPodsToEvictPerNode: 1,
pods: []v1.Pod{*p8, *nonEvictablePod},
nodes: []*v1.Node{node1, node3},
expectedEvictedPodCount: 0,
nodeFit: true,
},
}
for _, test := range tests {
@@ -159,26 +98,19 @@ func TestPodAntiAffinity(t *testing.T) {
return true, &v1.PodList{Items: test.pods}, nil
})
fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) {
return true, node1, nil
return true, node, nil
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
"v1",
false,
test.maxPodsToEvictPerNode,
test.nodes,
false,
false,
[]*v1.Node{node},
false,
)
strategy := api.DeschedulerStrategy{
Params: &api.StrategyParameters{
NodeFit: test.nodeFit,
},
}
RemovePodsViolatingInterPodAntiAffinity(ctx, fakeClient, strategy, test.nodes, podEvictor)
RemovePodsViolatingInterPodAntiAffinity(ctx, fakeClient, api.DeschedulerStrategy{}, []*v1.Node{node}, podEvictor)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != test.expectedEvictedPodCount {
t.Errorf("Unexpected no of pods evicted: pods evicted: %d, expected: %d", podsEvicted, test.expectedEvictedPodCount)

View File

@@ -18,121 +18,51 @@ package strategies
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1meta "k8s.io/apimachinery/pkg/apis/meta/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
)
func validatePodLifeTimeParams(params *api.StrategyParameters) error {
if params == nil || params.PodLifeTime == nil || params.PodLifeTime.MaxPodLifeTimeSeconds == nil {
return fmt.Errorf("MaxPodLifeTimeSeconds not set")
}
if params.PodLifeTime.PodStatusPhases != nil {
for _, phase := range params.PodLifeTime.PodStatusPhases {
if phase != string(v1.PodPending) && phase != string(v1.PodRunning) {
return fmt.Errorf("only Pending and Running phases are supported in PodLifeTime")
}
}
}
// At most one of include/exclude can be set
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
return fmt.Errorf("only one of Include/Exclude namespaces can be set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
}
return nil
}
// PodLifeTime evicts pods on nodes that were created more than strategy.Params.MaxPodLifeTimeSeconds seconds ago.
func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
if err := validatePodLifeTimeParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid PodLifeTime parameters")
if strategy.Params == nil || strategy.Params.MaxPodLifeTimeSeconds == nil {
klog.V(1).Infof("MaxPodLifeTimeSeconds not set")
return
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
return
}
var includedNamespaces, excludedNamespaces []string
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority))
filter := evictable.IsEvictable
if strategy.Params.PodLifeTime.PodStatusPhases != nil {
filter = func(pod *v1.Pod) bool {
for _, phase := range strategy.Params.PodLifeTime.PodStatusPhases {
if string(pod.Status.Phase) == phase {
return evictable.IsEvictable(pod)
}
}
return false
}
}
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods := listOldPodsOnNode(ctx, client, node, includedNamespaces, excludedNamespaces, strategy.Params.LabelSelector, *strategy.Params.PodLifeTime.MaxPodLifeTimeSeconds, filter)
klog.V(1).Infof("Processing node: %#v", node.Name)
pods := listOldPodsOnNode(ctx, client, node, *strategy.Params.MaxPodLifeTimeSeconds, podEvictor)
for _, pod := range pods {
success, err := podEvictor.EvictPod(ctx, pod, node, "PodLifeTime")
if success {
klog.V(1).InfoS("Evicted pod because it exceeded its lifetime", "pod", klog.KObj(pod), "maxPodLifeTime", *strategy.Params.PodLifeTime.MaxPodLifeTimeSeconds)
klog.V(1).Infof("Evicted pod: %#v because it was created more than %v seconds ago", pod.Name, *strategy.Params.MaxPodLifeTimeSeconds)
}
if err != nil {
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
klog.Errorf("Error evicting pod: (%#v)", err)
break
}
}
}
}
func listOldPodsOnNode(
ctx context.Context,
client clientset.Interface,
node *v1.Node,
includedNamespaces, excludedNamespaces []string,
labelSelector *metav1.LabelSelector,
maxPodLifeTimeSeconds uint,
filter func(pod *v1.Pod) bool,
) []*v1.Pod {
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithFilter(filter),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(labelSelector),
)
func listOldPodsOnNode(ctx context.Context, client clientset.Interface, node *v1.Node, maxAge uint, evictor *evictions.PodEvictor) []*v1.Pod {
pods, err := podutil.ListPodsOnANode(ctx, client, node, evictor.IsEvictable)
if err != nil {
return nil
}
var oldPods []*v1.Pod
for _, pod := range pods {
podAgeSeconds := uint(metav1.Now().Sub(pod.GetCreationTimestamp().Local()).Seconds())
if podAgeSeconds > maxPodLifeTimeSeconds {
podAgeSeconds := uint(v1meta.Now().Sub(pod.GetCreationTimestamp().Local()).Seconds())
if podAgeSeconds > maxAge {
oldPods = append(oldPods, pod)
}
}

View File

@@ -22,7 +22,6 @@ import (
"time"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
@@ -34,15 +33,15 @@ import (
func TestPodLifeTime(t *testing.T) {
ctx := context.Background()
node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
node := test.BuildTestNode("n1", 2000, 3000, 10, nil)
olderPodCreationTime := metav1.NewTime(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC))
newerPodCreationTime := metav1.NewTime(time.Now())
// Setup pods, one should be evicted
p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil)
p1 := test.BuildTestPod("p1", 100, 0, node.Name, nil)
p1.Namespace = "dev"
p1.ObjectMeta.CreationTimestamp = newerPodCreationTime
p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil)
p2 := test.BuildTestPod("p2", 100, 0, node.Name, nil)
p2.Namespace = "dev"
p2.ObjectMeta.CreationTimestamp = olderPodCreationTime
@@ -51,10 +50,10 @@ func TestPodLifeTime(t *testing.T) {
p2.ObjectMeta.OwnerReferences = ownerRef1
// Setup pods, zero should be evicted
p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil)
p3 := test.BuildTestPod("p3", 100, 0, node.Name, nil)
p3.Namespace = "dev"
p3.ObjectMeta.CreationTimestamp = newerPodCreationTime
p4 := test.BuildTestPod("p4", 100, 0, node1.Name, nil)
p4 := test.BuildTestPod("p4", 100, 0, node.Name, nil)
p4.Namespace = "dev"
p4.ObjectMeta.CreationTimestamp = newerPodCreationTime
@@ -63,10 +62,10 @@ func TestPodLifeTime(t *testing.T) {
p4.ObjectMeta.OwnerReferences = ownerRef2
// Setup pods, one should be evicted
p5 := test.BuildTestPod("p5", 100, 0, node1.Name, nil)
p5 := test.BuildTestPod("p5", 100, 0, node.Name, nil)
p5.Namespace = "dev"
p5.ObjectMeta.CreationTimestamp = newerPodCreationTime
p6 := test.BuildTestPod("p6", 100, 0, node1.Name, nil)
p6 := test.BuildTestPod("p6", 100, 0, node.Name, nil)
p6.Namespace = "dev"
p6.ObjectMeta.CreationTimestamp = metav1.NewTime(time.Now().Add(time.Second * 605))
@@ -75,10 +74,10 @@ func TestPodLifeTime(t *testing.T) {
p6.ObjectMeta.OwnerReferences = ownerRef3
// Setup pods, zero should be evicted
p7 := test.BuildTestPod("p7", 100, 0, node1.Name, nil)
p7 := test.BuildTestPod("p7", 100, 0, node.Name, nil)
p7.Namespace = "dev"
p7.ObjectMeta.CreationTimestamp = newerPodCreationTime
p8 := test.BuildTestPod("p8", 100, 0, node1.Name, nil)
p8 := test.BuildTestPod("p8", 100, 0, node.Name, nil)
p8.Namespace = "dev"
p8.ObjectMeta.CreationTimestamp = metav1.NewTime(time.Now().Add(time.Second * 595))
@@ -86,66 +85,24 @@ func TestPodLifeTime(t *testing.T) {
p5.ObjectMeta.OwnerReferences = ownerRef4
p6.ObjectMeta.OwnerReferences = ownerRef4
// Setup two old pods with different status phases
p9 := test.BuildTestPod("p9", 100, 0, node1.Name, nil)
p9.Namespace = "dev"
p9.ObjectMeta.CreationTimestamp = olderPodCreationTime
p10 := test.BuildTestPod("p10", 100, 0, node1.Name, nil)
p10.Namespace = "dev"
p10.ObjectMeta.CreationTimestamp = olderPodCreationTime
p9.Status.Phase = "Pending"
p10.Status.Phase = "Running"
p9.ObjectMeta.OwnerReferences = ownerRef1
p10.ObjectMeta.OwnerReferences = ownerRef1
p11 := test.BuildTestPod("p11", 100, 0, node1.Name, func(pod *v1.Pod) {
pod.Spec.Volumes = []v1.Volume{
{
Name: "pvc", VolumeSource: v1.VolumeSource{
PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ClaimName: "foo"},
},
},
}
pod.Namespace = "dev"
pod.ObjectMeta.CreationTimestamp = olderPodCreationTime
pod.ObjectMeta.OwnerReferences = ownerRef1
})
// Setup two old pods with different labels
p12 := test.BuildTestPod("p12", 100, 0, node1.Name, nil)
p12.Namespace = "dev"
p12.ObjectMeta.CreationTimestamp = olderPodCreationTime
p13 := test.BuildTestPod("p13", 100, 0, node1.Name, nil)
p13.Namespace = "dev"
p13.ObjectMeta.CreationTimestamp = olderPodCreationTime
p12.ObjectMeta.Labels = map[string]string{"foo": "bar"}
p13.ObjectMeta.Labels = map[string]string{"foo": "bar1"}
p12.ObjectMeta.OwnerReferences = ownerRef1
p13.ObjectMeta.OwnerReferences = ownerRef1
var maxLifeTime uint = 600
testCases := []struct {
description string
strategy api.DeschedulerStrategy
maxPodsToEvictPerNode int
pods []v1.Pod
nodes []*v1.Node
expectedEvictedPodCount int
ignorePvcPods bool
}{
{
description: "Two pods in the `dev` Namespace, 1 is new and 1 very is old. 1 should be evicted.",
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
MaxPodLifeTimeSeconds: &maxLifeTime,
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
@@ -153,12 +110,11 @@ func TestPodLifeTime(t *testing.T) {
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
MaxPodLifeTimeSeconds: &maxLifeTime,
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p3, *p4},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 0,
},
{
@@ -166,12 +122,11 @@ func TestPodLifeTime(t *testing.T) {
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
MaxPodLifeTimeSeconds: &maxLifeTime,
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p5, *p6},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
@@ -179,94 +134,33 @@ func TestPodLifeTime(t *testing.T) {
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
MaxPodLifeTimeSeconds: &maxLifeTime,
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p7, *p8},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 0,
},
{
description: "Two old pods with different status phases. 1 should be evicted.",
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
PodLifeTime: &api.PodLifeTime{
MaxPodLifeTimeSeconds: &maxLifeTime,
PodStatusPhases: []string{"Pending"},
},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p9, *p10},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "does not evict pvc pods with ignorePvcPods set to true",
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p11},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 0,
ignorePvcPods: true,
},
{
description: "evicts pvc pods with ignorePvcPods set to false (or unset)",
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p11},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "Two old pods with different labels, 1 selected by labelSelector",
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
LabelSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{"foo": "bar"},
},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p12, *p13},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
}
for _, tc := range testCases {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: tc.pods}, nil
})
fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) {
return true, node, nil
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
"v1",
false,
tc.maxPodsToEvictPerNode,
tc.nodes,
[]*v1.Node{node},
false,
false,
tc.ignorePvcPods,
)
PodLifeTime(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
PodLifeTime(ctx, fakeClient, tc.strategy, []*v1.Node{node}, podEvictor)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != tc.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", tc.description, tc.expectedEvictedPodCount, podsEvicted)

View File

@@ -18,7 +18,6 @@ package strategies
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
clientset "k8s.io/client-go/kubernetes"
@@ -27,66 +26,21 @@ import (
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
)
func validateRemovePodsHavingTooManyRestartsParams(params *api.StrategyParameters) error {
if params == nil || params.PodsHavingTooManyRestarts == nil || params.PodsHavingTooManyRestarts.PodRestartThreshold < 1 {
return fmt.Errorf("PodsHavingTooManyRestarts threshold not set")
}
// At most one of include/exclude can be set
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
return fmt.Errorf("only one of Include/Exclude namespaces can be set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
}
return nil
}
// RemovePodsHavingTooManyRestarts removes the pods that have too many restarts on node.
// There are too many cases leading this issue: Volume mount failed, app error due to nodes' different settings.
// As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages.
func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
if err := validateRemovePodsHavingTooManyRestartsParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemovePodsHavingTooManyRestarts parameters")
if strategy.Params == nil || strategy.Params.PodsHavingTooManyRestarts == nil || strategy.Params.PodsHavingTooManyRestarts.PodRestartThreshold < 1 {
klog.V(1).Infof("PodsHavingTooManyRestarts thresholds not set")
return
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
return
}
var includedNamespaces, excludedNamespaces []string
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
}
nodeFit := false
if strategy.Params != nil {
nodeFit = strategy.Params.NodeFit
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithFilter(evictable.IsEvictable),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(strategy.Params.LabelSelector),
)
klog.V(1).Infof("Processing node: %s", node.Name)
pods, err := podutil.ListPodsOnANode(ctx, client, node, podEvictor.IsEvictable)
if err != nil {
klog.ErrorS(err, "Error listing a nodes pods", "node", klog.KObj(node))
klog.Errorf("Error when list pods at node %s", node.Name)
continue
}
@@ -100,7 +54,7 @@ func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Inter
continue
}
if _, err := podEvictor.EvictPod(ctx, pods[i], node, "TooManyRestarts"); err != nil {
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
klog.Errorf("Error evicting pod: (%#v)", err)
break
}
}

View File

@@ -22,8 +22,7 @@ import (
"fmt"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
@@ -82,26 +81,7 @@ func initPods(node *v1.Node) []v1.Pod {
func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
ctx := context.Background()
node1 := test.BuildTestNode("node1", 2000, 3000, 10, nil)
node2 := test.BuildTestNode("node2", 2000, 3000, 10, func(node *v1.Node) {
node.Spec.Taints = []v1.Taint{
{
Key: "hardware",
Value: "gpu",
Effect: v1.TaintEffectNoSchedule,
},
}
})
node3 := test.BuildTestNode("node3", 2000, 3000, 10, func(node *v1.Node) {
node.Spec = v1.NodeSpec{
Unschedulable: true,
}
})
pods := initPods(node1)
createStrategy := func(enabled, includingInitContainers bool, restartThresholds int32, nodeFit bool) api.DeschedulerStrategy {
createStrategy := func(enabled, includingInitContainers bool, restartThresholds int32) api.DeschedulerStrategy {
return api.DeschedulerStrategy{
Enabled: enabled,
Params: &api.StrategyParameters{
@@ -109,98 +89,76 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
PodRestartThreshold: restartThresholds,
IncludingInitContainers: includingInitContainers,
},
NodeFit: nodeFit,
},
}
}
tests := []struct {
description string
nodes []*v1.Node
pods []v1.Pod
strategy api.DeschedulerStrategy
expectedEvictedPodCount int
maxPodsToEvictPerNode int
}{
{
description: "All pods have total restarts under threshold, no pod evictions",
strategy: createStrategy(true, true, 10000, false),
nodes: []*v1.Node{node1},
strategy: createStrategy(true, true, 10000),
expectedEvictedPodCount: 0,
maxPodsToEvictPerNode: 0,
},
{
description: "Some pods have total restarts bigger than threshold",
strategy: createStrategy(true, true, 1, false),
nodes: []*v1.Node{node1},
strategy: createStrategy(true, true, 1),
expectedEvictedPodCount: 6,
maxPodsToEvictPerNode: 0,
},
{
description: "Nine pods have total restarts equals threshold(includingInitContainers=true), 6 pod evictions",
strategy: createStrategy(true, true, 1*25, false),
nodes: []*v1.Node{node1},
description: "Nine pods have total restarts equals threshold(includingInitContainers=true), 6 pods evictions",
strategy: createStrategy(true, true, 1*25),
expectedEvictedPodCount: 6,
maxPodsToEvictPerNode: 0,
},
{
description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 5 pod evictions",
strategy: createStrategy(true, false, 1*25, false),
nodes: []*v1.Node{node1},
description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 5 pods evictions",
strategy: createStrategy(true, false, 1*25),
expectedEvictedPodCount: 5,
maxPodsToEvictPerNode: 0,
},
{
description: "All pods have total restarts equals threshold(includingInitContainers=true), 6 pod evictions",
strategy: createStrategy(true, true, 1*20, false),
nodes: []*v1.Node{node1},
description: "All pods have total restarts equals threshold(includingInitContainers=true), 6 pods evictions",
strategy: createStrategy(true, true, 1*20),
expectedEvictedPodCount: 6,
maxPodsToEvictPerNode: 0,
},
{
description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 6 pod evictions",
strategy: createStrategy(true, false, 1*20, false),
nodes: []*v1.Node{node1},
description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 6 pods evictions",
strategy: createStrategy(true, false, 1*20),
expectedEvictedPodCount: 6,
maxPodsToEvictPerNode: 0,
},
{
description: "Five pods have total restarts bigger than threshold(includingInitContainers=true), but only 1 pod eviction",
strategy: createStrategy(true, true, 5*25+1, false),
nodes: []*v1.Node{node1},
strategy: createStrategy(true, true, 5*25+1),
expectedEvictedPodCount: 1,
maxPodsToEvictPerNode: 0,
},
{
description: "Five pods have total restarts bigger than threshold(includingInitContainers=false), but only 1 pod eviction",
strategy: createStrategy(true, false, 5*20+1, false),
nodes: []*v1.Node{node1},
strategy: createStrategy(true, false, 5*20+1),
expectedEvictedPodCount: 1,
maxPodsToEvictPerNode: 0,
},
{
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3), 3 pod evictions",
strategy: createStrategy(true, true, 1, false),
nodes: []*v1.Node{node1},
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3), 3 pods evictions",
strategy: createStrategy(true, true, 1),
expectedEvictedPodCount: 3,
maxPodsToEvictPerNode: 3,
},
{
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node is tained, 0 pod evictions",
strategy: createStrategy(true, true, 1, true),
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
maxPodsToEvictPerNode: 3,
},
{
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node is not schedulable, 0 pod evictions",
strategy: createStrategy(true, true, 1, true),
nodes: []*v1.Node{node1, node3},
expectedEvictedPodCount: 0,
maxPodsToEvictPerNode: 3,
},
}
for _, tc := range tests {
node := test.BuildTestNode("node1", 2000, 3000, 10, nil)
pods := initPods(node)
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
@@ -209,16 +167,14 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
"v1",
false,
tc.maxPodsToEvictPerNode,
tc.nodes,
false,
false,
[]*v1.Node{node},
false,
)
RemovePodsHavingTooManyRestarts(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
RemovePodsHavingTooManyRestarts(ctx, fakeClient, tc.strategy, []*v1.Node{node}, podEvictor)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount)

Some files were not shown because too many files have changed in this diff Show More