1
0
mirror of https://github.com/kubernetes-sigs/descheduler.git synced 2026-01-26 21:31:18 +01:00

Compare commits

..

198 Commits

Author SHA1 Message Date
Kubernetes Prow Robot
9c110c4004 Merge pull request #791 from JaneLiuL/master
Bump to k8s 1.24.0
2022-05-12 12:06:33 -07:00
Kubernetes Prow Robot
0eddf7f108 Merge pull request #792 from pravarag/update-docs-1.24
Update Docs and Manifests for v0.24.0
2022-05-12 11:31:15 -07:00
Kubernetes Prow Robot
3c8d6c4d53 Merge pull request #795 from damemi/update-e2e
Update e2e test versions
2022-05-12 09:29:14 -07:00
Mike Dame
6e84d0a6ba React to removal of offensive language
https://github.com/kubernetes/kubeadm/issues/2200 went into effect in 1.24, so
e2es broke without the update.
2022-05-12 15:35:07 +00:00
Mike Dame
fb1df468ad golint fix 2022-05-12 14:21:34 +00:00
Mike Dame
ac4d576df8 Update e2e test versions 2022-05-12 14:16:53 +00:00
Pravar Agrawal
314ad65b04 Update docs and manifests for v0.24.0 2022-05-04 22:08:49 +05:30
JaneLiuL
969a618933 Bump to k8s 1.24.0 2022-05-04 10:17:47 +08:00
Kubernetes Prow Robot
028f205e8c Merge pull request #790 from ingvagabund/636
Added request considerations to NodeFit Feature [#636 follow up]
2022-05-03 19:09:16 -07:00
Jan Chaloupka
3eca2782d4 Addressing review comments
Both LowNode and HighNode utilization strategies evict only as many pods
as there's free resources on other nodes. Thus, the resource fit test
is always true by definition.
2022-04-28 18:54:54 +02:00
RyanDevlin
16eb9063b6 NodeFit parameter now considers pod requests 2022-04-28 10:16:52 +02:00
Kubernetes Prow Robot
eac3b4b54a Merge pull request #788 from ryan4yin/master
fix: incorrect yaml indentation in readme
2022-04-26 06:46:53 -07:00
Ryan Yin
d08cea731a fix: incorrect indentation 2022-04-26 06:05:12 +08:00
Kubernetes Prow Robot
0fc5ba9316 Merge pull request #787 from JaneLiuL/master
bump to k8s 1.24-rc.0
2022-04-25 12:05:43 -07:00
JaneLiuL
ecbd10afe2 bump to k8s 1.24-rc.0 2022-04-21 09:11:04 +08:00
Kubernetes Prow Robot
e5ed0540f2 Merge pull request #779 from pravarag/user-docs-typo
Fix missing param in user-guide for PodLifeTime strategy
2022-04-11 01:44:06 -07:00
Pravar Agrawal
4e972a7602 fix missing param in user-guide 2022-04-07 10:02:26 +05:30
Kubernetes Prow Robot
ae20b5b034 Merge pull request #732 from eminaktas/feature/metric-scape
feat: Add metric scrape configs in Helm Chart
2022-03-30 07:06:27 -07:00
Kubernetes Prow Robot
406e3ed5b3 Merge pull request #771 from dineshbhor/fix-highnodeutilization-node-sorting
Sort nodes in ascending order for HighNodeUtilization
2022-03-29 02:58:47 -07:00
dineshbhor
7589aaf00b Sort nodes in ascending order for HighNodeUtilization 2022-03-29 17:54:18 +09:00
eminaktas
ca90b53913 feat: Add metric scrape configs in Helm Chart
Signed-off-by: eminaktas <emin.aktas@trendyol.com>
2022-03-28 23:41:56 +03:00
Kubernetes Prow Robot
238eebeaca Merge pull request #722 from Dentrax/feature/leaderelection
feat(leaderelection): impl leader election for HA Deployment
2022-03-28 09:39:23 -07:00
Kubernetes Prow Robot
cf59d08193 Merge pull request #751 from HelmutLety/redo_#473
feat: Add DeviationThreshold Paramter for LowNodeUtilization, (Previous attempt - #473 )
2022-03-28 03:53:24 -07:00
HelmutLety
2ea65e69dc feat(LowNodeUtilization): useDeviationThresholds, redo of #473
[751]: normalize Percentage in nodeutilization and clean the tests
2022-03-28 12:35:01 +02:00
Kubernetes Prow Robot
7f6a2a69b0 Merge pull request #777 from JacobHenner/support-taint-exclusions
Add RemovePodsViolatingNodeTaints taint exclusion
2022-03-28 02:47:23 -07:00
Jacob Henner
ac3362149b Add RemovePodsViolatingNodeTaints taint exclusion
Add taint exclusion to RemovePodsViolatingNodeTaints. This permits node
taints to be ignored by allowing users to specify ignored taint keys or
ignored taint key=value pairs.
2022-03-27 13:48:40 -04:00
Furkan
0a52af9ab8 feat(leaderelection): impl leader election
Signed-off-by: Furkan <furkan.turkal@trendyol.com>
Signed-off-by: eminaktas <eminaktas34@gmail.com>
Co-authored-by: Emin <emin.aktas@trendyol.com>
Co-authored-by: Yasin <yasintaha.erol@trendyol.com>
2022-03-25 14:33:14 +03:00
Kubernetes Prow Robot
07bbdc61c4 Merge pull request #762 from ingvagabund/nodeutilization-refactor
Promote NodeUsage to NodeInfo, evaluate thresholds separately
2022-03-15 17:33:48 -07:00
Kubernetes Prow Robot
17595fdcfc Merge pull request #764 from ingvagabund/taints-prefer-no-scheduler
RemovePodsViolatingNodeTaints: optionally include PreferNoSchedule taint
2022-03-14 17:36:10 -07:00
Jan Chaloupka
285523f0d9 RemovePodsViolatingNodeTaints: optionally include PreferNoSchedule taint 2022-03-14 16:46:03 +01:00
Kubernetes Prow Robot
c55a897599 Merge pull request #759 from JaneLiuL/master
OWNERS: add janeliul as a reviewer
2022-03-11 10:29:07 -08:00
Jan Chaloupka
52ff50f2d1 Promote NodeUsage to NodeInfo, evaluate thresholds separately 2022-03-11 13:52:37 +01:00
Jan Chaloupka
8ebf3fb323 nodeutilization: move node resource threshold value computation under a separate function 2022-03-11 12:46:11 +01:00
Kubernetes Prow Robot
0e0ae8df90 Merge pull request #761 from ingvagabund/TestTooManyRestarts-II
[e2e] TestTooManyRestarts: check if container status is set before accessing
2022-03-11 02:29:06 -08:00
Jan Chaloupka
bd3daa82d3 [e2e] TestTooManyRestarts: check if container status is set before accessing 2022-03-11 10:35:49 +01:00
Kubernetes Prow Robot
60a15f0392 Merge pull request #760 from ingvagabund/TestTooManyRestarts
[e2e] TestTooManyRestarts: check err and len before accessing pod items
2022-03-11 01:09:07 -08:00
Jan Chaloupka
d98cb84568 [e2e] TestTooManyRestarts: check err and len before accessing pod items 2022-03-11 09:45:05 +01:00
Kubernetes Prow Robot
6ab01eca63 Merge pull request #758 from hiroyaonoe/add-doc-about-max-no-of-pods-to-evict-per-namespace-policy
Update docs for maxNoOfPodsToEvictPerNamespace
2022-03-10 11:25:21 -08:00
Kubernetes Prow Robot
584ac2d604 Merge pull request #757 from prune998/prune/taint-logs
add conflicting taint to the logs
2022-03-10 05:37:35 -08:00
prune
448dc4784c add conflicting taint to the logs
log when count mismatch


simplified logic to log blocking taints
2022-03-10 08:05:42 -05:00
JaneLiuL
3ca77e7a3d OWNERS: add janeliul as a reviewer 2022-03-08 07:48:11 +08:00
Hiroya Onoe
01e7015b97 Update docs for maxNoOfPodsToEvictPerNamespace 2022-03-07 16:21:04 +09:00
Kubernetes Prow Robot
fd5a8c7d78 Merge pull request #739 from JaneLiuL/master
Share links to all descheduler ehnacements proposals in the project repo
2022-03-02 09:55:14 -08:00
Kubernetes Prow Robot
43148ecd0c Merge pull request #740 from JaneLiuL/doc-npd
fix doc about NPD description
2022-03-01 09:59:55 -08:00
Kubernetes Prow Robot
16501978dc Merge pull request #748 from damemi/update-v0.23.1
Update manifests and doc for v0.23.1
2022-03-01 07:47:46 -08:00
Mike Dame
1b4e48b006 Update manifests and doc for v0.23.1 2022-02-28 19:06:50 +00:00
Kubernetes Prow Robot
da6a3e063f Merge pull request #744 from antonio-te/master
Update golang image
2022-02-28 10:41:46 -08:00
Antonio Gurgel
5784c0cc04 Update golang image
1.17.3 is affected by CVE-2021-44716.
2022-02-28 07:22:26 -08:00
JaneLiuL
254a3a9ec1 Share links to all descheduler ehnacements proposals in the project repository 2022-02-26 12:27:35 +08:00
JaneLiuL
328c695141 fix doc about NPD description 2022-02-26 12:23:33 +08:00
Kubernetes Prow Robot
3ab0268c5a Merge pull request #733 from JaneLiuL/master
remove MostRequestedPriority from doc since already deprecated
2022-02-24 04:32:32 -08:00
Jane Liu L
cd8dbdd1e2 remove MostRequestedPriority from doc since already deprecated 2022-02-24 09:00:36 +08:00
Kubernetes Prow Robot
54c50c5390 Merge pull request #731 from jklaw90/fix-ctx-cron
Bugfix: Cronjob ctx cancel
2022-02-22 11:35:18 -08:00
Julian Lawrence
a2cbc25397 updated to handle cronjob flow 2022-02-22 08:52:06 -08:00
Kubernetes Prow Robot
bd81f6436e Merge pull request #708 from damemi/utilization-values-readme
Clarify resource calculations in NodeUtilization strategy Readmes
2022-02-22 04:47:46 -08:00
Kubernetes Prow Robot
30be19b04e Merge pull request #715 from eminaktas/values-fix
fix: Remove deprecated parameters from cmdOptions and add the parameters under policy
2022-02-18 05:08:23 -08:00
Kubernetes Prow Robot
3c251fb09d Merge pull request #726 from jklaw90/log-eviction-node
Eviction Logs
2022-02-15 04:08:03 -08:00
Julian Lawrence
224e2b078f updated logs to help with debugging 2022-02-14 18:27:53 -08:00
Kubernetes Prow Robot
dd80d60f4f Merge pull request #716 from eminaktas/imagepullsecret
fix: add imagePullSecrets for deployment resource
2022-02-14 05:27:29 -08:00
Kubernetes Prow Robot
e88837a349 Merge pull request #704 from ingvagabund/update-chart-readme
Update charts README to reflect the new parameters
2022-02-11 14:23:46 -08:00
Kubernetes Prow Robot
5901f8af1b Merge pull request #697 from a7i/code-reviewer
OWNERS: add a7i as a reviewer
2022-02-11 08:14:23 -08:00
Kubernetes Prow Robot
0d1704a192 Merge pull request #717 from JaneLiuL/release-1.23.1
[release-1.23.1] Update helm chart version to v0.23.1
2022-02-08 04:34:54 -08:00
JaneLiuL
c5878b18c6 Update helm chart version to v0.23.1 2022-02-08 20:21:57 +08:00
emin.aktas
ff1954b32e fix: add imagePullSecrets for deployment resource
Signed-off-by: emin.aktas <eminaktas34@gmail.com>
Co-authored-by: yasintahaerol <yasintahaerol@gmail.com>
Co-authored-by: Dentrax <furkan.turkal@trendyol.com>
2022-02-07 18:05:18 +03:00
emin.aktas
4c8040bbaf fix: Remove deprecated parameters from cmdOptions and add the parameters under policy 2022-02-07 15:14:55 +03:00
Kubernetes Prow Robot
deaa314492 Merge pull request #712 from JaneLiuL/helm
fix helmchart fail to watch namespace issue
2022-02-06 10:36:51 -08:00
Jane Liu L
9c653a2274 fix helmchart fail to watch namespace issue 2022-02-04 18:34:21 +08:00
Kubernetes Prow Robot
8d37557743 Merge pull request #709 from damemi/update-helm-23
Update helm chart version to v0.23
2022-02-03 12:10:58 -08:00
Mike Dame
5081ad84b5 Update helm chart version to v0.23 2022-02-03 14:57:18 -05:00
Mike Dame
c51c066cd1 Clarify resource calculations in NodeUtilization strategy Readmes
This adds text explaining the resource calculation in LowNodeUtilization and HighNodeUtilization
2022-01-30 12:59:47 -05:00
Kubernetes Prow Robot
afb1d75ce1 Merge pull request #660 from martin-magakian/features/add_affinity_option
Adding 'affinity' support to run 'descheduler' in CronJob or Deployment
2022-01-27 05:56:27 -08:00
Jan Chaloupka
90e6174fdd Update charts README to reflect the new parameters 2022-01-27 14:46:15 +01:00
Kubernetes Prow Robot
8e3ef9a6b3 Merge pull request #694 from sharkannon/master
Updates to include annotations to the service account
2022-01-27 05:42:26 -08:00
Kubernetes Prow Robot
778a18c550 Merge pull request #700 from jklaw90/root-ctx
Use the root context cancellation
2022-01-27 05:08:25 -08:00
Julian Lawrence
1a98a566b3 adding cancelation from sigint sigterm 2022-01-25 00:10:09 -08:00
Kubernetes Prow Robot
a643c619c9 Merge pull request #699 from ingvagabund/evict-pods-report-metrics-indendent-of-the-dry-mode
Evictor: report successful eviction independently of the dry-mode
2022-01-20 14:16:29 -08:00
Jan Chaloupka
203388ff1a Evictor: report successful eviction independently of the dry-mode
Dry mode currently does not report metrics when the eviction succeeds
2022-01-20 21:23:19 +01:00
Kubernetes Prow Robot
2844f80a35 Merge pull request #677 from ingvagabund/accumulated-eviction
Use a fake client when evicting pods by individual strategies to accumulate the evictions
2022-01-20 08:15:52 -08:00
Jan Chaloupka
901a16ecbc Do not collect the metrics when the metrics server is not enabled 2022-01-20 17:04:15 +01:00
Jan Chaloupka
e0f086ff85 Use a fake client when evicting pods by individual strategies to accumulate the evictions
Currently, when the descheduler is running with the --dry-run on, no strategy actually
evicts a pod so every strategy always starts with a complete list of
pods. E.g. when the PodLifeTime strategy evicts few pods, the RemoveDuplicatePods
strategy still takes into account even the pods eliminated by the PodLifeTime
strategy. Which does not correspond to the real case scenarios as the
same pod can be evicted multiple times. Instead, use a fake client and
evict/delete the pods from its cache so the strategies evict each pod
at most once as it would be normally done in a real cluster.
2022-01-20 17:04:05 +01:00
Amir Alavi
0251935268 OWNERS: add a7i as a reviewer 2022-01-18 09:14:44 -05:00
Stephen Herd
8752a28025 Merge branch 'kubernetes-sigs-master' 2022-01-13 12:52:36 -08:00
Stephen Herd
24884c7568 Rebase from master 2022-01-13 12:52:06 -08:00
Kubernetes Prow Robot
175f648045 Merge pull request #695 from a7i/liveness-template
make livenessprobe consistent across manifests
2022-01-12 13:37:40 -08:00
Amir Alavi
f50a3fa119 make livenessprobe consistent across manifests; make helm chart configurable via values.yaml 2022-01-12 11:49:17 -05:00
Kubernetes Prow Robot
551eced42a Merge pull request #688 from babygoat/evict-failed-without-ownerrefs
feat: support eviction of failed bare pods
2022-01-11 12:31:15 -08:00
Stephen Herd
3635a8171c Updates to include annotations to the service account, needed for things such as Workload Identity in Google Cloud 2022-01-11 11:55:05 -08:00
Kubernetes Prow Robot
796f347305 Merge pull request #692 from jklaw90/sliding-until
NonSlidingUntil for deployment
2022-01-11 06:21:16 -08:00
Kubernetes Prow Robot
13abbe7f09 Merge pull request #693 from developer-guy/patch-1
Update NOTES.txt
2022-01-10 05:11:13 -08:00
Kubernetes Prow Robot
e4df54d2d1 Merge pull request #685 from JaneLiuL/master
add  liveness probe
2022-01-10 04:29:12 -08:00
Jane Liu L
c38f617e40 add liveness probe 2022-01-10 09:56:53 +08:00
Kubernetes Prow Robot
e6551564c4 Merge pull request #691 from RyanDevlin/waitForNodes
Eliminated race condition in E2E tests
2022-01-07 06:16:30 -08:00
Batuhan Apaydın
3a991dd50c Update NOTES.txt
Signed-off-by: Batuhan Apaydın <batuhan.apaydin@trendyol.com>
Co-authored-by: Furkan Türkal <furkan.turkal@trendyol.com>
Co-authored-by: Emin Aktaş <emin.aktas@trendyol.com>
Co-authored-by: Necatican Yıldırım <necatican.yildirim@trendyol.com>
Co-authored-by: Fatih Sarhan <fatih.sarhan@trendyol.com>
2022-01-07 13:42:00 +03:00
Julian Lawrence
77cb406052 updated until -> sliding until 2022-01-06 12:55:10 -08:00
RyanDevlin
921a5680ab Eliminated race condition in E2E tests 2022-01-06 09:36:13 -05:00
babygoat
1529180d70 feat: support eviction of failed bare pods
This patch adds the policy(evictFailedBarePods) to allow the failed
pods without ownerReferences to be evicted. For backward compatibility,
disable the policy by default. Address #644.
2022-01-06 01:07:41 +08:00
Kubernetes Prow Robot
2d9143d129 Merge pull request #687 from jklaw90/error-comment
Comment update for metrics
2022-01-04 06:52:52 -08:00
Kubernetes Prow Robot
e9c0833b6f Merge pull request #689 from ingvagabund/run-hack-update-generated-conversions-sh
run ./hack/update-* scripts
2022-01-04 06:34:52 -08:00
Jan Chaloupka
8462cf56d7 run ./hack/update-* scripts 2022-01-04 09:37:01 +01:00
Julian Lawrence
a60d6a527d updated comment to reflect actual value 2021-12-29 10:56:10 -08:00
Kubernetes Prow Robot
2b23694704 Merge pull request #682 from jklaw90/chart-labels
commonLabels value for chart
2021-12-26 06:15:15 -08:00
Julian Lawrence
d0a95bee2f fixed default value for common labels 2021-12-20 08:24:30 -08:00
Julian Lawrence
57a910f5d1 adding commonLabels value 2021-12-18 23:31:52 -08:00
Kubernetes Prow Robot
ccaedde183 Merge pull request #661 from kirecek/enhc/include-pod-reason
Add pod.Status.Reason to the list of reasons
2021-12-17 13:11:55 -08:00
Erik Jankovič
2020642b6f chore: add pod.Status.Reason to the list of reasons
Signed-off-by: Erik Jankovič <erik.jankovic@gmail.com>
2021-12-17 18:37:53 +01:00
Kubernetes Prow Robot
96ff5d2dd9 Merge pull request #680 from ingvagabund/klog-output-stdout
Set the klog output to stdout by default
2021-12-16 05:31:18 -08:00
Jan Chaloupka
d8718d7db3 Set the klog output to stdout by default
Also, one needs to set --logtostderr=false to properly log into the stdout
2021-12-16 11:22:40 +01:00
Kubernetes Prow Robot
1e5165ba9f Merge pull request #670 from autumn0207/improve_pod_eviction_metrics
Add node name label to the counter metric for evicted pods
2021-12-16 01:49:18 -08:00
autumn0207
8e74f8bd77 improve pod eviction metrics 2021-12-16 17:06:22 +08:00
Kubernetes Prow Robot
2424928019 Merge pull request #667 from damemi/1.23-rc.0
bump: k8s to 1.23
2021-12-15 06:56:20 -08:00
Jan Chaloupka
e6314d2c7e Init the klog directly
Since 3948cb8d1b (diff-465167b08358906be13f9641d4798c6e8ad0790395e045af8ace4d08223fa922R78)
the klog verbosity level gets always overriden.
2021-12-15 09:23:20 -05:00
Kubernetes Prow Robot
271ee3c7e3 Merge pull request #678 from a7i/golangci-fix
fix: install golangci using from the golangci repo
2021-12-15 02:20:19 -08:00
Amir Alavi
e58686c142 fix: install golangci using from the golangci repo 2021-12-14 13:18:19 -05:00
Kubernetes Prow Robot
0b2c10d6ce Merge pull request #673 from Garrybest/pr_pod_cache
list pods assigned to a node by pod informer cache
2021-12-14 01:32:04 -08:00
Garrybest
cac3b9185b reform all test files
Signed-off-by: Garrybest <garrybest@foxmail.com>
2021-12-11 19:43:16 +08:00
Mike Dame
94888e653c Move klog initialization to cli.Run() 2021-12-10 12:00:11 -05:00
Mike Dame
936578b238 Update k8s version in helm test 2021-12-10 10:14:47 -05:00
Mike Dame
4fa7bf978c run hack/update-generated-deep-copies.sh 2021-12-10 10:02:39 -05:00
Mike Dame
2f7c496944 React to 1.23 bump
Logging validation functions changed in upstream commit
54ecfcdac8.
This uses the new function name.
2021-12-10 10:02:26 -05:00
Mike Dame
5fe3ca86ff bump: k8s to 1.23 2021-12-10 10:02:14 -05:00
Garrybest
0ff8ecb41e reform all strategies by using getPodsAssignedToNode
Signed-off-by: Garrybest <garrybest@foxmail.com>
2021-12-10 19:28:51 +08:00
Garrybest
08ed129a07 reform ListPodsOnANode by using pod informer and indexer
Signed-off-by: Garrybest <garrybest@foxmail.com>
2021-12-10 19:25:20 +08:00
Kubernetes Prow Robot
49ad197dfc Merge pull request #658 from JaneLiuL/master
Add maxNoOfPodsToEvictPerNamespace policy
2021-12-03 01:50:27 -08:00
Kubernetes Prow Robot
82201d0e48 Add maxNoOfPodsToEvictPerNamespace policy 2021-12-03 10:58:37 +08:00
Kubernetes Prow Robot
2b95332e8c Merge pull request #665 from spiffxp/use-k8s-infra-for-gcb-image
images: use k8s-staging-test-infra/gcb-docker-gcloud
2021-11-30 13:59:01 -08:00
Aaron Crickenberger
e8ed62e540 images: use k8s-staging-test-infra/gcb-docker-gcloud 2021-11-30 13:12:18 -08:00
Kubernetes Prow Robot
e5725de7bb Merge pull request #664 from stpabhi/dev
fix typo minPodLifeTimeSeconds
2021-11-30 08:10:56 -08:00
Abhilash Pallerlamudi
c47e811937 fix typo minPodLifeTimeSeconds
Signed-off-by: Abhilash Pallerlamudi <stp.abhi@gmail.com>
2021-11-29 17:51:40 -08:00
Kubernetes Prow Robot
e0bac4c371 Merge pull request #662 from ingvagabund/drop-deprecated-flags
Drop deprecated flags
2021-11-29 08:43:23 -08:00
Jan Chaloupka
73a7adf572 Drop deprecated flags 2021-11-29 17:12:59 +01:00
Kubernetes Prow Robot
5cf381a817 Merge pull request #663 from ingvagabund/bump-go-to-1.17
Bump go version in go.mod to go1.17
2021-11-29 08:01:23 -08:00
Jan Chaloupka
4603182320 Bump go version in go.mod to go1.17 2021-11-29 16:49:35 +01:00
Martin Magakian
ad207775ff Adding 'affinity' support to run 'descheduler' in CronJob or Deployment 2021-11-18 11:08:36 +01:00
Kubernetes Prow Robot
d0f11a41c0 Merge pull request #639 from JaneLiuL/master
Ignore Pods With Deletion Timestamp
2021-11-15 08:44:49 -08:00
Jane Liu L
c7524705b3 Ignore Pods With Deletion Timestamp 2021-11-10 09:32:11 +08:00
Kubernetes Prow Robot
50f9513cbb Merge pull request #642 from wking/clarify-RemovePodsHavingTooManyRestarts
README: Clarify podRestartThreshold applying to the sum over containers
2021-10-13 03:15:49 -07:00
W. Trevor King
6fd80ba29c README: Clarify podRestartThreshold applying to the sum over containers
calcContainerRestarts sums over containers.  The new language makes
that clear, avoiding potential confusion vs. an altenative that looked
for pods where a single container had passed the configured threshold.
For example, with three containers with 50 restarts and a threshold of
100, the actual "sum over containers" logic makes that pod a candidate
for descheduling, but the "largest single container restart count"
hypothetical would not have made it a candidate.

Also shifts labelSelector into the parameter table, because when it
was added in 29ade13ce7 (README and e2e-testcase add for
labelSelector, 2021-03-02, #510), it landed a few lines too high.
2021-10-07 14:51:26 -07:00
Kubernetes Prow Robot
5b557941fa Merge pull request #627 from JaneLiuL/master
Add E2E test case cover duplicatepods strategy
2021-10-01 00:01:22 -07:00
Kubernetes Prow Robot
c6229934a0 Merge pull request #637 from KohlsTechnology/helm-suspend-docs
Document suspend helm chart configuration option
2021-09-30 23:47:22 -07:00
Sean Malloy
ed28eaeccc Document suspend helm chart configuration option 2021-09-30 23:30:10 -05:00
Kubernetes Prow Robot
3be910c238 Merge pull request #621 from uthark/oatamanenko/deleted
Ignore pods being deleted
2021-09-30 21:21:22 -07:00
Kubernetes Prow Robot
d96dd6da2d Merge pull request #632 from a7i/amir/failedpods-crash
RemoveFailedPods: guard against nil descheduler strategy (e.g. in case of default that loads all strategies)
2021-09-29 01:02:49 -07:00
Amir Alavi
f7c26ef41f e2e tests for RemoveFailedPods strategy
Fix priority class default
2021-09-26 20:39:32 -04:00
Jane Liu L
57ad9cc91b Add E2E test case cover tooManyRestarts strategy 2021-09-26 09:10:17 +08:00
Kubernetes Prow Robot
926339594d Merge pull request #622 from yutachaos/feature/added_suspend_parameter
Added support for cronjob suspend
2021-09-22 09:18:01 -07:00
Amir Alavi
1ba53ad68c e2e TestTopologySpreadConstraint: ensure pods are running before checking for topology spread across domains 2021-09-20 18:18:47 -04:00
Amir Alavi
6eb37ce079 RemoveFailedPods: guard against nil descheduler strategy (e.g. in case of default that loads all strategies) 2021-09-20 11:20:54 -04:00
Kubernetes Prow Robot
54d660eee0 Merge pull request #629 from chenkaiyue/fix-node-affinity-test
fix duplicate code in node_affinity_test.go
2021-09-16 01:59:46 -07:00
yutachaos
cf219fbfae Added helm chart suspend parameter
Signed-off-by: yutachaos <18604471+yutachaos@users.noreply.github.com>
2021-09-16 14:32:09 +09:00
kaiyuechen
d1d9ea0c48 fix duplicate code in node_affinity_test.go 2021-09-16 10:39:52 +08:00
Oleg Atamanenko
4448d9c670 Ignore pods being deleted 2021-09-15 00:05:51 -07:00
Kubernetes Prow Robot
3909f3acae Merge pull request #623 from damemi/release-1.22
Update Helm chart version to 0.22.0
2021-09-08 13:13:56 -07:00
Mike Dame
9f1274f3ab Update Helm chart version to 0.22.0 2021-09-08 16:00:56 -04:00
Kubernetes Prow Robot
e6926e11ea Merge pull request #617 from KohlsTechnology/docs-1.22
Update Docs and Manifests for v0.22.0
2021-08-31 09:31:37 -07:00
Sean Malloy
16228c9dd1 Update Docs and Manifests for v0.22.0
* Added v0.22 references to README
* Update k8s manifests with v0.22.0 references
* Added table with list of supported architectures by release
2021-08-31 00:24:18 -05:00
Kubernetes Prow Robot
57dabbca5c Merge pull request #597 from a7i/amir/e2e-topology
Add e2e tests for TopologySpreadConstraint
2021-08-30 22:00:28 -07:00
Kubernetes Prow Robot
04439c6e64 Merge pull request #610 from a7i/failed-pods
Introduce RemoveFailedPods strategy
2021-08-30 11:30:09 -07:00
Amir Alavi
0e0e688fe8 Introduce RemoveFailedPods strategy 2021-08-30 14:17:52 -04:00
Kubernetes Prow Robot
0603de4353 Merge pull request #613 from derdanne/derdanne/highnodeutilization-readme-591
nodeutilization strategy: "numberOfNodes" should work as documented
2021-08-24 08:29:14 -07:00
Kubernetes Prow Robot
ea911db6dc Merge pull request #615 from KohlsTechnology/bump-1.22
Bump To k8s 1.22.0
2021-08-19 07:07:24 -07:00
Sean Malloy
c079c7aaae Bump To k8s 1.22.0 2021-08-19 00:40:38 -05:00
Kubernetes Prow Robot
5420988a28 Merge pull request #614 from KohlsTechnology/bump-go-version
Bump To Go 1.16.7
2021-08-18 02:10:08 -07:00
Sean Malloy
b56a1ab80a Bump To Go 1.16.7
The main k/k repo was updated to Go 1.16.7 for k8s
v1.22.0 release. See below PR for reference.

https://github.com/kubernetes/kubernetes/pull/104200
2021-08-17 23:47:45 -05:00
Daniel Klockenkämper
a6b34c1130 nodeutilization strategy: "numberOfNodes" should work as documented 2021-08-17 13:29:07 +00:00
Amir Alavi
0de8002b7d Update gce scripts to spread nodes over 2 zones 2021-08-16 22:41:37 -04:00
Amir Alavi
84d648ff60 Add e2e tests for TopologySpreadConstraint 2021-08-16 22:39:31 -04:00
Kubernetes Prow Robot
6ad6f6fce5 Merge pull request #592 from chrisjohnson00/issue-591
docs: adding clarification to HighNodeUtilization's purpose
2021-08-12 21:38:22 -07:00
Kubernetes Prow Robot
b7100ad871 Merge pull request #582 from praxist/helm-deployment
Update helm chart for running as deployment
2021-08-12 07:29:47 -07:00
Matthew Leung
38c0f1c639 Update helm chart for running as deployment 2021-08-11 11:56:14 -07:00
Kubernetes Prow Robot
64d7901d82 Merge pull request #602 from a7i/balance-domains-aboveavg
TopologySpreadConstraint: advance above avg index when at ideal average
2021-08-10 05:47:25 -07:00
Kubernetes Prow Robot
ab1015e5fa Merge pull request #599 from a7i/amir/update-gce-images
Update GCE images to Ubuntu 18.04 LTS
2021-08-10 05:47:18 -07:00
Kubernetes Prow Robot
1753bf4422 Merge pull request #611 from a7i/docs-remove-redundant-strategies
Update README to remove redundant list of strategies
2021-08-02 08:11:22 -07:00
Amir Alavi
7cb44dca27 Update README to remove redundant list of strategies 2021-07-30 16:35:00 -04:00
Amir Alavi
2ec4b8b674 Update GCE images to Ubuntu 18.04 LTS 2021-07-30 16:14:20 -04:00
Kubernetes Prow Robot
0d0633488d Merge pull request #598 from jayonlau/clenup
Clean up extra spaces
2021-07-30 11:11:37 -07:00
Kubernetes Prow Robot
f3b3853d9d Merge pull request #562 from wsscc2021/helm-chart-tolerations
Add tolerations to cronjob in helm chart
2021-07-30 10:43:38 -07:00
Kubernetes Prow Robot
e550e5e22a Merge pull request #600 from a7i/topology-example
Add example for RemovePodsViolatingTopologySpreadConstraint
2021-07-28 05:37:34 -07:00
Kubernetes Prow Robot
2bf37ff495 Merge pull request #608 from a7i/master
Remove kind binary from repo
2021-07-28 01:13:35 -07:00
Amir Alavi
fa84ec6774 Remove kind binary from repo and add to gitignore 2021-07-27 08:28:55 -04:00
Kubernetes Prow Robot
e18e0416b1 Merge pull request #607 from a7i/fix-helm-test
Place bash shebang at the top of the script + Ensure Helm installed for run-helm-tests
2021-07-27 00:40:45 -07:00
Amir Alavi
34282162f8 Wait for job to start/finish in helm-test script 2021-07-26 23:53:36 -04:00
Amir Alavi
7a043d31be Ensure helm is installed and move shebang to top of file 2021-07-26 16:16:20 -04:00
Amir Alavi
b0e5d64bd7 TopologySpreadConstraint: advance above avg index when at ideal average when balancing domains 2021-07-13 22:55:10 -04:00
Amir Alavi
1c9ac2daee Add example for RemovePodsViolatingTopologySpreadConstraint 2021-07-09 22:56:59 -04:00
jayonlau
c6b67e8a6f Clean up extra spaces
Clean up extra spaces, although these errors are not important, they affect the code specification.
2021-07-09 15:40:35 +08:00
Kubernetes Prow Robot
2e4873d103 Merge pull request #594 from ikarldasan/master-to-main
Rename master to main
2021-07-06 05:50:18 -07:00
Karl Dasan
3e483c4d85 Rename master to main 2021-07-01 13:48:50 +05:30
Kubernetes Prow Robot
032ea70380 Merge pull request #549 from pravarag/verify-defaulters-gen
Add verify scripts for defaulters generator
2021-06-30 04:05:03 -07:00
Pravar Agrawal
df84dc4548 add verify script for defaulters gen 2021-06-30 09:41:43 +05:30
Chris Johnson
d4fa83f8bc chore: PR feedback 2021-06-25 12:30:28 -07:00
Chris Johnson
448dbceadd docs: adding clarification to HighNodeUtilization's purpose 2021-06-24 16:20:21 -07:00
Kubernetes Prow Robot
b83b064992 Merge pull request #589 from a7i/remove-kind-bin
Remove kind binary from repo
2021-06-16 00:19:59 -07:00
Amir Alavi
133a0049e3 Remove kind binary from repo 2021-06-15 18:29:06 -04:00
Kubernetes Prow Robot
50b1c1337d Merge pull request #561 from mekza/support_image_pull_secrets
Add support for private registry creds
2021-06-10 23:28:58 -07:00
Martin-Zack Mekkaoui
d5deed44ca Add support for private registry creds 2021-06-09 22:11:37 +02:00
Kubernetes Prow Robot
0f785b9530 Merge pull request #584 from damemi/update-go-badge
Update Go report card badge
2021-06-08 10:48:08 -07:00
Mike Dame
eb1f0ecc14 Update Go report card badge 2021-06-08 13:39:53 -04:00
Donggeun Lee
4ba48b018c Add tolerations to cronjob in helm chart 2021-05-04 19:56:08 +09:00
2785 changed files with 248129 additions and 90340 deletions

1
.gitignore vendored
View File

@@ -4,3 +4,4 @@ vendordiff.patch
.idea/
*.code-workspace
.vscode/
kind

View File

@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM golang:1.16.1
FROM golang:1.17.7
WORKDIR /go/src/sigs.k8s.io/descheduler
COPY . .

View File

@@ -24,8 +24,8 @@ ARCHS = amd64 arm arm64
LDFLAGS=-ldflags "-X ${LDFLAG_LOCATION}.version=${VERSION} -X ${LDFLAG_LOCATION}.buildDate=${BUILD} -X ${LDFLAG_LOCATION}.gitbranch=${BRANCH} -X ${LDFLAG_LOCATION}.gitsha1=${SHA1}"
GOLANGCI_VERSION := v1.30.0
HAS_GOLANGCI := $(shell ls _output/bin/golangci-lint)
GOLANGCI_VERSION := v1.46.1
HAS_GOLANGCI := $(shell ls _output/bin/golangci-lint 2> /dev/null)
# REGISTRY is the container registry to push
# into. The default is to push to the staging
@@ -43,7 +43,7 @@ IMAGE_GCLOUD:=$(REGISTRY)/descheduler:$(VERSION)
# In the future binaries can be uploaded to
# GCS bucket gs://k8s-staging-descheduler.
HAS_HELM := $(shell which helm)
HAS_HELM := $(shell which helm 2> /dev/null)
all: build
@@ -127,18 +127,21 @@ gen:
verify-gen:
./hack/verify-conversions.sh
./hack/verify-deep-copies.sh
./hack/verify-defaulters.sh
lint:
ifndef HAS_GOLANGCI
curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b ./_output/bin ${GOLANGCI_VERSION}
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b ./_output/bin ${GOLANGCI_VERSION}
endif
./_output/bin/golangci-lint run
lint-chart:
lint-chart: ensure-helm-install
helm lint ./charts/descheduler
test-helm: ensure-helm-install
./test/run-helm-tests.sh
ensure-helm-install:
ifndef HAS_HELM
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 && chmod 700 ./get_helm.sh && ./get_helm.sh
endif
helm lint ./charts/descheduler
test-helm:
./test/run-helm-tests.sh

2
OWNERS
View File

@@ -10,6 +10,8 @@ reviewers:
- seanmalloy
- ingvagabund
- lixiang233
- a7i
- janeliul
emeritus_approvers:
- aveshagarwal
- k82cn

212
README.md
View File

@@ -1,4 +1,4 @@
[![Go Report Card](https://goreportcard.com/badge/kubernetes-sigs/descheduler)](https://goreportcard.com/report/sigs.k8s.io/descheduler)
[![Go Report Card](https://goreportcard.com/badge/sigs.k8s.io/descheduler)](https://goreportcard.com/report/sigs.k8s.io/descheduler)
![Release Charts](https://github.com/kubernetes-sigs/descheduler/workflows/Release%20Charts/badge.svg)
# Descheduler for Kubernetes
@@ -42,6 +42,7 @@ Table of Contents
- [RemovePodsViolatingTopologySpreadConstraint](#removepodsviolatingtopologyspreadconstraint)
- [RemovePodsHavingTooManyRestarts](#removepodshavingtoomanyrestarts)
- [PodLifeTime](#podlifetime)
- [RemoveFailedPods](#removefailedpods)
- [Filter Pods](#filter-pods)
- [Namespace filtering](#namespace-filtering)
- [Priority filtering](#priority-filtering)
@@ -49,6 +50,8 @@ Table of Contents
- [Node Fit filtering](#node-fit-filtering)
- [Pod Evictions](#pod-evictions)
- [Pod Disruption Budget (PDB)](#pod-disruption-budget-pdb)
- [High Availability](#high-availability)
- [Configure HA Mode](#configure-ha-mode)
- [Metrics](#metrics)
- [Compatibility Matrix](#compatibility-matrix)
- [Getting Involved and Contributing](#getting-involved-and-contributing)
@@ -102,17 +105,17 @@ See the [resources | Kustomize](https://kubectl.docs.kubernetes.io/references/ku
Run As A Job
```
kustomize build 'github.com/kubernetes-sigs/descheduler/kubernetes/job?ref=v0.21.0' | kubectl apply -f -
kustomize build 'github.com/kubernetes-sigs/descheduler/kubernetes/job?ref=v0.24.0' | kubectl apply -f -
```
Run As A CronJob
```
kustomize build 'github.com/kubernetes-sigs/descheduler/kubernetes/cronjob?ref=v0.21.0' | kubectl apply -f -
kustomize build 'github.com/kubernetes-sigs/descheduler/kubernetes/cronjob?ref=v0.24.0' | kubectl apply -f -
```
Run As A Deployment
```
kustomize build 'github.com/kubernetes-sigs/descheduler/kubernetes/deployment?ref=v0.21.0' | kubectl apply -f -
kustomize build 'github.com/kubernetes-sigs/descheduler/kubernetes/deployment?ref=v0.24.0' | kubectl apply -f -
```
## User Guide
@@ -121,36 +124,29 @@ See the [user guide](docs/user-guide.md) in the `/docs` directory.
## Policy and Strategies
Descheduler's policy is configurable and includes strategies that can be enabled or disabled.
Nine strategies
1. `RemoveDuplicates`
2. `LowNodeUtilization`
3. `HighNodeUtilization`
4. `RemovePodsViolatingInterPodAntiAffinity`
5. `RemovePodsViolatingNodeAffinity`
6. `RemovePodsViolatingNodeTaints`
7. `RemovePodsViolatingTopologySpreadConstraint`
8. `RemovePodsHavingTooManyRestarts`
9. `PodLifeTime`
are currently implemented. As part of the policy, the
parameters associated with the strategies can be configured too. By default, all strategies are enabled.
Descheduler's policy is configurable and includes strategies that can be enabled or disabled. By default, all strategies are enabled.
The following diagram provides a visualization of most of the strategies to help
categorize how strategies fit together.
The policy includes a common configuration that applies to all the strategies:
| Name | Default Value | Description |
|------|---------------|-------------|
| `nodeSelector` | `nil` | limiting the nodes which are processed |
| `evictLocalStoragePods` | `false` | allows eviction of pods with local storage |
| `evictSystemCriticalPods` | `false` | [Warning: Will evict Kubernetes system pods] allows eviction of pods with any priority, including system pods like kube-dns |
| `ignorePvcPods` | `false` | set whether PVC pods should be evicted or ignored |
| `maxNoOfPodsToEvictPerNode` | `nil` | maximum number of pods evicted from each node (summed through all strategies) |
| `maxNoOfPodsToEvictPerNamespace` | `nil` | maximum number of pods evicted from each namespace (summed through all strategies) |
| `evictFailedBarePods` | `false` | allow eviction of pods without owner references and in failed phase |
![Strategies diagram](strategies_diagram.png)
As part of the policy, the parameters associated with each strategy can be configured.
See each strategy for details on available parameters.
The policy also includes common configuration for all the strategies:
- `nodeSelector` - limiting the nodes which are processed
- `evictLocalStoragePods` - allows eviction of pods with local storage
- `evictSystemCriticalPods` - [Warning: Will evict Kubernetes system pods] allows eviction of pods with any priority, including system pods like kube-dns
- `ignorePvcPods` - set whether PVC pods should be evicted or ignored (defaults to `false`)
- `maxNoOfPodsToEvictPerNode` - maximum number of pods evicted from each node (summed through all strategies)
**Policy:**
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
nodeSelector: prod=dev
evictFailedBarePods: false
evictLocalStoragePods: true
evictSystemCriticalPods: true
maxNoOfPodsToEvictPerNode: 40
@@ -159,6 +155,11 @@ strategies:
...
```
The following diagram provides a visualization of most of the strategies to help
categorize how strategies fit together.
![Strategies diagram](strategies_diagram.png)
### RemoveDuplicates
This strategy makes sure that there is only one pod associated with a ReplicaSet (RS),
@@ -220,6 +221,17 @@ These thresholds, `thresholds` and `targetThresholds`, could be tuned as per you
strategy evicts pods from `overutilized nodes` (those with usage above `targetThresholds`) to `underutilized nodes`
(those with usage below `thresholds`), it will abort if any number of `underutilized nodes` or `overutilized nodes` is zero.
Additionally, the strategy accepts a `useDeviationThresholds` parameter.
If that parameter is set to `true`, the thresholds are considered as percentage deviations from mean resource usage.
`thresholds` will be deducted from the mean among all nodes and `targetThresholds` will be added to the mean.
A resource consumption above (resp. below) this window is considered as overutilization (resp. underutilization).
**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage.
This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same
design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands
like `kubectl top`) may differ from the calculated consumption, due to these components reporting
actual usage metrics. Implementing metrics-based descheduling is currently TODO for the project.
**Parameters:**
|Name|Type|
@@ -227,6 +239,7 @@ strategy evicts pods from `overutilized nodes` (those with usage above `targetTh
|`thresholds`|map(string:int)|
|`targetThresholds`|map(string:int)|
|`numberOfNodes`|int|
|`useDeviationThresholds`|bool|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
@@ -267,9 +280,11 @@ under utilized frequently or for a short period of time. By default, `numberOfNo
### HighNodeUtilization
This strategy finds nodes that are under utilized and evicts pods in the hope that these pods will be scheduled compactly into fewer nodes.
This strategy **must** be used with the
scheduler strategy `MostRequestedPriority`. The parameters of this strategy are configured under `nodeResourceUtilizationThresholds`.
This strategy finds nodes that are under utilized and evicts pods from the nodes in the hope that these pods will be
scheduled compactly into fewer nodes. Used in conjunction with node auto-scaling, this strategy is intended to help
trigger down scaling of under utilized nodes.
This strategy **must** be used with the scheduler scoring strategy `MostAllocated`. The parameters of this strategy are
configured under `nodeResourceUtilizationThresholds`.
The under utilization of nodes is determined by a configurable threshold `thresholds`. The threshold
`thresholds` can be configured for cpu, memory, number of pods, and extended resources in terms of percentage. The percentage is
@@ -285,6 +300,12 @@ strategy evicts pods from `underutilized nodes` (those with usage below `thresho
so that they can be recreated in appropriately utilized nodes.
The strategy will abort if any number of `underutilized nodes` or `appropriately utilized nodes` is zero.
**NOTE:** Node resource consumption is determined by the requests and limits of pods, not actual usage.
This approach is chosen in order to maintain consistency with the kube-scheduler, which follows the same
design for scheduling pods onto nodes. This means that resource usage as reported by Kubelet (or commands
like `kubectl top`) may differ from the calculated consumption, due to these components reporting
actual usage metrics. Implementing metrics-based descheduling is currently TODO for the project.
**Parameters:**
|Name|Type|
@@ -399,10 +420,17 @@ pod "podA" with a toleration to tolerate a taint ``key=value:NoSchedule`` schedu
node. If the node's taint is subsequently updated/removed, taint is no longer satisfied by its pods' tolerations
and will be evicted.
Node taints can be excluded from consideration by specifying a list of excludedTaints. If a node taint key **or**
key=value matches an excludedTaints entry, the taint will be ignored.
For example, excludedTaints entry "dedicated" would match all taints with key "dedicated", regardless of value.
excludedTaints entry "dedicated=special-user" would match taints with key "dedicated" and value "special-user".
**Parameters:**
|Name|Type|
|---|---|
|`excludedTaints`|list(string)|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
@@ -417,6 +445,10 @@ kind: "DeschedulerPolicy"
strategies:
"RemovePodsViolatingNodeTaints":
enabled: true
params:
excludedTaints:
- dedicated=special-user # exclude taints with key "dedicated" and value "special-user"
- reserved # exclude all taints with key "reserved"
````
### RemovePodsViolatingTopologySpreadConstraint
@@ -458,9 +490,9 @@ strategies:
This strategy makes sure that pods having too many restarts are removed from nodes. For example a pod with EBS/PD that
can't get the volume/disk attached to the instance, then the pod should be re-scheduled to other nodes. Its parameters
include `podRestartThreshold`, which is the number of restarts at which a pod should be evicted, and `includingInitContainers`,
which determines whether init container restarts should be factored into that calculation.
|`labelSelector`|(see [label filtering](#label-filtering))|
include `podRestartThreshold`, which is the number of restarts (summed over all eligible containers) at which a pod
should be evicted, and `includingInitContainers`, which determines whether init container restarts should be factored
into that calculation.
**Parameters:**
@@ -471,6 +503,7 @@ which determines whether init container restarts should be factored into that ca
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`labelSelector`|(see [label filtering](#label-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
@@ -520,6 +553,47 @@ strategies:
- "Pending"
```
### RemoveFailedPods
This strategy evicts pods that are in failed status phase.
You can provide an optional parameter to filter by failed `reasons`.
`reasons` can be expanded to include reasons of InitContainers as well by setting the optional parameter `includingInitContainers` to `true`.
You can specify an optional parameter `minPodLifetimeSeconds` to evict pods that are older than specified seconds.
Lastly, you can specify the optional parameter `excludeOwnerKinds` and if a pod
has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considered for eviction.
**Parameters:**
|Name|Type|
|---|---|
|`minPodLifetimeSeconds`|uint|
|`excludeOwnerKinds`|list(string)|
|`reasons`|list(string)|
|`includingInitContainers`|bool|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|`labelSelector`|(see [label filtering](#label-filtering))|
|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))|
**Example:**
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemoveFailedPods":
enabled: true
params:
failedPods:
reasons:
- "NodeAffinity"
includingInitContainers: true
excludeOwnerKinds:
- "Job"
minPodLifetimeSeconds: 3600
```
## Filter Pods
### Namespace filtering
@@ -532,6 +606,7 @@ The following strategies accept a `namespaces` parameter which allows to specify
* `RemovePodsViolatingInterPodAntiAffinity`
* `RemoveDuplicates`
* `RemovePodsViolatingTopologySpreadConstraint`
* `RemoveFailedPods`
For example:
@@ -614,7 +689,7 @@ does not exist, descheduler won't create it and will throw an error.
### Label filtering
The following strategies can configure a [standard kubernetes labelSelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.21/#labelselector-v1-meta)
The following strategies can configure a [standard kubernetes labelSelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#labelselector-v1-meta)
to filter pods by their labels:
* `PodLifeTime`
@@ -623,6 +698,7 @@ to filter pods by their labels:
* `RemovePodsViolatingNodeAffinity`
* `RemovePodsViolatingInterPodAntiAffinity`
* `RemovePodsViolatingTopologySpreadConstraint`
* `RemoveFailedPods`
This allows running strategies among pods the descheduler is interested in.
@@ -657,11 +733,13 @@ The following strategies accept a `nodeFit` boolean parameter which can optimize
* `RemovePodsViolatingNodeTaints`
* `RemovePodsViolatingTopologySpreadConstraint`
* `RemovePodsHavingTooManyRestarts`
* `RemoveFailedPods`
If set to `true` the descheduler will consider whether or not the pods that meet eviction criteria will fit on other nodes before evicting them. If a pod cannot be rescheduled to another node, it will not be evicted. Currently the following criteria are considered when setting `nodeFit` to `true`:
- A `nodeSelector` on the pod
- Any `Tolerations` on the pod and any `Taints` on the other nodes
- Any `tolerations` on the pod and any `taints` on the other nodes
- `nodeAffinity` on the pod
- Resource `requests` made by the pod and the resources available on other nodes
- Whether any of the other nodes are marked as `unschedulable`
E.g.
@@ -671,17 +749,17 @@ apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"LowNodeUtilization":
enabled: true
params:
nodeResourceUtilizationThresholds:
thresholds:
"cpu" : 20
"memory": 20
"pods": 20
targetThresholds:
"cpu" : 50
"memory": 50
"pods": 50
enabled: true
params:
nodeResourceUtilizationThresholds:
thresholds:
"cpu": 20
"memory": 20
"pods": 20
targetThresholds:
"cpu": 50
"memory": 50
"pods": 50
nodeFit: true
```
@@ -697,8 +775,8 @@ Using Deployments instead of ReplicationControllers provides an automated rollou
When the descheduler decides to evict pods from a node, it employs the following general mechanism:
* [Critical pods](https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/) (with priorityClassName set to system-cluster-critical or system-node-critical) are never evicted (unless `evictSystemCriticalPods: true` is set).
* Pods (static or mirrored pods or stand alone pods) not part of an ReplicationController, ReplicaSet(Deployment), StatefulSet, or Job are
never evicted because these pods won't be recreated.
* Pods (static or mirrored pods or standalone pods) not part of an ReplicationController, ReplicaSet(Deployment), StatefulSet, or Job are
never evicted because these pods won't be recreated. (Standalone pods in failed status phase can be evicted by setting `evictFailedBarePods: true`)
* Pods associated with DaemonSets are never evicted.
* Pods with local storage are never evicted (unless `evictLocalStoragePods: true` is set).
* Pods with PVCs are evicted (unless `ignorePvcPods: true` is set).
@@ -707,6 +785,7 @@ best effort pods are evicted before burstable and guaranteed pods.
* All types of pods with the annotation `descheduler.alpha.kubernetes.io/evict` are eligible for eviction. This
annotation is used to override checks which prevent eviction and users can select which pod is evicted.
Users should know how and if the pod will be recreated.
* Pods with a non-nil DeletionTimestamp are not evicted by default.
Setting `--v=4` or greater on the Descheduler will log all reasons why any pod is not evictable.
@@ -715,6 +794,23 @@ Setting `--v=4` or greater on the Descheduler will log all reasons why any pod i
Pods subject to a Pod Disruption Budget(PDB) are not evicted if descheduling violates its PDB. The pods
are evicted by using the eviction subresource to handle PDB.
## High Availability
In High Availability mode, Descheduler starts [leader election](https://github.com/kubernetes/client-go/tree/master/tools/leaderelection) process in Kubernetes. You can activate HA mode
if you choose to deploy your application as Deployment.
Deployment starts with 1 replica by default. If you want to use more than 1 replica, you must consider
enable High Availability mode since we don't want to run descheduler pods simultaneously.
### Configure HA Mode
The leader election process can be enabled by setting `--leader-elect` in the CLI. You can also set
`--set=leaderElection.enabled=true` flag if you are using Helm.
To get best results from HA mode some additional configurations might require:
* Configure a [podAntiAffinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node) rule if you want to schedule onto a node only if that node is in the same zone as at least one already-running descheduler
* Set the replica count greater than 1
## Metrics
| name | type | description |
@@ -734,16 +830,18 @@ v0.18 should work with k8s v1.18, v1.17, and v1.16.
Starting with descheduler release v0.18 the minor version of descheduler matches the minor version of the k8s client
packages that it is compiled with.
Descheduler | Supported Kubernetes Version
-------------|-----------------------------
v0.21 | v1.21
v0.20 | v1.20
v0.19 | v1.19
v0.18 | v1.18
v0.10 | v1.17
v0.4-v0.9 | v1.9+
v0.1-v0.3 | v1.7-v1.8
| Descheduler | Supported Kubernetes Version |
|-------------|------------------------------|
| v0.24 | v1.24 |
| v0.23 | v1.23 |
| v0.22 | v1.22 |
| v0.21 | v1.21 |
| v0.20 | v1.20 |
| v0.19 | v1.19 |
| v0.18 | v1.18 |
| v0.10 | v1.17 |
| v0.4-v0.9 | v1.9+ |
| v0.1-v0.3 | v1.7-v1.8 |
## Getting Involved and Contributing

View File

@@ -1,7 +1,7 @@
apiVersion: v1
name: descheduler
version: 0.21.0
appVersion: 0.21.0
version: 0.23.1
appVersion: 0.23.0
description: Descheduler for Kubernetes is used to rebalance clusters by evicting pods that can potentially be scheduled on better nodes. In the current implementation, descheduler does not schedule replacement of evicted pods but relies on the default scheduler for that.
keywords:
- kubernetes

View File

@@ -43,24 +43,44 @@ The command removes all the Kubernetes components associated with the chart and
The following table lists the configurable parameters of the _descheduler_ chart and their default values.
| Parameter | Description | Default |
| ------------------------------ | --------------------------------------------------------------------------------------------------------------------- | ------------------------------------ |
| `image.repository` | Docker repository to use | `k8s.gcr.io/descheduler/descheduler` |
| `image.tag` | Docker tag to use | `v[chart appVersion]` |
| `image.pullPolicy` | Docker image pull policy | `IfNotPresent` |
| `nameOverride` | String to partially override `descheduler.fullname` template (will prepend the release name) | `""` |
| `fullnameOverride` | String to fully override `descheduler.fullname` template | `""` |
| `cronJobApiVersion` | CronJob API Group Version | `"batch/v1"` |
| `schedule` | The cron schedule to run the _descheduler_ job on | `"*/2 * * * *"` |
| `startingDeadlineSeconds` | If set, configure `startingDeadlineSeconds` for the _descheduler_ job | `nil` |
| `successfulJobsHistoryLimit` | If set, configure `successfulJobsHistoryLimit` for the _descheduler_ job | `nil` |
| `failedJobsHistoryLimit` | If set, configure `failedJobsHistoryLimit` for the _descheduler_ job | `nil` |
| `cmdOptions` | The options to pass to the _descheduler_ command | _see values.yaml_ |
| `deschedulerPolicy.strategies` | The _descheduler_ strategies to apply | _see values.yaml_ |
| `priorityClassName` | The name of the priority class to add to pods | `system-cluster-critical` |
| `rbac.create` | If `true`, create & use RBAC resources | `true` |
| `podSecurityPolicy.create` | If `true`, create PodSecurityPolicy | `true` |
| `resources` | Descheduler container CPU and memory requests/limits | _see values.yaml_ |
| `serviceAccount.create` | If `true`, create a service account for the cron job | `true` |
| `serviceAccount.name` | The name of the service account to use, if not set and create is true a name is generated using the fullname template | `nil` |
| `nodeSelector` | Node selectors to run the descheduler cronjob on specific nodes | `nil` |
| Parameter | Description | Default |
|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------|--------------------------------------|
| `kind` | Use as CronJob or Deployment | `CronJob` |
| `image.repository` | Docker repository to use | `k8s.gcr.io/descheduler/descheduler` |
| `image.tag` | Docker tag to use | `v[chart appVersion]` |
| `image.pullPolicy` | Docker image pull policy | `IfNotPresent` |
| `imagePullSecrets` | Docker repository secrets | `[]` |
| `nameOverride` | String to partially override `descheduler.fullname` template (will prepend the release name) | `""` |
| `fullnameOverride` | String to fully override `descheduler.fullname` template | `""` |
| `cronJobApiVersion` | CronJob API Group Version | `"batch/v1"` |
| `schedule` | The cron schedule to run the _descheduler_ job on | `"*/2 * * * *"` |
| `startingDeadlineSeconds` | If set, configure `startingDeadlineSeconds` for the _descheduler_ job | `nil` |
| `successfulJobsHistoryLimit` | If set, configure `successfulJobsHistoryLimit` for the _descheduler_ job | `nil` |
| `failedJobsHistoryLimit` | If set, configure `failedJobsHistoryLimit` for the _descheduler_ job | `nil` |
| `deschedulingInterval` | If using kind:Deployment, sets time between consecutive descheduler executions. | `5m` |
| `replicas` | The replica count for Deployment | `1` |
| `leaderElection` | The options for high availability when running replicated components | _see values.yaml_ |
| `cmdOptions` | The options to pass to the _descheduler_ command | _see values.yaml_ |
| `deschedulerPolicy.strategies` | The _descheduler_ strategies to apply | _see values.yaml_ |
| `priorityClassName` | The name of the priority class to add to pods | `system-cluster-critical` |
| `rbac.create` | If `true`, create & use RBAC resources | `true` |
| `podSecurityPolicy.create` | If `true`, create PodSecurityPolicy | `true` |
| `resources` | Descheduler container CPU and memory requests/limits | _see values.yaml_ |
| `serviceAccount.create` | If `true`, create a service account for the cron job | `true` |
| `serviceAccount.name` | The name of the service account to use, if not set and create is true a name is generated using the fullname template | `nil` |
| `serviceAccount.annotations` | Specifies custom annotations for the serviceAccount | `{}` |
| `nodeSelector` | Node selectors to run the descheduler cronjob/deployment on specific nodes | `nil` |
| `service.enabled` | If `true`, create a service for deployment | `false` |
| `serviceMonitor.enabled` | If `true`, create a ServiceMonitor for deployment | `false` |
| `serviceMonitor.namespace` | The namespace where Prometheus expects to find service monitors | `nil` |
| `serviceMonitor.interval` | The scrape interval. If not set, the Prometheus default scrape interval is used | `nil` |
| `serviceMonitor.honorLabels` | Keeps the scraped data's labels when labels are on collisions with target labels. | `true` |
| `serviceMonitor.insecureSkipVerify` | Skip TLS certificate validation when scraping | `true` |
| `serviceMonitor.serverName` | Name of the server to use when validating TLS certificate | `nil` |
| `serviceMonitor.metricRelabelings` | MetricRelabelConfigs to apply to samples after scraping, but before ingestion | `[]` |
| `serviceMonitor.relabelings` | RelabelConfigs to apply to samples before scraping | `[]` |
| `affinity` | Node affinity to run the descheduler cronjob/deployment on specific nodes | `nil` |
| `tolerations` | tolerations to run the descheduler cronjob/deployment on specific nodes | `nil` |
| `suspend` | Set spec.suspend in descheduler cronjob | `false` |
| `commonLabels` | Labels to apply to all resources | `{}` |
| `livenessProbe` | Liveness probe configuration for the descheduler container | _see values.yaml_ |

View File

@@ -1 +1,7 @@
Descheduler installed as a cron job.
Descheduler installed as a {{ .Values.kind }}.
{{- if eq .Values.kind "Deployment" }}
{{- if eq .Values.replicas 1.0}}
WARNING: You set replica count as 1 and workload kind as Deployment however leaderElection is not enabled. Consider enabling Leader Election for HA mode.
{{- end}}
{{- end}}

View File

@@ -42,6 +42,17 @@ app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- if .Values.commonLabels}}
{{ toYaml .Values.commonLabels }}
{{- end }}
{{- end -}}
{{/*
Selector labels
*/}}
{{- define "descheduler.selectorLabels" -}}
app.kubernetes.io/name: {{ include "descheduler.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end -}}
{{/*
@@ -54,3 +65,37 @@ Create the name of the service account to use
{{ default "default" .Values.serviceAccount.name }}
{{- end -}}
{{- end -}}
{{/*
Leader Election
*/}}
{{- define "descheduler.leaderElection"}}
{{- if .Values.leaderElection -}}
- --leader-elect
- {{ default false .Values.leaderElection.enabled }}
{{- if .Values.leaderElection.leaseDuration }}
- --leader-elect-lease-duration
- {{ .Values.leaderElection.leaseDuration }}
{{- end }}
{{- if .Values.leaderElection.renewDeadline }}
- --leader-elect-renew-deadline
- {{ .Values.leaderElection.renewDeadline }}
{{- end }}
{{- if .Values.leaderElection.retryPeriod }}
- --leader-elect-retry-period
- {{ .Values.leaderElection.retryPeriod }}
{{- end }}
{{- if .Values.leaderElection.resourceLock }}
- --leader-elect-resource-lock
- {{ .Values.leaderElection.resourceLock }}
{{- end }}
{{- if .Values.leaderElection.resourceName }}
- --leader-elect-resource-name
- {{ .Values.leaderElection.resourceName }}
{{- end }}
{{- if .Values.leaderElection.resourceNamescape }}
- --leader-elect-resource-namespace
- {{ .Values.leaderElection.resourceNamescape }}
{{- end -}}
{{- end }}
{{- end }}

View File

@@ -14,7 +14,7 @@ rules:
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list", "delete"]
@@ -24,6 +24,15 @@ rules:
- apiGroups: ["scheduling.k8s.io"]
resources: ["priorityclasses"]
verbs: ["get", "watch", "list"]
{{- if .Values.leaderElection.enabled }}
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
resourceNames: ["descheduler"]
verbs: ["get", "patch", "delete"]
{{- end }}
{{- if .Values.podSecurityPolicy.create }}
- apiGroups: ['policy']
resources: ['podsecuritypolicies']

View File

@@ -2,6 +2,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
name: {{ template "descheduler.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "descheduler.labels" . | nindent 4 }}
data:

View File

@@ -1,11 +1,16 @@
{{- if eq .Values.kind "CronJob" }}
apiVersion: {{ .Values.cronJobApiVersion | default "batch/v1" }}
kind: CronJob
metadata:
name: {{ template "descheduler.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "descheduler.labels" . | nindent 4 }}
spec:
schedule: {{ .Values.schedule | quote }}
{{- if .Values.suspend }}
suspend: {{ .Values.suspend }}
{{- end }}
concurrencyPolicy: "Forbid"
{{- if .Values.startingDeadlineSeconds }}
startingDeadlineSeconds: {{ .Values.startingDeadlineSeconds }}
@@ -27,8 +32,7 @@ spec:
{{- .Values.podAnnotations | toYaml | nindent 12 }}
{{- end }}
labels:
app.kubernetes.io/name: {{ include "descheduler.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- include "descheduler.selectorLabels" . | nindent 12 }}
{{- if .Values.podLabels }}
{{- .Values.podLabels | toYaml | nindent 12 }}
{{- end }}
@@ -37,11 +41,23 @@ spec:
nodeSelector:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- if .Values.priorityClassName }}
priorityClassName: {{ .Values.priorityClassName }}
{{- end }}
serviceAccountName: {{ template "descheduler.serviceAccountName" . }}
restartPolicy: "Never"
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 10 }}
{{- end }}
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "v%s" .Chart.AppVersion) }}"
@@ -57,6 +73,8 @@ spec:
- {{ $value | quote }}
{{- end }}
{{- end }}
livenessProbe:
{{- toYaml .Values.livenessProbe | nindent 16 }}
resources:
{{- toYaml .Values.resources | nindent 16 }}
securityContext:
@@ -74,3 +92,4 @@ spec:
- name: policy-volume
configMap:
name: {{ template "descheduler.fullname" . }}
{{- end }}

View File

@@ -0,0 +1,94 @@
{{- if eq .Values.kind "Deployment" }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ template "descheduler.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "descheduler.labels" . | nindent 4 }}
spec:
{{- if gt .Values.replicas 1.0}}
{{- if not .Values.leaderElection.enabled }}
{{- fail "You must set leaderElection to use more than 1 replica"}}
{{- end}}
replicas: {{ required "leaderElection required for running more than one replica" .Values.replicas }}
{{- else }}
replicas: 1
{{- end }}
selector:
matchLabels:
{{- include "descheduler.selectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "descheduler.selectorLabels" . | nindent 8 }}
{{- if .Values.podLabels }}
{{- .Values.podLabels | toYaml | nindent 8 }}
{{- end }}
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
{{- if .Values.podAnnotations }}
{{- .Values.podAnnotations | toYaml | nindent 8 }}
{{- end }}
spec:
{{- if .Values.priorityClassName }}
priorityClassName: {{ .Values.priorityClassName }}
{{- end }}
serviceAccountName: {{ template "descheduler.serviceAccountName" . }}
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 10 }}
{{- end }}
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "v%s" .Chart.AppVersion) }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- "/bin/descheduler"
args:
- "--policy-config-file"
- "/policy-dir/policy.yaml"
- "--descheduling-interval"
- {{ required "deschedulingInterval required for running as Deployment" .Values.deschedulingInterval }}
{{- range $key, $value := .Values.cmdOptions }}
- {{ printf "--%s" $key | quote }}
{{- if $value }}
- {{ $value | quote }}
{{- end }}
{{- end }}
{{- include "descheduler.leaderElection" . | nindent 12 }}
ports:
- containerPort: 10258
protocol: TCP
livenessProbe:
{{- toYaml .Values.livenessProbe | nindent 12 }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
runAsNonRoot: true
volumeMounts:
- mountPath: /policy-dir
name: policy-volume
volumes:
- name: policy-volume
configMap:
name: {{ template "descheduler.fullname" . }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}

View File

@@ -3,6 +3,7 @@ apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
name: {{ template "descheduler.fullname" . }}
namespace: {{ .Release.Namespace }}
annotations:
seccomp.security.alpha.kubernetes.io/allowedProfileNames: 'docker/default,runtime/default'
seccomp.security.alpha.kubernetes.io/defaultProfileName: 'runtime/default'

View File

@@ -0,0 +1,21 @@
{{- if eq .Values.kind "Deployment" }}
{{- if eq .Values.service.enabled true }}
apiVersion: v1
kind: Service
metadata:
labels:
{{- include "descheduler.labels" . | nindent 4 }}
name: {{ template "descheduler.fullname" . }}
namespace: {{ .Release.Namespace }}
spec:
clusterIP: None
ports:
- name: http-metrics
port: 10258
protocol: TCP
targetPort: 10258
selector:
{{- include "descheduler.selectorLabels" . | nindent 4 }}
type: ClusterIP
{{- end }}
{{- end }}

View File

@@ -3,6 +3,10 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ template "descheduler.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "descheduler.labels" . | nindent 4 }}
{{- if .Values.serviceAccount.annotations }}
annotations: {{ toYaml .Values.serviceAccount.annotations | nindent 4 }}
{{- end }}
{{- end -}}

View File

@@ -0,0 +1,41 @@
{{- if eq .Values.kind "Deployment" }}
{{- if eq .Values.serviceMonitor.enabled true }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ template "descheduler.fullname" . }}-servicemonitor
namespace: {{ .Values.serviceMonitor.namespace | default .Release.Namespace }}
labels:
{{- include "descheduler.labels" . | nindent 4 }}
spec:
jobLabel: jobLabel
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
selector:
matchLabels:
{{- include "descheduler.selectorLabels" . | nindent 6 }}
endpoints:
- honorLabels: {{ .Values.serviceMonitor.honorLabels | default true }}
port: http-metrics
{{- if .Values.serviceMonitor.interval }}
interval: {{ .Values.serviceMonitor.interval }}
{{- end }}
scheme: https
tlsConfig:
{{- if eq .Values.serviceMonitor.insecureSkipVerify true }}
insecureSkipVerify: true
{{- end }}
{{- if .Values.serviceMonitor.serverName }}
serverName: {{ .Values.serviceMonitor.serverName }}
{{- end}}
{{- if .Values.serviceMonitor.metricRelabelings }}
metricRelabelings:
{{ tpl (toYaml .Values.serviceMonitor.metricRelabelings | indent 4) . }}
{{- end }}
{{- if .Values.serviceMonitor.relabelings }}
relabelings:
{{ tpl (toYaml .Values.serviceMonitor.relabelings | indent 4) . }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -2,6 +2,7 @@ apiVersion: v1
kind: Pod
metadata:
name: descheduler-test-pod
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": test
spec:
@@ -26,4 +27,4 @@ spec:
curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl &&
chmod +x ./kubectl &&
mv ./kubectl /usr/local/bin/kubectl &&
/usr/local/bin/kubectl get pods --namespace kube-system --token "$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" | grep "descheduler" | grep "Completed"
/usr/local/bin/kubectl get pods --namespace {{ .Release.Namespace }} --token "$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" | grep "descheduler" | grep "Completed"

View File

@@ -2,12 +2,18 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# CronJob or Deployment
kind: CronJob
image:
repository: k8s.gcr.io/descheduler/descheduler
# Overrides the image tag whose default is the chart version
tag: ""
pullPolicy: IfNotPresent
imagePullSecrets:
# - name: container-registry-secret
resources:
requests:
cpu: 500m
@@ -19,19 +25,45 @@ resources:
nameOverride: ""
fullnameOverride: ""
# labels that'll be applied to all resources
commonLabels: {}
cronJobApiVersion: "batch/v1" # Use "batch/v1beta1" for k8s version < 1.21.0. TODO(@7i) remove with 1.23 release
schedule: "*/2 * * * *"
suspend: false
#startingDeadlineSeconds: 200
#successfulJobsHistoryLimit: 1
#failedJobsHistoryLimit: 1
# Required when running as a Deployment
deschedulingInterval: 5m
# Specifies the replica count for Deployment
# Set leaderElection if you want to use more than 1 replica
# Set affinity.podAntiAffinity rule if you want to schedule onto a node
# only if that node is in the same zone as at least one already-running descheduler
replicas: 1
# Specifies whether Leader Election resources should be created
# Required when running as a Deployment
leaderElection: {}
# enabled: true
# leaseDuration: 15
# renewDeadline: 10
# retryPeriod: 2
# resourceLock: "leases"
# resourceName: "descheduler"
# resourceNamescape: "kube-system"
cmdOptions:
v: 3
# evict-local-storage-pods:
# max-pods-to-evict-per-node: 10
# node-selector: "key1=value1,key2=value2"
deschedulerPolicy:
# nodeSelector: "key1=value1,key2=value2"
# maxNoOfPodsToEvictPerNode: 10
# maxNoOfPodsToEvictPerNamespace: 10
# ignorePvcPods: true
# evictLocalStoragePods: true
strategies:
RemoveDuplicates:
enabled: true
@@ -62,6 +94,31 @@ priorityClassName: system-cluster-critical
nodeSelector: {}
# foo: bar
affinity: {}
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: kubernetes.io/e2e-az-name
# operator: In
# values:
# - e2e-az1
# - e2e-az2
# podAntiAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# - labelSelector:
# matchExpressions:
# - key: app.kubernetes.io/name
# operator: In
# values:
# - descheduler
# topologyKey: "kubernetes.io/hostname"
tolerations: []
# - key: 'management'
# operator: 'Equal'
# value: 'tool'
# effect: 'NoSchedule'
rbac:
# Specifies whether RBAC resources should be created
create: true
@@ -76,3 +133,37 @@ serviceAccount:
# The name of the ServiceAccount to use.
# If not set and create is true, a name is generated using the fullname template
name:
# Specifies custom annotations for the serviceAccount
annotations: {}
livenessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 10258
scheme: HTTPS
initialDelaySeconds: 3
periodSeconds: 10
service:
enabled: false
serviceMonitor:
enabled: false
# The namespace where Prometheus expects to find service monitors.
# namespace: ""
interval: ""
# honorLabels: true
insecureSkipVerify: true
serverName: null
metricRelabelings: []
# - action: keep
# regex: 'descheduler_(build_info|pods_evicted)'
# sourceLabels: [__name__]
relabelings: []
# - sourceLabels: [__meta_kubernetes_pod_node_name]
# separator: ;
# regex: ^(.*)$
# targetLabel: nodename
# replacement: $1
# action: replace

View File

@@ -7,7 +7,7 @@ timeout: 1200s
options:
substitution_option: ALLOW_LOOSE
steps:
- name: 'gcr.io/k8s-testimages/gcb-docker-gcloud:v20190906-745fed4'
- name: 'gcr.io/k8s-staging-test-infra/gcb-docker-gcloud:v20211118-2f2d816b90'
entrypoint: make
env:
- DOCKER_CLI_EXPERIMENTAL=enabled

View File

@@ -19,15 +19,15 @@ package options
import (
"github.com/spf13/pflag"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
apiserveroptions "k8s.io/apiserver/pkg/server/options"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/component-base/logs"
componentbaseconfig "k8s.io/component-base/config"
componentbaseoptions "k8s.io/component-base/config/options"
"sigs.k8s.io/descheduler/pkg/apis/componentconfig"
"sigs.k8s.io/descheduler/pkg/apis/componentconfig/v1alpha1"
deschedulerscheme "sigs.k8s.io/descheduler/pkg/descheduler/scheme"
"time"
)
const (
@@ -39,7 +39,6 @@ type DeschedulerServer struct {
componentconfig.DeschedulerConfiguration
Client clientset.Interface
Logs *logs.Options
SecureServing *apiserveroptions.SecureServingOptionsWithLoopback
DisableMetrics bool
}
@@ -56,20 +55,22 @@ func NewDeschedulerServer() (*DeschedulerServer, error) {
return &DeschedulerServer{
DeschedulerConfiguration: *cfg,
Logs: logs.NewOptions(),
SecureServing: secureServing,
}, nil
}
// Validation checks for DeschedulerServer.
func (s *DeschedulerServer) Validate() error {
var errs []error
errs = append(errs, s.Logs.Validate()...)
return utilerrors.NewAggregate(errs)
}
func newDefaultComponentConfig() (*componentconfig.DeschedulerConfiguration, error) {
versionedCfg := v1alpha1.DeschedulerConfiguration{}
versionedCfg := v1alpha1.DeschedulerConfiguration{
LeaderElection: componentbaseconfig.LeaderElectionConfiguration{
LeaderElect: false,
LeaseDuration: metav1.Duration{Duration: 137 * time.Second},
RenewDeadline: metav1.Duration{Duration: 107 * time.Second},
RetryPeriod: metav1.Duration{Duration: 26 * time.Second},
ResourceLock: "leases",
ResourceName: "descheduler",
ResourceNamespace: "kube-system",
},
}
deschedulerscheme.Scheme.Default(&versionedCfg)
cfg := componentconfig.DeschedulerConfiguration{}
if err := deschedulerscheme.Scheme.Convert(&versionedCfg, &cfg, nil); err != nil {
@@ -85,13 +86,9 @@ func (rs *DeschedulerServer) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&rs.KubeconfigFile, "kubeconfig", rs.KubeconfigFile, "File with kube configuration.")
fs.StringVar(&rs.PolicyConfigFile, "policy-config-file", rs.PolicyConfigFile, "File with descheduler policy configuration.")
fs.BoolVar(&rs.DryRun, "dry-run", rs.DryRun, "execute descheduler in dry run mode.")
// node-selector query causes descheduler to run only on nodes that matches the node labels in the query
fs.StringVar(&rs.NodeSelector, "node-selector", rs.NodeSelector, "DEPRECATED: selector (label query) to filter on, supports '=', '==', and '!='.(e.g. -l key1=value1,key2=value2)")
// max-no-pods-to-evict limits the maximum number of pods to be evicted per node by descheduler.
fs.IntVar(&rs.MaxNoOfPodsToEvictPerNode, "max-pods-to-evict-per-node", rs.MaxNoOfPodsToEvictPerNode, "DEPRECATED: limits the maximum number of pods to be evicted per node by descheduler")
// evict-local-storage-pods allows eviction of pods that are using local storage. This is false by default.
fs.BoolVar(&rs.EvictLocalStoragePods, "evict-local-storage-pods", rs.EvictLocalStoragePods, "DEPRECATED: enables evicting pods using local storage by descheduler")
fs.BoolVar(&rs.DisableMetrics, "disable-metrics", rs.DisableMetrics, "Disables metrics. The metrics are by default served through https://localhost:10258/metrics. Secure address, resp. port can be changed through --bind-address, resp. --secure-port flags.")
componentbaseoptions.BindLeaderElectionFlags(&rs.LeaderElection, fs)
rs.SecureServing.AddFlags(fs)
}

View File

@@ -19,8 +19,11 @@ package app
import (
"context"
"flag"
"io"
"os/signal"
"syscall"
"k8s.io/apiserver/pkg/server/healthz"
"sigs.k8s.io/descheduler/cmd/descheduler/app/options"
"sigs.k8s.io/descheduler/pkg/descheduler"
@@ -30,7 +33,9 @@ import (
apiserver "k8s.io/apiserver/pkg/server"
"k8s.io/apiserver/pkg/server/mux"
restclient "k8s.io/client-go/rest"
aflag "k8s.io/component-base/cli/flag"
"k8s.io/component-base/config"
_ "k8s.io/component-base/logs/json/register"
"k8s.io/component-base/logs/registry"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
)
@@ -48,8 +53,7 @@ func NewDeschedulerCommand(out io.Writer) *cobra.Command {
Short: "descheduler",
Long: `The descheduler evicts pods which may be bound to less desired nodes`,
Run: func(cmd *cobra.Command, args []string) {
s.Logs.LogFormat = s.Logging.Format
s.Logs.Apply()
// s.Logs.Config.Format = s.Logging.Format
// LoopbackClientConfig is a config for a privileged loopback connection
var LoopbackClientConfig *restclient.Config
@@ -59,36 +63,46 @@ func NewDeschedulerCommand(out io.Writer) *cobra.Command {
return
}
if err := s.Validate(); err != nil {
klog.ErrorS(err, "failed to validate server configuration")
factory, _ := registry.LogRegistry.Get(s.Logging.Format)
if factory == nil {
klog.ClearLogger()
} else {
log, logrFlush := factory.Create(config.LoggingConfiguration{})
defer logrFlush()
klog.SetLogger(log)
}
ctx, done := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
pathRecorderMux := mux.NewPathRecorderMux("descheduler")
if !s.DisableMetrics {
pathRecorderMux.Handle("/metrics", legacyregistry.HandlerWithReset())
}
healthz.InstallHandler(pathRecorderMux, healthz.NamedCheck("Descheduler", healthz.PingHealthz.Check))
stoppedCh, _, err := SecureServing.Serve(pathRecorderMux, 0, ctx.Done())
if err != nil {
klog.Fatalf("failed to start secure server: %v", err)
return
}
if !s.DisableMetrics {
ctx := context.TODO()
pathRecorderMux := mux.NewPathRecorderMux("descheduler")
pathRecorderMux.Handle("/metrics", legacyregistry.HandlerWithReset())
if _, err := SecureServing.Serve(pathRecorderMux, 0, ctx.Done()); err != nil {
klog.Fatalf("failed to start secure server: %v", err)
return
}
}
err := Run(s)
err = Run(ctx, s)
if err != nil {
klog.ErrorS(err, "descheduler server")
}
done()
// wait for metrics server to close
<-stoppedCh
},
}
cmd.SetOut(out)
flags := cmd.Flags()
flags.SetNormalizeFunc(aflag.WordSepNormalizeFunc)
flags.AddGoFlagSet(flag.CommandLine)
s.AddFlags(flags)
return cmd
}
func Run(rs *options.DeschedulerServer) error {
return descheduler.Run(rs)
func Run(ctx context.Context, rs *options.DeschedulerServer) error {
return descheduler.Run(ctx, rs)
}

View File

@@ -17,22 +17,23 @@ limitations under the License.
package main
import (
"fmt"
"k8s.io/component-base/logs"
"os"
"k8s.io/component-base/cli"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/cmd/descheduler/app"
)
func init() {
klog.SetOutput(os.Stdout)
klog.InitFlags(nil)
}
func main() {
out := os.Stdout
cmd := app.NewDeschedulerCommand(out)
cmd.AddCommand(app.NewVersionCommand())
logs.InitLogs()
defer logs.FlushLogs()
if err := cmd.Execute(); err != nil {
fmt.Println(err)
os.Exit(1)
}
code := cli.Run(cmd)
os.Exit(code)
}

16
docs/proposals.md Normal file
View File

@@ -0,0 +1,16 @@
# Proposals
This document walk you through about all the enhancements proposals for descheduler.
## Descheduler v1alpha2 Design Proposal
```yaml
title: Descheduler v1alpha2 Design Proposal
authors:
- "@damemi"
link:
- https://docs.google.com/document/d/1S1JCh-0F-QCJvBBG-kbmXiHAJFF8doArhDIAKbOj93I/edit#heading=h.imbp1ctnc8lx
- https://github.com/kubernetes-sigs/descheduler/issues/679
owning-sig: sig-scheduling
creation-date: 2021-05-01
status: implementable
```

View File

@@ -9,7 +9,7 @@
3. Push the release branch to the descheuler repo and ensure branch protection is enabled (not required for patch releases)
4. Tag the repository from the `master` branch (from the `release-1.18` branch for a patch release) and push the tag `VERSION=v0.18.0 git tag -m $VERSION $VERSION; git push origin $VERSION`
5. Publish a draft release using the tag you just created
6. Perform the [image promotion process](https://github.com/kubernetes/k8s.io/tree/master/k8s.gcr.io#image-promoter)
6. Perform the [image promotion process](https://github.com/kubernetes/k8s.io/tree/main/k8s.gcr.io#image-promoter)
7. Publish release
8. Email `kubernetes-sig-scheduling@googlegroups.com` to announce the release
@@ -22,7 +22,7 @@
5. Checkout the tag you just created and make sure your repo is clean by git's standards `git checkout $VERSION`
6. Build and push the container image to the staging registry `VERSION=$VERSION make push-all`
7. Publish a draft release using the tag you just created
8. Perform the [image promotion process](https://github.com/kubernetes/k8s.io/tree/master/k8s.gcr.io#image-promoter)
8. Perform the [image promotion process](https://github.com/kubernetes/k8s.io/tree/main/k8s.gcr.io#image-promoter)
9. Publish release
10. Email `kubernetes-sig-scheduling@googlegroups.com` to announce the release

View File

@@ -2,13 +2,16 @@
Starting with descheduler release v0.10.0 container images are available in the official k8s container registry.
Descheduler Version | Container Image | Architectures |
------------------- |-----------------------------------------------------|-------------------------|
v0.21.0 | k8s.gcr.io/descheduler/descheduler:v0.21.0 | AMD64<br>ARM64<br>ARMv7 |
v0.20.0 | k8s.gcr.io/descheduler/descheduler:v0.20.0 | AMD64<br>ARM64 |
v0.19.0 | k8s.gcr.io/descheduler/descheduler:v0.19.0 | AMD64 |
v0.18.0 | k8s.gcr.io/descheduler/descheduler:v0.18.0 | AMD64 |
v0.10.0 | k8s.gcr.io/descheduler/descheduler:v0.10.0 | AMD64 |
Descheduler Version | Container Image | Architectures |
------------------- |--------------------------------------------|-------------------------|
v0.24.0 | k8s.gcr.io/descheduler/descheduler:v0.24.0 | AMD64<br>ARM64<br>ARMv7 |
v0.23.1 | k8s.gcr.io/descheduler/descheduler:v0.23.1 | AMD64<br>ARM64<br>ARMv7 |
v0.22.0 | k8s.gcr.io/descheduler/descheduler:v0.22.0 | AMD64<br>ARM64<br>ARMv7 |
v0.21.0 | k8s.gcr.io/descheduler/descheduler:v0.21.0 | AMD64<br>ARM64<br>ARMv7 |
v0.20.0 | k8s.gcr.io/descheduler/descheduler:v0.20.0 | AMD64<br>ARM64 |
v0.19.0 | k8s.gcr.io/descheduler/descheduler:v0.19.0 | AMD64 |
v0.18.0 | k8s.gcr.io/descheduler/descheduler:v0.18.0 | AMD64 |
v0.10.0 | k8s.gcr.io/descheduler/descheduler:v0.10.0 | AMD64 |
Note that multi-arch container images cannot be pulled by [kind](https://kind.sigs.k8s.io) from a registry. Therefore
starting with descheduler release v0.20.0 use the below process to download the official descheduler
@@ -33,31 +36,52 @@ Usage:
descheduler [command]
Available Commands:
completion generate the autocompletion script for the specified shell
help Help about any command
version Version of descheduler
Flags:
--add-dir-header If true, adds the file directory to the header of the log messages
--alsologtostderr log to standard error as well as files
--descheduling-interval duration Time interval between two consecutive descheduler executions. Setting this value instructs the descheduler to run in a continuous loop at the interval specified.
--dry-run execute descheduler in dry run mode.
--evict-local-storage-pods DEPRECATED: enables evicting pods using local storage by descheduler
-h, --help help for descheduler
--kubeconfig string File with kube configuration.
--log-backtrace-at traceLocation when logging hits line file:N, emit a stack trace (default :0)
--log-dir string If non-empty, write log files in this directory
--log-file string If non-empty, use this log file
--log-file-max-size uint Defines the maximum size a log file can grow to. Unit is megabytes. If the value is 0, the maximum file size is unlimited. (default 1800)
--log-flush-frequency duration Maximum number of seconds between log flushes (default 5s)
--logtostderr log to standard error instead of files (default true)
--max-pods-to-evict-per-node int DEPRECATED: limits the maximum number of pods to be evicted per node by descheduler
--node-selector string DEPRECATED: selector (label query) to filter on, supports '=', '==', and '!='.(e.g. -l key1=value1,key2=value2)
--policy-config-file string File with descheduler policy configuration.
--skip-headers If true, avoid header prefixes in the log messages
--skip-log-headers If true, avoid headers when opening log files
--stderrthreshold severity logs at or above this threshold go to stderr (default 2)
-v, --v Level number for the log level verbosity
--vmodule moduleSpec comma-separated list of pattern=N settings for file-filtered logging
--add-dir-header If true, adds the file directory to the header of the log messages (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--alsologtostderr log to standard error as well as files (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--bind-address ip The IP address on which to listen for the --secure-port port. The associated interface(s) must be reachable by the rest of the cluster, and by CLI/web clients. If blank or an unspecified address (0.0.0.0 or ::), all interfaces will be used. (default 0.0.0.0)
--cert-dir string The directory where the TLS certs are located. If --tls-cert-file and --tls-private-key-file are provided, this flag will be ignored. (default "apiserver.local.config/certificates")
--descheduling-interval duration Time interval between two consecutive descheduler executions. Setting this value instructs the descheduler to run in a continuous loop at the interval specified.
--disable-metrics Disables metrics. The metrics are by default served through https://localhost:10258/metrics. Secure address, resp. port can be changed through --bind-address, resp. --secure-port flags.
--dry-run execute descheduler in dry run mode.
-h, --help help for descheduler
--http2-max-streams-per-connection int The limit that the server gives to clients for the maximum number of streams in an HTTP/2 connection. Zero means to use golang's default.
--kubeconfig string File with kube configuration.
--leader-elect Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability.
--leader-elect-lease-duration duration The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled. (default 15s)
--leader-elect-renew-deadline duration The interval between attempts by the acting master to renew a leadership slot before it stops leading. This must be less than or equal to the lease duration. This is only applicable if leader election is enabled. (default 10s)
--leader-elect-resource-lock string The type of resource object that is used for locking during leader election. Supported options are 'endpoints', 'configmaps', 'leases', 'endpointsleases' and 'configmapsleases'. (default "leases")
--leader-elect-resource-name string The name of resource object that is used for locking during leader election. (default "descheduler")
--leader-elect-resource-namespace string The namespace of resource object that is used for locking during leader election. (default "kube-system")
--leader-elect-retry-period duration The duration the clients should wait between attempting acquisition and renewal of a leadership. This is only applicable if leader election is enabled. (default 2s)
--log-backtrace-at traceLocation when logging hits line file:N, emit a stack trace (default :0) (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--log-dir string If non-empty, write log files in this directory (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--log-file string If non-empty, use this log file (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--log-file-max-size uint Defines the maximum size a log file can grow to. Unit is megabytes. If the value is 0, the maximum file size is unlimited. (default 1800) (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--log-flush-frequency duration Maximum number of seconds between log flushes (default 5s)
--logging-format string Sets the log format. Permitted formats: "text", "json". Non-default formats don't honor these flags: --add-dir-header, --alsologtostderr, --log-backtrace-at, --log-dir, --log-file, --log-file-max-size, --logtostderr, --skip-headers, --skip-log-headers, --stderrthreshold, --log-flush-frequency.\nNon-default choices are currently alpha and subject to change without warning. (default "text")
--logtostderr log to standard error instead of files (default true) (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--one-output If true, only write logs to their native severity level (vs also writing to each lower severity level) (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--permit-address-sharing If true, SO_REUSEADDR will be used when binding the port. This allows binding to wildcard IPs like 0.0.0.0 and specific IPs in parallel, and it avoids waiting for the kernel to release sockets in TIME_WAIT state. [default=false]
--permit-port-sharing If true, SO_REUSEPORT will be used when binding the port, which allows more than one instance to bind on the same address and port. [default=false]
--policy-config-file string File with descheduler policy configuration.
--secure-port int The port on which to serve HTTPS with authentication and authorization. If 0, don't serve HTTPS at all. (default 10258)
--skip-headers If true, avoid header prefixes in the log messages (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--skip-log-headers If true, avoid headers when opening log files (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--stderrthreshold severity logs at or above this threshold go to stderr (default 2) (DEPRECATED: will be removed in a future release, see https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/2845-deprecate-klog-specific-flags-in-k8s-components)
--tls-cert-file string File containing the default x509 Certificate for HTTPS. (CA cert, if any, concatenated after server cert). If HTTPS serving is enabled, and --tls-cert-file and --tls-private-key-file are not provided, a self-signed certificate and key are generated for the public address and saved to the directory specified by --cert-dir.
--tls-cipher-suites strings Comma-separated list of cipher suites for the server. If omitted, the default Go cipher suites will be used.
Preferred values: TLS_AES_128_GCM_SHA256, TLS_AES_256_GCM_SHA384, TLS_CHACHA20_POLY1305_SHA256, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA, TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305, TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384, TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305, TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_128_GCM_SHA256, TLS_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_256_GCM_SHA384.
Insecure values: TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256, TLS_ECDHE_ECDSA_WITH_RC4_128_SHA, TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256, TLS_ECDHE_RSA_WITH_RC4_128_SHA, TLS_RSA_WITH_3DES_EDE_CBC_SHA, TLS_RSA_WITH_AES_128_CBC_SHA256, TLS_RSA_WITH_RC4_128_SHA.
--tls-min-version string Minimum TLS version supported. Possible values: VersionTLS10, VersionTLS11, VersionTLS12, VersionTLS13
--tls-private-key-file string File containing the default x509 private key matching --tls-cert-file.
--tls-sni-cert-key namedCertKey A pair of x509 certificate and private key file paths, optionally suffixed with a list of domain patterns which are fully qualified domain names, possibly with prefixed wildcard segments. The domain patterns also allow IP addresses, but IPs should only be used if the apiserver has visibility to the IP address requested by a client. If no domain patterns are provided, the names of the certificate are extracted. Non-wildcard matches trump over wildcard matches, explicit domain patterns trump over extracted names. For multiple key/certificate pairs, use the --tls-sni-cert-key multiple times. Examples: "example.crt,example.key" or "foo.crt,foo.key:*.foo.com,foo.com". (default [])
-v, --v Level number for the log level verbosity
--vmodule moduleSpec comma-separated list of pattern=N settings for file-filtered logging
Use "descheduler [command] --help" for more information about a command.
```
@@ -88,7 +112,8 @@ strategies:
"PodLifeTime":
enabled: true
params:
maxPodLifeTimeSeconds: 604800 # pods run for a maximum of 7 days
podLifeTime:
maxPodLifeTimeSeconds: 604800 # pods run for a maximum of 7 days
```
### Balance Cluster By Node Memory Utilization
@@ -116,7 +141,7 @@ strategies:
#### Balance low utilization nodes
Using `HighNodeUtilization`, descheduler will rebalance the cluster based on memory by evicting pods
from nodes with memory utilization lower than 20%. This should be used along with scheduler strategy `MostRequestedPriority`.
from nodes with memory utilization lower than 20%. This should be use `NodeResourcesFit` with the `MostAllocated` scoring strategy based on these [doc](https://kubernetes.io/docs/reference/scheduling/config/#scheduling-plugins).
The evicted pods will be compacted into minimal set of nodes.
```
@@ -135,7 +160,14 @@ strategies:
Descheduler's `RemovePodsViolatingNodeTaints` strategy can be combined with
[Node Problem Detector](https://github.com/kubernetes/node-problem-detector/) and
[Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) to automatically remove
Nodes which have problems. Node Problem Detector can detect specific Node problems and taint any Nodes which have those
problems. The Descheduler will then deschedule workloads from those Nodes. Finally, if the descheduled Node's resource
Nodes which have problems. Node Problem Detector can detect specific Node problems and report them to the API server.
There is a feature called TaintNodeByCondition of the node controller that takes some conditions and turns them into taints. Currently, this only works for the default node conditions: PIDPressure, MemoryPressure, DiskPressure, Ready, and some cloud provider specific conditions.
The Descheduler will then deschedule workloads from those Nodes. Finally, if the descheduled Node's resource
allocation falls below the Cluster Autoscaler's scale down threshold, the Node will become a scale down candidate
and can be removed by Cluster Autoscaler. These three components form an autohealing cycle for Node problems.
---
**NOTE**
Once [kubernetes/node-problem-detector#565](https://github.com/kubernetes/node-problem-detector/pull/565) is available in NPD, we need to update this section.
---

14
examples/failed-pods.yaml Normal file
View File

@@ -0,0 +1,14 @@
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemoveFailedPods":
enabled: true
params:
failedPods:
reasons:
- "OutOfcpu"
- "CreateContainerConfigError"
includingInitContainers: true
excludeOwnerKinds:
- "Job"
minPodLifetimeSeconds: 3600 # 1 hour

View File

@@ -1,4 +1,3 @@
---
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:

View File

@@ -1,4 +1,3 @@
---
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:

View File

@@ -1,4 +1,3 @@
---
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:

View File

@@ -23,3 +23,7 @@ strategies:
podsHavingTooManyRestarts:
podRestartThreshold: 100
includingInitContainers: true
"RemovePodsViolatingTopologySpreadConstraint":
enabled: true
params:
includeSoftConstraints: true

View File

@@ -0,0 +1,7 @@
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemovePodsViolatingTopologySpreadConstraint":
enabled: true
params:
includeSoftConstraints: true # Include 'ScheduleAnyways' constraints

118
go.mod
View File

@@ -1,19 +1,117 @@
module sigs.k8s.io/descheduler
go 1.16
go 1.17
require (
github.com/client9/misspell v0.3.4
github.com/spf13/cobra v1.1.1
github.com/spf13/cobra v1.4.0
github.com/spf13/pflag v1.0.5
k8s.io/api v0.21.0
k8s.io/apimachinery v0.21.0
k8s.io/apiserver v0.21.0
k8s.io/client-go v0.21.0
k8s.io/code-generator v0.21.0
k8s.io/component-base v0.21.0
k8s.io/component-helpers v0.21.0
k8s.io/klog/v2 v2.8.0
k8s.io/api v0.24.0
k8s.io/apimachinery v0.24.0
k8s.io/apiserver v0.24.0
k8s.io/client-go v0.24.0
k8s.io/code-generator v0.24.0
k8s.io/component-base v0.24.0
k8s.io/component-helpers v0.24.0
k8s.io/klog/v2 v2.60.1
k8s.io/kubectl v0.20.5
sigs.k8s.io/mdtoc v1.0.1
)
require (
cloud.google.com/go v0.81.0 // indirect
github.com/Azure/go-autorest v14.2.0+incompatible // indirect
github.com/Azure/go-autorest/autorest v0.11.18 // indirect
github.com/Azure/go-autorest/autorest/adal v0.9.13 // indirect
github.com/Azure/go-autorest/autorest/date v0.3.0 // indirect
github.com/Azure/go-autorest/logger v0.2.1 // indirect
github.com/Azure/go-autorest/tracing v0.6.0 // indirect
github.com/BurntSushi/toml v0.3.1 // indirect
github.com/NYTimes/gziphandler v1.1.1 // indirect
github.com/PuerkitoBio/purell v1.1.1 // indirect
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/coreos/go-semver v0.3.0 // indirect
github.com/coreos/go-systemd/v22 v22.3.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful v2.9.5+incompatible // indirect
github.com/evanphx/json-patch v4.12.0+incompatible // indirect
github.com/felixge/httpsnoop v1.0.1 // indirect
github.com/form3tech-oss/jwt-go v3.2.3+incompatible // indirect
github.com/fsnotify/fsnotify v1.4.9 // indirect
github.com/go-logr/logr v1.2.0 // indirect
github.com/go-logr/zapr v1.2.0 // indirect
github.com/go-openapi/jsonpointer v0.19.5 // indirect
github.com/go-openapi/jsonreference v0.19.5 // indirect
github.com/go-openapi/swag v0.19.14 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/gomarkdown/markdown v0.0.0-20200824053859-8c8b3816f167 // indirect
github.com/google/gnostic v0.5.7-v3refs // indirect
github.com/google/go-cmp v0.5.5 // indirect
github.com/google/gofuzz v1.1.0 // indirect
github.com/google/uuid v1.1.2 // indirect
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect
github.com/imdario/mergo v0.3.5 // indirect
github.com/inconshreveable/mousetrap v1.0.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.6 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect
github.com/mmarkdown/mmark v2.0.40+incompatible // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.12.1 // indirect
github.com/prometheus/client_model v0.2.0 // indirect
github.com/prometheus/common v0.32.1 // indirect
github.com/prometheus/procfs v0.7.3 // indirect
go.etcd.io/etcd/api/v3 v3.5.1 // indirect
go.etcd.io/etcd/client/pkg/v3 v3.5.1 // indirect
go.etcd.io/etcd/client/v3 v3.5.1 // indirect
go.opentelemetry.io/contrib v0.20.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.20.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.20.0 // indirect
go.opentelemetry.io/otel v0.20.0 // indirect
go.opentelemetry.io/otel/exporters/otlp v0.20.0 // indirect
go.opentelemetry.io/otel/metric v0.20.0 // indirect
go.opentelemetry.io/otel/sdk v0.20.0 // indirect
go.opentelemetry.io/otel/sdk/export/metric v0.20.0 // indirect
go.opentelemetry.io/otel/sdk/metric v0.20.0 // indirect
go.opentelemetry.io/otel/trace v0.20.0 // indirect
go.opentelemetry.io/proto/otlp v0.7.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
go.uber.org/zap v1.19.0 // indirect
golang.org/x/crypto v0.0.0-20220214200702-86341886e292 // indirect
golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3 // indirect
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect
golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c // indirect
golang.org/x/sys v0.0.0-20220209214540-3681064d5158 // indirect
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
golang.org/x/tools v0.1.10-0.20220218145154-897bd77cd717 // indirect
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto v0.0.0-20220107163113-42d7afdf6368 // indirect
google.golang.org/grpc v1.40.0 // indirect
google.golang.org/protobuf v1.27.1 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
k8s.io/gengo v0.0.0-20211129171323-c02415ce4185 // indirect
k8s.io/kube-openapi v0.0.0-20220328201542-3ee0da9b0b42 // indirect
k8s.io/utils v0.0.0-20220210201930-3a6ce19ff2f9 // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.30 // indirect
sigs.k8s.io/json v0.0.0-20211208200746-9f7c6b3444d2 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.1 // indirect
sigs.k8s.io/yaml v1.2.0 // indirect
)

545
go.sum

File diff suppressed because it is too large Load Diff

View File

@@ -18,15 +18,15 @@ E2E_GCE_HOME=$DESCHEDULER_ROOT/hack/e2e-gce
create_cluster() {
echo "#################### Creating instances ##########################"
gcloud compute instances create descheduler-$master_uuid --image="ubuntu-1604-xenial-v20180306" --image-project="ubuntu-os-cloud" --zone=us-east1-b
gcloud compute instances create descheduler-$master_uuid --image-family="ubuntu-1804-lts" --image-project="ubuntu-os-cloud" --zone=us-east1-b
# Keeping the --zone here so as to make sure that e2e's can run locally.
echo "gcloud compute instances delete descheduler-$master_uuid --zone=us-east1-b --quiet" > $E2E_GCE_HOME/delete_cluster.sh
gcloud compute instances create descheduler-$node1_uuid --image="ubuntu-1604-xenial-v20180306" --image-project="ubuntu-os-cloud" --zone=us-east1-b
gcloud compute instances create descheduler-$node1_uuid --image-family="ubuntu-1804-lts" --image-project="ubuntu-os-cloud" --zone=us-east1-b
echo "gcloud compute instances delete descheduler-$node1_uuid --zone=us-east1-b --quiet" >> $E2E_GCE_HOME/delete_cluster.sh
gcloud compute instances create descheduler-$node2_uuid --image="ubuntu-1604-xenial-v20180306" --image-project="ubuntu-os-cloud" --zone=us-east1-b
echo "gcloud compute instances delete descheduler-$node2_uuid --zone=us-east1-b --quiet" >> $E2E_GCE_HOME/delete_cluster.sh
gcloud compute instances create descheduler-$node2_uuid --image-family="ubuntu-1804-lts" --image-project="ubuntu-os-cloud" --zone=us-east1-b
echo "gcloud compute instances delete descheduler-$node2_uuid --zone=us-east1-c --quiet" >> $E2E_GCE_HOME/delete_cluster.sh
# Delete the firewall port created for master.
echo "gcloud compute firewall-rules delete kubeapiserver-$master_uuid --quiet" >> $E2E_GCE_HOME/delete_cluster.sh
@@ -44,10 +44,10 @@ generate_kubeadm_instance_files() {
transfer_install_files() {
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$master_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$master_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_install.sh descheduler-$master_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$node1_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$node2_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$node1_uuid:/tmp --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_preinstall.sh descheduler-$node2_uuid:/tmp --zone=us-east1-c
}
@@ -55,19 +55,19 @@ install_kube() {
# Docker installation.
gcloud compute ssh descheduler-$master_uuid --command "sudo apt-get update; sudo apt-get install -y docker.io" --zone=us-east1-b
gcloud compute ssh descheduler-$node1_uuid --command "sudo apt-get update; sudo apt-get install -y docker.io" --zone=us-east1-b
gcloud compute ssh descheduler-$node2_uuid --command "sudo apt-get update; sudo apt-get install -y docker.io" --zone=us-east1-b
gcloud compute ssh descheduler-$node2_uuid --command "sudo apt-get update; sudo apt-get install -y docker.io" --zone=us-east1-c
# kubeadm installation.
# 1. Transfer files to master, nodes.
transfer_install_files
# 2. Install kubeadm.
#TODO: Add rm /tmp/kubeadm_install.sh
# Open port for kube API server
gcloud compute firewall-rules create kubeapiserver-$master_uuid --allow tcp:6443 --source-tags=descheduler-$master_uuid --source-ranges=0.0.0.0/0 --description="Opening api server port"
gcloud compute firewall-rules create kubeapiserver-$master_uuid --allow tcp:6443 --source-tags=descheduler-$master_uuid --source-ranges=0.0.0.0/0 --description="Opening api server port"
gcloud compute ssh descheduler-$master_uuid --command "sudo chmod 755 /tmp/kubeadm_preinstall.sh; sudo /tmp/kubeadm_preinstall.sh" --zone=us-east1-b
kubeadm_join_command=$(gcloud compute ssh descheduler-$master_uuid --command "sudo chmod 755 /tmp/kubeadm_install.sh; sudo /tmp/kubeadm_install.sh" --zone=us-east1-b|grep 'kubeadm join')
# Copy the kubeconfig file onto /tmp for e2e tests.
# Copy the kubeconfig file onto /tmp for e2e tests.
gcloud compute ssh descheduler-$master_uuid --command "sudo cp /etc/kubernetes/admin.conf /tmp; sudo chmod 777 /tmp/admin.conf" --zone=us-east1-b
gcloud compute scp descheduler-$master_uuid:/tmp/admin.conf /tmp/admin.conf --zone=us-east1-b
@@ -75,16 +75,15 @@ install_kube() {
gcloud compute ssh descheduler-$master_uuid --command "sudo kubectl apply -f https://raw.githubusercontent.com/cloudnativelabs/kube-router/master/daemonset/kubeadm-kuberouter.yaml --kubeconfig /etc/kubernetes/admin.conf" --zone=us-east1-b
echo $kubeadm_join_command > $E2E_GCE_HOME/kubeadm_join.sh
# Copy kubeadm_join to every node.
# Copy kubeadm_join to every node.
#TODO: Put these in a loop, so that extension becomes possible.
gcloud compute ssh descheduler-$node1_uuid --command "sudo chmod 755 /tmp/kubeadm_preinstall.sh; sudo /tmp/kubeadm_preinstall.sh" --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_join.sh descheduler-$node1_uuid:/tmp --zone=us-east1-b
gcloud compute ssh descheduler-$node1_uuid --command "sudo chmod 755 /tmp/kubeadm_join.sh; sudo /tmp/kubeadm_join.sh" --zone=us-east1-b
gcloud compute ssh descheduler-$node2_uuid --command "sudo chmod 755 /tmp/kubeadm_preinstall.sh; sudo /tmp/kubeadm_preinstall.sh" --zone=us-east1-b
gcloud compute scp $E2E_GCE_HOME/kubeadm_join.sh descheduler-$node2_uuid:/tmp --zone=us-east1-b
gcloud compute ssh descheduler-$node2_uuid --command "sudo chmod 755 /tmp/kubeadm_join.sh; sudo /tmp/kubeadm_join.sh" --zone=us-east1-b
gcloud compute ssh descheduler-$node2_uuid --command "sudo chmod 755 /tmp/kubeadm_preinstall.sh; sudo /tmp/kubeadm_preinstall.sh" --zone=us-east1-c
gcloud compute scp $E2E_GCE_HOME/kubeadm_join.sh descheduler-$node2_uuid:/tmp --zone=us-east1-c
gcloud compute ssh descheduler-$node2_uuid --command "sudo chmod 755 /tmp/kubeadm_join.sh; sudo /tmp/kubeadm_join.sh" --zone=us-east1-c
}

View File

@@ -3,4 +3,16 @@ apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
kubeadmConfigPatches:
- |
kind: JoinConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "topology.kubernetes.io/zone=local-a"
- role: worker
kubeadmConfigPatches:
- |
kind: JoinConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "topology.kubernetes.io/zone=local-b"

View File

@@ -1,3 +1,4 @@
//go:build tools
// +build tools
/*

View File

@@ -23,7 +23,7 @@ DESCHEDULER_ROOT=$(dirname "${BASH_SOURCE}")/..
GO_VERSION=($(go version))
if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.14|go1.15|go1.16') ]]; then
if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.14|go1.15|go1.16|go1.17') ]]; then
echo "Unknown go version '${GO_VERSION[2]}', skipping gofmt."
exit 1
fi

View File

@@ -28,7 +28,7 @@ pushd "${DESCHEDULER_ROOT}" > /dev/null 2>&1
if ! _out="$(diff -Naupr pkg/ "${_deschedulertmp}/pkg/")"; then
echo "Generated output differs:" >&2
echo "${_out}" >&2
echo "Generated conversions verify failed. Please run ./hack/update-generated-conversions.sh (and commit the result)"
echo "Generated conversions verify failed. Please run ./hack/update-generated-conversions.sh"
exit 1
fi
popd > /dev/null 2>&1

View File

@@ -28,7 +28,7 @@ pushd "${DESCHEDULER_ROOT}" > /dev/null 2>&1
if ! _out="$(diff -Naupr pkg/ "${_deschedulertmp}/pkg/")"; then
echo "Generated deep-copies output differs:" >&2
echo "${_out}" >&2
echo "Generated deep-copies verify failed. Please run ./hack/update-generated-deep-copies.sh (and commit the result)"
echo "Generated deep-copies verify failed. Please run ./hack/update-generated-deep-copies.sh"
exit 1
fi
popd > /dev/null 2>&1

35
hack/verify-defaulters.sh Executable file
View File

@@ -0,0 +1,35 @@
#!/bin/bash
set -o errexit
set -o nounset
set -o pipefail
source "$(dirname "${BASH_SOURCE}")/lib/init.sh"
DESCHEDULER_ROOT=$(dirname "${BASH_SOURCE}")/..
_tmpdir="$(mktemp -d "${DESCHEDULER_ROOT}/_tmp/kube-verify.XXXXXX")"
_deschedulertmp="${_tmpdir}"
mkdir -p "${_deschedulertmp}"
git archive --format=tar --prefix=descheduler/ "$(git write-tree)" | (cd "${_deschedulertmp}" && tar xf -)
_deschedulertmp="${_deschedulertmp}/descheduler"
pushd "${_deschedulertmp}" > /dev/null 2>&1
go build -o "${OS_OUTPUT_BINPATH}/defaulter-gen" "k8s.io/code-generator/cmd/defaulter-gen"
${OS_OUTPUT_BINPATH}/defaulter-gen \
--go-header-file "hack/boilerplate/boilerplate.go.txt" \
--input-dirs "${PRJ_PREFIX}/pkg/apis/componentconfig/v1alpha1,${PRJ_PREFIX}/pkg/api/v1alpha1" \
--extra-peer-dirs "${PRJ_PREFIX}/pkg/apis/componentconfig/v1alpha1,${PRJ_PREFIX}/pkg/api/v1alpha1" \
--output-file-base zz_generated.defaults
popd > /dev/null 2>&1
pushd "${DESCHEDULER_ROOT}" > /dev/null 2>&1
if ! _out="$(diff -Naupr pkg/ "${_deschedulertmp}/pkg/")"; then
echo "Generated defaulters output differs:" >&2
echo "${_out}" >&2
echo "Generated defaulters verify failed. Please run ./hack/update-generated-defaulters.sh"
fi
popd > /dev/null 2>&1
echo "Generated Defaulters verified."

View File

@@ -23,7 +23,7 @@ DESCHEDULER_ROOT=$(dirname "${BASH_SOURCE}")/..
GO_VERSION=($(go version))
if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.14|go1.15|go1.16') ]]; then
if [[ -z $(echo "${GO_VERSION[2]}" | grep -E 'go1.14|go1.15|go1.16|go1.17|go1.18') ]]; then
echo "Unknown go version '${GO_VERSION[2]}', skipping gofmt."
exit 1
fi

BIN
kind

Binary file not shown.

View File

@@ -12,7 +12,7 @@ rules:
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list", "delete"]
@@ -22,6 +22,13 @@ rules:
- apiGroups: ["scheduling.k8s.io"]
resources: ["priorityclasses"]
verbs: ["get", "watch", "list"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
resourceNames: ["descheduler"]
verbs: ["get", "patch", "delete"]
---
apiVersion: v1
kind: ServiceAccount
@@ -41,4 +48,3 @@ subjects:
- name: descheduler-sa
kind: ServiceAccount
namespace: kube-system

View File

@@ -16,7 +16,7 @@ spec:
priorityClassName: system-cluster-critical
containers:
- name: descheduler
image: k8s.gcr.io/descheduler/descheduler:v0.21.0
image: k8s.gcr.io/descheduler/descheduler:v0.24.0
volumeMounts:
- mountPath: /policy-dir
name: policy-volume
@@ -31,6 +31,14 @@ spec:
requests:
cpu: "500m"
memory: "256Mi"
livenessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 10258
scheme: HTTPS
initialDelaySeconds: 3
periodSeconds: 10
securityContext:
allowPrivilegeEscalation: false
capabilities:

View File

@@ -19,7 +19,7 @@ spec:
serviceAccountName: descheduler-sa
containers:
- name: descheduler
image: k8s.gcr.io/descheduler/descheduler:v0.21.0
image: k8s.gcr.io/descheduler/descheduler:v0.24.0
imagePullPolicy: IfNotPresent
command:
- "/bin/descheduler"
@@ -33,6 +33,14 @@ spec:
ports:
- containerPort: 10258
protocol: TCP
livenessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 10258
scheme: HTTPS
initialDelaySeconds: 3
periodSeconds: 10
resources:
requests:
cpu: 500m

View File

@@ -14,7 +14,7 @@ spec:
priorityClassName: system-cluster-critical
containers:
- name: descheduler
image: k8s.gcr.io/descheduler/descheduler:v0.21.0
image: k8s.gcr.io/descheduler/descheduler:v0.24.0
volumeMounts:
- mountPath: /policy-dir
name: policy-volume
@@ -29,6 +29,14 @@ spec:
requests:
cpu: "500m"
memory: "256Mi"
livenessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 10258
scheme: HTTPS
initialDelaySeconds: 3
periodSeconds: 10
securityContext:
allowPrivilegeEscalation: false
capabilities:

View File

@@ -34,9 +34,9 @@ var (
&metrics.CounterOpts{
Subsystem: DeschedulerSubsystem,
Name: "pods_evicted",
Help: "Number of evicted pods, by the result, by the strategy, by the namespace. 'failed' result means a pod could not be evicted",
Help: "Number of evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted",
StabilityLevel: metrics.ALPHA,
}, []string{"result", "strategy", "namespace"})
}, []string{"result", "strategy", "namespace", "node"})
buildInfo = metrics.NewGauge(
&metrics.GaugeOpts{

View File

@@ -32,6 +32,9 @@ type DeschedulerPolicy struct {
// NodeSelector for a set of nodes to operate over
NodeSelector *string
// EvictFailedBarePods allows pods without ownerReferences and in failed phase to be evicted.
EvictFailedBarePods *bool
// EvictLocalStoragePods allows pods using local storage to be evicted.
EvictLocalStoragePods *bool
@@ -42,7 +45,10 @@ type DeschedulerPolicy struct {
IgnorePVCPods *bool
// MaxNoOfPodsToEvictPerNode restricts maximum of pods to be evicted per node.
MaxNoOfPodsToEvictPerNode *int
MaxNoOfPodsToEvictPerNode *uint
// MaxNoOfPodsToEvictPerNamespace restricts maximum of pods to be evicted per namespace.
MaxNoOfPodsToEvictPerNamespace *uint
}
type StrategyName string
@@ -75,21 +81,25 @@ type StrategyParameters struct {
PodsHavingTooManyRestarts *PodsHavingTooManyRestarts
PodLifeTime *PodLifeTime
RemoveDuplicates *RemoveDuplicates
FailedPods *FailedPods
IncludeSoftConstraints bool
Namespaces *Namespaces
ThresholdPriority *int32
ThresholdPriorityClassName string
LabelSelector *metav1.LabelSelector
NodeFit bool
IncludePreferNoSchedule bool
ExcludedTaints []string
}
type Percentage float64
type ResourceThresholds map[v1.ResourceName]Percentage
type NodeResourceUtilizationThresholds struct {
Thresholds ResourceThresholds
TargetThresholds ResourceThresholds
NumberOfNodes int
UseDeviationThresholds bool
Thresholds ResourceThresholds
TargetThresholds ResourceThresholds
NumberOfNodes int
}
type PodsHavingTooManyRestarts struct {
@@ -105,3 +115,10 @@ type PodLifeTime struct {
MaxPodLifeTimeSeconds *uint
PodStatusPhases []string
}
type FailedPods struct {
ExcludeOwnerKinds []string
MinPodLifetimeSeconds *uint
Reasons []string
IncludingInitContainers bool
}

View File

@@ -32,6 +32,9 @@ type DeschedulerPolicy struct {
// NodeSelector for a set of nodes to operate over
NodeSelector *string `json:"nodeSelector,omitempty"`
// EvictFailedBarePods allows pods without ownerReferences and in failed phase to be evicted.
EvictFailedBarePods *bool `json:"evictFailedBarePods,omitempty"`
// EvictLocalStoragePods allows pods using local storage to be evicted.
EvictLocalStoragePods *bool `json:"evictLocalStoragePods,omitempty"`
@@ -43,6 +46,9 @@ type DeschedulerPolicy struct {
// MaxNoOfPodsToEvictPerNode restricts maximum of pods to be evicted per node.
MaxNoOfPodsToEvictPerNode *int `json:"maxNoOfPodsToEvictPerNode,omitempty"`
// MaxNoOfPodsToEvictPerNamespace restricts maximum of pods to be evicted per namespace.
MaxNoOfPodsToEvictPerNamespace *int `json:"maxNoOfPodsToEvictPerNamespace,omitempty"`
}
type StrategyName string
@@ -73,21 +79,25 @@ type StrategyParameters struct {
PodsHavingTooManyRestarts *PodsHavingTooManyRestarts `json:"podsHavingTooManyRestarts,omitempty"`
PodLifeTime *PodLifeTime `json:"podLifeTime,omitempty"`
RemoveDuplicates *RemoveDuplicates `json:"removeDuplicates,omitempty"`
FailedPods *FailedPods `json:"failedPods,omitempty"`
IncludeSoftConstraints bool `json:"includeSoftConstraints"`
Namespaces *Namespaces `json:"namespaces"`
ThresholdPriority *int32 `json:"thresholdPriority"`
ThresholdPriorityClassName string `json:"thresholdPriorityClassName"`
LabelSelector *metav1.LabelSelector `json:"labelSelector"`
NodeFit bool `json:"nodeFit"`
IncludePreferNoSchedule bool `json:"includePreferNoSchedule"`
ExcludedTaints []string `json:"excludedTaints,omitempty"`
}
type Percentage float64
type ResourceThresholds map[v1.ResourceName]Percentage
type NodeResourceUtilizationThresholds struct {
Thresholds ResourceThresholds `json:"thresholds,omitempty"`
TargetThresholds ResourceThresholds `json:"targetThresholds,omitempty"`
NumberOfNodes int `json:"numberOfNodes,omitempty"`
UseDeviationThresholds bool `json:"useDeviationThresholds,omitempty"`
Thresholds ResourceThresholds `json:"thresholds,omitempty"`
TargetThresholds ResourceThresholds `json:"targetThresholds,omitempty"`
NumberOfNodes int `json:"numberOfNodes,omitempty"`
}
type PodsHavingTooManyRestarts struct {
@@ -103,3 +113,10 @@ type PodLifeTime struct {
MaxPodLifeTimeSeconds *uint `json:"maxPodLifeTimeSeconds,omitempty"`
PodStatusPhases []string `json:"podStatusPhases,omitempty"`
}
type FailedPods struct {
ExcludeOwnerKinds []string `json:"excludeOwnerKinds,omitempty"`
MinPodLifetimeSeconds *uint `json:"minPodLifetimeSeconds,omitempty"`
Reasons []string `json:"reasons,omitempty"`
IncludingInitContainers bool `json:"includingInitContainers,omitempty"`
}

View File

@@ -1,7 +1,8 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -56,6 +57,16 @@ func RegisterConversions(s *runtime.Scheme) error {
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*FailedPods)(nil), (*api.FailedPods)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1alpha1_FailedPods_To_api_FailedPods(a.(*FailedPods), b.(*api.FailedPods), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*api.FailedPods)(nil), (*FailedPods)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_api_FailedPods_To_v1alpha1_FailedPods(a.(*api.FailedPods), b.(*FailedPods), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*Namespaces)(nil), (*api.Namespaces)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1alpha1_Namespaces_To_api_Namespaces(a.(*Namespaces), b.(*api.Namespaces), scope)
}); err != nil {
@@ -122,10 +133,24 @@ func RegisterConversions(s *runtime.Scheme) error {
func autoConvert_v1alpha1_DeschedulerPolicy_To_api_DeschedulerPolicy(in *DeschedulerPolicy, out *api.DeschedulerPolicy, s conversion.Scope) error {
out.Strategies = *(*api.StrategyList)(unsafe.Pointer(&in.Strategies))
out.NodeSelector = (*string)(unsafe.Pointer(in.NodeSelector))
out.EvictFailedBarePods = (*bool)(unsafe.Pointer(in.EvictFailedBarePods))
out.EvictLocalStoragePods = (*bool)(unsafe.Pointer(in.EvictLocalStoragePods))
out.EvictSystemCriticalPods = (*bool)(unsafe.Pointer(in.EvictSystemCriticalPods))
out.IgnorePVCPods = (*bool)(unsafe.Pointer(in.IgnorePVCPods))
out.MaxNoOfPodsToEvictPerNode = (*int)(unsafe.Pointer(in.MaxNoOfPodsToEvictPerNode))
if in.MaxNoOfPodsToEvictPerNode != nil {
in, out := &in.MaxNoOfPodsToEvictPerNode, &out.MaxNoOfPodsToEvictPerNode
*out = new(uint)
**out = uint(**in)
} else {
out.MaxNoOfPodsToEvictPerNode = nil
}
if in.MaxNoOfPodsToEvictPerNamespace != nil {
in, out := &in.MaxNoOfPodsToEvictPerNamespace, &out.MaxNoOfPodsToEvictPerNamespace
*out = new(uint)
**out = uint(**in)
} else {
out.MaxNoOfPodsToEvictPerNamespace = nil
}
return nil
}
@@ -137,10 +162,24 @@ func Convert_v1alpha1_DeschedulerPolicy_To_api_DeschedulerPolicy(in *Descheduler
func autoConvert_api_DeschedulerPolicy_To_v1alpha1_DeschedulerPolicy(in *api.DeschedulerPolicy, out *DeschedulerPolicy, s conversion.Scope) error {
out.Strategies = *(*StrategyList)(unsafe.Pointer(&in.Strategies))
out.NodeSelector = (*string)(unsafe.Pointer(in.NodeSelector))
out.EvictFailedBarePods = (*bool)(unsafe.Pointer(in.EvictFailedBarePods))
out.EvictLocalStoragePods = (*bool)(unsafe.Pointer(in.EvictLocalStoragePods))
out.EvictSystemCriticalPods = (*bool)(unsafe.Pointer(in.EvictSystemCriticalPods))
out.IgnorePVCPods = (*bool)(unsafe.Pointer(in.IgnorePVCPods))
out.MaxNoOfPodsToEvictPerNode = (*int)(unsafe.Pointer(in.MaxNoOfPodsToEvictPerNode))
if in.MaxNoOfPodsToEvictPerNode != nil {
in, out := &in.MaxNoOfPodsToEvictPerNode, &out.MaxNoOfPodsToEvictPerNode
*out = new(int)
**out = int(**in)
} else {
out.MaxNoOfPodsToEvictPerNode = nil
}
if in.MaxNoOfPodsToEvictPerNamespace != nil {
in, out := &in.MaxNoOfPodsToEvictPerNamespace, &out.MaxNoOfPodsToEvictPerNamespace
*out = new(int)
**out = int(**in)
} else {
out.MaxNoOfPodsToEvictPerNamespace = nil
}
return nil
}
@@ -173,6 +212,32 @@ func Convert_api_DeschedulerStrategy_To_v1alpha1_DeschedulerStrategy(in *api.Des
return autoConvert_api_DeschedulerStrategy_To_v1alpha1_DeschedulerStrategy(in, out, s)
}
func autoConvert_v1alpha1_FailedPods_To_api_FailedPods(in *FailedPods, out *api.FailedPods, s conversion.Scope) error {
out.ExcludeOwnerKinds = *(*[]string)(unsafe.Pointer(&in.ExcludeOwnerKinds))
out.MinPodLifetimeSeconds = (*uint)(unsafe.Pointer(in.MinPodLifetimeSeconds))
out.Reasons = *(*[]string)(unsafe.Pointer(&in.Reasons))
out.IncludingInitContainers = in.IncludingInitContainers
return nil
}
// Convert_v1alpha1_FailedPods_To_api_FailedPods is an autogenerated conversion function.
func Convert_v1alpha1_FailedPods_To_api_FailedPods(in *FailedPods, out *api.FailedPods, s conversion.Scope) error {
return autoConvert_v1alpha1_FailedPods_To_api_FailedPods(in, out, s)
}
func autoConvert_api_FailedPods_To_v1alpha1_FailedPods(in *api.FailedPods, out *FailedPods, s conversion.Scope) error {
out.ExcludeOwnerKinds = *(*[]string)(unsafe.Pointer(&in.ExcludeOwnerKinds))
out.MinPodLifetimeSeconds = (*uint)(unsafe.Pointer(in.MinPodLifetimeSeconds))
out.Reasons = *(*[]string)(unsafe.Pointer(&in.Reasons))
out.IncludingInitContainers = in.IncludingInitContainers
return nil
}
// Convert_api_FailedPods_To_v1alpha1_FailedPods is an autogenerated conversion function.
func Convert_api_FailedPods_To_v1alpha1_FailedPods(in *api.FailedPods, out *FailedPods, s conversion.Scope) error {
return autoConvert_api_FailedPods_To_v1alpha1_FailedPods(in, out, s)
}
func autoConvert_v1alpha1_Namespaces_To_api_Namespaces(in *Namespaces, out *api.Namespaces, s conversion.Scope) error {
out.Include = *(*[]string)(unsafe.Pointer(&in.Include))
out.Exclude = *(*[]string)(unsafe.Pointer(&in.Exclude))
@@ -196,6 +261,7 @@ func Convert_api_Namespaces_To_v1alpha1_Namespaces(in *api.Namespaces, out *Name
}
func autoConvert_v1alpha1_NodeResourceUtilizationThresholds_To_api_NodeResourceUtilizationThresholds(in *NodeResourceUtilizationThresholds, out *api.NodeResourceUtilizationThresholds, s conversion.Scope) error {
out.UseDeviationThresholds = in.UseDeviationThresholds
out.Thresholds = *(*api.ResourceThresholds)(unsafe.Pointer(&in.Thresholds))
out.TargetThresholds = *(*api.ResourceThresholds)(unsafe.Pointer(&in.TargetThresholds))
out.NumberOfNodes = in.NumberOfNodes
@@ -208,6 +274,7 @@ func Convert_v1alpha1_NodeResourceUtilizationThresholds_To_api_NodeResourceUtili
}
func autoConvert_api_NodeResourceUtilizationThresholds_To_v1alpha1_NodeResourceUtilizationThresholds(in *api.NodeResourceUtilizationThresholds, out *NodeResourceUtilizationThresholds, s conversion.Scope) error {
out.UseDeviationThresholds = in.UseDeviationThresholds
out.Thresholds = *(*ResourceThresholds)(unsafe.Pointer(&in.Thresholds))
out.TargetThresholds = *(*ResourceThresholds)(unsafe.Pointer(&in.TargetThresholds))
out.NumberOfNodes = in.NumberOfNodes
@@ -289,12 +356,15 @@ func autoConvert_v1alpha1_StrategyParameters_To_api_StrategyParameters(in *Strat
out.PodsHavingTooManyRestarts = (*api.PodsHavingTooManyRestarts)(unsafe.Pointer(in.PodsHavingTooManyRestarts))
out.PodLifeTime = (*api.PodLifeTime)(unsafe.Pointer(in.PodLifeTime))
out.RemoveDuplicates = (*api.RemoveDuplicates)(unsafe.Pointer(in.RemoveDuplicates))
out.FailedPods = (*api.FailedPods)(unsafe.Pointer(in.FailedPods))
out.IncludeSoftConstraints = in.IncludeSoftConstraints
out.Namespaces = (*api.Namespaces)(unsafe.Pointer(in.Namespaces))
out.ThresholdPriority = (*int32)(unsafe.Pointer(in.ThresholdPriority))
out.ThresholdPriorityClassName = in.ThresholdPriorityClassName
out.LabelSelector = (*v1.LabelSelector)(unsafe.Pointer(in.LabelSelector))
out.NodeFit = in.NodeFit
out.IncludePreferNoSchedule = in.IncludePreferNoSchedule
out.ExcludedTaints = *(*[]string)(unsafe.Pointer(&in.ExcludedTaints))
return nil
}
@@ -309,12 +379,15 @@ func autoConvert_api_StrategyParameters_To_v1alpha1_StrategyParameters(in *api.S
out.PodsHavingTooManyRestarts = (*PodsHavingTooManyRestarts)(unsafe.Pointer(in.PodsHavingTooManyRestarts))
out.PodLifeTime = (*PodLifeTime)(unsafe.Pointer(in.PodLifeTime))
out.RemoveDuplicates = (*RemoveDuplicates)(unsafe.Pointer(in.RemoveDuplicates))
out.FailedPods = (*FailedPods)(unsafe.Pointer(in.FailedPods))
out.IncludeSoftConstraints = in.IncludeSoftConstraints
out.Namespaces = (*Namespaces)(unsafe.Pointer(in.Namespaces))
out.ThresholdPriority = (*int32)(unsafe.Pointer(in.ThresholdPriority))
out.ThresholdPriorityClassName = in.ThresholdPriorityClassName
out.LabelSelector = (*v1.LabelSelector)(unsafe.Pointer(in.LabelSelector))
out.NodeFit = in.NodeFit
out.IncludePreferNoSchedule = in.IncludePreferNoSchedule
out.ExcludedTaints = *(*[]string)(unsafe.Pointer(&in.ExcludedTaints))
return nil
}

View File

@@ -1,7 +1,8 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -41,6 +42,11 @@ func (in *DeschedulerPolicy) DeepCopyInto(out *DeschedulerPolicy) {
*out = new(string)
**out = **in
}
if in.EvictFailedBarePods != nil {
in, out := &in.EvictFailedBarePods, &out.EvictFailedBarePods
*out = new(bool)
**out = **in
}
if in.EvictLocalStoragePods != nil {
in, out := &in.EvictLocalStoragePods, &out.EvictLocalStoragePods
*out = new(bool)
@@ -61,6 +67,11 @@ func (in *DeschedulerPolicy) DeepCopyInto(out *DeschedulerPolicy) {
*out = new(int)
**out = **in
}
if in.MaxNoOfPodsToEvictPerNamespace != nil {
in, out := &in.MaxNoOfPodsToEvictPerNamespace, &out.MaxNoOfPodsToEvictPerNamespace
*out = new(int)
**out = **in
}
return
}
@@ -103,6 +114,37 @@ func (in *DeschedulerStrategy) DeepCopy() *DeschedulerStrategy {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *FailedPods) DeepCopyInto(out *FailedPods) {
*out = *in
if in.ExcludeOwnerKinds != nil {
in, out := &in.ExcludeOwnerKinds, &out.ExcludeOwnerKinds
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.MinPodLifetimeSeconds != nil {
in, out := &in.MinPodLifetimeSeconds, &out.MinPodLifetimeSeconds
*out = new(uint)
**out = **in
}
if in.Reasons != nil {
in, out := &in.Reasons, &out.Reasons
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailedPods.
func (in *FailedPods) DeepCopy() *FailedPods {
if in == nil {
return nil
}
out := new(FailedPods)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Namespaces) DeepCopyInto(out *Namespaces) {
*out = *in
@@ -294,6 +336,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) {
*out = new(RemoveDuplicates)
(*in).DeepCopyInto(*out)
}
if in.FailedPods != nil {
in, out := &in.FailedPods, &out.FailedPods
*out = new(FailedPods)
(*in).DeepCopyInto(*out)
}
if in.Namespaces != nil {
in, out := &in.Namespaces, &out.Namespaces
*out = new(Namespaces)
@@ -309,6 +356,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) {
*out = new(v1.LabelSelector)
(*in).DeepCopyInto(*out)
}
if in.ExcludedTaints != nil {
in, out := &in.ExcludedTaints, &out.ExcludedTaints
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}

View File

@@ -1,7 +1,8 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

View File

@@ -1,7 +1,8 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -41,6 +42,11 @@ func (in *DeschedulerPolicy) DeepCopyInto(out *DeschedulerPolicy) {
*out = new(string)
**out = **in
}
if in.EvictFailedBarePods != nil {
in, out := &in.EvictFailedBarePods, &out.EvictFailedBarePods
*out = new(bool)
**out = **in
}
if in.EvictLocalStoragePods != nil {
in, out := &in.EvictLocalStoragePods, &out.EvictLocalStoragePods
*out = new(bool)
@@ -58,7 +64,12 @@ func (in *DeschedulerPolicy) DeepCopyInto(out *DeschedulerPolicy) {
}
if in.MaxNoOfPodsToEvictPerNode != nil {
in, out := &in.MaxNoOfPodsToEvictPerNode, &out.MaxNoOfPodsToEvictPerNode
*out = new(int)
*out = new(uint)
**out = **in
}
if in.MaxNoOfPodsToEvictPerNamespace != nil {
in, out := &in.MaxNoOfPodsToEvictPerNamespace, &out.MaxNoOfPodsToEvictPerNamespace
*out = new(uint)
**out = **in
}
return
@@ -103,6 +114,37 @@ func (in *DeschedulerStrategy) DeepCopy() *DeschedulerStrategy {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *FailedPods) DeepCopyInto(out *FailedPods) {
*out = *in
if in.ExcludeOwnerKinds != nil {
in, out := &in.ExcludeOwnerKinds, &out.ExcludeOwnerKinds
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.MinPodLifetimeSeconds != nil {
in, out := &in.MinPodLifetimeSeconds, &out.MinPodLifetimeSeconds
*out = new(uint)
**out = **in
}
if in.Reasons != nil {
in, out := &in.Reasons, &out.Reasons
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailedPods.
func (in *FailedPods) DeepCopy() *FailedPods {
if in == nil {
return nil
}
out := new(FailedPods)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Namespaces) DeepCopyInto(out *Namespaces) {
*out = *in
@@ -294,6 +336,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) {
*out = new(RemoveDuplicates)
(*in).DeepCopyInto(*out)
}
if in.FailedPods != nil {
in, out := &in.FailedPods, &out.FailedPods
*out = new(FailedPods)
(*in).DeepCopyInto(*out)
}
if in.Namespaces != nil {
in, out := &in.Namespaces, &out.Namespaces
*out = new(Namespaces)
@@ -309,6 +356,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) {
*out = new(v1.LabelSelector)
(*in).DeepCopyInto(*out)
}
if in.ExcludedTaints != nil {
in, out := &in.ExcludedTaints, &out.ExcludedTaints
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}

View File

@@ -53,6 +53,9 @@ type DeschedulerConfiguration struct {
// IgnorePVCPods sets whether PVC pods should be allowed to be evicted
IgnorePVCPods bool
// LeaderElection starts Deployment using leader election loop
LeaderElection componentbaseconfig.LeaderElectionConfiguration
// Logging specifies the options of logging.
// Refer [Logs Options](https://github.com/kubernetes/component-base/blob/master/logs/options.go) for more information.
Logging componentbaseconfig.LoggingConfiguration

View File

@@ -17,9 +17,10 @@ limitations under the License.
package v1alpha1
import (
componentbaseconfig "k8s.io/component-base/config"
"time"
componentbaseconfig "k8s.io/component-base/config"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
@@ -53,6 +54,9 @@ type DeschedulerConfiguration struct {
// IgnorePVCPods sets whether PVC pods should be allowed to be evicted
IgnorePVCPods bool `json:"ignorePvcPods,omitempty"`
// LeaderElection starts Deployment using leader election loop
LeaderElection componentbaseconfig.LeaderElectionConfiguration `json:"leaderElection,omitempty"`
// Logging specifies the options of logging.
// Refer [Logs Options](https://github.com/kubernetes/component-base/blob/master/logs/options.go) for more information.
Logging componentbaseconfig.LoggingConfiguration `json:"logging,omitempty"`

View File

@@ -1,7 +1,8 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -57,6 +58,7 @@ func autoConvert_v1alpha1_DeschedulerConfiguration_To_componentconfig_Deschedule
out.MaxNoOfPodsToEvictPerNode = in.MaxNoOfPodsToEvictPerNode
out.EvictLocalStoragePods = in.EvictLocalStoragePods
out.IgnorePVCPods = in.IgnorePVCPods
out.LeaderElection = in.LeaderElection
out.Logging = in.Logging
return nil
}
@@ -75,6 +77,7 @@ func autoConvert_componentconfig_DeschedulerConfiguration_To_v1alpha1_Deschedule
out.MaxNoOfPodsToEvictPerNode = in.MaxNoOfPodsToEvictPerNode
out.EvictLocalStoragePods = in.EvictLocalStoragePods
out.IgnorePVCPods = in.IgnorePVCPods
out.LeaderElection = in.LeaderElection
out.Logging = in.Logging
return nil
}

View File

@@ -1,7 +1,8 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -28,7 +29,8 @@ import (
func (in *DeschedulerConfiguration) DeepCopyInto(out *DeschedulerConfiguration) {
*out = *in
out.TypeMeta = in.TypeMeta
out.Logging = in.Logging
out.LeaderElection = in.LeaderElection
in.Logging.DeepCopyInto(&out.Logging)
return
}

View File

@@ -1,7 +1,8 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

View File

@@ -1,7 +1,8 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright 2021 The Kubernetes Authors.
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -28,7 +29,8 @@ import (
func (in *DeschedulerConfiguration) DeepCopyInto(out *DeschedulerConfiguration) {
*out = *in
out.TypeMeta = in.TypeMeta
out.Logging = in.Logging
out.LeaderElection = in.LeaderElection
in.Logging.DeepCopyInto(&out.Logging)
return
}

View File

@@ -19,14 +19,22 @@ package descheduler
import (
"context"
"fmt"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies/nodeutilization"
v1 "k8s.io/api/core/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
policy "k8s.io/api/policy/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/informers"
clientset "k8s.io/client-go/kubernetes"
fakeclientset "k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"k8s.io/klog/v2"
corev1informers "k8s.io/client-go/informers/core/v1"
schedulingv1informers "k8s.io/client-go/informers/scheduling/v1"
"sigs.k8s.io/descheduler/cmd/descheduler/app/options"
"sigs.k8s.io/descheduler/metrics"
"sigs.k8s.io/descheduler/pkg/api"
@@ -34,13 +42,14 @@ import (
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
eutils "sigs.k8s.io/descheduler/pkg/descheduler/evictions/utils"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies/nodeutilization"
)
func Run(rs *options.DeschedulerServer) error {
func Run(ctx context.Context, rs *options.DeschedulerServer) error {
metrics.Register()
ctx := context.Background()
rsclient, err := client.CreateClient(rs.KubeconfigFile)
if err != nil {
return err
@@ -60,18 +69,123 @@ func Run(rs *options.DeschedulerServer) error {
return err
}
stopChannel := make(chan struct{})
return RunDeschedulerStrategies(ctx, rs, deschedulerPolicy, evictionPolicyGroupVersion, stopChannel)
runFn := func() error {
return RunDeschedulerStrategies(ctx, rs, deschedulerPolicy, evictionPolicyGroupVersion)
}
if rs.LeaderElection.LeaderElect && rs.DeschedulingInterval.Seconds() == 0 {
return fmt.Errorf("leaderElection must be used with deschedulingInterval")
}
if rs.LeaderElection.LeaderElect && !rs.DryRun {
if err := NewLeaderElection(runFn, rsclient, &rs.LeaderElection, ctx); err != nil {
return fmt.Errorf("leaderElection: %w", err)
}
return nil
}
return runFn()
}
type strategyFunction func(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor)
type strategyFunction func(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc)
func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer, deschedulerPolicy *api.DeschedulerPolicy, evictionPolicyGroupVersion string, stopChannel chan struct{}) error {
func cachedClient(
realClient clientset.Interface,
podInformer corev1informers.PodInformer,
nodeInformer corev1informers.NodeInformer,
namespaceInformer corev1informers.NamespaceInformer,
priorityClassInformer schedulingv1informers.PriorityClassInformer,
) (clientset.Interface, error) {
fakeClient := fakeclientset.NewSimpleClientset()
// simulate a pod eviction by deleting a pod
fakeClient.PrependReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) {
if action.GetSubresource() == "eviction" {
createAct, matched := action.(core.CreateActionImpl)
if !matched {
return false, nil, fmt.Errorf("unable to convert action to core.CreateActionImpl")
}
eviction, matched := createAct.Object.(*policy.Eviction)
if !matched {
return false, nil, fmt.Errorf("unable to convert action object into *policy.Eviction")
}
if err := fakeClient.Tracker().Delete(action.GetResource(), eviction.GetNamespace(), eviction.GetName()); err != nil {
return false, nil, fmt.Errorf("unable to delete pod %v/%v: %v", eviction.GetNamespace(), eviction.GetName(), err)
}
return true, nil, nil
}
// fallback to the default reactor
return false, nil, nil
})
klog.V(3).Infof("Pulling resources for the cached client from the cluster")
pods, err := podInformer.Lister().List(labels.Everything())
if err != nil {
return nil, fmt.Errorf("unable to list pods: %v", err)
}
for _, item := range pods {
if _, err := fakeClient.CoreV1().Pods(item.Namespace).Create(context.TODO(), item, metav1.CreateOptions{}); err != nil {
return nil, fmt.Errorf("unable to copy pod: %v", err)
}
}
nodes, err := nodeInformer.Lister().List(labels.Everything())
if err != nil {
return nil, fmt.Errorf("unable to list nodes: %v", err)
}
for _, item := range nodes {
if _, err := fakeClient.CoreV1().Nodes().Create(context.TODO(), item, metav1.CreateOptions{}); err != nil {
return nil, fmt.Errorf("unable to copy node: %v", err)
}
}
namespaces, err := namespaceInformer.Lister().List(labels.Everything())
if err != nil {
return nil, fmt.Errorf("unable to list namespaces: %v", err)
}
for _, item := range namespaces {
if _, err := fakeClient.CoreV1().Namespaces().Create(context.TODO(), item, metav1.CreateOptions{}); err != nil {
return nil, fmt.Errorf("unable to copy node: %v", err)
}
}
priorityClasses, err := priorityClassInformer.Lister().List(labels.Everything())
if err != nil {
return nil, fmt.Errorf("unable to list priorityclasses: %v", err)
}
for _, item := range priorityClasses {
if _, err := fakeClient.SchedulingV1().PriorityClasses().Create(context.TODO(), item, metav1.CreateOptions{}); err != nil {
return nil, fmt.Errorf("unable to copy priorityclass: %v", err)
}
}
return fakeClient, nil
}
func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer, deschedulerPolicy *api.DeschedulerPolicy, evictionPolicyGroupVersion string) error {
sharedInformerFactory := informers.NewSharedInformerFactory(rs.Client, 0)
nodeInformer := sharedInformerFactory.Core().V1().Nodes()
podInformer := sharedInformerFactory.Core().V1().Pods()
namespaceInformer := sharedInformerFactory.Core().V1().Namespaces()
priorityClassInformer := sharedInformerFactory.Scheduling().V1().PriorityClasses()
sharedInformerFactory.Start(stopChannel)
sharedInformerFactory.WaitForCacheSync(stopChannel)
ctx, cancel := context.WithCancel(ctx)
defer cancel()
// create the informers
namespaceInformer.Informer()
priorityClassInformer.Informer()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
return fmt.Errorf("build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
strategyFuncs := map[api.StrategyName]strategyFunction{
"RemoveDuplicates": strategies.RemoveDuplicatePods,
@@ -83,18 +197,27 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
"RemovePodsHavingTooManyRestarts": strategies.RemovePodsHavingTooManyRestarts,
"PodLifeTime": strategies.PodLifeTime,
"RemovePodsViolatingTopologySpreadConstraint": strategies.RemovePodsViolatingTopologySpreadConstraint,
"RemoveFailedPods": strategies.RemoveFailedPods,
}
nodeSelector := rs.NodeSelector
var nodeSelector string
if deschedulerPolicy.NodeSelector != nil {
nodeSelector = *deschedulerPolicy.NodeSelector
}
evictLocalStoragePods := rs.EvictLocalStoragePods
var evictLocalStoragePods bool
if deschedulerPolicy.EvictLocalStoragePods != nil {
evictLocalStoragePods = *deschedulerPolicy.EvictLocalStoragePods
}
evictBarePods := false
if deschedulerPolicy.EvictFailedBarePods != nil {
evictBarePods = *deschedulerPolicy.EvictFailedBarePods
if evictBarePods {
klog.V(1).InfoS("Warning: EvictFailedBarePods is set to True. This could cause eviction of pods without ownerReferences.")
}
}
evictSystemCriticalPods := false
if deschedulerPolicy.EvictSystemCriticalPods != nil {
evictSystemCriticalPods = *deschedulerPolicy.EvictSystemCriticalPods
@@ -108,40 +231,70 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
ignorePvcPods = *deschedulerPolicy.IgnorePVCPods
}
maxNoOfPodsToEvictPerNode := rs.MaxNoOfPodsToEvictPerNode
if deschedulerPolicy.MaxNoOfPodsToEvictPerNode != nil {
maxNoOfPodsToEvictPerNode = *deschedulerPolicy.MaxNoOfPodsToEvictPerNode
}
wait.Until(func() {
wait.NonSlidingUntil(func() {
nodes, err := nodeutil.ReadyNodes(ctx, rs.Client, nodeInformer, nodeSelector)
if err != nil {
klog.V(1).InfoS("Unable to get ready nodes", "err", err)
close(stopChannel)
cancel()
return
}
if len(nodes) <= 1 {
klog.V(1).InfoS("The cluster size is 0 or 1 meaning eviction causes service disruption or degradation. So aborting..")
close(stopChannel)
cancel()
return
}
var podEvictorClient clientset.Interface
// When the dry mode is enable, collect all the relevant objects (mostly pods) under a fake client.
// So when evicting pods while running multiple strategies in a row have the cummulative effect
// as is when evicting pods for real.
if rs.DryRun {
klog.V(3).Infof("Building a cached client from the cluster for the dry run")
// Create a new cache so we start from scratch without any leftovers
fakeClient, err := cachedClient(rs.Client, podInformer, nodeInformer, namespaceInformer, priorityClassInformer)
if err != nil {
klog.Error(err)
return
}
fakeSharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
getPodsAssignedToNode, err = podutil.BuildGetPodsAssignedToNodeFunc(fakeSharedInformerFactory.Core().V1().Pods())
if err != nil {
klog.Errorf("build get pods assigned to node function error: %v", err)
return
}
fakeCtx, cncl := context.WithCancel(context.TODO())
defer cncl()
fakeSharedInformerFactory.Start(fakeCtx.Done())
fakeSharedInformerFactory.WaitForCacheSync(fakeCtx.Done())
podEvictorClient = fakeClient
} else {
podEvictorClient = rs.Client
}
klog.V(3).Infof("Building a pod evictor")
podEvictor := evictions.NewPodEvictor(
rs.Client,
podEvictorClient,
evictionPolicyGroupVersion,
rs.DryRun,
maxNoOfPodsToEvictPerNode,
deschedulerPolicy.MaxNoOfPodsToEvictPerNode,
deschedulerPolicy.MaxNoOfPodsToEvictPerNamespace,
nodes,
getPodsAssignedToNode,
evictLocalStoragePods,
evictSystemCriticalPods,
ignorePvcPods,
evictBarePods,
!rs.DisableMetrics,
)
for name, strategy := range deschedulerPolicy.Strategies {
if f, ok := strategyFuncs[name]; ok {
if strategy.Enabled {
f(ctx, rs.Client, strategy, nodes, podEvictor)
f(ctx, rs.Client, strategy, nodes, podEvictor, getPodsAssignedToNode)
}
} else {
klog.ErrorS(fmt.Errorf("unknown strategy name"), "skipping strategy", "strategy", name)
@@ -152,9 +305,9 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
// If there was no interval specified, send a signal to the stopChannel to end the wait.Until loop after 1 iteration
if rs.DeschedulingInterval.Seconds() == 0 {
close(stopChannel)
cancel()
}
}, rs.DeschedulingInterval, stopChannel)
}, rs.DeschedulingInterval, ctx.Done())
return nil
}

View File

@@ -35,9 +35,6 @@ func TestTaintsUpdated(t *testing.T) {
},
}
stopChannel := make(chan struct{})
defer close(stopChannel)
rs, err := options.NewDeschedulerServer()
if err != nil {
t.Fatalf("Unable to initialize server: %v", err)
@@ -47,7 +44,7 @@ func TestTaintsUpdated(t *testing.T) {
errChan := make(chan error, 1)
defer close(errChan)
go func() {
err := RunDeschedulerStrategies(ctx, rs, dp, "v1beta1", stopChannel)
err := RunDeschedulerStrategies(ctx, rs, dp, "v1beta1")
errChan <- err
}()
select {
@@ -55,7 +52,7 @@ func TestTaintsUpdated(t *testing.T) {
if err != nil {
t.Fatalf("Unable to run descheduler strategies: %v", err)
}
case <-time.After(300 * time.Millisecond):
case <-time.After(1 * time.Second):
// Wait for few cycles and then verify the only pod still exists
}
@@ -101,3 +98,69 @@ func TestTaintsUpdated(t *testing.T) {
t.Fatalf("Unable to evict pod, node taint did not get propagated to descheduler strategies")
}
}
func TestRootCancel(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
n1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
n2 := test.BuildTestNode("n2", 2000, 3000, 10, nil)
client := fakeclientset.NewSimpleClientset(n1, n2)
dp := &api.DeschedulerPolicy{
Strategies: api.StrategyList{}, // no strategies needed for this test
}
rs, err := options.NewDeschedulerServer()
if err != nil {
t.Fatalf("Unable to initialize server: %v", err)
}
rs.Client = client
rs.DeschedulingInterval = 100 * time.Millisecond
errChan := make(chan error, 1)
defer close(errChan)
go func() {
err := RunDeschedulerStrategies(ctx, rs, dp, "v1beta1")
errChan <- err
}()
cancel()
select {
case err := <-errChan:
if err != nil {
t.Fatalf("Unable to run descheduler strategies: %v", err)
}
case <-time.After(1 * time.Second):
t.Fatal("Root ctx should have canceled immediately")
}
}
func TestRootCancelWithNoInterval(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
n1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
n2 := test.BuildTestNode("n2", 2000, 3000, 10, nil)
client := fakeclientset.NewSimpleClientset(n1, n2)
dp := &api.DeschedulerPolicy{
Strategies: api.StrategyList{}, // no strategies needed for this test
}
rs, err := options.NewDeschedulerServer()
if err != nil {
t.Fatalf("Unable to initialize server: %v", err)
}
rs.Client = client
rs.DeschedulingInterval = 0
errChan := make(chan error, 1)
defer close(errChan)
go func() {
err := RunDeschedulerStrategies(ctx, rs, dp, "v1beta1")
errChan <- err
}()
cancel()
select {
case err := <-errChan:
if err != nil {
t.Fatalf("Unable to run descheduler strategies: %v", err)
}
case <-time.After(1 * time.Second):
t.Fatal("Root ctx should have canceled immediately")
}
}

View File

@@ -45,57 +45,73 @@ const (
)
// nodePodEvictedCount keeps count of pods evicted on node
type nodePodEvictedCount map[*v1.Node]int
type nodePodEvictedCount map[*v1.Node]uint
type namespacePodEvictCount map[string]uint
type PodEvictor struct {
client clientset.Interface
nodes []*v1.Node
policyGroupVersion string
dryRun bool
maxPodsToEvictPerNode int
nodepodCount nodePodEvictedCount
evictLocalStoragePods bool
evictSystemCriticalPods bool
ignorePvcPods bool
client clientset.Interface
nodes []*v1.Node
nodeIndexer podutil.GetPodsAssignedToNodeFunc
policyGroupVersion string
dryRun bool
maxPodsToEvictPerNode *uint
maxPodsToEvictPerNamespace *uint
nodepodCount nodePodEvictedCount
namespacePodCount namespacePodEvictCount
evictFailedBarePods bool
evictLocalStoragePods bool
evictSystemCriticalPods bool
ignorePvcPods bool
metricsEnabled bool
}
func NewPodEvictor(
client clientset.Interface,
policyGroupVersion string,
dryRun bool,
maxPodsToEvictPerNode int,
maxPodsToEvictPerNode *uint,
maxPodsToEvictPerNamespace *uint,
nodes []*v1.Node,
nodeIndexer podutil.GetPodsAssignedToNodeFunc,
evictLocalStoragePods bool,
evictSystemCriticalPods bool,
ignorePvcPods bool,
evictFailedBarePods bool,
metricsEnabled bool,
) *PodEvictor {
var nodePodCount = make(nodePodEvictedCount)
var namespacePodCount = make(namespacePodEvictCount)
for _, node := range nodes {
// Initialize podsEvicted till now with 0.
nodePodCount[node] = 0
}
return &PodEvictor{
client: client,
nodes: nodes,
policyGroupVersion: policyGroupVersion,
dryRun: dryRun,
maxPodsToEvictPerNode: maxPodsToEvictPerNode,
nodepodCount: nodePodCount,
evictLocalStoragePods: evictLocalStoragePods,
evictSystemCriticalPods: evictSystemCriticalPods,
ignorePvcPods: ignorePvcPods,
client: client,
nodes: nodes,
nodeIndexer: nodeIndexer,
policyGroupVersion: policyGroupVersion,
dryRun: dryRun,
maxPodsToEvictPerNode: maxPodsToEvictPerNode,
maxPodsToEvictPerNamespace: maxPodsToEvictPerNamespace,
nodepodCount: nodePodCount,
namespacePodCount: namespacePodCount,
evictLocalStoragePods: evictLocalStoragePods,
evictSystemCriticalPods: evictSystemCriticalPods,
evictFailedBarePods: evictFailedBarePods,
ignorePvcPods: ignorePvcPods,
metricsEnabled: metricsEnabled,
}
}
// NodeEvicted gives a number of pods evicted for node
func (pe *PodEvictor) NodeEvicted(node *v1.Node) int {
func (pe *PodEvictor) NodeEvicted(node *v1.Node) uint {
return pe.nodepodCount[node]
}
// TotalEvicted gives a number of pods evicted through all nodes
func (pe *PodEvictor) TotalEvicted() int {
var total int
func (pe *PodEvictor) TotalEvicted() uint {
var total uint
for _, count := range pe.nodepodCount {
total += count
}
@@ -110,38 +126,51 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, node *v1.Node,
if len(reasons) > 0 {
reason += " (" + strings.Join(reasons, ", ") + ")"
}
if pe.maxPodsToEvictPerNode > 0 && pe.nodepodCount[node]+1 > pe.maxPodsToEvictPerNode {
metrics.PodsEvicted.With(map[string]string{"result": "maximum number reached", "strategy": strategy, "namespace": pod.Namespace}).Inc()
return false, fmt.Errorf("Maximum number %v of evicted pods per %q node reached", pe.maxPodsToEvictPerNode, node.Name)
if pe.maxPodsToEvictPerNode != nil && pe.nodepodCount[node]+1 > *pe.maxPodsToEvictPerNode {
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc()
}
return false, fmt.Errorf("Maximum number %v of evicted pods per %q node reached", *pe.maxPodsToEvictPerNode, node.Name)
}
err := evictPod(ctx, pe.client, pod, pe.policyGroupVersion, pe.dryRun)
if pe.maxPodsToEvictPerNamespace != nil && pe.namespacePodCount[pod.Namespace]+1 > *pe.maxPodsToEvictPerNamespace {
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc()
}
return false, fmt.Errorf("Maximum number %v of evicted pods per %q namespace reached", *pe.maxPodsToEvictPerNamespace, pod.Namespace)
}
err := evictPod(ctx, pe.client, pod, pe.policyGroupVersion)
if err != nil {
// err is used only for logging purposes
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", reason)
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace}).Inc()
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc()
}
return false, nil
}
pe.nodepodCount[node]++
pe.namespacePodCount[pod.Namespace]++
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc()
}
if pe.dryRun {
klog.V(1).InfoS("Evicted pod in dry run mode", "pod", klog.KObj(pod), "reason", reason)
klog.V(1).InfoS("Evicted pod in dry run mode", "pod", klog.KObj(pod), "reason", reason, "strategy", strategy, "node", node.Name)
} else {
klog.V(1).InfoS("Evicted pod", "pod", klog.KObj(pod), "reason", reason)
klog.V(1).InfoS("Evicted pod", "pod", klog.KObj(pod), "reason", reason, "strategy", strategy, "node", node.Name)
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartStructuredLogging(3)
eventBroadcaster.StartRecordingToSink(&clientcorev1.EventSinkImpl{Interface: pe.client.CoreV1().Events(pod.Namespace)})
r := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "sigs.k8s.io.descheduler"})
r.Event(pod, v1.EventTypeNormal, "Descheduled", fmt.Sprintf("pod evicted by sigs.k8s.io/descheduler%s", reason))
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace}).Inc()
}
return true, nil
}
func evictPod(ctx context.Context, client clientset.Interface, pod *v1.Pod, policyGroupVersion string, dryRun bool) error {
if dryRun {
return nil
}
func evictPod(ctx context.Context, client clientset.Interface, pod *v1.Pod, policyGroupVersion string) error {
deleteOptions := &metav1.DeleteOptions{}
// GracePeriodSeconds ?
eviction := &policy.Eviction{
@@ -215,6 +244,25 @@ func (pe *PodEvictor) Evictable(opts ...func(opts *Options)) *evictable {
}
ev := &evictable{}
if pe.evictFailedBarePods {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
ownerRefList := podutil.OwnerRef(pod)
// Enable evictFailedBarePods to evict bare pods in failed phase
if len(ownerRefList) == 0 && pod.Status.Phase != v1.PodFailed {
return fmt.Errorf("pod does not have any ownerRefs and is not in failed phase")
}
return nil
})
} else {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
ownerRefList := podutil.OwnerRef(pod)
// Moved from IsEvictable function for backward compatibility
if len(ownerRefList) == 0 {
return fmt.Errorf("pod does not have any ownerRefs")
}
return nil
})
}
if !pe.evictSystemCriticalPods {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
// Moved from IsEvictable function to allow for disabling
@@ -251,7 +299,7 @@ func (pe *PodEvictor) Evictable(opts ...func(opts *Options)) *evictable {
}
if options.nodeFit {
ev.constraints = append(ev.constraints, func(pod *v1.Pod) error {
if !nodeutil.PodFitsAnyOtherNode(pod, pe.nodes) {
if !nodeutil.PodFitsAnyOtherNode(pe.nodeIndexer, pod, pe.nodes) {
return fmt.Errorf("pod does not fit on any other node because of nodeSelector(s), Taint(s), or nodes marked as unschedulable")
}
return nil
@@ -278,10 +326,6 @@ func (ev *evictable) IsEvictable(pod *v1.Pod) bool {
checkErrs = append(checkErrs, fmt.Errorf("pod is a DaemonSet pod"))
}
if len(ownerRefList) == 0 {
checkErrs = append(checkErrs, fmt.Errorf("pod does not have any ownerrefs"))
}
if utils.IsMirrorPod(pod) {
checkErrs = append(checkErrs, fmt.Errorf("pod is a mirror pod"))
}
@@ -290,6 +334,10 @@ func (ev *evictable) IsEvictable(pod *v1.Pod) bool {
checkErrs = append(checkErrs, fmt.Errorf("pod is a static pod"))
}
if utils.IsPodTerminating(pod) {
checkErrs = append(checkErrs, fmt.Errorf("pod is terminating"))
}
for _, c := range ev.constraints {
if err := c(pod); err != nil {
checkErrs = append(checkErrs, err)

View File

@@ -21,8 +21,10 @@ import (
"testing"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
@@ -62,7 +64,7 @@ func TestEvictPod(t *testing.T) {
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: test.pods}, nil
})
got := evictPod(ctx, fakeClient, test.pod, "v1", false)
got := evictPod(ctx, fakeClient, test.pod, "v1")
if got != test.want {
t.Errorf("Test error for Desc: %s. Expected %v pod eviction to be %v, got %v", test.description, test.pod.Name, test.want, got)
}
@@ -80,9 +82,10 @@ func TestIsEvictable(t *testing.T) {
nodeLabelKey := "datacenter"
nodeLabelValue := "east"
type testCase struct {
pod *v1.Pod
description string
pods []*v1.Pod
nodes []*v1.Node
runBefore func(*v1.Pod, []*v1.Node)
evictFailedBarePods bool
evictLocalStoragePods bool
evictSystemCriticalPods bool
priorityThreshold *int32
@@ -91,241 +94,309 @@ func TestIsEvictable(t *testing.T) {
}
testCases := []testCase{
{ // Normal pod eviction with normal ownerRefs
pod: test.BuildTestPod("p1", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
{
description: "Failed pod eviction with no ownerRefs",
pods: []*v1.Pod{
test.BuildTestPod("bare_pod_failed", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Status.Phase = v1.PodFailed
}),
},
evictFailedBarePods: false,
result: false,
}, {
description: "Normal pod eviction with no ownerRefs and evictFailedBarePods enabled",
pods: []*v1.Pod{test.BuildTestPod("bare_pod", 400, 0, n1.Name, nil)},
evictFailedBarePods: true,
result: false,
}, {
description: "Failed pod eviction with no ownerRefs",
pods: []*v1.Pod{
test.BuildTestPod("bare_pod_failed_but_can_be_evicted", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Status.Phase = v1.PodFailed
}),
},
evictFailedBarePods: true,
result: true,
}, {
description: "Normal pod eviction with normal ownerRefs",
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with normal ownerRefs and descheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p2", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
}, {
description: "Normal pod eviction with normal ownerRefs and descheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p2", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with replicaSet ownerRefs
pod: test.BuildTestPod("p3", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList()
}, {
description: "Normal pod eviction with replicaSet ownerRefs",
pods: []*v1.Pod{
test.BuildTestPod("p3", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with replicaSet ownerRefs and descheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p4", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList()
}, {
description: "Normal pod eviction with replicaSet ownerRefs and descheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p4", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList()
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with statefulSet ownerRefs
pod: test.BuildTestPod("p18", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetStatefulSetOwnerRefList()
}, {
description: "Normal pod eviction with statefulSet ownerRefs",
pods: []*v1.Pod{
test.BuildTestPod("p18", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Normal pod eviction with statefulSet ownerRefs and descheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p19", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetStatefulSetOwnerRefList()
}, {
description: "Normal pod eviction with statefulSet ownerRefs and descheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p19", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetStatefulSetOwnerRefList()
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted because it is bound to a PV and evictLocalStoragePods = false
pod: test.BuildTestPod("p5", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
}, {
description: "Pod not evicted because it is bound to a PV and evictLocalStoragePods = false",
pods: []*v1.Pod{
test.BuildTestPod("p5", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
},
}
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: false,
}, { // Pod is evicted because it is bound to a PV and evictLocalStoragePods = true
pod: test.BuildTestPod("p6", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
}, {
description: "Pod is evicted because it is bound to a PV and evictLocalStoragePods = true",
pods: []*v1.Pod{
test.BuildTestPod("p6", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
},
}
}
}),
},
evictLocalStoragePods: true,
evictSystemCriticalPods: false,
result: true,
}, { // Pod is evicted because it is bound to a PV and evictLocalStoragePods = false, but it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p7", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
}, {
description: "Pod is evicted because it is bound to a PV and evictLocalStoragePods = false, but it has scheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p7", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
},
}
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted becasuse it is part of a daemonSet
pod: test.BuildTestPod("p8", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList()
}, {
description: "Pod not evicted becasuse it is part of a daemonSet",
pods: []*v1.Pod{
test.BuildTestPod("p8", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList()
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: false,
}, { // Pod is evicted becasuse it is part of a daemonSet, but it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p9", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList()
}, {
description: "Pod is evicted becasuse it is part of a daemonSet, but it has scheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p9", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList()
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted becasuse it is a mirror pod
pod: test.BuildTestPod("p10", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = test.GetMirrorPodAnnotation()
}, {
description: "Pod not evicted becasuse it is a mirror poddsa",
pods: []*v1.Pod{
test.BuildTestPod("p10", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = test.GetMirrorPodAnnotation()
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: false,
}, { // Pod is evicted becasuse it is a mirror pod, but it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p11", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = test.GetMirrorPodAnnotation()
pod.Annotations["descheduler.alpha.kubernetes.io/evict"] = "true"
}, {
description: "Pod is evicted becasuse it is a mirror pod, but it has scheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p11", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = test.GetMirrorPodAnnotation()
pod.Annotations["descheduler.alpha.kubernetes.io/evict"] = "true"
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted becasuse it has system critical priority
pod: test.BuildTestPod("p12", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}, {
description: "Pod not evicted becasuse it has system critical priority",
pods: []*v1.Pod{
test.BuildTestPod("p12", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: false,
}, { // Pod is evicted becasuse it has system critical priority, but it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p13", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
pod.Annotations = map[string]string{
"descheduler.alpha.kubernetes.io/evict": "true",
}
}, {
description: "Pod is evicted becasuse it has system critical priority, but it has scheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p13", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
pod.Annotations = map[string]string{
"descheduler.alpha.kubernetes.io/evict": "true",
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
result: true,
}, { // Pod not evicted becasuse it has a priority higher than the configured priority threshold
pod: test.BuildTestPod("p14", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Priority = &highPriority
}, {
description: "Pod not evicted becasuse it has a priority higher than the configured priority threshold",
pods: []*v1.Pod{
test.BuildTestPod("p14", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Priority = &highPriority
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
priorityThreshold: &lowPriority,
result: false,
}, { // Pod is evicted becasuse it has a priority higher than the configured priority threshold, but it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p15", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.Spec.Priority = &highPriority
}, {
description: "Pod is evicted becasuse it has a priority higher than the configured priority threshold, but it has scheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p15", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.Spec.Priority = &highPriority
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
priorityThreshold: &lowPriority,
result: true,
}, { // Pod is evicted becasuse it has system critical priority, but evictSystemCriticalPods = true
pod: test.BuildTestPod("p16", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}, {
description: "Pod is evicted becasuse it has system critical priority, but evictSystemCriticalPods = true",
pods: []*v1.Pod{
test.BuildTestPod("p16", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
result: true,
}, { // Pod is evicted becasuse it has system critical priority, but evictSystemCriticalPods = true and it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p16", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}, {
description: "Pod is evicted becasuse it has system critical priority, but evictSystemCriticalPods = true and it has scheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p16", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
result: true,
}, { // Pod is evicted becasuse it has a priority higher than the configured priority threshold, but evictSystemCriticalPods = true
pod: test.BuildTestPod("p17", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Priority = &highPriority
}, {
description: "Pod is evicted becasuse it has a priority higher than the configured priority threshold, but evictSystemCriticalPods = true",
pods: []*v1.Pod{
test.BuildTestPod("p17", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Priority = &highPriority
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
priorityThreshold: &lowPriority,
result: true,
}, { // Pod is evicted becasuse it has a priority higher than the configured priority threshold, but evictSystemCriticalPods = true and it has scheduler.alpha.kubernetes.io/evict annotation
pod: test.BuildTestPod("p17", 400, 0, n1.Name, nil),
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.Spec.Priority = &highPriority
}, {
description: "Pod is evicted becasuse it has a priority higher than the configured priority threshold, but evictSystemCriticalPods = true and it has scheduler.alpha.kubernetes.io/evict annotation",
pods: []*v1.Pod{
test.BuildTestPod("p17", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"}
pod.Spec.Priority = &highPriority
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
priorityThreshold: &lowPriority,
result: true,
}, { // Pod with no tolerations running on normal node, all other nodes tainted
pod: test.BuildTestPod("p1", 400, 0, n1.Name, nil),
nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)},
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
for _, node := range nodes {
}, {
description: "Pod with no tolerations running on normal node, all other nodes tainted",
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
}),
},
nodes: []*v1.Node{
test.BuildTestNode("node2", 1000, 2000, 13, func(node *v1.Node) {
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
@@ -333,27 +404,8 @@ func TestIsEvictable(t *testing.T) {
Effect: v1.TaintEffectNoSchedule,
},
}
}
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: false,
}, { // Pod with correct tolerations running on normal node, all other nodes tainted
pod: test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Spec.Tolerations = []v1.Toleration{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)},
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
for _, node := range nodes {
}),
test.BuildTestNode("node3", 1000, 2000, 13, func(node *v1.Node) {
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
@@ -361,80 +413,259 @@ func TestIsEvictable(t *testing.T) {
Effect: v1.TaintEffectNoSchedule,
},
}
}
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: true,
}, { // Pod with incorrect node selector
pod: test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: "fail",
}
}),
nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)},
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
for _, node := range nodes {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: false,
}, { // Pod with correct node selector
pod: test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)},
runBefore: func(pod *v1.Pod, nodes []*v1.Node) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
for _, node := range nodes {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}, {
description: "Pod with correct tolerations running on normal node, all other nodes tainted",
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.Tolerations = []v1.Toleration{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}
}),
},
nodes: []*v1.Node{
test.BuildTestNode("node2", 1000, 2000, 13, func(node *v1.Node) {
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
test.BuildTestNode("node3", 1000, 2000, 13, func(node *v1.Node) {
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: true,
}, {
description: "Pod with incorrect node selector",
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: "fail",
}
}),
},
nodes: []*v1.Node{
test.BuildTestNode("node2", 1000, 2000, 13, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
test.BuildTestNode("node3", 1000, 2000, 13, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: false,
}, {
description: "Pod with correct node selector",
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
},
nodes: []*v1.Node{
test.BuildTestNode("node2", 1000, 2000, 13, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
test.BuildTestNode("node3", 1000, 2000, 13, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: true,
}, {
description: "Pod with correct node selector, but only available node doesn't have enough CPU",
pods: []*v1.Pod{
test.BuildTestPod("p1", 12, 8, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
},
nodes: []*v1.Node{
test.BuildTestNode("node2-TEST", 10, 16, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
test.BuildTestNode("node3-TEST", 10, 16, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: false,
}, {
description: "Pod with correct node selector, and one node has enough memory",
pods: []*v1.Pod{
test.BuildTestPod("p1", 12, 8, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
test.BuildTestPod("node2-pod-10GB-mem", 20, 10, "node2", func(pod *v1.Pod) {
pod.ObjectMeta.Labels = map[string]string{
"test": "true",
}
}),
test.BuildTestPod("node3-pod-10GB-mem", 20, 10, "node3", func(pod *v1.Pod) {
pod.ObjectMeta.Labels = map[string]string{
"test": "true",
}
}),
},
nodes: []*v1.Node{
test.BuildTestNode("node2", 100, 16, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
test.BuildTestNode("node3", 100, 20, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: true,
}, {
description: "Pod with correct node selector, but both nodes don't have enough memory",
pods: []*v1.Pod{
test.BuildTestPod("p1", 12, 8, n1.Name, func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
test.BuildTestPod("node2-pod-10GB-mem", 10, 10, "node2", func(pod *v1.Pod) {
pod.ObjectMeta.Labels = map[string]string{
"test": "true",
}
}),
test.BuildTestPod("node3-pod-10GB-mem", 10, 10, "node3", func(pod *v1.Pod) {
pod.ObjectMeta.Labels = map[string]string{
"test": "true",
}
}),
},
nodes: []*v1.Node{
test.BuildTestNode("node2", 100, 16, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
test.BuildTestNode("node3", 100, 16, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
nodeFit: true,
result: false,
},
}
for _, test := range testCases {
test.runBefore(test.pod, test.nodes)
nodes := append(test.nodes, n1)
podEvictor := &PodEvictor{
evictLocalStoragePods: test.evictLocalStoragePods,
evictSystemCriticalPods: test.evictSystemCriticalPods,
nodes: nodes,
}
t.Run(test.description, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
evictable := podEvictor.Evictable()
var opts []func(opts *Options)
if test.priorityThreshold != nil {
opts = append(opts, WithPriorityThreshold(*test.priorityThreshold))
}
if test.nodeFit {
opts = append(opts, WithNodeFit(true))
}
evictable = podEvictor.Evictable(opts...)
nodes := append(test.nodes, n1)
result := evictable.IsEvictable(test.pod)
if result != test.result {
t.Errorf("IsEvictable should return for pod %s %t, but it returns %t", test.pod.Name, test.result, result)
}
var objs []runtime.Object
for _, node := range test.nodes {
objs = append(objs, node)
}
for _, pod := range test.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := &PodEvictor{
client: fakeClient,
nodes: nodes,
nodeIndexer: getPodsAssignedToNode,
policyGroupVersion: policyv1.SchemeGroupVersion.String(),
dryRun: false,
maxPodsToEvictPerNode: nil,
maxPodsToEvictPerNamespace: nil,
evictLocalStoragePods: test.evictLocalStoragePods,
evictSystemCriticalPods: test.evictSystemCriticalPods,
evictFailedBarePods: test.evictFailedBarePods,
}
var opts []func(opts *Options)
if test.priorityThreshold != nil {
opts = append(opts, WithPriorityThreshold(*test.priorityThreshold))
}
if test.nodeFit {
opts = append(opts, WithNodeFit(true))
}
evictable := podEvictor.Evictable(opts...)
result := evictable.IsEvictable(test.pods[0])
if result != test.result {
t.Errorf("IsEvictable should return for pod %s %t, but it returns %t", test.pods[0].Name, test.result, result)
}
})
}
}
func TestPodTypes(t *testing.T) {

View File

@@ -0,0 +1,99 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package descheduler
import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/util/uuid"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/leaderelection"
"k8s.io/client-go/tools/leaderelection/resourcelock"
componentbaseconfig "k8s.io/component-base/config"
"k8s.io/klog/v2"
"os"
)
// NewLeaderElection starts the leader election code loop
func NewLeaderElection(
run func() error,
client clientset.Interface,
LeaderElectionConfig *componentbaseconfig.LeaderElectionConfiguration,
ctx context.Context,
) error {
var id string
if hostname, err := os.Hostname(); err != nil {
// on errors, make sure we're unique
id = string(uuid.NewUUID())
} else {
// add a uniquifier so that two processes on the same host don't accidentally both become active
id = hostname + "_" + string(uuid.NewUUID())
}
klog.V(3).Infof("Assigned unique lease holder id: %s", id)
if len(LeaderElectionConfig.ResourceNamespace) == 0 {
return fmt.Errorf("namespace may not be empty")
}
if len(LeaderElectionConfig.ResourceName) == 0 {
return fmt.Errorf("name may not be empty")
}
lock, err := resourcelock.New(
LeaderElectionConfig.ResourceLock,
LeaderElectionConfig.ResourceNamespace,
LeaderElectionConfig.ResourceName,
client.CoreV1(),
client.CoordinationV1(),
resourcelock.ResourceLockConfig{
Identity: id,
},
)
if err != nil {
return fmt.Errorf("unable to create leader election lock: %v", err)
}
leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{
Lock: lock,
ReleaseOnCancel: true,
LeaseDuration: LeaderElectionConfig.LeaseDuration.Duration,
RenewDeadline: LeaderElectionConfig.RenewDeadline.Duration,
RetryPeriod: LeaderElectionConfig.RetryPeriod.Duration,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(ctx context.Context) {
klog.V(1).InfoS("Started leading")
err := run()
if err != nil {
klog.Error(err)
}
},
OnStoppedLeading: func() {
klog.V(1).InfoS("Leader lost")
},
OnNewLeader: func(identity string) {
// Just got the lock
if identity == id {
return
}
klog.V(1).Infof("New leader elected: %v", identity)
},
},
})
return nil
}

View File

@@ -18,13 +18,16 @@ package node
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
coreinformers "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
)
@@ -96,32 +99,90 @@ func IsReady(node *v1.Node) bool {
return true
}
// PodFitsAnyOtherNode checks if the given pod fits any of the given nodes, besides the node
// the pod is already running on. The node fit is based on multiple criteria, like, pod node selector
// matching the node label (including affinity), the taints on the node, and the node being schedulable or not.
func PodFitsAnyOtherNode(pod *v1.Pod, nodes []*v1.Node) bool {
// NodeFit returns true if the provided pod can be scheduled onto the provided node.
// This function is used when the NodeFit pod filtering feature of the Descheduler is enabled.
// This function currently considers a subset of the Kubernetes Scheduler's predicates when
// deciding if a pod would fit on a node, but more predicates may be added in the future.
func NodeFit(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) []error {
// Check node selector and required affinity
var errors []error
if ok, err := utils.PodMatchNodeSelector(pod, node); err != nil {
errors = append(errors, err)
} else if !ok {
errors = append(errors, fmt.Errorf("pod node selector does not match the node label"))
}
// Check taints (we only care about NoSchedule and NoExecute taints)
ok := utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool {
return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute
})
if !ok {
errors = append(errors, fmt.Errorf("pod does not tolerate taints on the node"))
}
// Check if the pod can fit on a node based off it's requests
ok, reqErrors := fitsRequest(nodeIndexer, pod, node)
if !ok {
errors = append(errors, reqErrors...)
}
// Check if node is schedulable
if IsNodeUnschedulable(node) {
errors = append(errors, fmt.Errorf("node is not schedulable"))
}
return errors
}
// PodFitsAnyOtherNode checks if the given pod will fit any of the given nodes, besides the node
// the pod is already running on. The predicates used to determine if the pod will fit can be found in the NodeFit function.
func PodFitsAnyOtherNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) bool {
for _, node := range nodes {
// Skip node pod is already on
if node.Name == pod.Spec.NodeName {
continue
}
// Check node selector and required affinity
ok, err := utils.PodMatchNodeSelector(pod, node)
if err != nil || !ok {
continue
}
// Check taints (we only care about NoSchedule and NoExecute taints)
ok = utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool {
return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute
})
if !ok {
continue
}
// Check if node is schedulable
if !IsNodeUnschedulable(node) {
klog.V(2).InfoS("Pod can possibly be scheduled on a different node", "pod", klog.KObj(pod), "node", klog.KObj(node))
errors := NodeFit(nodeIndexer, pod, node)
if len(errors) == 0 {
klog.V(4).InfoS("Pod fits on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
return true
} else {
klog.V(4).InfoS("Pod does not fit on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
for _, err := range errors {
klog.V(4).InfoS(err.Error())
}
}
}
return false
}
// PodFitsAnyNode checks if the given pod will fit any of the given nodes. The predicates used
// to determine if the pod will fit can be found in the NodeFit function.
func PodFitsAnyNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) bool {
for _, node := range nodes {
errors := NodeFit(nodeIndexer, pod, node)
if len(errors) == 0 {
klog.V(4).InfoS("Pod fits on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
return true
} else {
klog.V(4).InfoS("Pod does not fit on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
for _, err := range errors {
klog.V(4).InfoS(err.Error())
}
}
}
return false
}
// PodFitsCurrentNode checks if the given pod will fit onto the given node. The predicates used
// to determine if the pod will fit can be found in the NodeFit function.
func PodFitsCurrentNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) bool {
errors := NodeFit(nodeIndexer, pod, node)
if len(errors) == 0 {
klog.V(4).InfoS("Pod fits on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
return true
} else {
klog.V(4).InfoS("Pod does not fit on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
for _, err := range errors {
klog.V(4).InfoS(err.Error())
}
}
return false
@@ -133,39 +194,95 @@ func IsNodeUnschedulable(node *v1.Node) bool {
return node.Spec.Unschedulable
}
// PodFitsAnyNode checks if the given pod fits any of the given nodes, based on
// multiple criteria, like, pod node selector matching the node label, node
// being schedulable or not.
func PodFitsAnyNode(pod *v1.Pod, nodes []*v1.Node) bool {
for _, node := range nodes {
// fitsRequest determines if a pod can fit on a node based on its resource requests. It returns true if
// the pod will fit.
func fitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) (bool, []error) {
var insufficientResources []error
ok, err := utils.PodMatchNodeSelector(pod, node)
if err != nil || !ok {
continue
}
if !IsNodeUnschedulable(node) {
klog.V(2).InfoS("Pod can possibly be scheduled on a different node", "pod", klog.KObj(pod), "node", klog.KObj(node))
return true
}
// Get pod requests
podRequests, _ := utils.PodRequestsAndLimits(pod)
resourceNames := make([]v1.ResourceName, 0, len(podRequests))
for name := range podRequests {
resourceNames = append(resourceNames, name)
}
return false
}
// PodFitsCurrentNode checks if the given pod fits on the given node if the pod
// node selector matches the node label.
func PodFitsCurrentNode(pod *v1.Pod, node *v1.Node) bool {
ok, err := utils.PodMatchNodeSelector(pod, node)
availableResources, err := nodeAvailableResources(nodeIndexer, node, resourceNames)
if err != nil {
klog.ErrorS(err, "Failed to match node selector")
return false
return false, []error{err}
}
if !ok {
klog.V(2).InfoS("Pod does not fit on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
return false
podFitsOnNode := true
for _, resource := range resourceNames {
podResourceRequest := podRequests[resource]
availableResource, ok := availableResources[resource]
if !ok || podResourceRequest.MilliValue() > availableResource.MilliValue() {
insufficientResources = append(insufficientResources, fmt.Errorf("insufficient %v", resource))
podFitsOnNode = false
}
}
return podFitsOnNode, insufficientResources
}
// nodeAvailableResources returns resources mapped to the quanitity available on the node.
func nodeAvailableResources(nodeIndexer podutil.GetPodsAssignedToNodeFunc, node *v1.Node, resourceNames []v1.ResourceName) (map[v1.ResourceName]*resource.Quantity, error) {
podsOnNode, err := podutil.ListPodsOnANode(node.Name, nodeIndexer, nil)
if err != nil {
return nil, err
}
nodeUtilization := NodeUtilization(podsOnNode, resourceNames)
remainingResources := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(node.Status.Allocatable.Cpu().MilliValue()-nodeUtilization[v1.ResourceCPU].MilliValue(), resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(node.Status.Allocatable.Memory().Value()-nodeUtilization[v1.ResourceMemory].Value(), resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(node.Status.Allocatable.Pods().Value()-nodeUtilization[v1.ResourcePods].Value(), resource.DecimalSI),
}
for _, name := range resourceNames {
if !IsBasicResource(name) {
if _, exists := node.Status.Allocatable[name]; exists {
allocatableResource := node.Status.Allocatable[name]
remainingResources[name] = resource.NewQuantity(allocatableResource.Value()-nodeUtilization[name].Value(), resource.DecimalSI)
} else {
remainingResources[name] = resource.NewQuantity(0, resource.DecimalSI)
}
}
}
return remainingResources, nil
}
// NodeUtilization returns the resources requested by the given pods. Only resources supplied in the resourceNames parameter are calculated.
func NodeUtilization(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity {
totalReqs := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(0, resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(0, resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(int64(len(pods)), resource.DecimalSI),
}
for _, name := range resourceNames {
if !IsBasicResource(name) {
totalReqs[name] = resource.NewQuantity(0, resource.DecimalSI)
}
}
for _, pod := range pods {
req, _ := utils.PodRequestsAndLimits(pod)
for _, name := range resourceNames {
quantity, ok := req[name]
if ok && name != v1.ResourcePods {
// As Quantity.Add says: Add adds the provided y quantity to the current value. If the current value is zero,
// the format of the quantity will be updated to the format of y.
totalReqs[name].Add(quantity)
}
}
}
return totalReqs
}
// IsBasicResource checks if resource is basic native.
func IsBasicResource(name v1.ResourceName) bool {
switch name {
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods:
return true
default:
return false
}
klog.V(2).InfoS("Pod fits on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
return true
}

View File

@@ -21,9 +21,12 @@ import (
"testing"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/test"
)
@@ -147,13 +150,13 @@ func TestPodFitsCurrentNode(t *testing.T) {
},
},
},
node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
},
node: test.BuildTestNode("node1", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
success: true,
},
{
@@ -181,27 +184,48 @@ func TestPodFitsCurrentNode(t *testing.T) {
},
},
},
node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
nodeLabelKey: "no",
},
},
},
node: test.BuildTestNode("node1", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: "no",
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
success: false,
},
}
for _, tc := range tests {
actual := PodFitsCurrentNode(tc.pod, tc.node)
if actual != tc.success {
t.Errorf("Test %#v failed", tc.description)
}
t.Run(tc.description, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
objs = append(objs, tc.node)
objs = append(objs, tc.pod)
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
actual := PodFitsCurrentNode(getPodsAssignedToNode, tc.pod, tc.node)
if actual != tc.success {
t.Errorf("Test %#v failed", tc.description)
}
})
}
}
func TestPodFitsAnyOtherNode(t *testing.T) {
nodeLabelKey := "kubernetes.io/desiredNode"
nodeLabelValue := "yes"
nodeTaintKey := "hardware"
@@ -215,238 +239,527 @@ func TestPodFitsAnyOtherNode(t *testing.T) {
pod *v1.Pod
nodes []*v1.Node
success bool
podsOnNodes []*v1.Pod
}{
{
description: "Pod fits another node matching node affinity",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
pod: test.BuildTestPod("p1", 0, 0, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: "no",
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[1], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: "no",
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
success: true,
podsOnNodes: []*v1.Pod{},
success: true,
},
{
description: "Pod expected to fit one of the nodes",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
pod: test.BuildTestPod("p1", 0, 0, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: "no",
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: "no",
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[1], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
success: true,
podsOnNodes: []*v1.Pod{},
success: true,
},
{
description: "Pod expected to fit none of the nodes",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
pod: test.BuildTestPod("p1", 0, 0, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: "unfit1",
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: "unfit2",
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: "unfit1",
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[1], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: "unfit2",
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
success: false,
podsOnNodes: []*v1.Pod{},
success: false,
},
{
description: "Nodes are unschedulable but labels match, should fail",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
pod: test.BuildTestPod("p1", 0, 0, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
}),
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
Spec: v1.NodeSpec{
Unschedulable: true,
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: "no",
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Unschedulable = true
}),
test.BuildTestNode(nodeNames[1], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: "no",
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
success: false,
podsOnNodes: []*v1.Pod{},
success: false,
},
{
description: "Two nodes matches node selector, one of them is tained, should pass",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
description: "Both nodes are tained, should fail",
pod: test.BuildTestPod("p1", 2000, 2*1000*1000*1000, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
}),
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
test.BuildTestNode(nodeNames[1], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
podsOnNodes: []*v1.Pod{},
success: false,
},
{
description: "Two nodes matches node selector, one of them is tained, there is a pod on the available node, and requests are low, should pass",
pod: test.BuildTestPod("p1", 2000, 2*1000*1000*1000, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
}),
nodes: []*v1.Node{
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
test.BuildTestNode(nodeNames[1], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
podsOnNodes: []*v1.Pod{
test.BuildTestPod("test-pod", 12*1000, 20*1000*1000*1000, nodeNames[1], func(pod *v1.Pod) {
pod.ObjectMeta = metav1.ObjectMeta{
Namespace: "test",
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
"test": "true",
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(40*1000*1000*1000, resource.DecimalSI)
}),
},
success: true,
},
{
description: "Both nodes are tained, should fail",
pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue),
description: "Two nodes matches node selector, one of them is tained, but CPU requests are too big, should fail",
pod: test.BuildTestPod("p1", 2000, 2*1000*1000*1000, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
}),
nodes: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[0],
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
// Notice that this node only has 4 cores, the pod already on the node below requests 3 cores, and the pod above requests 2 cores
test.BuildTestNode(nodeNames[1], 4000, 8*1000*1000*1000, 12, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(200*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
podsOnNodes: []*v1.Pod{
test.BuildTestPod("3-core-pod", 3000, 4*1000*1000*1000, nodeNames[1], func(pod *v1.Pod) {
pod.ObjectMeta = metav1.ObjectMeta{
Namespace: "test",
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
"test": "true",
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
}),
},
success: false,
},
{
description: "Two nodes matches node selector, one of them is tained, but memory requests are too big, should fail",
pod: test.BuildTestPod("p1", 2000, 5*1000*1000*1000, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
}),
nodes: []*v1.Node{
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[1],
}
}),
// Notice that this node only has 8GB of memory, the pod already on the node below requests 4GB, and the pod above requests 5GB
test.BuildTestNode(nodeNames[1], 10*1000, 8*1000*1000*1000, 12, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(200*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
podsOnNodes: []*v1.Pod{
test.BuildTestPod("4GB-mem-pod", 2000, 4*1000*1000*1000, nodeNames[1], func(pod *v1.Pod) {
pod.ObjectMeta = metav1.ObjectMeta{
Namespace: "test",
Labels: map[string]string{
nodeLabelKey: nodeLabelValue,
"test": "true",
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoExecute,
},
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
}),
},
success: false,
},
{
description: "Two nodes matches node selector, one of them is tained, but ephemeral storage requests are too big, should fail",
pod: test.BuildTestPod("p1", 2000, 4*1000*1000*1000, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
}),
nodes: []*v1.Node{
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: nodeNames[2],
},
},
}
}),
// Notice that this node only has 20GB of storage, the pod already on the node below requests 11GB, and the pod above requests 10GB
test.BuildTestNode(nodeNames[1], 10*1000, 8*1000*1000*1000, 12, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(20*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
podsOnNodes: []*v1.Pod{
test.BuildTestPod("11GB-storage-pod", 2000, 4*1000*1000*1000, nodeNames[1], func(pod *v1.Pod) {
pod.ObjectMeta = metav1.ObjectMeta{
Namespace: "test",
Labels: map[string]string{
"test": "true",
},
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(11*1000*1000*1000, resource.DecimalSI)
}),
},
success: false,
},
{
description: "Two nodes matches node selector, one of them is tained, but custom resource requests are too big, should fail",
pod: test.BuildTestPod("p1", 2000, 2*1000*1000*1000, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
pod.Spec.Containers[0].Resources.Requests["example.com/custom-resource"] = *resource.NewQuantity(10, resource.DecimalSI)
}),
nodes: []*v1.Node{
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
node.Status.Allocatable["example.com/custom-resource"] = *resource.NewQuantity(15, resource.DecimalSI)
}),
// Notice that this node only has 15 of the custom resource, the pod already on the node below requests 10, and the pod above requests 10
test.BuildTestNode(nodeNames[1], 10*1000, 8*1000*1000*1000, 12, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(200*1000*1000*1000, resource.DecimalSI)
node.Status.Allocatable["example.com/custom-resource"] = *resource.NewQuantity(15, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
podsOnNodes: []*v1.Pod{
test.BuildTestPod("10-custom-resource-pod", 0, 0, nodeNames[1], func(pod *v1.Pod) {
pod.ObjectMeta = metav1.ObjectMeta{
Namespace: "test",
Labels: map[string]string{
"test": "true",
},
}
pod.Spec.Containers[0].Resources.Requests["example.com/custom-resource"] = *resource.NewQuantity(10, resource.DecimalSI)
}),
},
success: false,
},
{
description: "Two nodes matches node selector, one of them is tained, CPU requests will fit, and pod Overhead is low enough, should pass",
pod: test.BuildTestPod("p1", 1000, 2*1000*1000*1000, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
}),
nodes: []*v1.Node{
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
// Notice that this node has 5 CPU cores, the pod below requests 2 cores, and has CPU overhead of 1 cores, and the pod above requests 1 core
test.BuildTestNode(nodeNames[1], 5000, 8*1000*1000*1000, 12, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(200*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
podsOnNodes: []*v1.Pod{
test.BuildTestPod("3-core-pod", 2000, 4*1000*1000*1000, nodeNames[1], func(pod *v1.Pod) {
pod.ObjectMeta = metav1.ObjectMeta{
Namespace: "test",
Labels: map[string]string{
"test": "true",
},
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
pod.Spec.Overhead = createResourceList(1000, 1000*1000*1000, 1000*1000*1000)
}),
},
success: true,
},
{
description: "Two nodes matches node selector, one of them is tained, CPU requests will fit, but pod Overhead is too high, should fail",
pod: test.BuildTestPod("p1", 2000, 2*1000*1000*1000, nodeNames[2], func(pod *v1.Pod) {
pod.Spec.NodeSelector = map[string]string{
nodeLabelKey: nodeLabelValue,
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
}),
nodes: []*v1.Node{
test.BuildTestNode(nodeNames[0], 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(1000*1000*1000*1000, resource.DecimalSI)
node.Spec.Taints = []v1.Taint{
{
Key: nodeTaintKey,
Value: nodeTaintValue,
Effect: v1.TaintEffectNoSchedule,
},
}
}),
// Notice that this node only has 5 CPU cores, the pod below requests 2 cores, but has CPU overhead of 2 cores, and the pod above requests 2 cores
test.BuildTestNode(nodeNames[1], 5000, 8*1000*1000*1000, 12, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeLabelKey: nodeLabelValue,
}
node.Status.Allocatable[v1.ResourceEphemeralStorage] = *resource.NewQuantity(200*1000*1000*1000, resource.DecimalSI)
}),
test.BuildTestNode(nodeNames[2], 0, 0, 0, nil),
},
podsOnNodes: []*v1.Pod{
test.BuildTestPod("3-core-pod", 2000, 4*1000*1000*1000, nodeNames[1], func(pod *v1.Pod) {
pod.ObjectMeta = metav1.ObjectMeta{
Namespace: "test",
Labels: map[string]string{
"test": "true",
},
}
pod.Spec.Containers[0].Resources.Requests[v1.ResourceEphemeralStorage] = *resource.NewQuantity(10*1000*1000*1000, resource.DecimalSI)
pod.Spec.Overhead = createResourceList(2000, 1000*1000*1000, 1000*1000*1000)
}),
},
success: false,
},
}
for _, tc := range tests {
actual := PodFitsAnyOtherNode(tc.pod, tc.nodes)
if actual != tc.success {
t.Errorf("Test %#v failed", tc.description)
}
t.Run(tc.description, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
for _, node := range tc.nodes {
objs = append(objs, node)
}
for _, pod := range tc.podsOnNodes {
objs = append(objs, pod)
}
objs = append(objs, tc.pod)
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
actual := PodFitsAnyOtherNode(getPodsAssignedToNode, tc.pod, tc.nodes)
if actual != tc.success {
t.Errorf("Test %#v failed", tc.description)
}
})
}
}
func createPodManifest(nodeName string, nodeSelectorKey string, nodeSelectorValue string) *v1.Pod {
return (&v1.Pod{
Spec: v1.PodSpec{
NodeName: nodeName,
Affinity: &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: nodeSelectorKey,
Operator: "In",
Values: []string{
nodeSelectorValue,
},
},
},
},
},
},
},
},
},
})
// createResourceList builds a small resource list of core resources
func createResourceList(cpu int64, memory int64, ephemeralStorage int64) v1.ResourceList {
resourceList := make(map[v1.ResourceName]resource.Quantity)
resourceList[v1.ResourceCPU] = *resource.NewMilliQuantity(cpu, resource.DecimalSI)
resourceList[v1.ResourceMemory] = *resource.NewQuantity(memory, resource.DecimalSI)
resourceList[v1.ResourceEphemeralStorage] = *resource.NewQuantity(ephemeralStorage, resource.DecimalSI)
return resourceList
}

View File

@@ -17,139 +17,174 @@ limitations under the License.
package pod
import (
"context"
"sort"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
coreinformers "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/tools/cache"
"sigs.k8s.io/descheduler/pkg/utils"
)
const (
nodeNameKeyIndex = "spec.nodeName"
)
// FilterFunc is a filter for a pod.
type FilterFunc func(*v1.Pod) bool
// GetPodsAssignedToNodeFunc is a function which accept a node name and a pod filter function
// as input and returns the pods that assigned to the node.
type GetPodsAssignedToNodeFunc func(string, FilterFunc) ([]*v1.Pod, error)
// WrapFilterFuncs wraps a set of FilterFunc in one.
func WrapFilterFuncs(filters ...FilterFunc) FilterFunc {
return func(pod *v1.Pod) bool {
for _, filter := range filters {
if filter != nil && !filter(pod) {
return false
}
}
return true
}
}
type Options struct {
filter func(pod *v1.Pod) bool
includedNamespaces []string
excludedNamespaces []string
filter FilterFunc
includedNamespaces sets.String
excludedNamespaces sets.String
labelSelector *metav1.LabelSelector
}
// NewOptions returns an empty Options.
func NewOptions() *Options {
return &Options{}
}
// WithFilter sets a pod filter.
// The filter function should return true if the pod should be returned from ListPodsOnANode
func WithFilter(filter func(pod *v1.Pod) bool) func(opts *Options) {
return func(opts *Options) {
opts.filter = filter
}
func (o *Options) WithFilter(filter FilterFunc) *Options {
o.filter = filter
return o
}
// WithNamespaces sets included namespaces
func WithNamespaces(namespaces []string) func(opts *Options) {
return func(opts *Options) {
opts.includedNamespaces = namespaces
}
func (o *Options) WithNamespaces(namespaces sets.String) *Options {
o.includedNamespaces = namespaces
return o
}
// WithoutNamespaces sets excluded namespaces
func WithoutNamespaces(namespaces []string) func(opts *Options) {
return func(opts *Options) {
opts.excludedNamespaces = namespaces
}
func (o *Options) WithoutNamespaces(namespaces sets.String) *Options {
o.excludedNamespaces = namespaces
return o
}
// WithLabelSelector sets a pod label selector
func WithLabelSelector(labelSelector *metav1.LabelSelector) func(opts *Options) {
return func(opts *Options) {
opts.labelSelector = labelSelector
}
func (o *Options) WithLabelSelector(labelSelector *metav1.LabelSelector) *Options {
o.labelSelector = labelSelector
return o
}
// ListPodsOnANode lists all of the pods on a node
// It also accepts an optional "filter" function which can be used to further limit the pods that are returned.
// (Usually this is podEvictor.Evictable().IsEvictable, in order to only list the evictable pods on a node, but can
// be used by strategies to extend it if there are further restrictions, such as with NodeAffinity).
func ListPodsOnANode(
ctx context.Context,
client clientset.Interface,
node *v1.Node,
opts ...func(opts *Options),
) ([]*v1.Pod, error) {
options := &Options{}
for _, opt := range opts {
opt(options)
}
pods := make([]*v1.Pod, 0)
fieldSelectorString := "spec.nodeName=" + node.Name + ",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)
labelSelectorString := ""
if options.labelSelector != nil {
selector, err := metav1.LabelSelectorAsSelector(options.labelSelector)
// BuildFilterFunc builds a final FilterFunc based on Options.
func (o *Options) BuildFilterFunc() (FilterFunc, error) {
var s labels.Selector
var err error
if o.labelSelector != nil {
s, err = metav1.LabelSelectorAsSelector(o.labelSelector)
if err != nil {
return []*v1.Pod{}, err
return nil, err
}
labelSelectorString = selector.String()
}
if len(options.includedNamespaces) > 0 {
fieldSelector, err := fields.ParseSelector(fieldSelectorString)
if err != nil {
return []*v1.Pod{}, err
return func(pod *v1.Pod) bool {
if o.filter != nil && !o.filter(pod) {
return false
}
if len(o.includedNamespaces) > 0 && !o.includedNamespaces.Has(pod.Namespace) {
return false
}
if len(o.excludedNamespaces) > 0 && o.excludedNamespaces.Has(pod.Namespace) {
return false
}
if s != nil && !s.Matches(labels.Set(pod.GetLabels())) {
return false
}
return true
}, nil
}
for _, namespace := range options.includedNamespaces {
podList, err := client.CoreV1().Pods(namespace).List(ctx,
metav1.ListOptions{
FieldSelector: fieldSelector.String(),
LabelSelector: labelSelectorString,
})
if err != nil {
return []*v1.Pod{}, err
// BuildGetPodsAssignedToNodeFunc establishes an indexer to map the pods and their assigned nodes.
// It returns a function to help us get all the pods that assigned to a node based on the indexer.
func BuildGetPodsAssignedToNodeFunc(podInformer coreinformers.PodInformer) (GetPodsAssignedToNodeFunc, error) {
// Establish an indexer to map the pods and their assigned nodes.
err := podInformer.Informer().AddIndexers(cache.Indexers{
nodeNameKeyIndex: func(obj interface{}) ([]string, error) {
pod, ok := obj.(*v1.Pod)
if !ok {
return []string{}, nil
}
for i := range podList.Items {
if options.filter != nil && !options.filter(&podList.Items[i]) {
continue
}
pods = append(pods, &podList.Items[i])
if len(pod.Spec.NodeName) == 0 {
return []string{}, nil
}
return []string{pod.Spec.NodeName}, nil
},
})
if err != nil {
return nil, err
}
// The indexer helps us get all the pods that assigned to a node.
podIndexer := podInformer.Informer().GetIndexer()
getPodsAssignedToNode := func(nodeName string, filter FilterFunc) ([]*v1.Pod, error) {
objs, err := podIndexer.ByIndex(nodeNameKeyIndex, nodeName)
if err != nil {
return nil, err
}
pods := make([]*v1.Pod, 0, len(objs))
for _, obj := range objs {
pod, ok := obj.(*v1.Pod)
if !ok {
continue
}
if filter(pod) {
pods = append(pods, pod)
}
}
return pods, nil
}
return getPodsAssignedToNode, nil
}
if len(options.excludedNamespaces) > 0 {
for _, namespace := range options.excludedNamespaces {
fieldSelectorString += ",metadata.namespace!=" + namespace
}
// ListPodsOnANode lists all pods on a node.
// It also accepts a "filter" function which can be used to further limit the pods that are returned.
// (Usually this is podEvictor.Evictable().IsEvictable, in order to only list the evictable pods on a node, but can
// be used by strategies to extend it if there are further restrictions, such as with NodeAffinity).
func ListPodsOnANode(
nodeName string,
getPodsAssignedToNode GetPodsAssignedToNodeFunc,
filter FilterFunc,
) ([]*v1.Pod, error) {
// Succeeded and failed pods are not considered because they don't occupy any resource.
f := func(pod *v1.Pod) bool {
return pod.Status.Phase != v1.PodSucceeded && pod.Status.Phase != v1.PodFailed
}
return ListAllPodsOnANode(nodeName, getPodsAssignedToNode, WrapFilterFuncs(f, filter))
}
fieldSelector, err := fields.ParseSelector(fieldSelectorString)
// ListAllPodsOnANode lists all the pods on a node no matter what the phase of the pod is.
func ListAllPodsOnANode(
nodeName string,
getPodsAssignedToNode GetPodsAssignedToNodeFunc,
filter FilterFunc,
) ([]*v1.Pod, error) {
pods, err := getPodsAssignedToNode(nodeName, filter)
if err != nil {
return []*v1.Pod{}, err
}
// INFO(jchaloup): field selectors do not work properly with listers
// Once the descheduler switches to pod listers (through informers),
// We need to flip to client-side filtering.
podList, err := client.CoreV1().Pods(v1.NamespaceAll).List(ctx,
metav1.ListOptions{
FieldSelector: fieldSelector.String(),
LabelSelector: labelSelectorString,
})
if err != nil {
return []*v1.Pod{}, err
}
for i := range podList.Items {
// fake client does not support field selectors
// so let's filter based on the node name as well (quite cheap)
if podList.Items[i].Spec.NodeName != node.Name {
continue
}
if options.filter != nil && !options.filter(&podList.Items[i]) {
continue
}
pods = append(pods, &podList.Items[i])
}
return pods, nil
}

View File

@@ -18,16 +18,15 @@ package pod
import (
"context"
"fmt"
"reflect"
"strings"
"testing"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/test"
)
@@ -39,19 +38,17 @@ var (
func TestListPodsOnANode(t *testing.T) {
testCases := []struct {
name string
pods map[string][]v1.Pod
pods []*v1.Pod
node *v1.Node
labelSelector *metav1.LabelSelector
expectedPodCount int
}{
{
name: "test listing pods on a node",
pods: map[string][]v1.Pod{
"n1": {
*test.BuildTestPod("pod1", 100, 0, "n1", nil),
*test.BuildTestPod("pod2", 100, 0, "n1", nil),
},
"n2": {*test.BuildTestPod("pod3", 100, 0, "n2", nil)},
pods: []*v1.Pod{
test.BuildTestPod("pod1", 100, 0, "n1", nil),
test.BuildTestPod("pod2", 100, 0, "n1", nil),
test.BuildTestPod("pod3", 100, 0, "n2", nil),
},
node: test.BuildTestNode("n1", 2000, 3000, 10, nil),
labelSelector: nil,
@@ -59,17 +56,15 @@ func TestListPodsOnANode(t *testing.T) {
},
{
name: "test listing pods with label selector",
pods: map[string][]v1.Pod{
"n1": {
*test.BuildTestPod("pod1", 100, 0, "n1", nil),
*test.BuildTestPod("pod2", 100, 0, "n1", func(pod *v1.Pod) {
pod.Labels = map[string]string{"foo": "bar"}
}),
*test.BuildTestPod("pod3", 100, 0, "n1", func(pod *v1.Pod) {
pod.Labels = map[string]string{"foo": "bar1"}
}),
},
"n2": {*test.BuildTestPod("pod4", 100, 0, "n2", nil)},
pods: []*v1.Pod{
test.BuildTestPod("pod1", 100, 0, "n1", nil),
test.BuildTestPod("pod2", 100, 0, "n1", func(pod *v1.Pod) {
pod.Labels = map[string]string{"foo": "bar"}
}),
test.BuildTestPod("pod3", 100, 0, "n1", func(pod *v1.Pod) {
pod.Labels = map[string]string{"foo": "bar1"}
}),
test.BuildTestPod("pod4", 100, 0, "n2", nil),
},
node: test.BuildTestNode("n1", 2000, 3000, 10, nil),
labelSelector: &metav1.LabelSelector{
@@ -85,21 +80,38 @@ func TestListPodsOnANode(t *testing.T) {
},
}
for _, testCase := range testCases {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
list := action.(core.ListAction)
fieldString := list.GetListRestrictions().Fields.String()
if strings.Contains(fieldString, "n1") {
return true, &v1.PodList{Items: testCase.pods["n1"]}, nil
} else if strings.Contains(fieldString, "n2") {
return true, &v1.PodList{Items: testCase.pods["n2"]}, nil
t.Run(testCase.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
objs = append(objs, testCase.node)
for _, pod := range testCase.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
filter, err := NewOptions().WithLabelSelector(testCase.labelSelector).BuildFilterFunc()
if err != nil {
t.Errorf("Build filter function error: %v", err)
}
pods, _ := ListPodsOnANode(testCase.node.Name, getPodsAssignedToNode, filter)
if len(pods) != testCase.expectedPodCount {
t.Errorf("Expected %v pods on node %v, got %+v", testCase.expectedPodCount, testCase.node.Name, len(pods))
}
return true, nil, fmt.Errorf("Failed to list: %v", list)
})
pods, _ := ListPodsOnANode(context.TODO(), fakeClient, testCase.node, WithLabelSelector(testCase.labelSelector))
if len(pods) != testCase.expectedPodCount {
t.Errorf("expected %v pods on node %v, got %+v", testCase.expectedPodCount, testCase.node.Name, len(pods))
}
}
}

View File

@@ -66,6 +66,7 @@ func RemoveDuplicatePods(
strategy api.DeschedulerStrategy,
nodes []*v1.Node,
podEvictor *evictions.PodEvictor,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
) {
if err := validateRemoveDuplicatePodsParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemoveDuplicatePods parameters")
@@ -77,10 +78,10 @@ func RemoveDuplicatePods(
return
}
var includedNamespaces, excludedNamespaces []string
var includedNamespaces, excludedNamespaces sets.String
if strategy.Params != nil && strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
includedNamespaces = sets.NewString(strategy.Params.Namespaces.Include...)
excludedNamespaces = sets.NewString(strategy.Params.Namespaces.Exclude...)
}
nodeFit := false
@@ -95,15 +96,19 @@ func RemoveDuplicatePods(
nodeCount := 0
nodeMap := make(map[string]*v1.Node)
podFilter, err := podutil.NewOptions().
WithFilter(evictable.IsEvictable).
WithNamespaces(includedNamespaces).
WithoutNamespaces(excludedNamespaces).
BuildFilterFunc()
if err != nil {
klog.ErrorS(err, "Error initializing pod filter function")
return
}
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListPodsOnANode(ctx,
client,
node,
podutil.WithFilter(evictable.IsEvictable),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
)
pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter)
if err != nil {
klog.ErrorS(err, "Error listing evictable pods on node", "node", klog.KObj(node))
continue

View File

@@ -25,10 +25,12 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
"sigs.k8s.io/descheduler/test"
)
@@ -43,7 +45,6 @@ func buildTestPodWithImage(podName, node, image string) *v1.Pod {
}
func TestFindDuplicatePods(t *testing.T) {
ctx := context.Background()
// first setup pods
node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
node2 := test.BuildTestNode("n2", 2000, 3000, 10, nil)
@@ -66,6 +67,7 @@ func TestFindDuplicatePods(t *testing.T) {
Unschedulable: true,
}
})
node6 := test.BuildTestNode("n6", 200, 200, 10, nil)
p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil)
p1.Namespace = "dev"
@@ -101,6 +103,14 @@ func TestFindDuplicatePods(t *testing.T) {
p18 := test.BuildTestPod("TARGET", 100, 0, node1.Name, nil)
p18.Namespace = "node-fit"
// This pod sits on node6 and is used to take up CPU requests on the node
p19 := test.BuildTestPod("CPU-eater", 150, 150, node6.Name, nil)
p19.Namespace = "test"
// Dummy pod for node6 used to do the opposite of p19
p20 := test.BuildTestPod("CPU-saver", 100, 150, node6.Name, nil)
p20.Namespace = "test"
// ### Evictable Pods ###
// Three Pods in the "default" Namespace, bound to same ReplicaSet. 2 should be evicted.
@@ -173,128 +183,152 @@ func TestFindDuplicatePods(t *testing.T) {
testCases := []struct {
description string
maxPodsToEvictPerNode int
pods []v1.Pod
pods []*v1.Pod
nodes []*v1.Node
expectedEvictedPodCount int
expectedEvictedPodCount uint
strategy api.DeschedulerStrategy
}{
{
description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. 1 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3},
pods: []*v1.Pod{p1, p2, p3},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 1,
strategy: api.DeschedulerStrategy{},
},
{
description: "Three pods in the `dev` Namespace, bound to same ReplicaSet, but ReplicaSet kind is excluded. 0 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3},
pods: []*v1.Pod{p1, p2, p3},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{RemoveDuplicates: &api.RemoveDuplicates{ExcludeOwnerKinds: []string{"ReplicaSet"}}}},
},
{
description: "Three Pods in the `test` Namespace, bound to same ReplicaSet. 1 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p8, *p9, *p10},
pods: []*v1.Pod{p8, p9, p10},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 1,
strategy: api.DeschedulerStrategy{},
},
{
description: "Three Pods in the `dev` Namespace, three Pods in the `test` Namespace. Bound to ReplicaSet with same name. 4 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3, *p8, *p9, *p10},
pods: []*v1.Pod{p1, p2, p3, p8, p9, p10},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 2,
strategy: api.DeschedulerStrategy{},
},
{
description: "Pods are: part of DaemonSet, with local storage, mirror pod annotation, critical pod annotation - none should be evicted.",
maxPodsToEvictPerNode: 2,
pods: []v1.Pod{*p4, *p5, *p6, *p7},
pods: []*v1.Pod{p4, p5, p6, p7},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{},
},
{
description: "Test all Pods: 4 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3, *p4, *p5, *p6, *p7, *p8, *p9, *p10},
pods: []*v1.Pod{p1, p2, p3, p4, p5, p6, p7, p8, p9, p10},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 2,
strategy: api.DeschedulerStrategy{},
},
{
description: "Pods with the same owner but different images should not be evicted",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p11, *p12},
pods: []*v1.Pod{p11, p12},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{},
},
{
description: "Pods with multiple containers should not match themselves",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p13},
pods: []*v1.Pod{p13},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{},
},
{
description: "Pods with matching ownerrefs and at not all matching image should not trigger an eviction",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p11, *p13},
pods: []*v1.Pod{p11, p13},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{},
},
{
description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. Only node available has a taint, and nodeFit set to true. 0 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3},
pods: []*v1.Pod{p1, p2, p3},
nodes: []*v1.Node{node1, node3},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}},
},
{
description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet, all with a nodeSelector. Only node available has an incorrect node label, and nodeFit set to true. 0 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p15, *p16, *p17},
pods: []*v1.Pod{p15, p16, p17},
nodes: []*v1.Node{node1, node4},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}},
},
{
description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet. Only node available is not schedulable, and nodeFit set to true. 0 should be evicted.",
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2, *p3},
pods: []*v1.Pod{p1, p2, p3},
nodes: []*v1.Node{node1, node5},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}},
},
{
description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet. Only node available does not have enough CPU, and nodeFit set to true. 0 should be evicted.",
pods: []*v1.Pod{p1, p2, p3, p19},
nodes: []*v1.Node{node1, node6},
expectedEvictedPodCount: 0,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}},
},
{
description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet. Only node available has enough CPU, and nodeFit set to true. 1 should be evicted.",
pods: []*v1.Pod{p1, p2, p3, p20},
nodes: []*v1.Node{node1, node6},
expectedEvictedPodCount: 1,
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}},
},
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: testCase.pods}, nil
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
for _, node := range testCase.nodes {
objs = append(objs, node)
}
for _, pod := range testCase.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
"v1",
false,
testCase.maxPodsToEvictPerNode,
nil,
nil,
testCase.nodes,
getPodsAssignedToNode,
false,
false,
false,
false,
false,
)
RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor)
RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor, getPodsAssignedToNode)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != testCase.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted)
@@ -305,8 +339,6 @@ func TestFindDuplicatePods(t *testing.T) {
}
func TestRemoveDuplicatesUniformly(t *testing.T) {
ctx := context.Background()
setRSOwnerRef2 := func(pod *v1.Pod) {
pod.ObjectMeta.OwnerReferences = []metav1.OwnerReference{
{Kind: "ReplicaSet", APIVersion: "v1", Name: "replicaset-2"},
@@ -346,7 +378,7 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
node.Spec.Taints = []v1.Taint{
{
Effect: v1.TaintEffectNoSchedule,
Key: "node-role.kubernetes.io/master",
Key: "node-role.kubernetes.io/control-plane",
},
}
}
@@ -355,7 +387,7 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
if node.ObjectMeta.Labels == nil {
node.ObjectMeta.Labels = map[string]string{}
}
node.ObjectMeta.Labels["node-role.kubernetes.io/master"] = ""
node.ObjectMeta.Labels["node-role.kubernetes.io/control-plane"] = ""
}
setWorkerLabel := func(node *v1.Node) {
@@ -375,7 +407,7 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: "node-role.kubernetes.io/master",
Key: "node-role.kubernetes.io/control-plane",
Operator: v1.NodeSelectorOpDoesNotExist,
},
{
@@ -399,7 +431,7 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: "node-role.kubernetes.io/master",
Key: "node-role.kubernetes.io/control-plane",
Operator: v1.NodeSelectorOpDoesNotExist,
},
{
@@ -432,25 +464,24 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
testCases := []struct {
description string
maxPodsToEvictPerNode int
pods []v1.Pod
pods []*v1.Pod
nodes []*v1.Node
expectedEvictedPodCount int
expectedEvictedPodCount uint
strategy api.DeschedulerStrategy
}{
{
description: "Evict pods uniformly",
pods: []v1.Pod{
pods: []*v1.Pod{
// (5,3,1) -> (3,3,3) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p3", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p4", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p5", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p6", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p7", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p8", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p9", 100, 0, "n3", test.SetRSOwnerRef),
test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p3", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p4", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p5", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p6", 100, 0, "n2", test.SetRSOwnerRef),
test.BuildTestPod("p7", 100, 0, "n2", test.SetRSOwnerRef),
test.BuildTestPod("p8", 100, 0, "n2", test.SetRSOwnerRef),
test.BuildTestPod("p9", 100, 0, "n3", test.SetRSOwnerRef),
},
expectedEvictedPodCount: 2,
nodes: []*v1.Node{
@@ -462,17 +493,17 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods uniformly with one node left out",
pods: []v1.Pod{
pods: []*v1.Pod{
// (5,3,1) -> (4,4,1) -> 1 eviction
*test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p3", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p4", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p5", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p6", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p7", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p8", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p9", 100, 0, "n3", test.SetRSOwnerRef),
test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p3", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p4", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p5", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p6", 100, 0, "n2", test.SetRSOwnerRef),
test.BuildTestPod("p7", 100, 0, "n2", test.SetRSOwnerRef),
test.BuildTestPod("p8", 100, 0, "n2", test.SetRSOwnerRef),
test.BuildTestPod("p9", 100, 0, "n3", test.SetRSOwnerRef),
},
expectedEvictedPodCount: 1,
nodes: []*v1.Node{
@@ -483,17 +514,17 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods uniformly with two replica sets",
pods: []v1.Pod{
pods: []*v1.Pod{
// (5,3,1) -> (3,3,3) -> 2 evictions
*test.BuildTestPod("p11", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p12", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p13", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p14", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p15", 100, 0, "n1", setTwoRSOwnerRef),
*test.BuildTestPod("p16", 100, 0, "n2", setTwoRSOwnerRef),
*test.BuildTestPod("p17", 100, 0, "n2", setTwoRSOwnerRef),
*test.BuildTestPod("p18", 100, 0, "n2", setTwoRSOwnerRef),
*test.BuildTestPod("p19", 100, 0, "n3", setTwoRSOwnerRef),
test.BuildTestPod("p11", 100, 0, "n1", setTwoRSOwnerRef),
test.BuildTestPod("p12", 100, 0, "n1", setTwoRSOwnerRef),
test.BuildTestPod("p13", 100, 0, "n1", setTwoRSOwnerRef),
test.BuildTestPod("p14", 100, 0, "n1", setTwoRSOwnerRef),
test.BuildTestPod("p15", 100, 0, "n1", setTwoRSOwnerRef),
test.BuildTestPod("p16", 100, 0, "n2", setTwoRSOwnerRef),
test.BuildTestPod("p17", 100, 0, "n2", setTwoRSOwnerRef),
test.BuildTestPod("p18", 100, 0, "n2", setTwoRSOwnerRef),
test.BuildTestPod("p19", 100, 0, "n3", setTwoRSOwnerRef),
},
expectedEvictedPodCount: 4,
nodes: []*v1.Node{
@@ -505,27 +536,27 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods uniformly with two owner references",
pods: []v1.Pod{
pods: []*v1.Pod{
// (5,3,1) -> (3,3,3) -> 2 evictions
*test.BuildTestPod("p11", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p12", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p13", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p14", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p15", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p16", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p17", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p18", 100, 0, "n2", test.SetRSOwnerRef),
*test.BuildTestPod("p19", 100, 0, "n3", test.SetRSOwnerRef),
test.BuildTestPod("p11", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p12", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p13", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p14", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p15", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p16", 100, 0, "n2", test.SetRSOwnerRef),
test.BuildTestPod("p17", 100, 0, "n2", test.SetRSOwnerRef),
test.BuildTestPod("p18", 100, 0, "n2", test.SetRSOwnerRef),
test.BuildTestPod("p19", 100, 0, "n3", test.SetRSOwnerRef),
// (1,3,5) -> (3,3,3) -> 2 evictions
*test.BuildTestPod("p21", 100, 0, "n1", setRSOwnerRef2),
*test.BuildTestPod("p22", 100, 0, "n2", setRSOwnerRef2),
*test.BuildTestPod("p23", 100, 0, "n2", setRSOwnerRef2),
*test.BuildTestPod("p24", 100, 0, "n2", setRSOwnerRef2),
*test.BuildTestPod("p25", 100, 0, "n3", setRSOwnerRef2),
*test.BuildTestPod("p26", 100, 0, "n3", setRSOwnerRef2),
*test.BuildTestPod("p27", 100, 0, "n3", setRSOwnerRef2),
*test.BuildTestPod("p28", 100, 0, "n3", setRSOwnerRef2),
*test.BuildTestPod("p29", 100, 0, "n3", setRSOwnerRef2),
test.BuildTestPod("p21", 100, 0, "n1", setRSOwnerRef2),
test.BuildTestPod("p22", 100, 0, "n2", setRSOwnerRef2),
test.BuildTestPod("p23", 100, 0, "n2", setRSOwnerRef2),
test.BuildTestPod("p24", 100, 0, "n2", setRSOwnerRef2),
test.BuildTestPod("p25", 100, 0, "n3", setRSOwnerRef2),
test.BuildTestPod("p26", 100, 0, "n3", setRSOwnerRef2),
test.BuildTestPod("p27", 100, 0, "n3", setRSOwnerRef2),
test.BuildTestPod("p28", 100, 0, "n3", setRSOwnerRef2),
test.BuildTestPod("p29", 100, 0, "n3", setRSOwnerRef2),
},
expectedEvictedPodCount: 4,
nodes: []*v1.Node{
@@ -537,10 +568,10 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods with number of pods less than nodes",
pods: []v1.Pod{
pods: []*v1.Pod{
// (2,0,0) -> (1,1,0) -> 1 eviction
*test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
*test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p2", 100, 0, "n1", test.SetRSOwnerRef),
},
expectedEvictedPodCount: 1,
nodes: []*v1.Node{
@@ -552,14 +583,14 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods with number of pods less than nodes, but ignore different pods with the same ownerref",
pods: []v1.Pod{
pods: []*v1.Pod{
// (1, 0, 0) for "bar","baz" images -> no eviction, even with a matching ownerKey
// (2, 0, 0) for "foo" image -> (1,1,0) - 1 eviction
// In this case the only "real" duplicates are p1 and p4, so one of those should be evicted
*buildTestPodWithImage("p1", "n1", "foo"),
*buildTestPodWithImage("p2", "n1", "bar"),
*buildTestPodWithImage("p3", "n1", "baz"),
*buildTestPodWithImage("p4", "n1", "foo"),
buildTestPodWithImage("p1", "n1", "foo"),
buildTestPodWithImage("p2", "n1", "bar"),
buildTestPodWithImage("p3", "n1", "baz"),
buildTestPodWithImage("p4", "n1", "foo"),
},
expectedEvictedPodCount: 1,
nodes: []*v1.Node{
@@ -571,9 +602,9 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods with a single pod with three nodes",
pods: []v1.Pod{
pods: []*v1.Pod{
// (2,0,0) -> (1,1,0) -> 1 eviction
*test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
test.BuildTestPod("p1", 100, 0, "n1", test.SetRSOwnerRef),
},
expectedEvictedPodCount: 0,
nodes: []*v1.Node{
@@ -585,17 +616,17 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods uniformly respecting taints",
pods: []v1.Pod{
pods: []*v1.Pod{
// (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "worker1", setTolerationsK1),
*test.BuildTestPod("p2", 100, 0, "worker1", setTolerationsK2),
*test.BuildTestPod("p3", 100, 0, "worker1", setTolerationsK1),
*test.BuildTestPod("p4", 100, 0, "worker1", setTolerationsK2),
*test.BuildTestPod("p5", 100, 0, "worker1", setTolerationsK1),
*test.BuildTestPod("p6", 100, 0, "worker2", setTolerationsK2),
*test.BuildTestPod("p7", 100, 0, "worker2", setTolerationsK1),
*test.BuildTestPod("p8", 100, 0, "worker2", setTolerationsK2),
*test.BuildTestPod("p9", 100, 0, "worker3", setTolerationsK1),
test.BuildTestPod("p1", 100, 0, "worker1", setTolerationsK1),
test.BuildTestPod("p2", 100, 0, "worker1", setTolerationsK2),
test.BuildTestPod("p3", 100, 0, "worker1", setTolerationsK1),
test.BuildTestPod("p4", 100, 0, "worker1", setTolerationsK2),
test.BuildTestPod("p5", 100, 0, "worker1", setTolerationsK1),
test.BuildTestPod("p6", 100, 0, "worker2", setTolerationsK2),
test.BuildTestPod("p7", 100, 0, "worker2", setTolerationsK1),
test.BuildTestPod("p8", 100, 0, "worker2", setTolerationsK2),
test.BuildTestPod("p9", 100, 0, "worker3", setTolerationsK1),
},
expectedEvictedPodCount: 2,
nodes: []*v1.Node{
@@ -610,17 +641,17 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods uniformly respecting RequiredDuringSchedulingIgnoredDuringExecution node affinity",
pods: []v1.Pod{
pods: []*v1.Pod{
// (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "worker1", setNotMasterNodeSelectorK1),
*test.BuildTestPod("p2", 100, 0, "worker1", setNotMasterNodeSelectorK2),
*test.BuildTestPod("p3", 100, 0, "worker1", setNotMasterNodeSelectorK1),
*test.BuildTestPod("p4", 100, 0, "worker1", setNotMasterNodeSelectorK2),
*test.BuildTestPod("p5", 100, 0, "worker1", setNotMasterNodeSelectorK1),
*test.BuildTestPod("p6", 100, 0, "worker2", setNotMasterNodeSelectorK2),
*test.BuildTestPod("p7", 100, 0, "worker2", setNotMasterNodeSelectorK1),
*test.BuildTestPod("p8", 100, 0, "worker2", setNotMasterNodeSelectorK2),
*test.BuildTestPod("p9", 100, 0, "worker3", setNotMasterNodeSelectorK1),
test.BuildTestPod("p1", 100, 0, "worker1", setNotMasterNodeSelectorK1),
test.BuildTestPod("p2", 100, 0, "worker1", setNotMasterNodeSelectorK2),
test.BuildTestPod("p3", 100, 0, "worker1", setNotMasterNodeSelectorK1),
test.BuildTestPod("p4", 100, 0, "worker1", setNotMasterNodeSelectorK2),
test.BuildTestPod("p5", 100, 0, "worker1", setNotMasterNodeSelectorK1),
test.BuildTestPod("p6", 100, 0, "worker2", setNotMasterNodeSelectorK2),
test.BuildTestPod("p7", 100, 0, "worker2", setNotMasterNodeSelectorK1),
test.BuildTestPod("p8", 100, 0, "worker2", setNotMasterNodeSelectorK2),
test.BuildTestPod("p9", 100, 0, "worker3", setNotMasterNodeSelectorK1),
},
expectedEvictedPodCount: 2,
nodes: []*v1.Node{
@@ -635,17 +666,17 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods uniformly respecting node selector",
pods: []v1.Pod{
pods: []*v1.Pod{
// (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2),
*test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2),
*test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2),
*test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1),
*test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2),
*test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1),
test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1),
test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2),
test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1),
test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2),
test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1),
test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2),
test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1),
test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2),
test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1),
},
expectedEvictedPodCount: 2,
nodes: []*v1.Node{
@@ -660,17 +691,17 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
},
{
description: "Evict pods uniformly respecting node selector with zero target nodes",
pods: []v1.Pod{
pods: []*v1.Pod{
// (5,3,1,0,0,0) -> (3,3,3,0,0,0) -> 2 evictions
*test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2),
*test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2),
*test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1),
*test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2),
*test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1),
*test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2),
*test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1),
test.BuildTestPod("p1", 100, 0, "worker1", setWorkerLabelSelectorK1),
test.BuildTestPod("p2", 100, 0, "worker1", setWorkerLabelSelectorK2),
test.BuildTestPod("p3", 100, 0, "worker1", setWorkerLabelSelectorK1),
test.BuildTestPod("p4", 100, 0, "worker1", setWorkerLabelSelectorK2),
test.BuildTestPod("p5", 100, 0, "worker1", setWorkerLabelSelectorK1),
test.BuildTestPod("p6", 100, 0, "worker2", setWorkerLabelSelectorK2),
test.BuildTestPod("p7", 100, 0, "worker2", setWorkerLabelSelectorK1),
test.BuildTestPod("p8", 100, 0, "worker2", setWorkerLabelSelectorK2),
test.BuildTestPod("p9", 100, 0, "worker3", setWorkerLabelSelectorK1),
},
expectedEvictedPodCount: 0,
nodes: []*v1.Node{
@@ -687,22 +718,45 @@ func TestRemoveDuplicatesUniformly(t *testing.T) {
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: testCase.pods}, nil
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
for _, node := range testCase.nodes {
objs = append(objs, node)
}
for _, pod := range testCase.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
testCase.maxPodsToEvictPerNode,
nil,
nil,
testCase.nodes,
getPodsAssignedToNode,
false,
false,
false,
false,
false,
)
RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor)
RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor, getPodsAssignedToNode)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != testCase.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted)

View File

@@ -0,0 +1,179 @@
package strategies
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies/validation"
)
// validatedFailedPodsStrategyParams contains validated strategy parameters
type validatedFailedPodsStrategyParams struct {
validation.ValidatedStrategyParams
includingInitContainers bool
reasons sets.String
excludeOwnerKinds sets.String
minPodLifetimeSeconds *uint
}
// RemoveFailedPods removes Pods that are in failed status phase.
func RemoveFailedPods(
ctx context.Context,
client clientset.Interface,
strategy api.DeschedulerStrategy,
nodes []*v1.Node,
podEvictor *evictions.PodEvictor,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
) {
strategyParams, err := validateAndParseRemoveFailedPodsParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Invalid RemoveFailedPods parameters")
return
}
evictable := podEvictor.Evictable(
evictions.WithPriorityThreshold(strategyParams.ThresholdPriority),
evictions.WithNodeFit(strategyParams.NodeFit),
evictions.WithLabelSelector(strategyParams.LabelSelector),
)
var labelSelector *metav1.LabelSelector
if strategy.Params != nil {
labelSelector = strategy.Params.LabelSelector
}
podFilter, err := podutil.NewOptions().
WithFilter(evictable.IsEvictable).
WithNamespaces(strategyParams.IncludedNamespaces).
WithoutNamespaces(strategyParams.ExcludedNamespaces).
WithLabelSelector(labelSelector).
BuildFilterFunc()
if err != nil {
klog.ErrorS(err, "Error initializing pod filter function")
return
}
// Only list failed pods
phaseFilter := func(pod *v1.Pod) bool { return pod.Status.Phase == v1.PodFailed }
podFilter = podutil.WrapFilterFuncs(phaseFilter, podFilter)
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListAllPodsOnANode(node.Name, getPodsAssignedToNode, podFilter)
if err != nil {
klog.ErrorS(err, "Error listing a nodes failed pods", "node", klog.KObj(node))
continue
}
for i, pod := range pods {
if err = validateFailedPodShouldEvict(pod, *strategyParams); err != nil {
klog.V(4).InfoS(fmt.Sprintf("ignoring pod for eviction due to: %s", err.Error()), "pod", klog.KObj(pod))
continue
}
if _, err = podEvictor.EvictPod(ctx, pods[i], node, "FailedPod"); err != nil {
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
break
}
}
}
}
func validateAndParseRemoveFailedPodsParams(
ctx context.Context,
client clientset.Interface,
params *api.StrategyParameters,
) (*validatedFailedPodsStrategyParams, error) {
if params == nil {
return &validatedFailedPodsStrategyParams{
ValidatedStrategyParams: validation.DefaultValidatedStrategyParams(),
}, nil
}
strategyParams, err := validation.ValidateAndParseStrategyParams(ctx, client, params)
if err != nil {
return nil, err
}
var reasons, excludeOwnerKinds sets.String
var includingInitContainers bool
var minPodLifetimeSeconds *uint
if params.FailedPods != nil {
reasons = sets.NewString(params.FailedPods.Reasons...)
includingInitContainers = params.FailedPods.IncludingInitContainers
excludeOwnerKinds = sets.NewString(params.FailedPods.ExcludeOwnerKinds...)
minPodLifetimeSeconds = params.FailedPods.MinPodLifetimeSeconds
}
return &validatedFailedPodsStrategyParams{
ValidatedStrategyParams: *strategyParams,
includingInitContainers: includingInitContainers,
reasons: reasons,
excludeOwnerKinds: excludeOwnerKinds,
minPodLifetimeSeconds: minPodLifetimeSeconds,
}, nil
}
// validateFailedPodShouldEvict looks at strategy params settings to see if the Pod
// should be evicted given the params in the PodFailed policy.
func validateFailedPodShouldEvict(pod *v1.Pod, strategyParams validatedFailedPodsStrategyParams) error {
var errs []error
if strategyParams.minPodLifetimeSeconds != nil {
podAgeSeconds := uint(metav1.Now().Sub(pod.GetCreationTimestamp().Local()).Seconds())
if podAgeSeconds < *strategyParams.minPodLifetimeSeconds {
errs = append(errs, fmt.Errorf("pod does not exceed the min age seconds of %d", *strategyParams.minPodLifetimeSeconds))
}
}
if len(strategyParams.excludeOwnerKinds) > 0 {
ownerRefList := podutil.OwnerRef(pod)
for _, owner := range ownerRefList {
if strategyParams.excludeOwnerKinds.Has(owner.Kind) {
errs = append(errs, fmt.Errorf("pod's owner kind of %s is excluded", owner.Kind))
}
}
}
if len(strategyParams.reasons) > 0 {
reasons := getFailedContainerStatusReasons(pod.Status.ContainerStatuses)
if pod.Status.Phase == v1.PodFailed && pod.Status.Reason != "" {
reasons = append(reasons, pod.Status.Reason)
}
if strategyParams.includingInitContainers {
reasons = append(reasons, getFailedContainerStatusReasons(pod.Status.InitContainerStatuses)...)
}
if !strategyParams.reasons.HasAny(reasons...) {
errs = append(errs, fmt.Errorf("pod does not match any of the reasons"))
}
}
return utilerrors.NewAggregate(errs)
}
func getFailedContainerStatusReasons(containerStatuses []v1.ContainerStatus) []string {
reasons := make([]string, 0)
for _, containerStatus := range containerStatuses {
if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason != "" {
reasons = append(reasons, containerStatus.State.Waiting.Reason)
}
if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.Reason != "" {
reasons = append(reasons, containerStatus.State.Terminated.Reason)
}
}
return reasons
}

View File

@@ -0,0 +1,358 @@
package strategies
import (
"context"
"testing"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/test"
)
var (
OneHourInSeconds uint = 3600
)
func TestRemoveFailedPods(t *testing.T) {
createStrategy := func(enabled, includingInitContainers bool, reasons, excludeKinds []string, minAgeSeconds *uint, nodeFit bool) api.DeschedulerStrategy {
return api.DeschedulerStrategy{
Enabled: enabled,
Params: &api.StrategyParameters{
FailedPods: &api.FailedPods{
Reasons: reasons,
IncludingInitContainers: includingInitContainers,
ExcludeOwnerKinds: excludeKinds,
MinPodLifetimeSeconds: minAgeSeconds,
},
NodeFit: nodeFit,
},
}
}
tests := []struct {
description string
nodes []*v1.Node
strategy api.DeschedulerStrategy
expectedEvictedPodCount uint
pods []*v1.Pod
}{
{
description: "default empty strategy, 0 failures, 0 evictions",
strategy: api.DeschedulerStrategy{},
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{}, // no pods come back with field selector phase=Failed
},
{
description: "0 failures, 0 evictions",
strategy: createStrategy(true, false, nil, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{}, // no pods come back with field selector phase=Failed
},
{
description: "1 container terminated with reason NodeAffinity, 1 eviction",
strategy: createStrategy(true, false, nil, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}), nil),
},
},
{
description: "1 init container terminated with reason NodeAffinity, 1 eviction",
strategy: createStrategy(true, true, nil, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}, nil), nil),
},
},
{
description: "1 init container waiting with reason CreateContainerConfigError, 1 eviction",
strategy: createStrategy(true, true, nil, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", &v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "CreateContainerConfigError"},
}, nil), nil),
},
},
{
description: "2 init container waiting with reason CreateContainerConfigError, 2 nodes, 2 evictions",
strategy: createStrategy(true, true, nil, nil, nil, false),
nodes: []*v1.Node{
test.BuildTestNode("node1", 2000, 3000, 10, nil),
test.BuildTestNode("node2", 2000, 3000, 10, nil),
},
expectedEvictedPodCount: 2,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "CreateContainerConfigError"},
}, nil), nil),
buildTestPod("p2", "node2", newPodStatus("", "", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "CreateContainerConfigError"},
}, nil), nil),
},
},
{
description: "include reason=CreateContainerConfigError, 1 container terminated with reason CreateContainerConfigError, 1 eviction",
strategy: createStrategy(true, false, []string{"CreateContainerConfigError"}, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "CreateContainerConfigError"},
}), nil),
},
},
{
description: "include reason=CreateContainerConfigError+NodeAffinity, 1 container terminated with reason CreateContainerConfigError, 1 eviction",
strategy: createStrategy(true, false, []string{"CreateContainerConfigError", "NodeAffinity"}, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "CreateContainerConfigError"},
}), nil),
},
},
{
description: "include reason=CreateContainerConfigError, 1 container terminated with reason NodeAffinity, 0 eviction",
strategy: createStrategy(true, false, []string{"CreateContainerConfigError"}, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}), nil),
},
},
{
description: "include init container=false, 1 init container waiting with reason CreateContainerConfigError, 0 eviction",
strategy: createStrategy(true, false, []string{"CreateContainerConfigError"}, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", &v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "CreateContainerConfigError"},
}, nil), nil),
},
},
{
description: "lifetime 1 hour, 1 container terminated with reason NodeAffinity, 0 eviction",
strategy: createStrategy(true, false, nil, nil, &OneHourInSeconds, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}), nil),
},
},
{
description: "nodeFit=true, 1 unschedulable node, 1 container terminated with reason NodeAffinity, 0 eviction",
strategy: createStrategy(true, false, nil, nil, nil, true),
nodes: []*v1.Node{
test.BuildTestNode("node1", 2000, 3000, 10, nil),
test.BuildTestNode("node2", 2000, 2000, 10, func(node *v1.Node) {
node.Spec.Unschedulable = true
}),
},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}), nil),
},
},
{
description: "nodeFit=true, only available node does not have enough resources, 1 container terminated with reason CreateContainerConfigError, 0 eviction",
strategy: createStrategy(true, false, []string{"CreateContainerConfigError"}, nil, nil, true),
nodes: []*v1.Node{test.BuildTestNode("node1", 1, 1, 10, nil), test.BuildTestNode("node2", 0, 0, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", nil, &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "CreateContainerConfigError"},
}), nil),
},
},
{
description: "excluded owner kind=ReplicaSet, 1 init container terminated with owner kind=ReplicaSet, 0 eviction",
strategy: createStrategy(true, true, nil, []string{"ReplicaSet"}, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}, nil), nil),
},
},
{
description: "excluded owner kind=DaemonSet, 1 init container terminated with owner kind=ReplicaSet, 1 eviction",
strategy: createStrategy(true, true, nil, []string{"DaemonSet"}, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 1,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}, nil), nil),
},
},
{
description: "excluded owner kind=DaemonSet, 1 init container terminated with owner kind=ReplicaSet, 1 pod in termination; nothing should be moved",
strategy: createStrategy(true, true, nil, []string{"DaemonSet"}, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("", "", &v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{Reason: "NodeAffinity"},
}, nil), &metav1.Time{}),
},
},
{
description: "1 container terminated with reason ShutDown, 0 evictions",
strategy: createStrategy(true, false, nil, nil, nil, true),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 0,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("Shutdown", v1.PodFailed, nil, nil), nil),
},
},
{
description: "include reason=Shutdown, 2 containers terminated with reason ShutDown, 2 evictions",
strategy: createStrategy(true, false, []string{"Shutdown"}, nil, nil, false),
nodes: []*v1.Node{test.BuildTestNode("node1", 2000, 3000, 10, nil)},
expectedEvictedPodCount: 2,
pods: []*v1.Pod{
buildTestPod("p1", "node1", newPodStatus("Shutdown", v1.PodFailed, nil, nil), nil),
buildTestPod("p2", "node1", newPodStatus("Shutdown", v1.PodFailed, nil, nil), nil),
},
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
for _, node := range tc.nodes {
objs = append(objs, node)
}
for _, pod := range tc.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
nil,
nil,
tc.nodes,
getPodsAssignedToNode,
false,
false,
false,
false,
false,
)
RemoveFailedPods(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount)
}
})
}
}
func TestValidRemoveFailedPodsParams(t *testing.T) {
ctx := context.Background()
fakeClient := &fake.Clientset{}
testCases := []struct {
name string
params *api.StrategyParameters
}{
{name: "validate nil params", params: nil},
{name: "validate empty params", params: &api.StrategyParameters{}},
{name: "validate reasons params", params: &api.StrategyParameters{FailedPods: &api.FailedPods{
Reasons: []string{"CreateContainerConfigError"},
}}},
{name: "validate includingInitContainers params", params: &api.StrategyParameters{FailedPods: &api.FailedPods{
IncludingInitContainers: true,
}}},
{name: "validate excludeOwnerKinds params", params: &api.StrategyParameters{FailedPods: &api.FailedPods{
ExcludeOwnerKinds: []string{"Job"},
}}},
{name: "validate excludeOwnerKinds params", params: &api.StrategyParameters{FailedPods: &api.FailedPods{
MinPodLifetimeSeconds: &OneHourInSeconds,
}}},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
params, err := validateAndParseRemoveFailedPodsParams(ctx, fakeClient, tc.params)
if err != nil {
t.Errorf("strategy params should be valid but got err: %v", err.Error())
}
if params == nil {
t.Errorf("strategy params should return a ValidatedFailedPodsStrategyParams but got nil")
}
})
}
}
func newPodStatus(reason string, phase v1.PodPhase, initContainerState, containerState *v1.ContainerState) v1.PodStatus {
ps := v1.PodStatus{
Reason: reason,
Phase: phase,
}
if initContainerState != nil {
ps.InitContainerStatuses = []v1.ContainerStatus{{State: *initContainerState}}
ps.Phase = v1.PodFailed
}
if containerState != nil {
ps.ContainerStatuses = []v1.ContainerStatus{{State: *containerState}}
ps.Phase = v1.PodFailed
}
return ps
}
func buildTestPod(podName, nodeName string, podStatus v1.PodStatus, deletionTimestamp *metav1.Time) *v1.Pod {
pod := test.BuildTestPod(podName, 1, 1, nodeName, func(p *v1.Pod) {
p.Status = podStatus
})
pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList()
pod.ObjectMeta.SetCreationTimestamp(metav1.Now())
pod.DeletionTimestamp = deletionTimestamp
return pod
}

View File

@@ -21,6 +21,7 @@ import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
@@ -47,7 +48,7 @@ func validatePodsViolatingNodeAffinityParams(params *api.StrategyParameters) err
}
// RemovePodsViolatingNodeAffinity evicts pods on nodes which violate node affinity
func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) {
if err := validatePodsViolatingNodeAffinityParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemovePodsViolatingNodeAffinity parameters")
return
@@ -58,10 +59,10 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter
return
}
var includedNamespaces, excludedNamespaces []string
var includedNamespaces, excludedNamespaces sets.String
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
includedNamespaces = sets.NewString(strategy.Params.Namespaces.Include...)
excludedNamespaces = sets.NewString(strategy.Params.Namespaces.Exclude...)
}
nodeFit := false
@@ -71,6 +72,16 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
podFilter, err := podutil.NewOptions().
WithNamespaces(includedNamespaces).
WithoutNamespaces(excludedNamespaces).
WithLabelSelector(strategy.Params.LabelSelector).
BuildFilterFunc()
if err != nil {
klog.ErrorS(err, "Error initializing pod filter function")
return
}
for _, nodeAffinity := range strategy.Params.NodeAffinityType {
klog.V(2).InfoS("Executing for nodeAffinityType", "nodeAffinity", nodeAffinity)
@@ -80,17 +91,13 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithFilter(func(pod *v1.Pod) bool {
node.Name,
getPodsAssignedToNode,
podutil.WrapFilterFuncs(podFilter, func(pod *v1.Pod) bool {
return evictable.IsEvictable(pod) &&
!nodeutil.PodFitsCurrentNode(pod, node) &&
nodeutil.PodFitsAnyNode(pod, nodes)
!nodeutil.PodFitsCurrentNode(getPodsAssignedToNode, pod, node) &&
nodeutil.PodFitsAnyNode(getPodsAssignedToNode, pod, nodes)
}),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(strategy.Params.LabelSelector),
)
if err != nil {
klog.ErrorS(err, "Failed to get pods", "node", klog.KObj(node))

View File

@@ -22,16 +22,18 @@ import (
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/test"
)
func TestRemovePodsViolatingNodeAffinity(t *testing.T) {
ctx := context.Background()
requiredDuringSchedulingIgnoredDuringExecutionStrategy := api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
@@ -59,10 +61,10 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) {
nodeWithoutLabels := test.BuildTestNode("nodeWithoutLabels", 2000, 3000, 10, nil)
unschedulableNodeWithLabels := test.BuildTestNode("unschedulableNodeWithLabels", 2000, 3000, 10, nil)
nodeWithLabels.Labels[nodeLabelKey] = nodeLabelValue
unschedulableNodeWithLabels.Labels[nodeLabelKey] = nodeLabelValue
unschedulableNodeWithLabels.Spec.Unschedulable = true
addPodsToNode := func(node *v1.Node) []v1.Pod {
addPodsToNode := func(node *v1.Node, deletionTimestamp *metav1.Time) []*v1.Pod {
podWithNodeAffinity := test.BuildTestPod("podWithNodeAffinity", 100, 0, node.Name, nil)
podWithNodeAffinity.Spec.Affinity = &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
@@ -91,20 +93,25 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) {
pod1.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
pod2.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
return []v1.Pod{
*podWithNodeAffinity,
*pod1,
*pod2,
pod1.DeletionTimestamp = deletionTimestamp
pod2.DeletionTimestamp = deletionTimestamp
return []*v1.Pod{
podWithNodeAffinity,
pod1,
pod2,
}
}
var uint1 uint = 1
tests := []struct {
description string
nodes []*v1.Node
pods []v1.Pod
strategy api.DeschedulerStrategy
expectedEvictedPodCount int
maxPodsToEvictPerNode int
description string
nodes []*v1.Node
pods []*v1.Pod
strategy api.DeschedulerStrategy
expectedEvictedPodCount uint
maxPodsToEvictPerNode *uint
maxNoOfPodsToEvictPerNamespace *uint
}{
{
description: "Invalid strategy type, should not evict any pods",
@@ -117,74 +124,117 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) {
},
},
expectedEvictedPodCount: 0,
pods: addPodsToNode(nodeWithoutLabels),
pods: addPodsToNode(nodeWithoutLabels, nil),
nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels},
maxPodsToEvictPerNode: 0,
},
{
description: "Pod is correctly scheduled on node, no eviction expected",
strategy: requiredDuringSchedulingIgnoredDuringExecutionStrategy,
expectedEvictedPodCount: 0,
pods: addPodsToNode(nodeWithLabels),
pods: addPodsToNode(nodeWithLabels, nil),
nodes: []*v1.Node{nodeWithLabels},
maxPodsToEvictPerNode: 0,
},
{
description: "Pod is scheduled on node without matching labels, another schedulable node available, should be evicted",
expectedEvictedPodCount: 1,
strategy: requiredDuringSchedulingIgnoredDuringExecutionStrategy,
pods: addPodsToNode(nodeWithoutLabels),
pods: addPodsToNode(nodeWithoutLabels, nil),
nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels},
maxPodsToEvictPerNode: 0,
},
{
description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 1, should not be evicted",
expectedEvictedPodCount: 1,
strategy: requiredDuringSchedulingIgnoredDuringExecutionStrategy,
pods: addPodsToNode(nodeWithoutLabels),
pods: addPodsToNode(nodeWithoutLabels, nil),
nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels},
maxPodsToEvictPerNode: 1,
maxPodsToEvictPerNode: &uint1,
},
{
description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 1, no pod evicted since pod terminting",
expectedEvictedPodCount: 1,
strategy: requiredDuringSchedulingIgnoredDuringExecutionStrategy,
pods: addPodsToNode(nodeWithoutLabels, &metav1.Time{}),
nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels},
maxPodsToEvictPerNode: &uint1,
},
{
description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 1, should not be evicted",
expectedEvictedPodCount: 1,
strategy: requiredDuringSchedulingIgnoredDuringExecutionStrategy,
pods: addPodsToNode(nodeWithoutLabels, nil),
nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels},
maxNoOfPodsToEvictPerNamespace: &uint1,
},
{
description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 1, no pod evicted since pod terminting",
expectedEvictedPodCount: 1,
strategy: requiredDuringSchedulingIgnoredDuringExecutionStrategy,
pods: addPodsToNode(nodeWithoutLabels, &metav1.Time{}),
nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels},
maxNoOfPodsToEvictPerNamespace: &uint1,
},
{
description: "Pod is scheduled on node without matching labels, but no node where pod fits is available, should not evict",
expectedEvictedPodCount: 0,
strategy: requiredDuringSchedulingIgnoredDuringExecutionWithNodeFitStrategy,
pods: addPodsToNode(nodeWithoutLabels),
pods: addPodsToNode(nodeWithoutLabels, nil),
nodes: []*v1.Node{nodeWithoutLabels, unschedulableNodeWithLabels},
maxPodsToEvictPerNode: 0,
},
{
description: "Pod is scheduled on node without matching labels, and node where pod fits is available, should evict",
expectedEvictedPodCount: 0,
strategy: requiredDuringSchedulingIgnoredDuringExecutionWithNodeFitStrategy,
pods: addPodsToNode(nodeWithoutLabels),
pods: addPodsToNode(nodeWithoutLabels, nil),
nodes: []*v1.Node{nodeWithLabels, unschedulableNodeWithLabels},
maxPodsToEvictPerNode: 1,
maxPodsToEvictPerNode: &uint1,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: tc.pods}, nil
var objs []runtime.Object
for _, node := range tc.nodes {
objs = append(objs, node)
}
for _, pod := range tc.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
tc.maxPodsToEvictPerNode,
tc.maxNoOfPodsToEvictPerNamespace,
tc.nodes,
getPodsAssignedToNode,
false,
false,
false,
false,
false,
)
RemovePodsViolatingNodeAffinity(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount)
}
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
tc.maxPodsToEvictPerNode,
tc.nodes,
false,
false,
false,
)
RemovePodsViolatingNodeAffinity(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount)
}
}
}

View File

@@ -20,6 +20,7 @@ import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/util/sets"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
@@ -48,18 +49,21 @@ func validateRemovePodsViolatingNodeTaintsParams(params *api.StrategyParameters)
}
// RemovePodsViolatingNodeTaints evicts pods on the node which violate NoSchedule Taints on nodes
func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) {
if err := validateRemovePodsViolatingNodeTaintsParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemovePodsViolatingNodeTaints parameters")
return
}
var includedNamespaces, excludedNamespaces []string
var includedNamespaces, excludedNamespaces, excludedTaints sets.String
var labelSelector *metav1.LabelSelector
if strategy.Params != nil {
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
includedNamespaces = sets.NewString(strategy.Params.Namespaces.Include...)
excludedNamespaces = sets.NewString(strategy.Params.Namespaces.Exclude...)
}
if strategy.Params.ExcludedTaints != nil {
excludedTaints = sets.NewString(strategy.Params.ExcludedTaints...)
}
labelSelector = strategy.Params.LabelSelector
}
@@ -77,17 +81,32 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
podFilter, err := podutil.NewOptions().
WithFilter(evictable.IsEvictable).
WithNamespaces(includedNamespaces).
WithoutNamespaces(excludedNamespaces).
WithLabelSelector(labelSelector).
BuildFilterFunc()
if err != nil {
klog.ErrorS(err, "Error initializing pod filter function")
return
}
excludeTaint := func(taint *v1.Taint) bool {
// Exclude taints by key *or* key=value
return excludedTaints.Has(taint.Key) || (taint.Value != "" && excludedTaints.Has(fmt.Sprintf("%s=%s", taint.Key, taint.Value)))
}
taintFilterFnc := func(taint *v1.Taint) bool { return (taint.Effect == v1.TaintEffectNoSchedule) && !excludeTaint(taint) }
if strategy.Params != nil && strategy.Params.IncludePreferNoSchedule {
taintFilterFnc = func(taint *v1.Taint) bool {
return (taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectPreferNoSchedule) && !excludeTaint(taint)
}
}
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithFilter(evictable.IsEvictable),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(labelSelector),
)
pods, err := podutil.ListAllPodsOnANode(node.Name, getPodsAssignedToNode, podFilter)
if err != nil {
//no pods evicted as error encountered retrieving evictable Pods
return
@@ -97,7 +116,7 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa
if !utils.TolerationsTolerateTaintsWithFilter(
pods[i].Spec.Tolerations,
node.Spec.Taints,
func(taint *v1.Taint) bool { return taint.Effect == v1.TaintEffectNoSchedule },
taintFilterFnc,
) {
klog.V(2).InfoS("Not all taints with NoSchedule effect are tolerated after update for pod on node", "pod", klog.KObj(pods[i]), "node", klog.KObj(node))
if _, err := podEvictor.EvictPod(ctx, pods[i], node, "NodeTaint"); err != nil {

View File

@@ -9,10 +9,12 @@ import (
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
"sigs.k8s.io/descheduler/test"
)
@@ -25,6 +27,14 @@ func createNoScheduleTaint(key, value string, index int) v1.Taint {
}
}
func createPreferNoScheduleTaint(key, value string, index int) v1.Taint {
return v1.Taint{
Key: "testTaint" + fmt.Sprintf("%v", index),
Value: "test" + fmt.Sprintf("%v", index),
Effect: v1.TaintEffectPreferNoSchedule,
}
}
func addTaintsToNode(node *v1.Node, key, value string, indices []int) *v1.Node {
taints := []v1.Taint{}
for _, index := range indices {
@@ -34,22 +44,21 @@ func addTaintsToNode(node *v1.Node, key, value string, indices []int) *v1.Node {
return node
}
func addTolerationToPod(pod *v1.Pod, key, value string, index int) *v1.Pod {
func addTolerationToPod(pod *v1.Pod, key, value string, index int, effect v1.TaintEffect) *v1.Pod {
if pod.Annotations == nil {
pod.Annotations = map[string]string{}
}
pod.Spec.Tolerations = []v1.Toleration{{Key: key + fmt.Sprintf("%v", index), Value: value + fmt.Sprintf("%v", index), Effect: v1.TaintEffectNoSchedule}}
pod.Spec.Tolerations = []v1.Toleration{{Key: key + fmt.Sprintf("%v", index), Value: value + fmt.Sprintf("%v", index), Effect: effect}}
return pod
}
func TestDeletePodsViolatingNodeTaints(t *testing.T) {
ctx := context.Background()
node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
node1 = addTaintsToNode(node1, "testTaint", "test", []int{1})
node2 := test.BuildTestNode("n2", 2000, 3000, 10, nil)
node1 = addTaintsToNode(node2, "testingTaint", "testing", []int{1})
node2 = addTaintsToNode(node2, "testingTaint", "testing", []int{1})
node3 := test.BuildTestNode("n3", 2000, 3000, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
@@ -62,6 +71,16 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
}
})
node5 := test.BuildTestNode("n5", 2000, 3000, 10, nil)
node5.Spec.Taints = []v1.Taint{
createPreferNoScheduleTaint("testTaint", "test", 1),
}
node6 := test.BuildTestNode("n6", 1, 1, 1, nil)
node6.Spec.Taints = []v1.Taint{
createPreferNoScheduleTaint("testTaint", "test", 1),
}
p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil)
p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil)
p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil)
@@ -108,152 +127,240 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
// A Mirror Pod.
p10.Annotations = test.GetMirrorPodAnnotation()
p1 = addTolerationToPod(p1, "testTaint", "test", 1)
p3 = addTolerationToPod(p3, "testTaint", "test", 1)
p4 = addTolerationToPod(p4, "testTaintX", "testX", 1)
p1 = addTolerationToPod(p1, "testTaint", "test", 1, v1.TaintEffectNoSchedule)
p3 = addTolerationToPod(p3, "testTaint", "test", 1, v1.TaintEffectNoSchedule)
p4 = addTolerationToPod(p4, "testTaintX", "testX", 1, v1.TaintEffectNoSchedule)
p12.Spec.NodeSelector = map[string]string{
"datacenter": "west",
}
p13 := test.BuildTestPod("p13", 100, 0, node5.Name, nil)
p13.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
// node5 has PreferNoSchedule:testTaint1=test1, so the p13 has to have
// PreferNoSchedule:testTaint0=test0 so the pod is not tolarated
p13 = addTolerationToPod(p13, "testTaint", "test", 0, v1.TaintEffectPreferNoSchedule)
var uint1 uint = 1
tests := []struct {
description string
nodes []*v1.Node
pods []v1.Pod
evictLocalStoragePods bool
evictSystemCriticalPods bool
maxPodsToEvictPerNode int
expectedEvictedPodCount int
nodeFit bool
description string
nodes []*v1.Node
pods []*v1.Pod
evictLocalStoragePods bool
evictSystemCriticalPods bool
maxPodsToEvictPerNode *uint
maxNoOfPodsToEvictPerNamespace *uint
expectedEvictedPodCount uint
nodeFit bool
includePreferNoSchedule bool
excludedTaints []string
}{
{
description: "Pods not tolerating node taint should be evicted",
pods: []v1.Pod{*p1, *p2, *p3},
pods: []*v1.Pod{p1, p2, p3},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 1, //p2 gets evicted
},
{
description: "Pods with tolerations but not tolerating node taint should be evicted",
pods: []v1.Pod{*p1, *p3, *p4},
pods: []*v1.Pod{p1, p3, p4},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 1, //p4 gets evicted
},
{
description: "Only <maxPodsToEvictPerNode> number of Pods not tolerating node taint should be evicted",
pods: []v1.Pod{*p1, *p5, *p6},
pods: []*v1.Pod{p1, p5, p6},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 1,
maxPodsToEvictPerNode: &uint1,
expectedEvictedPodCount: 1, //p5 or p6 gets evicted
},
{
description: "Only <maxNoOfPodsToEvictPerNamespace> number of Pods not tolerating node taint should be evicted",
pods: []*v1.Pod{p1, p5, p6},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxNoOfPodsToEvictPerNamespace: &uint1,
expectedEvictedPodCount: 1, //p5 or p6 gets evicted
},
{
description: "Critical pods not tolerating node taint should not be evicted",
pods: []v1.Pod{*p7, *p8, *p9, *p10},
pods: []*v1.Pod{p7, p8, p9, p10},
nodes: []*v1.Node{node2},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 0, //nothing is evicted
},
{
description: "Critical pods except storage pods not tolerating node taint should not be evicted",
pods: []v1.Pod{*p7, *p8, *p9, *p10},
pods: []*v1.Pod{p7, p8, p9, p10},
nodes: []*v1.Node{node2},
evictLocalStoragePods: true,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 1, //p9 gets evicted
},
{
description: "Critical and non critical pods, only non critical pods not tolerating node taint should be evicted",
pods: []v1.Pod{*p7, *p8, *p10, *p11},
pods: []*v1.Pod{p7, p8, p10, p11},
nodes: []*v1.Node{node2},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 1, //p11 gets evicted
},
{
description: "Critical and non critical pods, pods not tolerating node taint should be evicted even if they are critical",
pods: []v1.Pod{*p2, *p7, *p9, *p10},
nodes: []*v1.Node{node2},
pods: []*v1.Pod{p2, p7, p9, p10},
nodes: []*v1.Node{node1, node2},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 2, //p2 and p7 are evicted
},
{
description: "Pod p2 doesn't tolerate taint on it's node, but also doesn't tolerate taints on other nodes",
pods: []v1.Pod{*p1, *p2, *p3},
pods: []*v1.Pod{p1, p2, p3},
nodes: []*v1.Node{node1, node2},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 0, //p2 gets evicted
nodeFit: true,
},
{
description: "Pod p12 doesn't tolerate taint on it's node, but other nodes don't match it's selector",
pods: []v1.Pod{*p1, *p3, *p12},
pods: []*v1.Pod{p1, p3, p12},
nodes: []*v1.Node{node1, node3},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 0, //p2 gets evicted
nodeFit: true,
},
{
description: "Pod p2 doesn't tolerate taint on it's node, but other nodes are unschedulable",
pods: []v1.Pod{*p1, *p2, *p3},
pods: []*v1.Pod{p1, p2, p3},
nodes: []*v1.Node{node1, node4},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
maxPodsToEvictPerNode: 0,
expectedEvictedPodCount: 0, //p2 gets evicted
nodeFit: true,
},
{
description: "Pods not tolerating PreferNoSchedule node taint should not be evicted when not enabled",
pods: []*v1.Pod{p13},
nodes: []*v1.Node{node5},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
expectedEvictedPodCount: 0,
},
{
description: "Pods not tolerating PreferNoSchedule node taint should be evicted when enabled",
pods: []*v1.Pod{p13},
nodes: []*v1.Node{node5},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
includePreferNoSchedule: true,
expectedEvictedPodCount: 1, // p13 gets evicted
},
{
description: "Pods not tolerating excluded node taints (by key) should not be evicted",
pods: []*v1.Pod{p2},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
excludedTaints: []string{"excludedTaint1", "testTaint1"},
expectedEvictedPodCount: 0, // nothing gets evicted, as one of the specified excludedTaints matches the key of node1's taint
},
{
description: "Pods not tolerating excluded node taints (by key and value) should not be evicted",
pods: []*v1.Pod{p2},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
excludedTaints: []string{"testTaint1=test1"},
expectedEvictedPodCount: 0, // nothing gets evicted, as both the key and value of the excluded taint match node1's taint
},
{
description: "The excluded taint matches the key of node1's taint, but does not match the value",
pods: []*v1.Pod{p2},
nodes: []*v1.Node{node1},
evictLocalStoragePods: false,
evictSystemCriticalPods: false,
excludedTaints: []string{"testTaint1=test2"},
expectedEvictedPodCount: 1, // pod gets evicted, as excluded taint value does not match node1's taint value
},
{
description: "Critical and non critical pods, pods not tolerating node taint can't be evicted because the only available node does not have enough resources.",
pods: []*v1.Pod{p2, p7, p9, p10},
nodes: []*v1.Node{node1, node6},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
expectedEvictedPodCount: 0, //p2 and p7 can't be evicted
nodeFit: true,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
// create fake client
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: tc.pods}, nil
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
for _, node := range tc.nodes {
objs = append(objs, node)
}
for _, pod := range tc.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
tc.maxPodsToEvictPerNode,
tc.maxNoOfPodsToEvictPerNamespace,
tc.nodes,
getPodsAssignedToNode,
tc.evictLocalStoragePods,
tc.evictSystemCriticalPods,
false,
false,
false,
)
strategy := api.DeschedulerStrategy{
Params: &api.StrategyParameters{
NodeFit: tc.nodeFit,
IncludePreferNoSchedule: tc.includePreferNoSchedule,
ExcludedTaints: tc.excludedTaints,
},
}
RemovePodsViolatingNodeTaints(ctx, fakeClient, strategy, tc.nodes, podEvictor, getPodsAssignedToNode)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, Unexpected no of pods evicted: pods evicted: %d, expected: %d", tc.description, actualEvictedPodCount, tc.expectedEvictedPodCount)
}
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
tc.maxPodsToEvictPerNode,
tc.nodes,
tc.evictLocalStoragePods,
tc.evictSystemCriticalPods,
false,
)
strategy := api.DeschedulerStrategy{
Params: &api.StrategyParameters{
NodeFit: tc.nodeFit,
},
}
RemovePodsViolatingNodeTaints(ctx, fakeClient, strategy, tc.nodes, podEvictor)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, Unexpected no of pods evicted: pods evicted: %d, expected: %d", tc.description, actualEvictedPodCount, tc.expectedEvictedPodCount)
}
}
}
func TestToleratesTaint(t *testing.T) {

View File

@@ -24,15 +24,17 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
)
// HighNodeUtilization evicts pods from under utilized nodes so that scheduler can schedule according to its strategy.
// Note that CPU/Memory requests are used to calculate nodes' utilization and not the actual resource usage.
func HighNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
func HighNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) {
if err := validateNodeUtilizationParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid HighNodeUtilization parameters")
return
@@ -61,16 +63,17 @@ func HighNodeUtilization(ctx context.Context, client clientset.Interface, strate
resourceNames := getResourceNames(targetThresholds)
sourceNodes, highNodes := classifyNodes(
getNodeUsage(ctx, client, nodes, thresholds, targetThresholds, resourceNames),
func(node *v1.Node, usage NodeUsage) bool {
return isNodeWithLowUtilization(usage)
getNodeUsage(nodes, resourceNames, getPodsAssignedToNode),
getNodeThresholds(nodes, thresholds, targetThresholds, resourceNames, getPodsAssignedToNode, false),
func(node *v1.Node, usage NodeUsage, threshold NodeThresholds) bool {
return isNodeWithLowUtilization(usage, threshold.lowResourceThreshold)
},
func(node *v1.Node, usage NodeUsage) bool {
func(node *v1.Node, usage NodeUsage, threshold NodeThresholds) bool {
if nodeutil.IsNodeUnschedulable(node) {
klog.V(2).InfoS("Node is unschedulable", "node", klog.KObj(node))
return false
}
return !isNodeWithLowUtilization(usage)
return !isNodeWithLowUtilization(usage, threshold.lowResourceThreshold)
})
// log message in one line
@@ -80,7 +83,7 @@ func HighNodeUtilization(ctx context.Context, client clientset.Interface, strate
"Pods", thresholds[v1.ResourcePods],
}
for name := range thresholds {
if !isBasicResource(name) {
if !nodeutil.IsBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), int64(thresholds[name]))
}
}
@@ -92,8 +95,8 @@ func HighNodeUtilization(ctx context.Context, client clientset.Interface, strate
klog.V(1).InfoS("No node is underutilized, nothing to do here, you might tune your thresholds further")
return
}
if len(sourceNodes) < strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes {
klog.V(1).InfoS("Number of nodes underutilized is less than NumberOfNodes, nothing to do here", "underutilizedNodes", len(sourceNodes), "numberOfNodes", strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes)
if len(sourceNodes) <= strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes {
klog.V(1).InfoS("Number of nodes underutilized is less or equal than NumberOfNodes, nothing to do here", "underutilizedNodes", len(sourceNodes), "numberOfNodes", strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes)
return
}
if len(sourceNodes) == len(nodes) {
@@ -108,7 +111,7 @@ func HighNodeUtilization(ctx context.Context, client clientset.Interface, strate
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
// stop if the total available usage has dropped to zero - no more pods can be scheduled
continueEvictionCond := func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool {
continueEvictionCond := func(nodeInfo NodeInfo, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool {
for name := range totalAvailableUsage {
if totalAvailableUsage[name].CmpInt64(0) < 1 {
return false
@@ -117,6 +120,10 @@ func HighNodeUtilization(ctx context.Context, client clientset.Interface, strate
return true
}
// Sort the nodes by the usage in ascending order
sortNodesByUsage(sourceNodes, true)
evictPodsFromSourceNodes(
ctx,
sourceNodes,
@@ -157,7 +164,7 @@ func setDefaultForThresholds(thresholds, targetThresholds api.ResourceThresholds
targetThresholds[v1.ResourceMemory] = MaxResourcePercentage
for name := range thresholds {
if !isBasicResource(name) {
if !nodeutil.IsBasicResource(name) {
targetThresholds[name] = MaxResourcePercentage
}
}

View File

@@ -19,23 +19,24 @@ package nodeutilization
import (
"context"
"fmt"
"strings"
"testing"
v1 "k8s.io/api/core/v1"
"k8s.io/api/policy/v1beta1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
"sigs.k8s.io/descheduler/test"
)
func TestHighNodeUtilization(t *testing.T) {
ctx := context.Background()
n1NodeName := "n1"
n2NodeName := "n2"
n3NodeName := "n3"
@@ -44,13 +45,12 @@ func TestHighNodeUtilization(t *testing.T) {
nodeSelectorValue := "west"
testCases := []struct {
name string
thresholds api.ResourceThresholds
nodes map[string]*v1.Node
pods map[string]*v1.PodList
maxPodsToEvictPerNode int
expectedPodsEvicted int
evictedPods []string
name string
thresholds api.ResourceThresholds
nodes []*v1.Node
pods []*v1.Pod
expectedPodsEvicted uint
evictedPods []string
}{
{
name: "no node below threshold usage",
@@ -58,39 +58,26 @@ func TestHighNodeUtilization(t *testing.T) {
v1.ResourceCPU: 20,
v1.ResourcePods: 20,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil),
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p8", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p9", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
pods: []*v1.Pod{
// These won't be evicted.
test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
// These won't be evicted.
test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p8", 400, 0, n3NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p9", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
expectedPodsEvicted: 0,
},
{
name: "no evictable pods",
@@ -98,57 +85,44 @@ func TestHighNodeUtilization(t *testing.T) {
v1.ResourceCPU: 40,
v1.ResourcePods: 40,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil),
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with local storage.
test.SetNormalOwnerRef(pod)
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
}
// A Mirror Pod.
pod.Annotations = test.GetMirrorPodAnnotation()
}),
*test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetDSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetDSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p5", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p8", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
pods: []*v1.Pod{
// These won't be evicted.
test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with local storage.
test.SetNormalOwnerRef(pod)
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
}
// A Mirror Pod.
pod.Annotations = test.GetMirrorPodAnnotation()
}),
test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
// These won't be evicted.
test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetDSOwnerRef),
test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetDSOwnerRef),
test.BuildTestPod("p5", 400, 0, n3NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p6", 400, 0, n3NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p8", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
expectedPodsEvicted: 0,
},
{
name: "no node to schedule evicted pods",
@@ -156,34 +130,21 @@ func TestHighNodeUtilization(t *testing.T) {
v1.ResourceCPU: 20,
v1.ResourcePods: 20,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
// These can't be evicted.
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
// These can't be evicted.
*test.BuildTestPod("p2", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p3", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n3NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
pods: []*v1.Pod{
// These can't be evicted.
test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
// These can't be evicted.
test.BuildTestPod("p2", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p3", 400, 0, n3NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p4", 400, 0, n3NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p5", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
expectedPodsEvicted: 0,
},
{
name: "without priorities",
@@ -191,42 +152,29 @@ func TestHighNodeUtilization(t *testing.T) {
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
*test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
// These won't be evicted.
test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p7", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 2,
evictedPods: []string{"p1", "p7"},
expectedPodsEvicted: 2,
evictedPods: []string{"p1", "p7"},
},
{
name: "without priorities stop when resource capacity is depleted",
@@ -234,117 +182,79 @@ func TestHighNodeUtilization(t *testing.T) {
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 2000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 2000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 2000, 3000, 10, test.SetNodeUnschedulable),
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 2000, 3000, 10, nil),
test.BuildTestNode(n2NodeName, 2000, 3000, 10, nil),
test.BuildTestNode(n3NodeName, 2000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p2", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p6", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
test.BuildTestPod("p2", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p6", 400, 0, n3NodeName, test.SetRSOwnerRef),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 1,
expectedPodsEvicted: 1,
},
{
name: "with priorities",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 2000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 2000, 3000, 10, test.SetNodeUnschedulable),
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n2NodeName, 2000, 3000, 10, nil),
test.BuildTestNode(n3NodeName, 2000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodPriority(pod, lowPriority)
}),
*test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodPriority(pod, highPriority)
}),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p7", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p8", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p9", 400, 0, n3NodeName, test.SetDSOwnerRef),
},
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodPriority(pod, lowPriority)
}),
test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodPriority(pod, highPriority)
}),
// These won't be evicted.
test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p7", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p8", 400, 0, n2NodeName, test.SetRSOwnerRef),
// These won't be evicted.
test.BuildTestPod("p9", 400, 0, n3NodeName, test.SetDSOwnerRef),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 1,
evictedPods: []string{"p1"},
expectedPodsEvicted: 1,
evictedPods: []string{"p1"},
},
{
name: "without priorities evicting best-effort pods only",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 3000, 3000, 10, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 3000, 3000, 5, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 3000, 3000, 10, test.SetNodeUnschedulable),
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 3000, 3000, 10, nil),
test.BuildTestNode(n2NodeName, 3000, 3000, 5, nil),
test.BuildTestNode(n3NodeName, 3000, 3000, 10, test.SetNodeUnschedulable),
},
// All pods are assumed to be burstable (test.BuildTestNode always sets both cpu/memory resource requests to some value)
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.MakeBestEffortPod(pod)
}),
*test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
}),
},
},
n2NodeName: {
Items: []v1.Pod{
// These won't be evicted.
*test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{},
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.MakeBestEffortPod(pod)
}),
test.BuildTestPod("p2", 400, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
}),
// These won't be evicted.
test.BuildTestPod("p3", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p4", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p5", 400, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p6", 400, 0, n2NodeName, test.SetRSOwnerRef),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 1,
evictedPods: []string{"p1"},
expectedPodsEvicted: 1,
evictedPods: []string{"p1"},
},
{
name: "with extended resource",
@@ -352,60 +262,44 @@ func TestHighNodeUtilization(t *testing.T) {
v1.ResourceCPU: 20,
extendedResource: 40,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) {
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 8)
}),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 8)
}),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
// These won't be evicted
*test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetDSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p3", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p4", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p5", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p6", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
},
},
n3NodeName: {
Items: []v1.Pod{},
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
// These won't be evicted
test.BuildTestPod("p3", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
test.BuildTestPod("p4", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
test.BuildTestPod("p5", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
test.BuildTestPod("p6", 500, 0, n2NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 2,
evictedPods: []string{"p1", "p2"},
expectedPodsEvicted: 2,
evictedPods: []string{"p1", "p2"},
},
{
name: "with extended resource in some of nodes",
@@ -413,41 +307,29 @@ func TestHighNodeUtilization(t *testing.T) {
v1.ResourceCPU: 40,
extendedResource: 40,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) {
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 10, func(node *v1.Node) {
test.SetNodeExtendedResource(node, extendedResource, 8)
}),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
n3NodeName: test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
//These won't be evicted
*test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
*test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p3", 500, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 500, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p5", 500, 0, n2NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p6", 500, 0, n2NodeName, test.SetRSOwnerRef),
},
},
n3NodeName: {
Items: []v1.Pod{},
},
pods: []*v1.Pod{
//These won't be evicted
test.BuildTestPod("p1", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
test.BuildTestPod("p2", 100, 0, n1NodeName, func(pod *v1.Pod) {
test.SetRSOwnerRef(pod)
test.SetPodExtendedResourceRequest(pod, extendedResource, 1)
}),
test.BuildTestPod("p3", 500, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p4", 500, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p5", 500, 0, n2NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p6", 500, 0, n2NodeName, test.SetRSOwnerRef),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
expectedPodsEvicted: 0,
},
{
name: "Other node match pod node selector",
@@ -455,37 +337,28 @@ func TestHighNodeUtilization(t *testing.T) {
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, func(node *v1.Node) {
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 9, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetDSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p5", 400, 0, n2NodeName, func(pod *v1.Pod) {
// A pod selecting nodes in the "west" datacenter
test.SetRSOwnerRef(pod)
pod.Spec.NodeSelector = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
},
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetDSOwnerRef),
test.BuildTestPod("p5", 400, 0, n2NodeName, func(pod *v1.Pod) {
// A pod selecting nodes in the "west" datacenter
test.SetRSOwnerRef(pod)
pod.Spec.NodeSelector = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 1,
expectedPodsEvicted: 1,
},
{
name: "Other node does not match pod node selector",
@@ -493,67 +366,100 @@ func TestHighNodeUtilization(t *testing.T) {
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: map[string]*v1.Node{
n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
},
pods: map[string]*v1.PodList{
n1NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
*test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetDSOwnerRef),
},
},
n2NodeName: {
Items: []v1.Pod{
*test.BuildTestPod("p5", 400, 0, n2NodeName, func(pod *v1.Pod) {
// A pod selecting nodes in the "west" datacenter
test.SetRSOwnerRef(pod)
pod.Spec.NodeSelector = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
},
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetDSOwnerRef),
test.BuildTestPod("p5", 400, 0, n2NodeName, func(pod *v1.Pod) {
// A pod selecting nodes in the "west" datacenter
test.SetRSOwnerRef(pod)
pod.Spec.NodeSelector = map[string]string{
nodeSelectorKey: nodeSelectorValue,
}
}),
},
maxPodsToEvictPerNode: 0,
expectedPodsEvicted: 0,
expectedPodsEvicted: 0,
},
{
name: "Other node does not have enough Memory",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 200, 9, nil),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 50, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p2", 400, 50, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p3", 400, 50, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p4", 400, 50, n1NodeName, test.SetDSOwnerRef),
test.BuildTestPod("p5", 400, 100, n2NodeName, func(pod *v1.Pod) {
// A pod requesting more memory than is available on node1
test.SetRSOwnerRef(pod)
}),
},
expectedPodsEvicted: 0,
},
{
name: "Other node does not have enough Memory",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 4000, 200, 9, nil),
test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil),
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 400, 50, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p2", 400, 50, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p3", 400, 50, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p4", 400, 50, n1NodeName, test.SetDSOwnerRef),
test.BuildTestPod("p5", 400, 100, n2NodeName, func(pod *v1.Pod) {
// A pod requesting more memory than is available on node1
test.SetRSOwnerRef(pod)
}),
},
expectedPodsEvicted: 0,
},
}
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
list := action.(core.ListAction)
fieldString := list.GetListRestrictions().Fields.String()
if strings.Contains(fieldString, n1NodeName) {
return true, test.pods[n1NodeName], nil
}
if strings.Contains(fieldString, n2NodeName) {
return true, test.pods[n2NodeName], nil
}
if strings.Contains(fieldString, n3NodeName) {
return true, test.pods[n3NodeName], nil
}
return true, nil, fmt.Errorf("Failed to list: %v", list)
})
fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) {
getAction := action.(core.GetAction)
if node, exists := test.nodes[getAction.GetName()]; exists {
return true, node, nil
}
return true, nil, fmt.Errorf("Wrong node: %v", getAction.GetName())
})
for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
for _, node := range testCase.nodes {
objs = append(objs, node)
}
for _, pod := range testCase.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
podsForEviction := make(map[string]struct{})
for _, pod := range test.evictedPods {
for _, pod := range testCase.evictedPods {
podsForEviction[pod] = struct{}{}
}
evictionFailed := false
if len(test.evictedPods) > 0 {
if len(testCase.evictedPods) > 0 {
fakeClient.Fake.AddReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) {
getAction := action.(core.CreateAction)
obj := getAction.GetObject()
@@ -568,17 +474,42 @@ func TestHighNodeUtilization(t *testing.T) {
})
}
var nodes []*v1.Node
for _, node := range test.nodes {
nodes = append(nodes, node)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
//fakeClient := &fake.Clientset{}
//fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
// list := action.(core.ListAction)
// fieldString := list.GetListRestrictions().Fields.String()
// if strings.Contains(fieldString, n1NodeName) {
// return true, test.pods[n1NodeName], nil
// }
// if strings.Contains(fieldString, n2NodeName) {
// return true, test.pods[n2NodeName], nil
// }
// if strings.Contains(fieldString, n3NodeName) {
// return true, test.pods[n3NodeName], nil
// }
// return true, nil, fmt.Errorf("Failed to list: %v", list)
//})
//fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) {
// getAction := action.(core.GetAction)
// if node, exists := testCase.nodes[getAction.GetName()]; exists {
// return true, node, nil
// }
// return true, nil, fmt.Errorf("Wrong node: %v", getAction.GetName())
//})
podEvictor := evictions.NewPodEvictor(
fakeClient,
"v1",
false,
test.maxPodsToEvictPerNode,
nodes,
nil,
nil,
testCase.nodes,
getPodsAssignedToNode,
false,
false,
false,
false,
false,
@@ -588,16 +519,16 @@ func TestHighNodeUtilization(t *testing.T) {
Enabled: true,
Params: &api.StrategyParameters{
NodeResourceUtilizationThresholds: &api.NodeResourceUtilizationThresholds{
Thresholds: test.thresholds,
Thresholds: testCase.thresholds,
},
NodeFit: true,
},
}
HighNodeUtilization(ctx, fakeClient, strategy, nodes, podEvictor)
HighNodeUtilization(ctx, fakeClient, strategy, testCase.nodes, podEvictor, getPodsAssignedToNode)
podsEvicted := podEvictor.TotalEvicted()
if test.expectedPodsEvicted != podsEvicted {
t.Errorf("Expected %#v pods to be evicted but %#v got evicted", test.expectedPodsEvicted, podsEvicted)
if testCase.expectedPodsEvicted != podsEvicted {
t.Errorf("Expected %v pods to be evicted but %v got evicted", testCase.expectedPodsEvicted, podsEvicted)
}
if evictionFailed {
t.Errorf("Pod evictions failed unexpectedly")
@@ -674,7 +605,6 @@ func TestValidateHighNodeUtilizationStrategyConfig(t *testing.T) {
}
func TestHighNodeUtilizationWithTaints(t *testing.T) {
ctx := context.Background()
strategy := api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
@@ -710,7 +640,7 @@ func TestHighNodeUtilizationWithTaints(t *testing.T) {
name string
nodes []*v1.Node
pods []*v1.Pod
evictionsExpected int
evictionsExpected uint
}{
{
name: "No taints",
@@ -752,6 +682,9 @@ func TestHighNodeUtilizationWithTaints(t *testing.T) {
for _, item := range tests {
t.Run(item.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
for _, node := range item.nodes {
objs = append(objs, node)
@@ -762,19 +695,33 @@ func TestHighNodeUtilizationWithTaints(t *testing.T) {
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
"policy/v1",
false,
item.evictionsExpected,
&item.evictionsExpected,
nil,
item.nodes,
getPodsAssignedToNode,
false,
false,
false,
false,
false,
)
HighNodeUtilization(ctx, fakeClient, strategy, item.nodes, podEvictor)
HighNodeUtilization(ctx, fakeClient, strategy, item.nodes, podEvictor, getPodsAssignedToNode)
if item.evictionsExpected != podEvictor.TotalEvicted() {
t.Errorf("Expected %v evictions, got %v", item.evictionsExpected, podEvictor.TotalEvicted())

View File

@@ -19,6 +19,7 @@ package nodeutilization
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
clientset "k8s.io/client-go/kubernetes"
@@ -27,12 +28,13 @@ import (
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
)
// LowNodeUtilization evicts pods from overutilized nodes to underutilized nodes. Note that CPU/Memory requests are used
// to calculate nodes' utilization and not the actual resource usage.
func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
func LowNodeUtilization(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) {
// TODO: May be create a struct for the strategy as well, so that we don't have to pass along the all the params?
if err := validateNodeUtilizationParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid LowNodeUtilization parameters")
@@ -48,40 +50,57 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg
if strategy.Params != nil {
nodeFit = strategy.Params.NodeFit
}
useDeviationThresholds := strategy.Params.NodeResourceUtilizationThresholds.UseDeviationThresholds
thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds
targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds
if err := validateLowUtilizationStrategyConfig(thresholds, targetThresholds); err != nil {
if err := validateLowUtilizationStrategyConfig(thresholds, targetThresholds, useDeviationThresholds); err != nil {
klog.ErrorS(err, "LowNodeUtilization config is not valid")
return
}
// check if Pods/CPU/Mem are set, if not, set them to 100
if _, ok := thresholds[v1.ResourcePods]; !ok {
thresholds[v1.ResourcePods] = MaxResourcePercentage
targetThresholds[v1.ResourcePods] = MaxResourcePercentage
if useDeviationThresholds {
thresholds[v1.ResourcePods] = MinResourcePercentage
targetThresholds[v1.ResourcePods] = MinResourcePercentage
} else {
thresholds[v1.ResourcePods] = MaxResourcePercentage
targetThresholds[v1.ResourcePods] = MaxResourcePercentage
}
}
if _, ok := thresholds[v1.ResourceCPU]; !ok {
thresholds[v1.ResourceCPU] = MaxResourcePercentage
targetThresholds[v1.ResourceCPU] = MaxResourcePercentage
if useDeviationThresholds {
thresholds[v1.ResourceCPU] = MinResourcePercentage
targetThresholds[v1.ResourceCPU] = MinResourcePercentage
} else {
thresholds[v1.ResourceCPU] = MaxResourcePercentage
targetThresholds[v1.ResourceCPU] = MaxResourcePercentage
}
}
if _, ok := thresholds[v1.ResourceMemory]; !ok {
thresholds[v1.ResourceMemory] = MaxResourcePercentage
targetThresholds[v1.ResourceMemory] = MaxResourcePercentage
if useDeviationThresholds {
thresholds[v1.ResourceMemory] = MinResourcePercentage
targetThresholds[v1.ResourceMemory] = MinResourcePercentage
} else {
thresholds[v1.ResourceMemory] = MaxResourcePercentage
targetThresholds[v1.ResourceMemory] = MaxResourcePercentage
}
}
resourceNames := getResourceNames(thresholds)
lowNodes, sourceNodes := classifyNodes(
getNodeUsage(ctx, client, nodes, thresholds, targetThresholds, resourceNames),
getNodeUsage(nodes, resourceNames, getPodsAssignedToNode),
getNodeThresholds(nodes, thresholds, targetThresholds, resourceNames, getPodsAssignedToNode, useDeviationThresholds),
// The node has to be schedulable (to be able to move workload there)
func(node *v1.Node, usage NodeUsage) bool {
func(node *v1.Node, usage NodeUsage, threshold NodeThresholds) bool {
if nodeutil.IsNodeUnschedulable(node) {
klog.V(2).InfoS("Node is unschedulable, thus not considered as underutilized", "node", klog.KObj(node))
return false
}
return isNodeWithLowUtilization(usage)
return isNodeWithLowUtilization(usage, threshold.lowResourceThreshold)
},
func(node *v1.Node, usage NodeUsage) bool {
return isNodeAboveTargetUtilization(usage)
func(node *v1.Node, usage NodeUsage, threshold NodeThresholds) bool {
return isNodeAboveTargetUtilization(usage, threshold.highResourceThreshold)
},
)
@@ -92,7 +111,7 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg
"Pods", thresholds[v1.ResourcePods],
}
for name := range thresholds {
if !isBasicResource(name) {
if !nodeutil.IsBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), int64(thresholds[name]))
}
}
@@ -106,7 +125,7 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg
"Pods", targetThresholds[v1.ResourcePods],
}
for name := range targetThresholds {
if !isBasicResource(name) {
if !nodeutil.IsBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), int64(targetThresholds[name]))
}
}
@@ -118,8 +137,8 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg
return
}
if len(lowNodes) < strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes {
klog.V(1).InfoS("Number of nodes underutilized is less than NumberOfNodes, nothing to do here", "underutilizedNodes", len(lowNodes), "numberOfNodes", strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes)
if len(lowNodes) <= strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes {
klog.V(1).InfoS("Number of nodes underutilized is less or equal than NumberOfNodes, nothing to do here", "underutilizedNodes", len(lowNodes), "numberOfNodes", strategy.Params.NodeResourceUtilizationThresholds.NumberOfNodes)
return
}
@@ -136,8 +155,8 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
// stop if node utilization drops below target threshold or any of required capacity (cpu, memory, pods) is moved
continueEvictionCond := func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool {
if !isNodeAboveTargetUtilization(nodeUsage) {
continueEvictionCond := func(nodeInfo NodeInfo, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool {
if !isNodeAboveTargetUtilization(nodeInfo.NodeUsage, nodeInfo.thresholds.highResourceThreshold) {
return false
}
for name := range totalAvailableUsage {
@@ -149,6 +168,9 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg
return true
}
// Sort the nodes by the usage in descending order
sortNodesByUsage(sourceNodes, false)
evictPodsFromSourceNodes(
ctx,
sourceNodes,
@@ -163,7 +185,7 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg
}
// validateLowUtilizationStrategyConfig checks if the strategy's config is valid
func validateLowUtilizationStrategyConfig(thresholds, targetThresholds api.ResourceThresholds) error {
func validateLowUtilizationStrategyConfig(thresholds, targetThresholds api.ResourceThresholds, useDeviationThresholds bool) error {
// validate thresholds and targetThresholds config
if err := validateThresholds(thresholds); err != nil {
return fmt.Errorf("thresholds config is not valid: %v", err)
@@ -179,7 +201,7 @@ func validateLowUtilizationStrategyConfig(thresholds, targetThresholds api.Resou
for resourceName, value := range thresholds {
if targetValue, ok := targetThresholds[resourceName]; !ok {
return fmt.Errorf("thresholds and targetThresholds configured different resources")
} else if value > targetValue {
} else if value > targetValue && !useDeviationThresholds {
return fmt.Errorf("thresholds' %v percentage is greater than targetThresholds'", resourceName)
}
}

View File

@@ -19,15 +19,18 @@ package nodeutilization
import (
"context"
"fmt"
"sort"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
"sigs.k8s.io/descheduler/pkg/descheduler/node"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
"sort"
)
// NodeUsage stores a node's info, pods on it, thresholds and its resource usage
@@ -35,12 +38,19 @@ type NodeUsage struct {
node *v1.Node
usage map[v1.ResourceName]*resource.Quantity
allPods []*v1.Pod
}
type NodeThresholds struct {
lowResourceThreshold map[v1.ResourceName]*resource.Quantity
highResourceThreshold map[v1.ResourceName]*resource.Quantity
}
type continueEvictionCond func(nodeUsage NodeUsage, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool
type NodeInfo struct {
NodeUsage
thresholds NodeThresholds
}
type continueEvictionCond func(nodeInfo NodeInfo, totalAvailableUsage map[v1.ResourceName]*resource.Quantity) bool
// NodePodsMap is a set of (node, pods) pairs
type NodePodsMap map[*v1.Node][]*v1.Pod
@@ -65,7 +75,7 @@ func validateNodeUtilizationParams(params *api.StrategyParameters) error {
// validateThresholds checks if thresholds have valid resource name and resource percentage configured
func validateThresholds(thresholds api.ResourceThresholds) error {
if thresholds == nil || len(thresholds) == 0 {
if len(thresholds) == 0 {
return fmt.Errorf("no resource threshold is configured")
}
for name, percent := range thresholds {
@@ -76,64 +86,106 @@ func validateThresholds(thresholds api.ResourceThresholds) error {
return nil
}
func getNodeUsage(
ctx context.Context,
client clientset.Interface,
func normalizePercentage(percent api.Percentage) api.Percentage {
if percent > MaxResourcePercentage {
return MaxResourcePercentage
}
if percent < MinResourcePercentage {
return MinResourcePercentage
}
return percent
}
func getNodeThresholds(
nodes []*v1.Node,
lowThreshold, highThreshold api.ResourceThresholds,
resourceNames []v1.ResourceName,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
useDeviationThresholds bool,
) map[string]NodeThresholds {
nodeThresholdsMap := map[string]NodeThresholds{}
averageResourceUsagePercent := api.ResourceThresholds{}
if useDeviationThresholds {
averageResourceUsagePercent = averageNodeBasicresources(nodes, getPodsAssignedToNode, resourceNames)
}
for _, node := range nodes {
nodeCapacity := node.Status.Capacity
if len(node.Status.Allocatable) > 0 {
nodeCapacity = node.Status.Allocatable
}
nodeThresholdsMap[node.Name] = NodeThresholds{
lowResourceThreshold: map[v1.ResourceName]*resource.Quantity{},
highResourceThreshold: map[v1.ResourceName]*resource.Quantity{},
}
for _, resourceName := range resourceNames {
if useDeviationThresholds {
cap := nodeCapacity[resourceName]
if lowThreshold[resourceName] == MinResourcePercentage {
nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = &cap
nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = &cap
} else {
nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, normalizePercentage(averageResourceUsagePercent[resourceName]-lowThreshold[resourceName]))
nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, normalizePercentage(averageResourceUsagePercent[resourceName]+highThreshold[resourceName]))
}
} else {
nodeThresholdsMap[node.Name].lowResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, lowThreshold[resourceName])
nodeThresholdsMap[node.Name].highResourceThreshold[resourceName] = resourceThreshold(nodeCapacity, resourceName, highThreshold[resourceName])
}
}
}
return nodeThresholdsMap
}
func getNodeUsage(
nodes []*v1.Node,
resourceNames []v1.ResourceName,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
) []NodeUsage {
var nodeUsageList []NodeUsage
for _, node := range nodes {
pods, err := podutil.ListPodsOnANode(ctx, client, node)
pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, nil)
if err != nil {
klog.V(2).InfoS("Node will not be processed, error accessing its pods", "node", klog.KObj(node), "err", err)
continue
}
// A threshold is in percentages but in <0;100> interval.
// Performing `threshold * 0.01` will convert <0;100> interval into <0;1>.
// Multiplying it with capacity will give fraction of the capacity corresponding to the given high/low resource threshold in Quantity units.
nodeCapacity := node.Status.Capacity
if len(node.Status.Allocatable) > 0 {
nodeCapacity = node.Status.Allocatable
}
lowResourceThreshold := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(int64(float64(lowThreshold[v1.ResourceCPU])*float64(nodeCapacity.Cpu().MilliValue())*0.01), resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(int64(float64(lowThreshold[v1.ResourceMemory])*float64(nodeCapacity.Memory().Value())*0.01), resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(int64(float64(lowThreshold[v1.ResourcePods])*float64(nodeCapacity.Pods().Value())*0.01), resource.DecimalSI),
}
for _, name := range resourceNames {
if !isBasicResource(name) {
cap := nodeCapacity[name]
lowResourceThreshold[name] = resource.NewQuantity(int64(float64(lowThreshold[name])*float64(cap.Value())*0.01), resource.DecimalSI)
}
}
highResourceThreshold := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(int64(float64(highThreshold[v1.ResourceCPU])*float64(nodeCapacity.Cpu().MilliValue())*0.01), resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(int64(float64(highThreshold[v1.ResourceMemory])*float64(nodeCapacity.Memory().Value())*0.01), resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(int64(float64(highThreshold[v1.ResourcePods])*float64(nodeCapacity.Pods().Value())*0.01), resource.DecimalSI),
}
for _, name := range resourceNames {
if !isBasicResource(name) {
cap := nodeCapacity[name]
highResourceThreshold[name] = resource.NewQuantity(int64(float64(highThreshold[name])*float64(cap.Value())*0.01), resource.DecimalSI)
}
}
nodeUsageList = append(nodeUsageList, NodeUsage{
node: node,
usage: nodeUtilization(node, pods, resourceNames),
allPods: pods,
lowResourceThreshold: lowResourceThreshold,
highResourceThreshold: highResourceThreshold,
node: node,
usage: nodeutil.NodeUtilization(pods, resourceNames),
allPods: pods,
})
}
return nodeUsageList
}
func resourceThreshold(nodeCapacity v1.ResourceList, resourceName v1.ResourceName, threshold api.Percentage) *resource.Quantity {
defaultFormat := resource.DecimalSI
if resourceName == v1.ResourceMemory {
defaultFormat = resource.BinarySI
}
resourceCapacityFraction := func(resourceNodeCapacity int64) int64 {
// A threshold is in percentages but in <0;100> interval.
// Performing `threshold * 0.01` will convert <0;100> interval into <0;1>.
// Multiplying it with capacity will give fraction of the capacity corresponding to the given resource threshold in Quantity units.
return int64(float64(threshold) * 0.01 * float64(resourceNodeCapacity))
}
resourceCapacityQuantity := nodeCapacity.Name(resourceName, defaultFormat)
if resourceName == v1.ResourceCPU {
return resource.NewMilliQuantity(resourceCapacityFraction(resourceCapacityQuantity.MilliValue()), defaultFormat)
}
return resource.NewQuantity(resourceCapacityFraction(resourceCapacityQuantity.Value()), defaultFormat)
}
func resourceUsagePercentages(nodeUsage NodeUsage) map[v1.ResourceName]float64 {
nodeCapacity := nodeUsage.node.Status.Capacity
if len(nodeUsage.node.Status.Allocatable) > 0 {
@@ -155,19 +207,24 @@ func resourceUsagePercentages(nodeUsage NodeUsage) map[v1.ResourceName]float64 {
// low and high thresholds, it is simply ignored.
func classifyNodes(
nodeUsages []NodeUsage,
lowThresholdFilter, highThresholdFilter func(node *v1.Node, usage NodeUsage) bool,
) ([]NodeUsage, []NodeUsage) {
lowNodes, highNodes := []NodeUsage{}, []NodeUsage{}
nodeThresholds map[string]NodeThresholds,
lowThresholdFilter, highThresholdFilter func(node *v1.Node, usage NodeUsage, threshold NodeThresholds) bool,
) ([]NodeInfo, []NodeInfo) {
lowNodes, highNodes := []NodeInfo{}, []NodeInfo{}
for _, nodeUsage := range nodeUsages {
if lowThresholdFilter(nodeUsage.node, nodeUsage) {
klog.V(2).InfoS("Node is underutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
lowNodes = append(lowNodes, nodeUsage)
} else if highThresholdFilter(nodeUsage.node, nodeUsage) {
klog.V(2).InfoS("Node is overutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
highNodes = append(highNodes, nodeUsage)
nodeInfo := NodeInfo{
NodeUsage: nodeUsage,
thresholds: nodeThresholds[nodeUsage.node.Name],
}
if lowThresholdFilter(nodeUsage.node, nodeUsage, nodeThresholds[nodeUsage.node.Name]) {
klog.InfoS("Node is underutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
lowNodes = append(lowNodes, nodeInfo)
} else if highThresholdFilter(nodeUsage.node, nodeUsage, nodeThresholds[nodeUsage.node.Name]) {
klog.InfoS("Node is overutilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
highNodes = append(highNodes, nodeInfo)
} else {
klog.V(2).InfoS("Node is appropriately utilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
klog.InfoS("Node is appropriately utilized", "node", klog.KObj(nodeUsage.node), "usage", nodeUsage.usage, "usagePercentage", resourceUsagePercentages(nodeUsage))
}
}
@@ -179,16 +236,13 @@ func classifyNodes(
// TODO: @ravig Break this function into smaller functions.
func evictPodsFromSourceNodes(
ctx context.Context,
sourceNodes, destinationNodes []NodeUsage,
sourceNodes, destinationNodes []NodeInfo,
podEvictor *evictions.PodEvictor,
podFilter func(pod *v1.Pod) bool,
resourceNames []v1.ResourceName,
strategy string,
continueEviction continueEvictionCond,
) {
sortNodesByUsage(sourceNodes)
// upper bound on total number of pods/cpu/memory and optional extended resources to be moved
totalAvailableUsage := map[v1.ResourceName]*resource.Quantity{
v1.ResourcePods: {},
@@ -204,7 +258,7 @@ func evictPodsFromSourceNodes(
if _, ok := totalAvailableUsage[name]; !ok {
totalAvailableUsage[name] = resource.NewQuantity(0, resource.DecimalSI)
}
totalAvailableUsage[name].Add(*node.highResourceThreshold[name])
totalAvailableUsage[name].Add(*node.thresholds.highResourceThreshold[name])
totalAvailableUsage[name].Sub(*node.usage[name])
}
}
@@ -216,7 +270,7 @@ func evictPodsFromSourceNodes(
"Pods", totalAvailableUsage[v1.ResourcePods].Value(),
}
for name := range totalAvailableUsage {
if !isBasicResource(name) {
if !node.IsBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), totalAvailableUsage[name].Value())
}
}
@@ -244,7 +298,7 @@ func evictPodsFromSourceNodes(
func evictPods(
ctx context.Context,
inputPods []*v1.Pod,
nodeUsage NodeUsage,
nodeInfo NodeInfo,
totalAvailableUsage map[v1.ResourceName]*resource.Quantity,
taintsOfLowNodes map[string][]v1.Taint,
podEvictor *evictions.PodEvictor,
@@ -252,14 +306,14 @@ func evictPods(
continueEviction continueEvictionCond,
) {
if continueEviction(nodeUsage, totalAvailableUsage) {
if continueEviction(nodeInfo, totalAvailableUsage) {
for _, pod := range inputPods {
if !utils.PodToleratesTaints(pod, taintsOfLowNodes) {
klog.V(3).InfoS("Skipping eviction for pod, doesn't tolerate node taint", "pod", klog.KObj(pod))
continue
}
success, err := podEvictor.EvictPod(ctx, pod, nodeUsage.node, strategy)
success, err := podEvictor.EvictPod(ctx, pod, nodeInfo.node, strategy)
if err != nil {
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
break
@@ -270,30 +324,30 @@ func evictPods(
for name := range totalAvailableUsage {
if name == v1.ResourcePods {
nodeUsage.usage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI))
nodeInfo.usage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI))
totalAvailableUsage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI))
} else {
quantity := utils.GetResourceRequestQuantity(pod, name)
nodeUsage.usage[name].Sub(quantity)
nodeInfo.usage[name].Sub(quantity)
totalAvailableUsage[name].Sub(quantity)
}
}
keysAndValues := []interface{}{
"node", nodeUsage.node.Name,
"CPU", nodeUsage.usage[v1.ResourceCPU].MilliValue(),
"Mem", nodeUsage.usage[v1.ResourceMemory].Value(),
"Pods", nodeUsage.usage[v1.ResourcePods].Value(),
"node", nodeInfo.node.Name,
"CPU", nodeInfo.usage[v1.ResourceCPU].MilliValue(),
"Mem", nodeInfo.usage[v1.ResourceMemory].Value(),
"Pods", nodeInfo.usage[v1.ResourcePods].Value(),
}
for name := range totalAvailableUsage {
if !isBasicResource(name) {
if !nodeutil.IsBasicResource(name) {
keysAndValues = append(keysAndValues, string(name), totalAvailableUsage[name].Value())
}
}
klog.V(3).InfoS("Updated node usage", keysAndValues...)
// check if pods can be still evicted
if !continueEviction(nodeUsage, totalAvailableUsage) {
if !continueEviction(nodeInfo, totalAvailableUsage) {
break
}
}
@@ -301,31 +355,36 @@ func evictPods(
}
}
// sortNodesByUsage sorts nodes based on usage in descending order
func sortNodesByUsage(nodes []NodeUsage) {
// sortNodesByUsage sorts nodes based on usage according to the given strategy.
func sortNodesByUsage(nodes []NodeInfo, ascending bool) {
sort.Slice(nodes, func(i, j int) bool {
ti := nodes[i].usage[v1.ResourceMemory].Value() + nodes[i].usage[v1.ResourceCPU].MilliValue() + nodes[i].usage[v1.ResourcePods].Value()
tj := nodes[j].usage[v1.ResourceMemory].Value() + nodes[j].usage[v1.ResourceCPU].MilliValue() + nodes[j].usage[v1.ResourcePods].Value()
// extended resources
for name := range nodes[i].usage {
if !isBasicResource(name) {
if !nodeutil.IsBasicResource(name) {
ti = ti + nodes[i].usage[name].Value()
tj = tj + nodes[j].usage[name].Value()
}
}
// To return sorted in descending order
// Return ascending order for HighNodeUtilization strategy
if ascending {
return ti < tj
}
// Return descending order for LowNodeUtilization strategy
return ti > tj
})
}
// isNodeAboveTargetUtilization checks if a node is overutilized
// At least one resource has to be above the high threshold
func isNodeAboveTargetUtilization(usage NodeUsage) bool {
func isNodeAboveTargetUtilization(usage NodeUsage, threshold map[v1.ResourceName]*resource.Quantity) bool {
for name, nodeValue := range usage.usage {
// usage.highResourceThreshold[name] < nodeValue
if usage.highResourceThreshold[name].Cmp(*nodeValue) == -1 {
if threshold[name].Cmp(*nodeValue) == -1 {
return true
}
}
@@ -334,10 +393,10 @@ func isNodeAboveTargetUtilization(usage NodeUsage) bool {
// isNodeWithLowUtilization checks if a node is underutilized
// All resources have to be below the low threshold
func isNodeWithLowUtilization(usage NodeUsage) bool {
func isNodeWithLowUtilization(usage NodeUsage, threshold map[v1.ResourceName]*resource.Quantity) bool {
for name, nodeValue := range usage.usage {
// usage.lowResourceThreshold[name] < nodeValue
if usage.lowResourceThreshold[name].Cmp(*nodeValue) == -1 {
if threshold[name].Cmp(*nodeValue) == -1 {
return false
}
}
@@ -354,43 +413,6 @@ func getResourceNames(thresholds api.ResourceThresholds) []v1.ResourceName {
return resourceNames
}
// isBasicResource checks if resource is basic native.
func isBasicResource(name v1.ResourceName) bool {
switch name {
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods:
return true
default:
return false
}
}
func nodeUtilization(node *v1.Node, pods []*v1.Pod, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity {
totalReqs := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(0, resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(0, resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(int64(len(pods)), resource.DecimalSI),
}
for _, name := range resourceNames {
if !isBasicResource(name) {
totalReqs[name] = resource.NewQuantity(0, resource.DecimalSI)
}
}
for _, pod := range pods {
req, _ := utils.PodRequestsAndLimits(pod)
for _, name := range resourceNames {
quantity, ok := req[name]
if ok && name != v1.ResourcePods {
// As Quantity.Add says: Add adds the provided y quantity to the current value. If the current value is zero,
// the format of the quantity will be updated to the format of y.
totalReqs[name].Add(quantity)
}
}
}
return totalReqs
}
func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) {
var nonRemovablePods, removablePods []*v1.Pod
@@ -404,3 +426,34 @@ func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*
return nonRemovablePods, removablePods
}
func averageNodeBasicresources(nodes []*v1.Node, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, resourceNames []v1.ResourceName) api.ResourceThresholds {
total := api.ResourceThresholds{}
average := api.ResourceThresholds{}
numberOfNodes := len(nodes)
for _, node := range nodes {
pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, nil)
if err != nil {
numberOfNodes--
continue
}
usage := nodeutil.NodeUtilization(pods, resourceNames)
nodeCapacity := node.Status.Capacity
if len(node.Status.Allocatable) > 0 {
nodeCapacity = node.Status.Allocatable
}
for resource, value := range usage {
nodeCapacityValue := nodeCapacity[resource]
if resource == v1.ResourceCPU {
total[resource] += api.Percentage(value.MilliValue()) / api.Percentage(nodeCapacityValue.MilliValue()) * 100.0
} else {
total[resource] += api.Percentage(value.Value()) / api.Percentage(nodeCapacityValue.Value()) * 100.0
}
}
}
for resource, value := range total {
average[resource] = value / api.Percentage(numberOfNodes)
}
return average
}

View File

@@ -18,17 +18,91 @@ package nodeutilization
import (
"fmt"
"math"
"testing"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"math"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/descheduler/pkg/api"
"testing"
)
var (
lowPriority = int32(0)
highPriority = int32(10000)
extendedResource = v1.ResourceName("example.com/foo")
testNode1 = NodeInfo{
NodeUsage: NodeUsage{
node: &v1.Node{
Status: v1.NodeStatus{
Capacity: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(3977868*1024, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI),
},
Allocatable: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(1930, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(3287692*1024, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI),
},
},
ObjectMeta: metav1.ObjectMeta{Name: "node1"},
},
usage: map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(1730, resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(3038982964, resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(25, resource.BinarySI),
},
},
}
testNode2 = NodeInfo{
NodeUsage: NodeUsage{
node: &v1.Node{
Status: v1.NodeStatus{
Capacity: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(3977868*1024, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI),
},
Allocatable: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(1930, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(3287692*1024, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI),
},
},
ObjectMeta: metav1.ObjectMeta{Name: "node2"},
},
usage: map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(1220, resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(3038982964, resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(11, resource.BinarySI),
},
},
}
testNode3 = NodeInfo{
NodeUsage: NodeUsage{
node: &v1.Node{
Status: v1.NodeStatus{
Capacity: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(3977868*1024, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI),
},
Allocatable: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(1930, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(3287692*1024, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI),
},
},
ObjectMeta: metav1.ObjectMeta{Name: "node3"},
},
usage: map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(1530, resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(5038982964, resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(20, resource.BinarySI),
},
},
}
)
func TestValidateThresholds(t *testing.T) {
@@ -156,3 +230,27 @@ func TestResourceUsagePercentages(t *testing.T) {
t.Logf("resourceUsagePercentage: %#v\n", resourceUsagePercentage)
}
func TestSortNodesByUsageDescendingOrder(t *testing.T) {
nodeList := []NodeInfo{testNode1, testNode2, testNode3}
expectedNodeList := []NodeInfo{testNode3, testNode1, testNode2} // testNode3 has the highest usage
sortNodesByUsage(nodeList, false) // ascending=false, sort nodes in descending order
for i := 0; i < len(expectedNodeList); i++ {
if nodeList[i].NodeUsage.node.Name != expectedNodeList[i].NodeUsage.node.Name {
t.Errorf("Expected %v, got %v", expectedNodeList[i].NodeUsage.node.Name, nodeList[i].NodeUsage.node.Name)
}
}
}
func TestSortNodesByUsageAscendingOrder(t *testing.T) {
nodeList := []NodeInfo{testNode1, testNode2, testNode3}
expectedNodeList := []NodeInfo{testNode2, testNode1, testNode3}
sortNodesByUsage(nodeList, true) // ascending=true, sort nodes in ascending order
for i := 0; i < len(expectedNodeList); i++ {
if nodeList[i].NodeUsage.node.Name != expectedNodeList[i].NodeUsage.node.Name {
t.Errorf("Expected %v, got %v", expectedNodeList[i].NodeUsage.node.Name, nodeList[i].NodeUsage.node.Name)
}
}
}

View File

@@ -20,6 +20,7 @@ import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/util/sets"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
@@ -48,18 +49,18 @@ func validateRemovePodsViolatingInterPodAntiAffinityParams(params *api.StrategyP
}
// RemovePodsViolatingInterPodAntiAffinity evicts pods on the node which are having a pod affinity rules.
func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) {
if err := validateRemovePodsViolatingInterPodAntiAffinityParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemovePodsViolatingInterPodAntiAffinity parameters")
return
}
var includedNamespaces, excludedNamespaces []string
var includedNamespaces, excludedNamespaces sets.String
var labelSelector *metav1.LabelSelector
if strategy.Params != nil {
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
includedNamespaces = sets.NewString(strategy.Params.Namespaces.Include...)
excludedNamespaces = sets.NewString(strategy.Params.Namespaces.Exclude...)
}
labelSelector = strategy.Params.LabelSelector
}
@@ -77,16 +78,19 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
podFilter, err := podutil.NewOptions().
WithNamespaces(includedNamespaces).
WithoutNamespaces(excludedNamespaces).
WithLabelSelector(labelSelector).
BuildFilterFunc()
if err != nil {
klog.ErrorS(err, "Error initializing pod filter function")
return
}
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(labelSelector),
)
pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter)
if err != nil {
return
}

View File

@@ -24,16 +24,17 @@ import (
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/utils"
"sigs.k8s.io/descheduler/test"
)
func TestPodAntiAffinity(t *testing.T) {
ctx := context.Background()
node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
node2 := test.BuildTestNode("n2", 2000, 3000, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
@@ -46,6 +47,7 @@ func TestPodAntiAffinity(t *testing.T) {
Unschedulable: true,
}
})
node4 := test.BuildTestNode("n4", 2, 2, 1, nil)
p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil)
p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil)
@@ -55,6 +57,10 @@ func TestPodAntiAffinity(t *testing.T) {
p6 := test.BuildTestPod("p6", 100, 0, node1.Name, nil)
p7 := test.BuildTestPod("p7", 100, 0, node1.Name, nil)
p8 := test.BuildTestPod("p8", 100, 0, node1.Name, nil)
p9 := test.BuildTestPod("p9", 100, 0, node1.Name, nil)
p10 := test.BuildTestPod("p10", 100, 0, node1.Name, nil)
p9.DeletionTimestamp = &metav1.Time{}
p10.DeletionTimestamp = &metav1.Time{}
criticalPriority := utils.SystemCriticalPriority
nonEvictablePod := test.BuildTestPod("non-evict", 100, 0, node1.Name, func(pod *v1.Pod) {
@@ -72,6 +78,8 @@ func TestPodAntiAffinity(t *testing.T) {
test.SetNormalOwnerRef(p5)
test.SetNormalOwnerRef(p6)
test.SetNormalOwnerRef(p7)
test.SetNormalOwnerRef(p9)
test.SetNormalOwnerRef(p10)
// set pod anti affinity
setPodAntiAffinity(p1, "foo", "bar")
@@ -80,6 +88,8 @@ func TestPodAntiAffinity(t *testing.T) {
setPodAntiAffinity(p5, "foo1", "bar1")
setPodAntiAffinity(p6, "foo1", "bar1")
setPodAntiAffinity(p7, "foo", "bar")
setPodAntiAffinity(p9, "foo", "bar")
setPodAntiAffinity(p10, "foo", "bar")
// set pod priority
test.SetPodPriority(p5, 100)
@@ -91,98 +101,142 @@ func TestPodAntiAffinity(t *testing.T) {
"datacenter": "west",
}
var uint1 uint = 1
var uint3 uint = 3
tests := []struct {
description string
maxPodsToEvictPerNode int
pods []v1.Pod
expectedEvictedPodCount int
nodeFit bool
nodes []*v1.Node
description string
maxPodsToEvictPerNode *uint
maxNoOfPodsToEvictPerNamespace *uint
pods []*v1.Pod
expectedEvictedPodCount uint
nodeFit bool
nodes []*v1.Node
}{
{
description: "Maximum pods to evict - 0",
maxPodsToEvictPerNode: 0,
pods: []v1.Pod{*p1, *p2, *p3, *p4},
pods: []*v1.Pod{p1, p2, p3, p4},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 3,
},
{
description: "Maximum pods to evict - 3",
maxPodsToEvictPerNode: 3,
pods: []v1.Pod{*p1, *p2, *p3, *p4},
maxPodsToEvictPerNode: &uint3,
pods: []*v1.Pod{p1, p2, p3, p4},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 3,
},
{
description: "Maximum pods to evict (maxPodsToEvictPerNamespace=3) - 3",
maxNoOfPodsToEvictPerNamespace: &uint3,
pods: []*v1.Pod{p1, p2, p3, p4},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 3,
},
{
description: "Evict only 1 pod after sorting",
maxPodsToEvictPerNode: 0,
pods: []v1.Pod{*p5, *p6, *p7},
pods: []*v1.Pod{p5, p6, p7},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "Evicts pod that conflicts with critical pod (but does not evict critical pod)",
maxPodsToEvictPerNode: 1,
pods: []v1.Pod{*p1, *nonEvictablePod},
maxPodsToEvictPerNode: &uint1,
pods: []*v1.Pod{p1, nonEvictablePod},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "Evicts pod that conflicts with critical pod (but does not evict critical pod)",
maxPodsToEvictPerNode: 1,
pods: []v1.Pod{*p1, *nonEvictablePod},
maxPodsToEvictPerNode: &uint1,
pods: []*v1.Pod{p1, nonEvictablePod},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "Won't evict pods because node selectors don't match available nodes",
maxPodsToEvictPerNode: 1,
pods: []v1.Pod{*p8, *nonEvictablePod},
maxPodsToEvictPerNode: &uint1,
pods: []*v1.Pod{p8, nonEvictablePod},
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
nodeFit: true,
},
{
description: "Won't evict pods because only other node is not schedulable",
maxPodsToEvictPerNode: 1,
pods: []v1.Pod{*p8, *nonEvictablePod},
maxPodsToEvictPerNode: &uint1,
pods: []*v1.Pod{p8, nonEvictablePod},
nodes: []*v1.Node{node1, node3},
expectedEvictedPodCount: 0,
nodeFit: true,
},
{
description: "No pod to evicted since all pod terminating",
pods: []*v1.Pod{p9, p10},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 0,
},
{
description: "Won't evict pods because only other node doesn't have enough resources",
maxPodsToEvictPerNode: &uint3,
pods: []*v1.Pod{p1, p2, p3, p4},
nodes: []*v1.Node{node1, node4},
expectedEvictedPodCount: 0,
nodeFit: true,
},
}
for _, test := range tests {
// create fake client
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: test.pods}, nil
})
fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) {
return true, node1, nil
})
t.Run(test.description, func(t *testing.T) {
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
test.maxPodsToEvictPerNode,
test.nodes,
false,
false,
false,
)
strategy := api.DeschedulerStrategy{
Params: &api.StrategyParameters{
NodeFit: test.nodeFit,
},
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
RemovePodsViolatingInterPodAntiAffinity(ctx, fakeClient, strategy, test.nodes, podEvictor)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != test.expectedEvictedPodCount {
t.Errorf("Unexpected no of pods evicted: pods evicted: %d, expected: %d", podsEvicted, test.expectedEvictedPodCount)
}
var objs []runtime.Object
for _, node := range test.nodes {
objs = append(objs, node)
}
for _, pod := range test.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
test.maxPodsToEvictPerNode,
test.maxNoOfPodsToEvictPerNamespace,
test.nodes,
getPodsAssignedToNode,
false,
false,
false,
false,
false,
)
strategy := api.DeschedulerStrategy{
Params: &api.StrategyParameters{
NodeFit: test.nodeFit,
},
}
RemovePodsViolatingInterPodAntiAffinity(ctx, fakeClient, strategy, test.nodes, podEvictor, getPodsAssignedToNode)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != test.expectedEvictedPodCount {
t.Errorf("Unexpected no of pods evicted: pods evicted: %d, expected: %d", podsEvicted, test.expectedEvictedPodCount)
}
})
}
}

View File

@@ -22,6 +22,7 @@ import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
@@ -56,7 +57,7 @@ func validatePodLifeTimeParams(params *api.StrategyParameters) error {
}
// PodLifeTime evicts pods on nodes that were created more than strategy.Params.MaxPodLifeTimeSeconds seconds ago.
func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) {
if err := validatePodLifeTimeParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid PodLifeTime parameters")
return
@@ -68,10 +69,10 @@ func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.D
return
}
var includedNamespaces, excludedNamespaces []string
var includedNamespaces, excludedNamespaces sets.String
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
includedNamespaces = sets.NewString(strategy.Params.Namespaces.Include...)
excludedNamespaces = sets.NewString(strategy.Params.Namespaces.Exclude...)
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority))
@@ -88,10 +89,21 @@ func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.D
}
}
podFilter, err := podutil.NewOptions().
WithFilter(filter).
WithNamespaces(includedNamespaces).
WithoutNamespaces(excludedNamespaces).
WithLabelSelector(strategy.Params.LabelSelector).
BuildFilterFunc()
if err != nil {
klog.ErrorS(err, "Error initializing pod filter function")
return
}
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods := listOldPodsOnNode(ctx, client, node, includedNamespaces, excludedNamespaces, strategy.Params.LabelSelector, *strategy.Params.PodLifeTime.MaxPodLifeTimeSeconds, filter)
pods := listOldPodsOnNode(node.Name, getPodsAssignedToNode, podFilter, *strategy.Params.PodLifeTime.MaxPodLifeTimeSeconds)
for _, pod := range pods {
success, err := podEvictor.EvictPod(ctx, pod, node, "PodLifeTime")
if success {
@@ -108,23 +120,12 @@ func PodLifeTime(ctx context.Context, client clientset.Interface, strategy api.D
}
func listOldPodsOnNode(
ctx context.Context,
client clientset.Interface,
node *v1.Node,
includedNamespaces, excludedNamespaces []string,
labelSelector *metav1.LabelSelector,
nodeName string,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
filter podutil.FilterFunc,
maxPodLifeTimeSeconds uint,
filter func(pod *v1.Pod) bool,
) []*v1.Pod {
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithFilter(filter),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(labelSelector),
)
pods, err := podutil.ListPodsOnANode(nodeName, getPodsAssignedToNode, filter)
if err != nil {
return nil
}

View File

@@ -25,15 +25,16 @@ import (
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/test"
)
func TestPodLifeTime(t *testing.T) {
ctx := context.Background()
node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil)
olderPodCreationTime := metav1.NewTime(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC))
newerPodCreationTime := metav1.NewTime(time.Now())
@@ -125,14 +126,24 @@ func TestPodLifeTime(t *testing.T) {
p12.ObjectMeta.OwnerReferences = ownerRef1
p13.ObjectMeta.OwnerReferences = ownerRef1
p14 := test.BuildTestPod("p14", 100, 0, node1.Name, nil)
p15 := test.BuildTestPod("p15", 100, 0, node1.Name, nil)
p14.Namespace = "dev"
p15.Namespace = "dev"
p14.ObjectMeta.CreationTimestamp = olderPodCreationTime
p15.ObjectMeta.CreationTimestamp = olderPodCreationTime
p14.ObjectMeta.OwnerReferences = ownerRef1
p15.ObjectMeta.OwnerReferences = ownerRef1
p14.DeletionTimestamp = &metav1.Time{}
p15.DeletionTimestamp = &metav1.Time{}
var maxLifeTime uint = 600
testCases := []struct {
description string
strategy api.DeschedulerStrategy
maxPodsToEvictPerNode int
pods []v1.Pod
pods []*v1.Pod
nodes []*v1.Node
expectedEvictedPodCount int
expectedEvictedPodCount uint
ignorePvcPods bool
}{
{
@@ -143,8 +154,7 @@ func TestPodLifeTime(t *testing.T) {
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p1, *p2},
pods: []*v1.Pod{p1, p2},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
@@ -156,8 +166,7 @@ func TestPodLifeTime(t *testing.T) {
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p3, *p4},
pods: []*v1.Pod{p3, p4},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 0,
},
@@ -169,8 +178,7 @@ func TestPodLifeTime(t *testing.T) {
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p5, *p6},
pods: []*v1.Pod{p5, p6},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
@@ -182,8 +190,7 @@ func TestPodLifeTime(t *testing.T) {
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p7, *p8},
pods: []*v1.Pod{p7, p8},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 0,
},
@@ -198,8 +205,7 @@ func TestPodLifeTime(t *testing.T) {
},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p9, *p10},
pods: []*v1.Pod{p9, p10},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
@@ -211,8 +217,7 @@ func TestPodLifeTime(t *testing.T) {
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p11},
pods: []*v1.Pod{p11},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 0,
ignorePvcPods: true,
@@ -225,13 +230,12 @@ func TestPodLifeTime(t *testing.T) {
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p11},
pods: []*v1.Pod{p11},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "Two old pods with different labels, 1 selected by labelSelector",
description: "No pod to evicted since all pod terminating",
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
@@ -241,36 +245,72 @@ func TestPodLifeTime(t *testing.T) {
},
},
},
maxPodsToEvictPerNode: 5,
pods: []v1.Pod{*p12, *p13},
pods: []*v1.Pod{p12, p13},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
},
{
description: "No pod should be evicted since pod terminating",
strategy: api.DeschedulerStrategy{
Enabled: true,
Params: &api.StrategyParameters{
PodLifeTime: &api.PodLifeTime{MaxPodLifeTimeSeconds: &maxLifeTime},
LabelSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{"foo": "bar"},
},
},
},
pods: []*v1.Pod{p14, p15},
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 0,
},
}
for _, tc := range testCases {
fakeClient := &fake.Clientset{}
t.Run(tc.description, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: tc.pods}, nil
var objs []runtime.Object
for _, node := range tc.nodes {
objs = append(objs, node)
}
for _, pod := range tc.pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
nil,
nil,
tc.nodes,
getPodsAssignedToNode,
false,
false,
tc.ignorePvcPods,
false,
false,
)
PodLifeTime(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != tc.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", tc.description, tc.expectedEvictedPodCount, podsEvicted)
}
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
tc.maxPodsToEvictPerNode,
tc.nodes,
false,
false,
tc.ignorePvcPods,
)
PodLifeTime(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != tc.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", tc.description, tc.expectedEvictedPodCount, podsEvicted)
}
}
}

View File

@@ -21,6 +21,7 @@ import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
@@ -49,7 +50,7 @@ func validateRemovePodsHavingTooManyRestartsParams(params *api.StrategyParameter
// RemovePodsHavingTooManyRestarts removes the pods that have too many restarts on node.
// There are too many cases leading this issue: Volume mount failed, app error due to nodes' different settings.
// As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages.
func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor) {
func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) {
if err := validateRemovePodsHavingTooManyRestartsParams(strategy.Params); err != nil {
klog.ErrorS(err, "Invalid RemovePodsHavingTooManyRestarts parameters")
return
@@ -61,10 +62,10 @@ func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Inter
return
}
var includedNamespaces, excludedNamespaces []string
var includedNamespaces, excludedNamespaces sets.String
if strategy.Params.Namespaces != nil {
includedNamespaces = strategy.Params.Namespaces.Include
excludedNamespaces = strategy.Params.Namespaces.Exclude
includedNamespaces = sets.NewString(strategy.Params.Namespaces.Include...)
excludedNamespaces = sets.NewString(strategy.Params.Namespaces.Exclude...)
}
nodeFit := false
@@ -74,17 +75,20 @@ func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Inter
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
podFilter, err := podutil.NewOptions().
WithFilter(evictable.IsEvictable).
WithNamespaces(includedNamespaces).
WithoutNamespaces(excludedNamespaces).
WithLabelSelector(strategy.Params.LabelSelector).
BuildFilterFunc()
if err != nil {
klog.ErrorS(err, "Error initializing pod filter function")
return
}
for _, node := range nodes {
klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
pods, err := podutil.ListPodsOnANode(
ctx,
client,
node,
podutil.WithFilter(evictable.IsEvictable),
podutil.WithNamespaces(includedNamespaces),
podutil.WithoutNamespaces(excludedNamespaces),
podutil.WithLabelSelector(strategy.Params.LabelSelector),
)
pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter)
if err != nil {
klog.ErrorS(err, "Error listing a nodes pods", "node", klog.KObj(node))
continue

View File

@@ -18,23 +18,24 @@ package strategies
import (
"context"
"testing"
"fmt"
"testing"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/test"
)
func initPods(node *v1.Node) []v1.Pod {
pods := make([]v1.Pod, 0)
func initPods(node *v1.Node) []*v1.Pod {
pods := make([]*v1.Pod, 0)
for i := int32(0); i <= 9; i++ {
pod := test.BuildTestPod(fmt.Sprintf("pod-%d", i), 100, 0, node.Name, nil)
@@ -56,7 +57,7 @@ func initPods(node *v1.Node) []v1.Pod {
},
},
}
pods = append(pods, *pod)
pods = append(pods, pod)
}
// The following 3 pods won't get evicted.
@@ -81,8 +82,6 @@ func initPods(node *v1.Node) []v1.Pod {
}
func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
ctx := context.Background()
node1 := test.BuildTestNode("node1", 2000, 3000, 10, nil)
node2 := test.BuildTestNode("node2", 2000, 3000, 10, func(node *v1.Node) {
node.Spec.Taints = []v1.Taint{
@@ -98,8 +97,10 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
Unschedulable: true,
}
})
node4 := test.BuildTestNode("node4", 200, 3000, 10, nil)
node5 := test.BuildTestNode("node5", 2000, 3000, 10, nil)
pods := initPods(node1)
pods := append(append(initPods(node1), test.BuildTestPod("CPU-consumer-1", 150, 100, node4.Name, nil)), test.BuildTestPod("CPU-consumer-2", 150, 100, node5.Name, nil))
createStrategy := func(enabled, includingInitContainers bool, restartThresholds int32, nodeFit bool) api.DeschedulerStrategy {
return api.DeschedulerStrategy{
@@ -114,115 +115,154 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
}
}
var uint3 uint = 3
tests := []struct {
description string
nodes []*v1.Node
strategy api.DeschedulerStrategy
expectedEvictedPodCount int
maxPodsToEvictPerNode int
description string
nodes []*v1.Node
strategy api.DeschedulerStrategy
expectedEvictedPodCount uint
maxPodsToEvictPerNode *uint
maxNoOfPodsToEvictPerNamespace *uint
}{
{
description: "All pods have total restarts under threshold, no pod evictions",
strategy: createStrategy(true, true, 10000, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 0,
maxPodsToEvictPerNode: 0,
},
{
description: "Some pods have total restarts bigger than threshold",
strategy: createStrategy(true, true, 1, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 6,
maxPodsToEvictPerNode: 0,
},
{
description: "Nine pods have total restarts equals threshold(includingInitContainers=true), 6 pod evictions",
strategy: createStrategy(true, true, 1*25, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 6,
maxPodsToEvictPerNode: 0,
},
{
description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 5 pod evictions",
strategy: createStrategy(true, false, 1*25, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 5,
maxPodsToEvictPerNode: 0,
},
{
description: "All pods have total restarts equals threshold(includingInitContainers=true), 6 pod evictions",
strategy: createStrategy(true, true, 1*20, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 6,
maxPodsToEvictPerNode: 0,
},
{
description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 6 pod evictions",
strategy: createStrategy(true, false, 1*20, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 6,
maxPodsToEvictPerNode: 0,
},
{
description: "Five pods have total restarts bigger than threshold(includingInitContainers=true), but only 1 pod eviction",
strategy: createStrategy(true, true, 5*25+1, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
maxPodsToEvictPerNode: 0,
},
{
description: "Five pods have total restarts bigger than threshold(includingInitContainers=false), but only 1 pod eviction",
strategy: createStrategy(true, false, 5*20+1, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 1,
maxPodsToEvictPerNode: 0,
},
{
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3), 3 pod evictions",
strategy: createStrategy(true, true, 1, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 3,
maxPodsToEvictPerNode: 3,
maxPodsToEvictPerNode: &uint3,
},
{
description: "All pods have total restarts equals threshold(maxNoOfPodsToEvictPerNamespace=3), 3 pod evictions",
strategy: createStrategy(true, true, 1, false),
nodes: []*v1.Node{node1},
expectedEvictedPodCount: 3,
maxNoOfPodsToEvictPerNamespace: &uint3,
},
{
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node is tained, 0 pod evictions",
strategy: createStrategy(true, true, 1, true),
nodes: []*v1.Node{node1, node2},
expectedEvictedPodCount: 0,
maxPodsToEvictPerNode: 3,
maxPodsToEvictPerNode: &uint3,
},
{
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node is not schedulable, 0 pod evictions",
strategy: createStrategy(true, true, 1, true),
nodes: []*v1.Node{node1, node3},
expectedEvictedPodCount: 0,
maxPodsToEvictPerNode: 3,
maxPodsToEvictPerNode: &uint3,
},
{
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node does not have enough CPU, 0 pod evictions",
strategy: createStrategy(true, true, 1, true),
nodes: []*v1.Node{node1, node4},
expectedEvictedPodCount: 0,
maxPodsToEvictPerNode: &uint3,
},
{
description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node has enough CPU, 3 pod evictions",
strategy: createStrategy(true, true, 1, true),
nodes: []*v1.Node{node1, node5},
expectedEvictedPodCount: 3,
maxPodsToEvictPerNode: &uint3,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: pods}, nil
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
for _, node := range tc.nodes {
objs = append(objs, node)
}
for _, pod := range pods {
objs = append(objs, pod)
}
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
tc.maxPodsToEvictPerNode,
tc.maxNoOfPodsToEvictPerNamespace,
tc.nodes,
getPodsAssignedToNode,
false,
false,
false,
false,
false,
)
RemovePodsHavingTooManyRestarts(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount)
}
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
policyv1.SchemeGroupVersion.String(),
false,
tc.maxPodsToEvictPerNode,
tc.nodes,
false,
false,
false,
)
RemovePodsHavingTooManyRestarts(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount)
}
}
}

View File

@@ -18,21 +18,20 @@ package strategies
import (
"context"
"fmt"
"math"
"sort"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
"sigs.k8s.io/descheduler/pkg/descheduler/node"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies/validation"
"sigs.k8s.io/descheduler/pkg/utils"
)
@@ -47,70 +46,24 @@ type topology struct {
pods []*v1.Pod
}
// topologySpreadStrategyParams contains validated strategy parameters
type topologySpreadStrategyParams struct {
thresholdPriority int32
includedNamespaces sets.String
excludedNamespaces sets.String
labelSelector labels.Selector
nodeFit bool
}
// validateAndParseTopologySpreadParams will validate parameters to ensure that they do not contain invalid values.
func validateAndParseTopologySpreadParams(ctx context.Context, client clientset.Interface, params *api.StrategyParameters) (*topologySpreadStrategyParams, error) {
var includedNamespaces, excludedNamespaces sets.String
if params == nil {
return &topologySpreadStrategyParams{includedNamespaces: includedNamespaces, excludedNamespaces: excludedNamespaces}, nil
}
// At most one of include/exclude can be set
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
return nil, fmt.Errorf("only one of Include/Exclude namespaces can be set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return nil, fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, params)
if err != nil {
return nil, fmt.Errorf("failed to get threshold priority from strategy's params: %+v", err)
}
if params.Namespaces != nil {
includedNamespaces = sets.NewString(params.Namespaces.Include...)
excludedNamespaces = sets.NewString(params.Namespaces.Exclude...)
}
var selector labels.Selector
if params.LabelSelector != nil {
selector, err = metav1.LabelSelectorAsSelector(params.LabelSelector)
if err != nil {
return nil, fmt.Errorf("failed to get label selectors from strategy's params: %+v", err)
}
}
return &topologySpreadStrategyParams{
thresholdPriority: thresholdPriority,
includedNamespaces: includedNamespaces,
excludedNamespaces: excludedNamespaces,
labelSelector: selector,
nodeFit: params.NodeFit,
}, nil
}
func RemovePodsViolatingTopologySpreadConstraint(
ctx context.Context,
client clientset.Interface,
strategy api.DeschedulerStrategy,
nodes []*v1.Node,
podEvictor *evictions.PodEvictor,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
) {
strategyParams, err := validateAndParseTopologySpreadParams(ctx, client, strategy.Params)
strategyParams, err := validation.ValidateAndParseStrategyParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Invalid RemovePodsViolatingTopologySpreadConstraint parameters")
return
}
evictable := podEvictor.Evictable(
evictions.WithPriorityThreshold(strategyParams.thresholdPriority),
evictions.WithNodeFit(strategyParams.nodeFit),
evictions.WithLabelSelector(strategyParams.labelSelector),
evictions.WithPriorityThreshold(strategyParams.ThresholdPriority),
evictions.WithNodeFit(strategyParams.NodeFit),
evictions.WithLabelSelector(strategyParams.LabelSelector),
)
nodeMap := make(map[string]*v1.Node, len(nodes))
@@ -140,8 +93,8 @@ func RemovePodsViolatingTopologySpreadConstraint(
podsForEviction := make(map[*v1.Pod]struct{})
// 1. for each namespace...
for _, namespace := range namespaces.Items {
if (len(strategyParams.includedNamespaces) > 0 && !strategyParams.includedNamespaces.Has(namespace.Name)) ||
(len(strategyParams.excludedNamespaces) > 0 && strategyParams.excludedNamespaces.Has(namespace.Name)) {
if (len(strategyParams.IncludedNamespaces) > 0 && !strategyParams.IncludedNamespaces.Has(namespace.Name)) ||
(len(strategyParams.ExcludedNamespaces) > 0 && strategyParams.ExcludedNamespaces.Has(namespace.Name)) {
continue
}
namespacePods, err := client.CoreV1().Pods(namespace.Name).List(ctx, metav1.ListOptions{})
@@ -186,6 +139,10 @@ func RemovePodsViolatingTopologySpreadConstraint(
// (this loop is where we count the number of pods per topologyValue that match this constraint's selector)
var sumPods float64
for i := range namespacePods.Items {
// skip pods that are being deleted.
if utils.IsPodTerminating(&namespacePods.Items[i]) {
continue
}
// 4. if the pod matches this TopologySpreadConstraint LabelSelector
if !selector.Matches(labels.Set(namespacePods.Items[i].Labels)) {
continue
@@ -211,7 +168,7 @@ func RemovePodsViolatingTopologySpreadConstraint(
klog.V(2).InfoS("Skipping topology constraint because it is already balanced", "constraint", constraint)
continue
}
balanceDomains(podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodeMap)
balanceDomains(client, getPodsAssignedToNode, podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodes)
}
}
@@ -266,12 +223,14 @@ func topologyIsBalanced(topology map[topologyPair][]*v1.Pod, constraint v1.Topol
// [5, 5, 5, 5, 5, 5]
// (assuming even distribution by the scheduler of the evicted pods)
func balanceDomains(
client clientset.Interface,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
podsForEviction map[*v1.Pod]struct{},
constraint v1.TopologySpreadConstraint,
constraintTopologies map[topologyPair][]*v1.Pod,
sumPods float64,
isEvictable func(*v1.Pod) bool,
nodeMap map[string]*v1.Node) {
isEvictable func(pod *v1.Pod) bool,
nodes []*v1.Node) {
idealAvg := sumPods / float64(len(constraintTopologies))
sortedDomains := sortDomains(constraintTopologies, isEvictable)
@@ -281,7 +240,7 @@ func balanceDomains(
j := len(sortedDomains) - 1
for i < j {
// if j has no more to give without falling below the ideal average, move to next aboveAvg
if float64(len(sortedDomains[j].pods)) < idealAvg {
if float64(len(sortedDomains[j].pods)) <= idealAvg {
j--
}
@@ -314,8 +273,19 @@ func balanceDomains(
// also (just for tracking), add them to the list of pods in the lower topology
aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:]
for k := range aboveToEvict {
if err := validatePodFitsOnOtherNodes(aboveToEvict[k], nodeMap); err != nil {
klog.V(2).InfoS(fmt.Sprintf("ignoring pod for eviction due to: %s", err.Error()), "pod", klog.KObj(aboveToEvict[k]))
// PodFitsAnyOtherNode excludes the current node because, for the sake of domain balancing only, we care about if there is any other
// place it could theoretically fit.
// If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading
// Also, if the pod has a hard nodeAffinity/nodeSelector/toleration that only matches this node,
// don't bother evicting it as it will just end up back on the same node
// however we still account for it "being evicted" so the algorithm can complete
// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
// but still try to get the required # of movePods (instead of just chopping that value off the slice above).
// In other words, PTS can perform suboptimally if some of its chosen pods don't fit on other nodes.
// This is because the chosen pods aren't sorted, but immovable pods still count as "evicted" toward the PTS algorithm.
// So, a better selection heuristic could improve performance.
if !node.PodFitsAnyOtherNode(getPodsAssignedToNode, aboveToEvict[k], nodes) {
klog.V(2).InfoS("ignoring pod for eviction as it does not fit on any other node", "pod", klog.KObj(aboveToEvict[k]))
continue
}
@@ -326,56 +296,6 @@ func balanceDomains(
}
}
// validatePodFitsOnOtherNodes performs validation based on scheduling predicates for affinity and toleration.
// It excludes the current node because, for the sake of domain balancing only, we care about if there is any other
// place it could theoretically fit.
// If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading
func validatePodFitsOnOtherNodes(pod *v1.Pod, nodeMap map[string]*v1.Node) error {
// if the pod has a hard nodeAffinity/nodeSelector/toleration that only matches this node,
// don't bother evicting it as it will just end up back on the same node
// however we still account for it "being evicted" so the algorithm can complete
// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
// but still try to get the required # of movePods (instead of just chopping that value off the slice above)
isRequiredDuringSchedulingIgnoredDuringExecution := pod.Spec.Affinity != nil &&
pod.Spec.Affinity.NodeAffinity != nil &&
pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil
hardTaintsFilter := func(taint *v1.Taint) bool {
return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute
}
var eligibleNodesCount, ineligibleAffinityNodesCount, ineligibleTaintedNodesCount int
for _, node := range nodeMap {
if node == nodeMap[pod.Spec.NodeName] {
continue
}
if pod.Spec.NodeSelector != nil || isRequiredDuringSchedulingIgnoredDuringExecution {
if !nodeutil.PodFitsCurrentNode(pod, node) {
ineligibleAffinityNodesCount++
continue
}
}
if !utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, hardTaintsFilter) {
ineligibleTaintedNodesCount++
continue
}
eligibleNodesCount++
}
if eligibleNodesCount == 0 {
var errs []error
if ineligibleAffinityNodesCount > 0 {
errs = append(errs, fmt.Errorf("%d nodes with ineligible selector/affinity", ineligibleAffinityNodesCount))
}
if ineligibleTaintedNodesCount > 0 {
errs = append(errs, fmt.Errorf("%d nodes with taints that are not tolerated", ineligibleTaintedNodesCount))
}
return utilerrors.NewAggregate(errs)
}
return nil
}
// sortDomains sorts and splits the list of topology domains based on their size
// it also sorts the list of pods within the domains based on their node affinity/selector and priority in the following order:
// 1. non-evictable pods
@@ -383,7 +303,7 @@ func validatePodFitsOnOtherNodes(pod *v1.Pod, nodeMap map[string]*v1.Node) error
// 3. pods in descending priority
// 4. all other pods
// We then pop pods off the back of the list for eviction
func sortDomains(constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(*v1.Pod) bool) []topology {
func sortDomains(constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(pod *v1.Pod) bool) []topology {
sortedTopologies := make([]topology, 0, len(constraintTopologyPairs))
// sort the topologies and return 2 lists: those <= the average and those > the average (> list inverted)
for pair, list := range constraintTopologyPairs {

View File

@@ -5,23 +5,23 @@ import (
"fmt"
"testing"
"sigs.k8s.io/descheduler/pkg/api"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/test"
)
func TestTopologySpreadConstraint(t *testing.T) {
ctx := context.Background()
testCases := []struct {
name string
pods []*v1.Pod
expectedEvictedCount int
expectedEvictedCount uint
nodes []*v1.Node
strategy api.DeschedulerStrategy
namespaces []string
@@ -483,6 +483,38 @@ func TestTopologySpreadConstraint(t *testing.T) {
},
namespaces: []string{"ns1"},
},
{
name: "2 domains size [2 6], maxSkew=2, can't move any because node1 does not have enough CPU",
nodes: []*v1.Node{
test.BuildTestNode("n1", 200, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(2),
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
},
{
count: 6,
node: "n2",
labels: map[string]string{"foo": "bar"},
},
}),
expectedEvictedCount: 0,
strategy: api.DeschedulerStrategy{
Params: &api.StrategyParameters{
NodeFit: true,
},
},
namespaces: []string{"ns1"},
},
{
// see https://github.com/kubernetes-sigs/descheduler/issues/564
name: "Multiple constraints (6 nodes/2 zones, 4 pods)",
@@ -588,6 +620,62 @@ func TestTopologySpreadConstraint(t *testing.T) {
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{IncludeSoftConstraints: true}},
namespaces: []string{"ns1"},
},
{
name: "3 domains size [8 7 0], maxSkew=1, should move 5 to get [5 5 5]",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }),
},
pods: createTestPods([]testPodList{
{
count: 8,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(1),
},
{
count: 7,
node: "n2",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(1),
},
}),
expectedEvictedCount: 5,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "3 domains size [5 5 5], maxSkew=1, should move 0 to retain [5 5 5]",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }),
},
pods: createTestPods([]testPodList{
{
count: 5,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(1),
},
{
count: 5,
node: "n2",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(1),
},
{
count: 5,
node: "n3",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(1),
},
}),
expectedEvictedCount: 0,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 1 pod since pod tolerates the node with taint",
nodes: []*v1.Node{
@@ -630,7 +718,7 @@ func TestTopologySpreadConstraint(t *testing.T) {
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 0 pods since pod does not tolerate the tainted node",
name: "2 domains, sizes [2,0], maxSkew=1, move 1 pods since pod does not tolerate the tainted node",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) {
@@ -662,6 +750,43 @@ func TestTopologySpreadConstraint(t *testing.T) {
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 0 pods since pod does not tolerate the tainted node, and NodeFit is enabled",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) {
n.Labels["zone"] = "zoneB"
n.Spec.Taints = []v1.Taint{
{
Key: "taint-test",
Value: "test",
Effect: v1.TaintEffectNoSchedule,
},
}
}),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(1),
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
nodeSelector: map[string]string{"zone": "zoneA"},
},
}),
expectedEvictedCount: 0,
strategy: api.DeschedulerStrategy{
Params: &api.StrategyParameters{
NodeFit: true,
},
},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 1 pod for node with PreferNoSchedule Taint",
nodes: []*v1.Node{
@@ -776,33 +901,84 @@ func TestTopologySpreadConstraint(t *testing.T) {
},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [4,2], maxSkew=1, 2 pods in termination; nothing should be moved",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
},
pods: createTestPods([]testPodList{
{
count: 2,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(1),
},
{
count: 2,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(1),
deletionTimestamp: &metav1.Time{},
},
{
count: 2,
node: "n2",
labels: map[string]string{"foo": "bar"},
constraints: getDefaultTopologyConstraints(1),
},
}),
expectedEvictedCount: 0,
strategy: api.DeschedulerStrategy{
Params: &api.StrategyParameters{
LabelSelector: getLabelSelector("foo", []string{"bar"}, metav1.LabelSelectorOpIn),
},
},
namespaces: []string{"ns1"},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
podList := make([]v1.Pod, 0, len(tc.pods))
for _, pod := range tc.pods {
podList = append(podList, *pod)
}
return true, &v1.PodList{Items: podList}, nil
})
fakeClient.Fake.AddReactor("list", "namespaces", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.NamespaceList{Items: []v1.Namespace{{ObjectMeta: metav1.ObjectMeta{Name: "ns1", Namespace: "ns1"}}}}, nil
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var objs []runtime.Object
for _, node := range tc.nodes {
objs = append(objs, node)
}
for _, pod := range tc.pods {
objs = append(objs, pod)
}
objs = append(objs, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "ns1"}})
fakeClient := fake.NewSimpleClientset(objs...)
sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("Build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
podEvictor := evictions.NewPodEvictor(
fakeClient,
"v1",
false,
100,
nil,
nil,
tc.nodes,
getPodsAssignedToNode,
false,
false,
false,
false,
false,
)
RemovePodsViolatingTopologySpreadConstraint(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
RemovePodsViolatingTopologySpreadConstraint(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != tc.expectedEvictedCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", tc.name, tc.expectedEvictedCount, podsEvicted)
@@ -812,14 +988,15 @@ func TestTopologySpreadConstraint(t *testing.T) {
}
type testPodList struct {
count int
node string
labels map[string]string
constraints []v1.TopologySpreadConstraint
nodeSelector map[string]string
nodeAffinity *v1.Affinity
noOwners bool
tolerations []v1.Toleration
count int
node string
labels map[string]string
constraints []v1.TopologySpreadConstraint
nodeSelector map[string]string
nodeAffinity *v1.Affinity
noOwners bool
tolerations []v1.Toleration
deletionTimestamp *metav1.Time
}
func createTestPods(testPods []testPodList) []*v1.Pod {
@@ -840,6 +1017,7 @@ func createTestPods(testPods []testPodList) []*v1.Pod {
p.Spec.NodeSelector = tp.nodeSelector
p.Spec.Affinity = tp.nodeAffinity
p.Spec.Tolerations = tp.tolerations
p.ObjectMeta.DeletionTimestamp = tp.deletionTimestamp
}))
podNum++
}

View File

@@ -0,0 +1,71 @@
package validation
import (
"context"
"fmt"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/utils"
)
// ValidatedStrategyParams contains validated common strategy parameters
type ValidatedStrategyParams struct {
ThresholdPriority int32
IncludedNamespaces sets.String
ExcludedNamespaces sets.String
LabelSelector labels.Selector
NodeFit bool
}
func DefaultValidatedStrategyParams() ValidatedStrategyParams {
return ValidatedStrategyParams{ThresholdPriority: utils.SystemCriticalPriority}
}
func ValidateAndParseStrategyParams(
ctx context.Context,
client clientset.Interface,
params *api.StrategyParameters,
) (*ValidatedStrategyParams, error) {
if params == nil {
defaultValidatedStrategyParams := DefaultValidatedStrategyParams()
return &defaultValidatedStrategyParams, nil
}
// At most one of include/exclude can be set
var includedNamespaces, excludedNamespaces sets.String
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
return nil, fmt.Errorf("only one of Include/Exclude namespaces can be set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return nil, fmt.Errorf("only one of ThresholdPriority and thresholdPriorityClassName can be set")
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, params)
if err != nil {
return nil, fmt.Errorf("failed to get threshold priority from strategy's params: %+v", err)
}
if params.Namespaces != nil {
includedNamespaces = sets.NewString(params.Namespaces.Include...)
excludedNamespaces = sets.NewString(params.Namespaces.Exclude...)
}
var selector labels.Selector
if params.LabelSelector != nil {
selector, err = metav1.LabelSelectorAsSelector(params.LabelSelector)
if err != nil {
return nil, fmt.Errorf("failed to get label selectors from strategy's params: %+v", err)
}
}
return &ValidatedStrategyParams{
ThresholdPriority: thresholdPriority,
IncludedNamespaces: includedNamespaces,
ExcludedNamespaces: excludedNamespaces,
LabelSelector: selector,
NodeFit: params.NodeFit,
}, nil
}

View File

@@ -0,0 +1,79 @@
package validation
import (
"context"
"testing"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
"sigs.k8s.io/descheduler/pkg/api"
)
var (
thresholdPriority int32 = 1000
)
func TestValidStrategyParams(t *testing.T) {
ctx := context.Background()
fakeClient := &fake.Clientset{}
testCases := []struct {
name string
params *api.StrategyParameters
}{
{name: "validate nil params", params: nil},
{name: "validate empty params", params: &api.StrategyParameters{}},
{name: "validate params with NodeFit", params: &api.StrategyParameters{NodeFit: true}},
{name: "validate params with ThresholdPriority", params: &api.StrategyParameters{ThresholdPriority: &thresholdPriority}},
{name: "validate params with priorityClassName", params: &api.StrategyParameters{ThresholdPriorityClassName: "high-priority"}},
{name: "validate params with excluded namespace", params: &api.StrategyParameters{Namespaces: &api.Namespaces{Exclude: []string{"excluded-ns"}}}},
{name: "validate params with included namespace", params: &api.StrategyParameters{Namespaces: &api.Namespaces{Include: []string{"include-ns"}}}},
{name: "validate params with empty label selector", params: &api.StrategyParameters{LabelSelector: &metav1.LabelSelector{}}},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
params, err := ValidateAndParseStrategyParams(ctx, fakeClient, tc.params)
if err != nil {
t.Errorf("strategy params should be valid but got err: %v", err.Error())
}
if params == nil {
t.Errorf("strategy params should return a strategyParams but got nil")
}
})
}
}
func TestInvalidStrategyParams(t *testing.T) {
ctx := context.Background()
fakeClient := &fake.Clientset{}
testCases := []struct {
name string
params *api.StrategyParameters
}{
{
name: "invalid params with both included and excluded namespaces nil params",
params: &api.StrategyParameters{Namespaces: &api.Namespaces{Include: []string{"include-ns"}, Exclude: []string{"exclude-ns"}}},
},
{
name: "invalid params with both threshold priority and priority class name",
params: &api.StrategyParameters{ThresholdPriorityClassName: "high-priority", ThresholdPriority: &thresholdPriority},
},
{
name: "invalid params with bad label selector",
params: &api.StrategyParameters{LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"": "missing-label"}}},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
params, err := ValidateAndParseStrategyParams(ctx, fakeClient, tc.params)
if err == nil {
t.Errorf("strategy params should be invalid but did not get err")
}
if params != nil {
t.Errorf("strategy params should return a nil strategyParams but got %v", params)
}
})
}
}

View File

@@ -6,25 +6,9 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-base/featuregate"
"k8s.io/klog/v2"
)
const (
// owner: @jinxu
// beta: v1.10
//
// New local storage types to support local storage capacity isolation
LocalStorageCapacityIsolation featuregate.Feature = "LocalStorageCapacityIsolation"
// owner: @egernst
// alpha: v1.16
//
// Enables PodOverhead, for accounting pod overheads which are specific to a given RuntimeClass
PodOverhead featuregate.Feature = "PodOverhead"
)
// GetResourceRequest finds and returns the request value for a specific resource.
func GetResourceRequest(pod *v1.Pod, resource v1.ResourceName) int64 {
if resource == v1.ResourcePods {
@@ -53,11 +37,6 @@ func GetResourceRequestQuantity(pod *v1.Pod, resourceName v1.ResourceName) resou
requestQuantity = resource.Quantity{Format: resource.DecimalSI}
}
if resourceName == v1.ResourceEphemeralStorage && !utilfeature.DefaultFeatureGate.Enabled(LocalStorageCapacityIsolation) {
// if the local storage capacity isolation feature gate is disabled, pods request 0 disk
return requestQuantity
}
for _, container := range pod.Spec.Containers {
if rQuantity, ok := container.Resources.Requests[resourceName]; ok {
requestQuantity.Add(rQuantity)
@@ -72,9 +51,9 @@ func GetResourceRequestQuantity(pod *v1.Pod, resourceName v1.ResourceName) resou
}
}
// if PodOverhead feature is supported, add overhead for running a pod
// to the total requests if the resource total is non-zero
if pod.Spec.Overhead != nil && utilfeature.DefaultFeatureGate.Enabled(PodOverhead) {
// We assume pod overhead feature gate is enabled.
// We can't import the scheduler settings so we will inherit the default.
if pod.Spec.Overhead != nil {
if podOverhead, ok := pod.Spec.Overhead[resourceName]; ok && !requestQuantity.IsZero() {
requestQuantity.Add(podOverhead)
}
@@ -89,6 +68,11 @@ func IsMirrorPod(pod *v1.Pod) bool {
return ok
}
// IsPodTerminating returns true if the pod DeletionTimestamp is set.
func IsPodTerminating(pod *v1.Pod) bool {
return pod.DeletionTimestamp != nil
}
// IsStaticPod returns true if the pod is a static pod.
func IsStaticPod(pod *v1.Pod) bool {
source, err := GetPodSource(pod)
@@ -157,9 +141,9 @@ func PodRequestsAndLimits(pod *v1.Pod) (reqs, limits v1.ResourceList) {
maxResourceList(limits, container.Resources.Limits)
}
// if PodOverhead feature is supported, add overhead for running a pod
// to the sum of reqeuests and to non-zero limits:
if pod.Spec.Overhead != nil && utilfeature.DefaultFeatureGate.Enabled(PodOverhead) {
// We assume pod overhead feature gate is enabled.
// We can't import the scheduler settings so we will inherit the default.
if pod.Spec.Overhead != nil {
addResourceList(reqs, pod.Spec.Overhead)
for name, quantity := range pod.Spec.Overhead {
@@ -202,12 +186,31 @@ func maxResourceList(list, new v1.ResourceList) {
// PodToleratesTaints returns true if a pod tolerates one node's taints
func PodToleratesTaints(pod *v1.Pod, taintsOfNodes map[string][]v1.Taint) bool {
for nodeName, taintsForNode := range taintsOfNodes {
if len(pod.Spec.Tolerations) >= len(taintsForNode) && TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, taintsForNode, nil) {
return true
}
klog.V(5).InfoS("Pod doesn't tolerate nodes taint", "pod", klog.KObj(pod), "nodeName", nodeName)
}
for nodeName, taintsForNode := range taintsOfNodes {
if len(pod.Spec.Tolerations) >= len(taintsForNode) {
if TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, taintsForNode, nil) {
return true
}
if klog.V(5).Enabled() {
for i := range taintsForNode {
if !TolerationsTolerateTaint(pod.Spec.Tolerations, &taintsForNode[i]) {
klog.V(5).InfoS("Pod doesn't tolerate node taint",
"pod", klog.KObj(pod),
"nodeName", nodeName,
"taint", fmt.Sprintf("%s:%s=%s", taintsForNode[i].Key, taintsForNode[i].Value, taintsForNode[i].Effect),
)
}
}
}
} else {
klog.V(5).InfoS("Pod doesn't tolerate nodes taint, count mismatch",
"pod", klog.KObj(pod),
"nodeName", nodeName,
)
}
}
return false
}

View File

@@ -0,0 +1,177 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e
import (
"context"
"strings"
"testing"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
deschedulerapi "sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
eutils "sigs.k8s.io/descheduler/pkg/descheduler/evictions/utils"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies"
)
func TestRemoveDuplicates(t *testing.T) {
ctx := context.Background()
clientSet, _, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
t.Errorf("Error listing node with %v", err)
}
nodes, workerNodes := splitNodesAndWorkerNodes(nodeList.Items)
t.Log("Creating testing namespace")
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
if _, err := clientSet.CoreV1().Namespaces().Create(ctx, testNamespace, metav1.CreateOptions{}); err != nil {
t.Fatalf("Unable to create ns %v", testNamespace.Name)
}
defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{})
t.Log("Creating duplicates pods")
deploymentObj := &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "duplicate-pod",
Namespace: testNamespace.Name,
Labels: map[string]string{"app": "test-duplicate", "name": "test-duplicatePods"},
},
Spec: appsv1.DeploymentSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "test-duplicate", "name": "test-duplicatePods"},
},
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{"app": "test-duplicate", "name": "test-duplicatePods"},
},
Spec: v1.PodSpec{
Containers: []v1.Container{{
Name: "pause",
ImagePullPolicy: "Always",
Image: "kubernetes/pause",
Ports: []v1.ContainerPort{{ContainerPort: 80}},
}},
},
},
},
}
tests := []struct {
description string
replicasNum int
beforeFunc func(deployment *appsv1.Deployment)
expectedEvictedPodCount uint
}{
{
description: "Evict Pod even Pods schedule to specific node",
replicasNum: 4,
beforeFunc: func(deployment *appsv1.Deployment) {
deployment.Spec.Replicas = func(i int32) *int32 { return &i }(4)
deployment.Spec.Template.Spec.NodeName = workerNodes[0].Name
},
expectedEvictedPodCount: 2,
},
{
description: "Evict Pod even Pods with local storage",
replicasNum: 5,
beforeFunc: func(deployment *appsv1.Deployment) {
deployment.Spec.Replicas = func(i int32) *int32 { return &i }(5)
deployment.Spec.Template.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI),
},
},
},
}
},
expectedEvictedPodCount: 2,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
t.Logf("Creating deployment %v in %v namespace", deploymentObj.Name, deploymentObj.Namespace)
tc.beforeFunc(deploymentObj)
_, err = clientSet.AppsV1().Deployments(deploymentObj.Namespace).Create(ctx, deploymentObj, metav1.CreateOptions{})
if err != nil {
t.Logf("Error creating deployment: %v", err)
if err = clientSet.AppsV1().Deployments(deploymentObj.Namespace).DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{
LabelSelector: labels.SelectorFromSet(labels.Set(map[string]string{"app": "test-duplicate", "name": "test-duplicatePods"})).String(),
}); err != nil {
t.Fatalf("Unable to delete deployment: %v", err)
}
return
}
defer clientSet.AppsV1().Deployments(deploymentObj.Namespace).Delete(ctx, deploymentObj.Name, metav1.DeleteOptions{})
waitForPodsRunning(ctx, t, clientSet, map[string]string{"app": "test-duplicate", "name": "test-duplicatePods"}, tc.replicasNum, testNamespace.Name)
// Run DeschedulerStrategy strategy
evictionPolicyGroupVersion, err := eutils.SupportEviction(clientSet)
if err != nil || len(evictionPolicyGroupVersion) == 0 {
t.Fatalf("Error creating eviction policy group %v", err)
}
podEvictor := evictions.NewPodEvictor(
clientSet,
evictionPolicyGroupVersion,
false,
nil,
nil,
nodes,
getPodsAssignedToNode,
true,
false,
false,
false,
false,
)
t.Log("Running DeschedulerStrategy strategy")
strategies.RemoveDuplicatePods(
ctx,
clientSet,
deschedulerapi.DeschedulerStrategy{
Enabled: true,
Params: &deschedulerapi.StrategyParameters{
RemoveDuplicates: &deschedulerapi.RemoveDuplicates{},
},
},
workerNodes,
podEvictor,
getPodsAssignedToNode,
)
waitForTerminatingPodsToDisappear(ctx, t, clientSet, testNamespace.Name)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount != tc.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Unexpected number of pods have been evicted, got %v, expected %v", tc.description, actualEvictedPodCount, tc.expectedEvictedPodCount)
}
})
}
}

View File

@@ -0,0 +1,156 @@
package e2e
import (
"context"
"strings"
"testing"
"time"
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
deschedulerapi "sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies"
)
var oneHourPodLifetimeSeconds uint = 3600
func TestFailedPods(t *testing.T) {
ctx := context.Background()
clientSet, _, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
t.Errorf("Error listing node with %v", err)
}
nodes, _ := splitNodesAndWorkerNodes(nodeList.Items)
t.Log("Creating testing namespace")
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
if _, err := clientSet.CoreV1().Namespaces().Create(ctx, testNamespace, metav1.CreateOptions{}); err != nil {
t.Fatalf("Unable to create ns %v", testNamespace.Name)
}
defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{})
testCases := map[string]struct {
expectedEvictedCount uint
strategyParams *deschedulerapi.StrategyParameters
}{
"test-failed-pods-nil-strategy": {
expectedEvictedCount: 1,
strategyParams: nil,
},
"test-failed-pods-default-strategy": {
expectedEvictedCount: 1,
strategyParams: &deschedulerapi.StrategyParameters{},
},
"test-failed-pods-default-failed-pods": {
expectedEvictedCount: 1,
strategyParams: &deschedulerapi.StrategyParameters{
FailedPods: &deschedulerapi.FailedPods{},
},
},
"test-failed-pods-reason-unmatched": {
expectedEvictedCount: 0,
strategyParams: &deschedulerapi.StrategyParameters{
FailedPods: &deschedulerapi.FailedPods{Reasons: []string{"ReasonDoesNotMatch"}},
},
},
"test-failed-pods-min-age-unmet": {
expectedEvictedCount: 0,
strategyParams: &deschedulerapi.StrategyParameters{
FailedPods: &deschedulerapi.FailedPods{MinPodLifetimeSeconds: &oneHourPodLifetimeSeconds},
},
},
"test-failed-pods-exclude-job-kind": {
expectedEvictedCount: 0,
strategyParams: &deschedulerapi.StrategyParameters{
FailedPods: &deschedulerapi.FailedPods{ExcludeOwnerKinds: []string{"Job"}},
},
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
job := initFailedJob(name, testNamespace.Namespace)
t.Logf("Creating job %s in %s namespace", job.Name, job.Namespace)
jobClient := clientSet.BatchV1().Jobs(testNamespace.Name)
if _, err := jobClient.Create(ctx, job, metav1.CreateOptions{}); err != nil {
t.Fatalf("Error creating Job %s: %v", name, err)
}
deletePropagationPolicy := metav1.DeletePropagationForeground
defer jobClient.Delete(ctx, job.Name, metav1.DeleteOptions{PropagationPolicy: &deletePropagationPolicy})
waitForJobPodPhase(ctx, t, clientSet, job, v1.PodFailed)
podEvictor := initPodEvictorOrFail(t, clientSet, getPodsAssignedToNode, nodes)
t.Logf("Running RemoveFailedPods strategy for %s", name)
strategies.RemoveFailedPods(
ctx,
clientSet,
deschedulerapi.DeschedulerStrategy{
Enabled: true,
Params: tc.strategyParams,
},
nodes,
podEvictor,
getPodsAssignedToNode,
)
t.Logf("Finished RemoveFailedPods strategy for %s", name)
if actualEvictedCount := podEvictor.TotalEvicted(); actualEvictedCount == tc.expectedEvictedCount {
t.Logf("Total of %d Pods were evicted for %s", actualEvictedCount, name)
} else {
t.Errorf("Unexpected number of pods have been evicted, got %v, expected %v", actualEvictedCount, tc.expectedEvictedCount)
}
})
}
}
func initFailedJob(name, namespace string) *batchv1.Job {
podSpec := MakePodSpec("", nil)
podSpec.Containers[0].Command = []string{"/bin/false"}
podSpec.RestartPolicy = v1.RestartPolicyNever
labelsSet := labels.Set{"test": name, "name": name}
jobBackoffLimit := int32(0)
return &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Labels: labelsSet,
Name: name,
Namespace: namespace,
},
Spec: batchv1.JobSpec{
Template: v1.PodTemplateSpec{
Spec: podSpec,
ObjectMeta: metav1.ObjectMeta{Labels: labelsSet},
},
BackoffLimit: &jobBackoffLimit,
},
}
}
func waitForJobPodPhase(ctx context.Context, t *testing.T, clientSet clientset.Interface, job *batchv1.Job, phase v1.PodPhase) {
podClient := clientSet.CoreV1().Pods(job.Namespace)
if err := wait.PollImmediate(5*time.Second, 30*time.Second, func() (bool, error) {
t.Log(labels.FormatLabels(job.Labels))
if podList, err := podClient.List(ctx, metav1.ListOptions{LabelSelector: labels.FormatLabels(job.Labels)}); err != nil {
return false, err
} else {
if len(podList.Items) == 0 {
t.Logf("Job controller has not created Pod for job %s yet", job.Name)
return false, nil
}
for _, pod := range podList.Items {
if pod.Status.Phase != phase {
t.Logf("Pod %v not in %s phase yet, is %v instead", pod.Name, phase, pod.Status.Phase)
return false, nil
}
}
t.Logf("Job %v Pod is in %s phase now", job.Name, phase)
return true, nil
}
}); err != nil {
t.Fatalf("Error waiting for pods in %s phase: %v", phase, err)
}
}

View File

@@ -0,0 +1,201 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e
import (
"context"
"fmt"
"os"
"reflect"
"strings"
"testing"
"time"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
clientset "k8s.io/client-go/kubernetes"
"sigs.k8s.io/descheduler/cmd/descheduler/app/options"
"sigs.k8s.io/descheduler/pkg/descheduler"
)
func TestLeaderElection(t *testing.T) {
ctx := context.Background()
clientSet, _, _, stopCh := initializeClient(t)
defer close(stopCh)
ns1 := "e2e-" + strings.ToLower(t.Name()+"-a")
ns2 := "e2e-" + strings.ToLower(t.Name()+"-b")
t.Logf("Creating testing namespace %v", ns1)
testNamespace1 := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns1}}
if _, err := clientSet.CoreV1().Namespaces().Create(ctx, testNamespace1, metav1.CreateOptions{}); err != nil {
t.Fatalf("Unable to create ns %v", testNamespace1.Name)
}
defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace1.Name, metav1.DeleteOptions{})
t.Logf("Creating testing namespace %v", ns2)
testNamespace2 := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns2}}
if _, err := clientSet.CoreV1().Namespaces().Create(ctx, testNamespace2, metav1.CreateOptions{}); err != nil {
t.Fatalf("Unable to create ns %v", testNamespace2.Name)
}
defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace2.Name, metav1.DeleteOptions{})
deployment1, err := createDeployment(ctx, clientSet, ns1, 5, t)
if err != nil {
t.Fatalf("create deployment 1: %v", err)
}
defer clientSet.AppsV1().Deployments(deployment1.Namespace).Delete(ctx, deployment1.Name, metav1.DeleteOptions{})
deployment2, err := createDeployment(ctx, clientSet, ns2, 5, t)
if err != nil {
t.Fatalf("create deployment 2: %v", err)
}
defer clientSet.AppsV1().Deployments(deployment2.Namespace).Delete(ctx, deployment2.Name, metav1.DeleteOptions{})
waitForPodsRunning(ctx, t, clientSet, map[string]string{"test": "leaderelection", "name": "test-leaderelection"}, 5, ns1)
podListAOrg := getPodNameList(ctx, clientSet, ns1, t)
waitForPodsRunning(ctx, t, clientSet, map[string]string{"test": "leaderelection", "name": "test-leaderelection"}, 5, ns2)
podListBOrg := getPodNameList(ctx, clientSet, ns2, t)
s1, err := options.NewDeschedulerServer()
if err != nil {
t.Fatalf("unable to initialize server: %v", err)
}
s1.Client = clientSet
s1.DeschedulingInterval = 5 * time.Second
s1.LeaderElection.LeaderElect = true
s1.KubeconfigFile = os.Getenv("KUBECONFIG")
s1.PolicyConfigFile = "./policy_leaderelection_a.yaml"
s2, err := options.NewDeschedulerServer()
if err != nil {
t.Fatalf("unable to initialize server: %v", err)
}
s2.Client = clientSet
s2.DeschedulingInterval = 5 * time.Second
s2.LeaderElection.LeaderElect = true
s2.KubeconfigFile = os.Getenv("KUBECONFIG")
s2.PolicyConfigFile = "./policy_leaderelection_b.yaml"
t.Log("starting deschedulers")
go func() {
err := descheduler.Run(ctx, s1)
if err != nil {
t.Errorf("unable to start descheduler: %v", err)
return
}
}()
time.Sleep(1 * time.Second)
go func() {
err := descheduler.Run(ctx, s2)
if err != nil {
t.Errorf("unable to start descheduler: %v", err)
return
}
}()
defer clientSet.CoordinationV1().Leases(s1.LeaderElection.ResourceNamespace).Delete(ctx, s1.LeaderElection.ResourceName, metav1.DeleteOptions{})
defer clientSet.CoordinationV1().Leases(s2.LeaderElection.ResourceNamespace).Delete(ctx, s2.LeaderElection.ResourceName, metav1.DeleteOptions{})
// wait for a while so all the pods are 5 seconds older
time.Sleep(7 * time.Second)
// validate only pods from e2e-testleaderelection-a namespace are evicted.
podListA := getPodNameList(ctx, clientSet, ns1, t)
podListB := getPodNameList(ctx, clientSet, ns2, t)
left := reflect.DeepEqual(podListAOrg, podListA)
right := reflect.DeepEqual(podListBOrg, podListB)
singleNamespaceEvicted := (left && !right) || (!left && right)
if singleNamespaceEvicted {
if !left {
t.Logf("Only the pods in %s namespace are evicted. Pods before: %s, Pods after %s", ns1, podListAOrg, podListA)
} else {
t.Logf("Only the pods in %s namespace are evicted. Pods before: %s, Pods after %s", ns2, podListBOrg, podListB)
}
} else {
t.Fatalf("Pods are evicted in both namespaces. For %s namespace Pods before: %s, Pods after %s. And, for %s namespace Pods before: %s, Pods after: %s", ns1, podListAOrg, podListA, ns2, podListBOrg, podListB)
}
}
func createDeployment(ctx context.Context, clientSet clientset.Interface, namespace string, replicas int32, t *testing.T) (*appsv1.Deployment, error) {
deployment := &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "leaderelection",
Namespace: namespace,
Labels: map[string]string{"test": "leaderelection", "name": "test-leaderelection"},
},
Spec: appsv1.DeploymentSpec{
Replicas: func(i int32) *int32 { return &i }(replicas),
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"test": "leaderelection", "name": "test-leaderelection"},
},
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{"test": "leaderelection", "name": "test-leaderelection"},
},
Spec: v1.PodSpec{
Containers: []v1.Container{{
Name: "pause",
ImagePullPolicy: "Always",
Image: "kubernetes/pause",
Ports: []v1.ContainerPort{{ContainerPort: 80}},
}},
},
},
},
}
t.Logf("Creating deployment %v for namespace %s", deployment.Name, deployment.Namespace)
deployment, err := clientSet.AppsV1().Deployments(deployment.Namespace).Create(ctx, deployment, metav1.CreateOptions{})
if err != nil {
t.Logf("Error creating deployment: %v", err)
if err = clientSet.AppsV1().Deployments(deployment.Namespace).DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{
LabelSelector: labels.SelectorFromSet(labels.Set(map[string]string{"test": "leaderelection", "name": "test-leaderelection"})).String(),
}); err != nil {
t.Fatalf("Unable to delete deployment: %v", err)
}
return nil, fmt.Errorf("create deployment %v", err)
}
return deployment, nil
}
func getPodNameList(ctx context.Context, clientSet clientset.Interface, namespace string, t *testing.T) []string {
podList, err := clientSet.CoreV1().Pods(namespace).List(
ctx, metav1.ListOptions{LabelSelector: labels.SelectorFromSet(labels.Set(map[string]string{"test": "leaderelection", "name": "test-leaderelection"})).String()})
if err != nil {
t.Fatalf("Unable to list pods from ns: %s: %v", namespace, err)
}
podNames := make([]string, len(podList.Items))
for i, pod := range podList.Items {
podNames[i] = pod.Name
}
return podNames
}

View File

@@ -39,7 +39,6 @@ import (
v1qos "k8s.io/kubectl/pkg/util/qos"
"sigs.k8s.io/descheduler/cmd/descheduler/app/options"
"sigs.k8s.io/descheduler/pkg/api"
deschedulerapi "sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler"
"sigs.k8s.io/descheduler/pkg/descheduler/client"
@@ -62,11 +61,11 @@ func MakePodSpec(priorityClassName string, gracePeriod *int64) v1.PodSpec {
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("1000Mi"),
v1.ResourceMemory: resource.MustParse("200Mi"),
},
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("800Mi"),
v1.ResourceMemory: resource.MustParse("100Mi"),
},
},
}},
@@ -75,7 +74,7 @@ func MakePodSpec(priorityClassName string, gracePeriod *int64) v1.PodSpec {
}
}
// RcByNameContainer returns a ReplicationControoler with specified name and container
// RcByNameContainer returns a ReplicationController with specified name and container
func RcByNameContainer(name, namespace string, replicas int32, labels map[string]string, gracePeriod *int64, priorityClassName string) *v1.ReplicationController {
zeroGracePeriod := int64(0)
@@ -108,7 +107,7 @@ func RcByNameContainer(name, namespace string, replicas int32, labels map[string
}
}
func initializeClient(t *testing.T) (clientset.Interface, coreinformers.NodeInformer, chan struct{}) {
func initializeClient(t *testing.T) (clientset.Interface, coreinformers.NodeInformer, podutil.GetPodsAssignedToNodeFunc, chan struct{}) {
clientSet, err := client.CreateClient(os.Getenv("KUBECONFIG"))
if err != nil {
t.Errorf("Error during client creation with %v", err)
@@ -117,12 +116,41 @@ func initializeClient(t *testing.T) (clientset.Interface, coreinformers.NodeInfo
stopChannel := make(chan struct{})
sharedInformerFactory := informers.NewSharedInformerFactory(clientSet, 0)
nodeInformer := sharedInformerFactory.Core().V1().Nodes()
podInformer := sharedInformerFactory.Core().V1().Pods()
getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
if err != nil {
t.Errorf("build get pods assigned to node function error: %v", err)
}
sharedInformerFactory.Start(stopChannel)
sharedInformerFactory.WaitForCacheSync(stopChannel)
nodeInformer := sharedInformerFactory.Core().V1().Nodes()
waitForNodesReady(context.Background(), t, clientSet, nodeInformer)
return clientSet, nodeInformer, getPodsAssignedToNode, stopChannel
}
return clientSet, nodeInformer, stopChannel
func waitForNodesReady(ctx context.Context, t *testing.T, clientSet clientset.Interface, nodeInformer coreinformers.NodeInformer) {
if err := wait.PollImmediate(5*time.Second, 30*time.Second, func() (bool, error) {
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return false, err
}
readyNodes, err := nodeutil.ReadyNodes(ctx, clientSet, nodeInformer, "")
if err != nil {
return false, err
}
if len(nodeList.Items) != len(readyNodes) {
t.Logf("%v/%v nodes are ready. Waiting for all nodes to be ready...", len(readyNodes), len(nodeList.Items))
return false, nil
}
return true, nil
}); err != nil {
t.Fatalf("Error waiting for nodes to be ready: %v", err)
}
}
func runPodLifetimeStrategy(
@@ -135,6 +163,7 @@ func runPodLifetimeStrategy(
priority *int32,
evictCritical bool,
labelSelector *metav1.LabelSelector,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
) {
// Run descheduler.
evictionPolicyGroupVersion, err := eutils.SupportEviction(clientset)
@@ -166,12 +195,17 @@ func runPodLifetimeStrategy(
clientset,
evictionPolicyGroupVersion,
false,
0,
nil,
nil,
nodes,
getPodsAssignedToNode,
false,
evictCritical,
false,
false,
false,
),
getPodsAssignedToNode,
)
}
@@ -201,7 +235,7 @@ func intersectStrings(lista, listb []string) []string {
func TestLowNodeUtilization(t *testing.T) {
ctx := context.Background()
clientSet, _, stopCh := initializeClient(t)
clientSet, _, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
@@ -209,15 +243,7 @@ func TestLowNodeUtilization(t *testing.T) {
t.Errorf("Error listing node with %v", err)
}
var nodes []*v1.Node
var workerNodes []*v1.Node
for i := range nodeList.Items {
node := nodeList.Items[i]
nodes = append(nodes, &node)
if _, exists := node.Labels["node-role.kubernetes.io/master"]; !exists {
workerNodes = append(workerNodes, &node)
}
}
nodes, workerNodes := splitNodesAndWorkerNodes(nodeList.Items)
t.Log("Creating testing namespace")
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
@@ -298,22 +324,14 @@ func TestLowNodeUtilization(t *testing.T) {
waitForRCPodsRunning(ctx, t, clientSet, rc)
// Run LowNodeUtilization strategy
evictionPolicyGroupVersion, err := eutils.SupportEviction(clientSet)
if err != nil || len(evictionPolicyGroupVersion) == 0 {
t.Fatalf("%v", err)
}
podEvictor := evictions.NewPodEvictor(
clientSet,
evictionPolicyGroupVersion,
false,
0,
nodes,
true,
false,
false,
)
podEvictor := initPodEvictorOrFail(t, clientSet, getPodsAssignedToNode, nodes)
podsOnMosttUtilizedNode, err := podutil.ListPodsOnANode(ctx, clientSet, workerNodes[0], podutil.WithFilter(podEvictor.Evictable().IsEvictable))
podFilter, err := podutil.NewOptions().WithFilter(podEvictor.Evictable().IsEvictable).BuildFilterFunc()
if err != nil {
t.Errorf("Error initializing pod filter function, %v", err)
}
podsOnMosttUtilizedNode, err := podutil.ListPodsOnANode(workerNodes[0].Name, getPodsAssignedToNode, podFilter)
if err != nil {
t.Errorf("Error listing pods on a node %v", err)
}
@@ -338,11 +356,17 @@ func TestLowNodeUtilization(t *testing.T) {
},
workerNodes,
podEvictor,
getPodsAssignedToNode,
)
waitForTerminatingPodsToDisappear(ctx, t, clientSet, rc.Namespace)
podsOnMosttUtilizedNode, err = podutil.ListPodsOnANode(ctx, clientSet, workerNodes[0], podutil.WithFilter(podEvictor.Evictable().IsEvictable))
podFilter, err = podutil.NewOptions().WithFilter(podEvictor.Evictable().IsEvictable).BuildFilterFunc()
if err != nil {
t.Errorf("Error initializing pod filter function, %v", err)
}
podsOnMosttUtilizedNode, err = podutil.ListPodsOnANode(workerNodes[0].Name, getPodsAssignedToNode, podFilter)
if err != nil {
t.Errorf("Error listing pods on a node %v", err)
}
@@ -359,7 +383,7 @@ func TestLowNodeUtilization(t *testing.T) {
func TestNamespaceConstraintsInclude(t *testing.T) {
ctx := context.Background()
clientSet, nodeInformer, stopCh := initializeClient(t)
clientSet, nodeInformer, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
@@ -394,7 +418,7 @@ func TestNamespaceConstraintsInclude(t *testing.T) {
t.Logf("set the strategy to delete pods from %v namespace", rc.Namespace)
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, &deschedulerapi.Namespaces{
Include: []string{rc.Namespace},
}, "", nil, false, nil)
}, "", nil, false, nil, getPodsAssignedToNode)
// All pods are supposed to be deleted, wait until all the old pods are deleted
if err := wait.PollImmediate(time.Second, 20*time.Second, func() (bool, error) {
@@ -430,7 +454,7 @@ func TestNamespaceConstraintsInclude(t *testing.T) {
func TestNamespaceConstraintsExclude(t *testing.T) {
ctx := context.Background()
clientSet, nodeInformer, stopCh := initializeClient(t)
clientSet, nodeInformer, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
@@ -465,7 +489,7 @@ func TestNamespaceConstraintsExclude(t *testing.T) {
t.Logf("set the strategy to delete pods from namespaces except the %v namespace", rc.Namespace)
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, &deschedulerapi.Namespaces{
Exclude: []string{rc.Namespace},
}, "", nil, false, nil)
}, "", nil, false, nil, getPodsAssignedToNode)
t.Logf("Waiting 10s")
time.Sleep(10 * time.Second)
@@ -493,11 +517,11 @@ func TestEvictSystemCriticalPriorityClass(t *testing.T) {
}
func testEvictSystemCritical(t *testing.T, isPriorityClass bool) {
var highPriority = int32(1000)
var lowPriority = int32(500)
highPriority := int32(1000)
lowPriority := int32(500)
ctx := context.Background()
clientSet, nodeInformer, stopCh := initializeClient(t)
clientSet, nodeInformer, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
@@ -578,9 +602,9 @@ func testEvictSystemCritical(t *testing.T, isPriorityClass bool) {
t.Logf("Existing pods: %v", initialPodNames)
if isPriorityClass {
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, highPriorityClass.Name, nil, true, nil)
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, highPriorityClass.Name, nil, true, nil, getPodsAssignedToNode)
} else {
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, "", &highPriority, true, nil)
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, "", &highPriority, true, nil, getPodsAssignedToNode)
}
// All pods are supposed to be deleted, wait until all pods in the test namespace are terminating
@@ -623,11 +647,11 @@ func TestThresholdPriorityClass(t *testing.T) {
}
func testPriority(t *testing.T, isPriorityClass bool) {
var highPriority = int32(1000)
var lowPriority = int32(500)
highPriority := int32(1000)
lowPriority := int32(500)
ctx := context.Background()
clientSet, nodeInformer, stopCh := initializeClient(t)
clientSet, nodeInformer, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
@@ -697,10 +721,10 @@ func testPriority(t *testing.T, isPriorityClass bool) {
if isPriorityClass {
t.Logf("set the strategy to delete pods with priority lower than priority class %s", highPriorityClass.Name)
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, highPriorityClass.Name, nil, false, nil)
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, highPriorityClass.Name, nil, false, nil, getPodsAssignedToNode)
} else {
t.Logf("set the strategy to delete pods with priority lower than %d", highPriority)
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, "", &highPriority, false, nil)
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, "", &highPriority, false, nil, getPodsAssignedToNode)
}
t.Logf("Waiting 10s")
@@ -721,7 +745,7 @@ func testPriority(t *testing.T, isPriorityClass bool) {
t.Fatalf("None of %v high priority pods are expected to be deleted", expectReservePodNames)
}
//check if all pods with low priority class are evicted
// check if all pods with low priority class are evicted
if err := wait.PollImmediate(time.Second, 30*time.Second, func() (bool, error) {
podListLowPriority, err := clientSet.CoreV1().Pods(rcLowPriority.Namespace).List(
ctx, metav1.ListOptions{LabelSelector: labels.SelectorFromSet(rcLowPriority.Spec.Template.Labels).String()})
@@ -756,7 +780,7 @@ func testPriority(t *testing.T, isPriorityClass bool) {
func TestPodLabelSelector(t *testing.T) {
ctx := context.Background()
clientSet, nodeInformer, stopCh := initializeClient(t)
clientSet, nodeInformer, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
@@ -804,7 +828,7 @@ func TestPodLabelSelector(t *testing.T) {
t.Logf("Pods not expected to be evicted: %v, pods expected to be evicted: %v", expectReservePodNames, expectEvictPodNames)
t.Logf("set the strategy to delete pods with label test:podlifetime-evict")
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, "", nil, false, &metav1.LabelSelector{MatchLabels: map[string]string{"test": "podlifetime-evict"}})
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, "", nil, false, &metav1.LabelSelector{MatchLabels: map[string]string{"test": "podlifetime-evict"}}, getPodsAssignedToNode)
t.Logf("Waiting 10s")
time.Sleep(10 * time.Second)
@@ -824,7 +848,7 @@ func TestPodLabelSelector(t *testing.T) {
t.Fatalf("None of %v unevictable pods are expected to be deleted", expectReservePodNames)
}
//check if all selected pods are evicted
// check if all selected pods are evicted
if err := wait.PollImmediate(time.Second, 30*time.Second, func() (bool, error) {
podListEvict, err := clientSet.CoreV1().Pods(rcEvict.Namespace).List(
ctx, metav1.ListOptions{LabelSelector: labels.SelectorFromSet(rcEvict.Spec.Template.Labels).String()})
@@ -859,20 +883,9 @@ func TestPodLabelSelector(t *testing.T) {
func TestEvictAnnotation(t *testing.T) {
ctx := context.Background()
clientSet, nodeInformer, stopCh := initializeClient(t)
clientSet, nodeInformer, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
t.Errorf("Error listing node with %v", err)
}
var nodes []*v1.Node
for i := range nodeList.Items {
node := nodeList.Items[i]
nodes = append(nodes, &node)
}
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
if _, err := clientSet.CoreV1().Namespaces().Create(ctx, testNamespace, metav1.CreateOptions{}); err != nil {
t.Fatalf("Unable to create ns %v", testNamespace.Name)
@@ -887,7 +900,8 @@ func TestEvictAnnotation(t *testing.T) {
Name: "sample",
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI),
},
},
},
}
@@ -914,7 +928,7 @@ func TestEvictAnnotation(t *testing.T) {
t.Logf("Existing pods: %v", initialPodNames)
t.Log("Running PodLifetime strategy")
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, "", nil, false, nil)
runPodLifetimeStrategy(ctx, t, clientSet, nodeInformer, nil, "", nil, false, nil, getPodsAssignedToNode)
if err := wait.PollImmediate(5*time.Second, time.Minute, func() (bool, error) {
podList, err = clientSet.CoreV1().Pods(rc.Namespace).List(ctx, metav1.ListOptions{LabelSelector: labels.SelectorFromSet(rc.Spec.Template.Labels).String()})
@@ -952,7 +966,7 @@ func TestDeschedulingInterval(t *testing.T) {
}
s.Client = clientSet
deschedulerPolicy := &api.DeschedulerPolicy{}
deschedulerPolicy := &deschedulerapi.DeschedulerPolicy{}
c := make(chan bool, 1)
go func() {
@@ -960,9 +974,7 @@ func TestDeschedulingInterval(t *testing.T) {
if err != nil || len(evictionPolicyGroupVersion) == 0 {
t.Errorf("Error when checking support for eviction: %v", err)
}
stopChannel := make(chan struct{})
if err := descheduler.RunDeschedulerStrategies(ctx, s, deschedulerPolicy, evictionPolicyGroupVersion, stopChannel); err != nil {
if err := descheduler.RunDeschedulerStrategies(ctx, s, deschedulerPolicy, evictionPolicyGroupVersion); err != nil {
t.Errorf("Error running descheduler strategies: %+v", err)
}
c <- true
@@ -1121,8 +1133,8 @@ func createBalancedPodForNodes(
// find the max, if the node has the max,use the one, if not,use the ratio parameter
var maxCPUFraction, maxMemFraction float64 = ratio, ratio
var cpuFractionMap = make(map[string]float64)
var memFractionMap = make(map[string]float64)
cpuFractionMap := make(map[string]float64)
memFractionMap := make(map[string]float64)
for _, node := range nodes {
cpuFraction, memFraction, _, _ := computeCPUMemFraction(t, cs, node, podRequestedResource)
@@ -1159,7 +1171,7 @@ func createBalancedPodForNodes(
// add crioMinMemLimit to ensure that all pods are setting at least that much for a limit, while keeping the same ratios
needCreateResource[v1.ResourceMemory] = *resource.NewQuantity(int64((ratio-memFraction)*float64(memAllocatableVal)+float64(crioMinMemLimit)), resource.BinarySI)
var gracePeriod = int64(1)
gracePeriod := int64(1)
// Don't set OwnerReferences to avoid pod eviction
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
@@ -1288,3 +1300,61 @@ func waitForPodRunning(ctx context.Context, t *testing.T, clientSet clientset.In
t.Fatalf("Error waiting for pod running: %v", err)
}
}
func waitForPodsRunning(ctx context.Context, t *testing.T, clientSet clientset.Interface, labelMap map[string]string, desireRunningPodNum int, namespace string) {
if err := wait.PollImmediate(10*time.Second, 60*time.Second, func() (bool, error) {
podList, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
LabelSelector: labels.SelectorFromSet(labelMap).String(),
})
if err != nil {
return false, err
}
if len(podList.Items) != desireRunningPodNum {
t.Logf("Waiting for %v pods to be running, got %v instead", desireRunningPodNum, len(podList.Items))
return false, nil
}
for _, pod := range podList.Items {
if pod.Status.Phase != v1.PodRunning {
t.Logf("Pod %v not running yet, is %v instead", pod.Name, pod.Status.Phase)
return false, nil
}
}
return true, nil
}); err != nil {
t.Fatalf("Error waiting for pods running: %v", err)
}
}
func splitNodesAndWorkerNodes(nodes []v1.Node) ([]*v1.Node, []*v1.Node) {
var allNodes []*v1.Node
var workerNodes []*v1.Node
for i := range nodes {
node := nodes[i]
allNodes = append(allNodes, &node)
if _, exists := node.Labels["node-role.kubernetes.io/control-plane"]; !exists {
workerNodes = append(workerNodes, &node)
}
}
return allNodes, workerNodes
}
func initPodEvictorOrFail(t *testing.T, clientSet clientset.Interface, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, nodes []*v1.Node) *evictions.PodEvictor {
evictionPolicyGroupVersion, err := eutils.SupportEviction(clientSet)
if err != nil || len(evictionPolicyGroupVersion) == 0 {
t.Fatalf("Error creating eviction policy group: %v", err)
}
return evictions.NewPodEvictor(
clientSet,
evictionPolicyGroupVersion,
false,
nil,
nil,
nodes,
getPodsAssignedToNode,
true,
false,
false,
false,
false,
)
}

View File

@@ -0,0 +1,208 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e
import (
"context"
"fmt"
"strings"
"testing"
"time"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
clientset "k8s.io/client-go/kubernetes"
deschedulerapi "sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
eutils "sigs.k8s.io/descheduler/pkg/descheduler/evictions/utils"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies"
)
func TestTooManyRestarts(t *testing.T) {
ctx := context.Background()
clientSet, _, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
t.Errorf("Error listing node with %v", err)
}
nodes, workerNodes := splitNodesAndWorkerNodes(nodeList.Items)
t.Logf("Creating testing namespace %v", t.Name())
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
if _, err := clientSet.CoreV1().Namespaces().Create(ctx, testNamespace, metav1.CreateOptions{}); err != nil {
t.Fatalf("Unable to create ns %v", testNamespace.Name)
}
defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{})
deploymentObj := &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "restart-pod",
Namespace: testNamespace.Name,
Labels: map[string]string{"test": "restart-pod", "name": "test-toomanyrestarts"},
},
Spec: appsv1.DeploymentSpec{
Replicas: func(i int32) *int32 { return &i }(4),
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"test": "restart-pod", "name": "test-toomanyrestarts"},
},
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{"test": "restart-pod", "name": "test-toomanyrestarts"},
},
Spec: v1.PodSpec{
Containers: []v1.Container{{
Name: "pause",
ImagePullPolicy: "Always",
Image: "kubernetes/pause",
Command: []string{"/bin/sh"},
Args: []string{"-c", "sleep 1s && exit 1"},
Ports: []v1.ContainerPort{{ContainerPort: 80}},
}},
},
},
},
}
t.Logf("Creating deployment %v", deploymentObj.Name)
_, err = clientSet.AppsV1().Deployments(deploymentObj.Namespace).Create(ctx, deploymentObj, metav1.CreateOptions{})
if err != nil {
t.Logf("Error creating deployment: %v", err)
if err = clientSet.AppsV1().Deployments(deploymentObj.Namespace).DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{
LabelSelector: labels.SelectorFromSet(labels.Set(map[string]string{"test": "restart-pod", "name": "test-toomanyrestarts"})).String(),
}); err != nil {
t.Fatalf("Unable to delete deployment: %v", err)
}
return
}
defer clientSet.AppsV1().Deployments(deploymentObj.Namespace).Delete(ctx, deploymentObj.Name, metav1.DeleteOptions{})
// Need to wait restartCount more than 4
result, err := waitPodRestartCount(ctx, clientSet, testNamespace.Name, t)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if !result {
t.Fatal("Pod restart count not as expected")
}
tests := []struct {
name string
podRestartThreshold int32
includingInitContainers bool
expectedEvictedPodCount uint
}{
{
name: "test-no-evictions",
podRestartThreshold: int32(10000),
includingInitContainers: true,
expectedEvictedPodCount: 0,
},
{
name: "test-one-evictions",
podRestartThreshold: int32(4),
includingInitContainers: true,
expectedEvictedPodCount: 4,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
evictionPolicyGroupVersion, err := eutils.SupportEviction(clientSet)
if err != nil || len(evictionPolicyGroupVersion) == 0 {
t.Fatalf("Error creating eviction policy group: %v", err)
}
podEvictor := evictions.NewPodEvictor(
clientSet,
evictionPolicyGroupVersion,
false,
nil,
nil,
nodes,
getPodsAssignedToNode,
true,
false,
false,
false,
false,
)
// Run RemovePodsHavingTooManyRestarts strategy
t.Log("Running RemovePodsHavingTooManyRestarts strategy")
strategies.RemovePodsHavingTooManyRestarts(
ctx,
clientSet,
deschedulerapi.DeschedulerStrategy{
Enabled: true,
Params: &deschedulerapi.StrategyParameters{
PodsHavingTooManyRestarts: &deschedulerapi.PodsHavingTooManyRestarts{
PodRestartThreshold: tc.podRestartThreshold,
IncludingInitContainers: tc.includingInitContainers,
},
},
},
workerNodes,
podEvictor,
getPodsAssignedToNode,
)
waitForTerminatingPodsToDisappear(ctx, t, clientSet, testNamespace.Name)
actualEvictedPodCount := podEvictor.TotalEvicted()
if actualEvictedPodCount < tc.expectedEvictedPodCount {
t.Errorf("Test error for description: %s. Unexpected number of pods have been evicted, got %v, expected %v", tc.name, actualEvictedPodCount, tc.expectedEvictedPodCount)
}
})
}
}
func waitPodRestartCount(ctx context.Context, clientSet clientset.Interface, namespace string, t *testing.T) (bool, error) {
timeout := time.After(5 * time.Minute)
tick := time.Tick(5 * time.Second)
for {
select {
case <-timeout:
t.Log("Timeout, still restart count not as expected")
return false, fmt.Errorf("timeout Error")
case <-tick:
podList, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
LabelSelector: labels.SelectorFromSet(labels.Set(map[string]string{"test": "restart-pod", "name": "test-toomanyrestarts"})).String(),
})
if err != nil {
t.Fatalf("Unexpected err: %v", err)
return false, err
}
if len(podList.Items) < 4 {
t.Log("Waiting for 4 pods")
return false, nil
}
for i := 0; i < 4; i++ {
if len(podList.Items[0].Status.ContainerStatuses) < 1 {
t.Logf("Waiting for podList.Items[%v].Status.ContainerStatuses to be populated", i)
return false, nil
}
}
if podList.Items[0].Status.ContainerStatuses[0].RestartCount >= 4 && podList.Items[1].Status.ContainerStatuses[0].RestartCount >= 4 && podList.Items[2].Status.ContainerStatuses[0].RestartCount >= 4 && podList.Items[3].Status.ContainerStatuses[0].RestartCount >= 4 {
t.Log("Pod restartCount as expected")
return true, nil
}
}
}
}

View File

@@ -0,0 +1,159 @@
package e2e
import (
"context"
"fmt"
"math"
"strings"
"testing"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
deschedulerapi "sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/strategies"
)
const zoneTopologyKey string = "topology.kubernetes.io/zone"
func TestTopologySpreadConstraint(t *testing.T) {
ctx := context.Background()
clientSet, _, getPodsAssignedToNode, stopCh := initializeClient(t)
defer close(stopCh)
nodeList, err := clientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
t.Errorf("Error listing node with %v", err)
}
nodes, workerNodes := splitNodesAndWorkerNodes(nodeList.Items)
t.Log("Creating testing namespace")
testNamespace := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-" + strings.ToLower(t.Name())}}
if _, err := clientSet.CoreV1().Namespaces().Create(ctx, testNamespace, metav1.CreateOptions{}); err != nil {
t.Fatalf("Unable to create ns %v", testNamespace.Name)
}
defer clientSet.CoreV1().Namespaces().Delete(ctx, testNamespace.Name, metav1.DeleteOptions{})
testCases := map[string]struct {
replicaCount int
maxSkew int
labelKey string
labelValue string
constraint v1.UnsatisfiableConstraintAction
}{
"test-rc-topology-spread-hard-constraint": {
replicaCount: 4,
maxSkew: 1,
labelKey: "test",
labelValue: "topology-spread-hard-constraint",
constraint: v1.DoNotSchedule,
},
"test-rc-topology-spread-soft-constraint": {
replicaCount: 4,
maxSkew: 1,
labelKey: "test",
labelValue: "topology-spread-soft-constraint",
constraint: v1.ScheduleAnyway,
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
t.Logf("Creating RC %s with %d replicas", name, tc.replicaCount)
rc := RcByNameContainer(name, testNamespace.Name, int32(tc.replicaCount), map[string]string{tc.labelKey: tc.labelValue}, nil, "")
rc.Spec.Template.Spec.TopologySpreadConstraints = makeTopologySpreadConstraints(tc.maxSkew, tc.labelKey, tc.labelValue, tc.constraint)
if _, err := clientSet.CoreV1().ReplicationControllers(rc.Namespace).Create(ctx, rc, metav1.CreateOptions{}); err != nil {
t.Fatalf("Error creating RC %s %v", name, err)
}
defer deleteRC(ctx, t, clientSet, rc)
waitForRCPodsRunning(ctx, t, clientSet, rc)
// Create a "Violator" RC that has the same label and is forced to be on the same node using a nodeSelector
violatorRcName := name + "-violator"
violatorCount := tc.maxSkew + 1
violatorRc := RcByNameContainer(violatorRcName, testNamespace.Name, int32(violatorCount), map[string]string{tc.labelKey: tc.labelValue}, nil, "")
violatorRc.Spec.Template.Spec.NodeSelector = map[string]string{zoneTopologyKey: workerNodes[0].Labels[zoneTopologyKey]}
rc.Spec.Template.Spec.TopologySpreadConstraints = makeTopologySpreadConstraints(tc.maxSkew, tc.labelKey, tc.labelValue, tc.constraint)
if _, err := clientSet.CoreV1().ReplicationControllers(rc.Namespace).Create(ctx, violatorRc, metav1.CreateOptions{}); err != nil {
t.Fatalf("Error creating RC %s: %v", violatorRcName, err)
}
defer deleteRC(ctx, t, clientSet, violatorRc)
waitForRCPodsRunning(ctx, t, clientSet, violatorRc)
podEvictor := initPodEvictorOrFail(t, clientSet, getPodsAssignedToNode, nodes)
// Run TopologySpreadConstraint strategy
t.Logf("Running RemovePodsViolatingTopologySpreadConstraint strategy for %s", name)
strategies.RemovePodsViolatingTopologySpreadConstraint(
ctx,
clientSet,
deschedulerapi.DeschedulerStrategy{
Enabled: true,
Params: &deschedulerapi.StrategyParameters{
IncludeSoftConstraints: tc.constraint != v1.DoNotSchedule,
},
},
nodes,
podEvictor,
getPodsAssignedToNode,
)
t.Logf("Finished RemovePodsViolatingTopologySpreadConstraint strategy for %s", name)
t.Logf("Wait for terminating pods of %s to disappear", name)
waitForTerminatingPodsToDisappear(ctx, t, clientSet, rc.Namespace)
if totalEvicted := podEvictor.TotalEvicted(); totalEvicted > 0 {
t.Logf("Total of %d Pods were evicted for %s", totalEvicted, name)
} else {
t.Fatalf("Pods were not evicted for %s TopologySpreadConstraint", name)
}
// Ensure recently evicted Pod are rescheduled and running before asserting for a balanced topology spread
waitForRCPodsRunning(ctx, t, clientSet, rc)
pods, err := clientSet.CoreV1().Pods(testNamespace.Name).List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("%s=%s", tc.labelKey, tc.labelValue)})
if err != nil {
t.Errorf("Error listing pods for %s: %v", name, err)
}
nodePodCountMap := make(map[string]int)
for _, pod := range pods.Items {
nodePodCountMap[pod.Spec.NodeName]++
}
if len(nodePodCountMap) != len(workerNodes) {
t.Errorf("%s Pods were scheduled on only '%d' nodes and were not properly distributed on the nodes", name, len(nodePodCountMap))
}
min, max := getMinAndMaxPodDistribution(nodePodCountMap)
if max-min > tc.maxSkew {
t.Errorf("Pod distribution for %s is still violating the max skew of %d as it is %d", name, tc.maxSkew, max-min)
}
t.Logf("Pods for %s were distributed in line with max skew of %d", name, tc.maxSkew)
})
}
}
func makeTopologySpreadConstraints(maxSkew int, labelKey, labelValue string, constraint v1.UnsatisfiableConstraintAction) []v1.TopologySpreadConstraint {
return []v1.TopologySpreadConstraint{
{
MaxSkew: int32(maxSkew),
TopologyKey: zoneTopologyKey,
WhenUnsatisfiable: constraint,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{labelKey: labelValue}},
},
}
}
func getMinAndMaxPodDistribution(nodePodCountMap map[string]int) (int, int) {
min := math.MaxInt32
max := math.MinInt32
for _, podCount := range nodePodCountMap {
if podCount < min {
min = podCount
}
if podCount > max {
max = podCount
}
}
return min, max
}

Some files were not shown because too many files have changed in this diff Show More