From b63b09089e4fb4ffbc21d70a4a5478ecd6d2e0b3 Mon Sep 17 00:00:00 2001 From: lowang-bh Date: Sat, 26 Aug 2023 20:03:05 +0800 Subject: [PATCH] metrics name refact and mark old one deprecated Signed-off-by: lowang-bh --- README.md | 9 ++++--- metrics/metrics.go | 37 +++++++++++++++++++++++--- pkg/descheduler/descheduler.go | 1 + pkg/descheduler/evictions/evictions.go | 5 ++++ pkg/framework/profile/profile.go | 2 ++ 5 files changed, 47 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 09ee596af..49a599393 100644 --- a/README.md +++ b/README.md @@ -1050,9 +1050,12 @@ To get best results from HA mode some additional configurations might require: | name | type | description | |---------------------------------------|--------------|-----------------------------------------------------------------------------------| | build_info | gauge | constant 1 | -| pods_evicted | CounterVec | total number of pods evicted | -| descheduler_loop_duration_seconds | HistogramVec | time taken to complete a whole descheduling cycle (support _bucket, _sum, _count) | -| descheduler_strategy_duration_seconds | HistogramVec | time taken to complete each stragtegy of descheduling operation (support _bucket, _sum, _count) | +| pods_evicted | CounterVec | total number of pods evicted, is deprecated in version v0.34.0 | +| pods_evicted_total | CounterVec | total number of pods evicted | +| descheduler_loop_duration_seconds | HistogramVec | time taken to complete a whole descheduling cycle (support _bucket, _sum, _count), is deprecated in version v0.34.0 | +| loop_duration_seconds | HistogramVec | time taken to complete a whole descheduling cycle (support _bucket, _sum, _count) | +| descheduler_strategy_duration_seconds | HistogramVec | time taken to complete each stragtegy of descheduling operation (support _bucket, _sum, _count), is deprecated in version v0.34.0 | +| strategy_duration_seconds | HistogramVec | time taken to complete each stragtegy of descheduling operation (support _bucket, _sum, _count) | The metrics are served through https://localhost:10258/metrics by default. The address and port can be changed by setting `--binding-address` and `--secure-port` flags. diff --git a/metrics/metrics.go b/metrics/metrics.go index 07e9c3abd..2b8884939 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -31,10 +31,18 @@ const ( var ( PodsEvicted = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: DeschedulerSubsystem, + Name: "pods_evicted", + Help: "Number of total evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted", + StabilityLevel: metrics.ALPHA, + DeprecatedVersion: "0.34.0", + }, []string{"result", "strategy", "profile", "namespace", "node"}) + PodsEvictedTotal = metrics.NewCounterVec( &metrics.CounterOpts{ Subsystem: DeschedulerSubsystem, - Name: "pods_evicted", - Help: "Number of evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted", + Name: "pods_evicted_total", + Help: "Number of total evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted", StabilityLevel: metrics.ALPHA, }, []string{"result", "strategy", "profile", "namespace", "node"}) @@ -49,18 +57,36 @@ var ( ) DeschedulerLoopDuration = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Subsystem: DeschedulerSubsystem, + Name: "descheduler_loop_duration_seconds", + Help: "Time taken to complete a full descheduling cycle", + StabilityLevel: metrics.ALPHA, + DeprecatedVersion: "0.34.0", + Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500}, + }, []string{}) + LoopDuration = metrics.NewHistogramVec( &metrics.HistogramOpts{ Subsystem: DeschedulerSubsystem, - Name: "descheduler_loop_duration_seconds", + Name: "loop_duration_seconds", Help: "Time taken to complete a full descheduling cycle", StabilityLevel: metrics.ALPHA, Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500}, }, []string{}) DeschedulerStrategyDuration = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Subsystem: DeschedulerSubsystem, + Name: "descheduler_strategy_duration_seconds", + Help: "Time taken to complete Each strategy of the descheduling operation", + StabilityLevel: metrics.ALPHA, + DeprecatedVersion: "0.34.0", + Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100}, + }, []string{"strategy", "profile"}) + StrategyDuration = metrics.NewHistogramVec( &metrics.HistogramOpts{ Subsystem: DeschedulerSubsystem, - Name: "descheduler_strategy_duration_seconds", + Name: "strategy_duration_seconds", Help: "Time taken to complete Each strategy of the descheduling operation", StabilityLevel: metrics.ALPHA, Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100}, @@ -68,9 +94,12 @@ var ( metricsList = []metrics.Registerable{ PodsEvicted, + PodsEvictedTotal, buildInfo, DeschedulerLoopDuration, DeschedulerStrategyDuration, + LoopDuration, + StrategyDuration, } ) diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index 1a7129c59..b21d5c222 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -351,6 +351,7 @@ func (d *descheduler) runDeschedulerLoop(ctx context.Context, nodes []*v1.Node) defer span.End() defer func(loopStartDuration time.Time) { metrics.DeschedulerLoopDuration.With(map[string]string{}).Observe(time.Since(loopStartDuration).Seconds()) + metrics.LoopDuration.With(map[string]string{}).Observe(time.Since(loopStartDuration).Seconds()) }(time.Now()) // if len is still <= 1 error out diff --git a/pkg/descheduler/evictions/evictions.go b/pkg/descheduler/evictions/evictions.go index 02c349d1d..258c3fa8f 100644 --- a/pkg/descheduler/evictions/evictions.go +++ b/pkg/descheduler/evictions/evictions.go @@ -482,6 +482,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio err := NewEvictionTotalLimitError() if pe.metricsEnabled { metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() + metrics.PodsEvictedTotal.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() } span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error()))) klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictTotal) @@ -496,6 +497,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio err := NewEvictionNodeLimitError(pod.Spec.NodeName) if pe.metricsEnabled { metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() + metrics.PodsEvictedTotal.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() } span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error()))) klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNode, "node", pod.Spec.NodeName) @@ -510,6 +512,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio err := NewEvictionNamespaceLimitError(pod.Namespace) if pe.metricsEnabled { metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() + metrics.PodsEvictedTotal.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() } span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error()))) klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNamespace, "namespace", pod.Namespace, "pod", klog.KObj(pod)) @@ -526,6 +529,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", opts.Reason) if pe.metricsEnabled { metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() + metrics.PodsEvictedTotal.With(map[string]string{"result": "error", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() } if pe.evictionFailureEventNotification { pe.eventRecorder.Eventf(pod, nil, v1.EventTypeWarning, "EvictionFailed", "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler failed: %v", pod.Spec.NodeName, err.Error()) @@ -545,6 +549,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio if pe.metricsEnabled { metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() + metrics.PodsEvictedTotal.With(map[string]string{"result": "success", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc() } if pe.dryRun { diff --git a/pkg/framework/profile/profile.go b/pkg/framework/profile/profile.go index cd51accf0..8cf1f82b0 100644 --- a/pkg/framework/profile/profile.go +++ b/pkg/framework/profile/profile.go @@ -339,6 +339,7 @@ func (d profileImpl) RunDeschedulePlugins(ctx context.Context, nodes []*v1.Node) strategyStart := time.Now() status := pl.Deschedule(ctx, nodes) metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds()) + metrics.StrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds()) if status != nil && status.Err != nil { span.AddEvent("Plugin Execution Failed", trace.WithAttributes(attribute.String("err", status.Err.Error()))) @@ -368,6 +369,7 @@ func (d profileImpl) RunBalancePlugins(ctx context.Context, nodes []*v1.Node) *f strategyStart := time.Now() status := pl.Balance(ctx, nodes) metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds()) + metrics.StrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds()) if status != nil && status.Err != nil { span.AddEvent("Plugin Execution Failed", trace.WithAttributes(attribute.String("err", status.Err.Error())))