1
0
mirror of https://github.com/kubernetes-sigs/descheduler.git synced 2026-01-25 20:59:28 +01:00

metrics name refact and mark old one deprecated

Signed-off-by: lowang-bh <lhui_wang@163.com>
This commit is contained in:
lowang-bh
2023-08-26 20:03:05 +08:00
parent 2d7528411a
commit b63b09089e
5 changed files with 47 additions and 7 deletions

View File

@@ -1050,9 +1050,12 @@ To get best results from HA mode some additional configurations might require:
| name | type | description |
|---------------------------------------|--------------|-----------------------------------------------------------------------------------|
| build_info | gauge | constant 1 |
| pods_evicted | CounterVec | total number of pods evicted |
| descheduler_loop_duration_seconds | HistogramVec | time taken to complete a whole descheduling cycle (support _bucket, _sum, _count) |
| descheduler_strategy_duration_seconds | HistogramVec | time taken to complete each stragtegy of descheduling operation (support _bucket, _sum, _count) |
| pods_evicted | CounterVec | total number of pods evicted, is deprecated in version v0.34.0 |
| pods_evicted_total | CounterVec | total number of pods evicted |
| descheduler_loop_duration_seconds | HistogramVec | time taken to complete a whole descheduling cycle (support _bucket, _sum, _count), is deprecated in version v0.34.0 |
| loop_duration_seconds | HistogramVec | time taken to complete a whole descheduling cycle (support _bucket, _sum, _count) |
| descheduler_strategy_duration_seconds | HistogramVec | time taken to complete each stragtegy of descheduling operation (support _bucket, _sum, _count), is deprecated in version v0.34.0 |
| strategy_duration_seconds | HistogramVec | time taken to complete each stragtegy of descheduling operation (support _bucket, _sum, _count) |
The metrics are served through https://localhost:10258/metrics by default.
The address and port can be changed by setting `--binding-address` and `--secure-port` flags.

View File

@@ -31,10 +31,18 @@ const (
var (
PodsEvicted = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: DeschedulerSubsystem,
Name: "pods_evicted",
Help: "Number of total evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted",
StabilityLevel: metrics.ALPHA,
DeprecatedVersion: "0.34.0",
}, []string{"result", "strategy", "profile", "namespace", "node"})
PodsEvictedTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: DeschedulerSubsystem,
Name: "pods_evicted",
Help: "Number of evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted",
Name: "pods_evicted_total",
Help: "Number of total evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted",
StabilityLevel: metrics.ALPHA,
}, []string{"result", "strategy", "profile", "namespace", "node"})
@@ -49,18 +57,36 @@ var (
)
DeschedulerLoopDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: DeschedulerSubsystem,
Name: "descheduler_loop_duration_seconds",
Help: "Time taken to complete a full descheduling cycle",
StabilityLevel: metrics.ALPHA,
DeprecatedVersion: "0.34.0",
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500},
}, []string{})
LoopDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: DeschedulerSubsystem,
Name: "descheduler_loop_duration_seconds",
Name: "loop_duration_seconds",
Help: "Time taken to complete a full descheduling cycle",
StabilityLevel: metrics.ALPHA,
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500},
}, []string{})
DeschedulerStrategyDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: DeschedulerSubsystem,
Name: "descheduler_strategy_duration_seconds",
Help: "Time taken to complete Each strategy of the descheduling operation",
StabilityLevel: metrics.ALPHA,
DeprecatedVersion: "0.34.0",
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100},
}, []string{"strategy", "profile"})
StrategyDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: DeschedulerSubsystem,
Name: "descheduler_strategy_duration_seconds",
Name: "strategy_duration_seconds",
Help: "Time taken to complete Each strategy of the descheduling operation",
StabilityLevel: metrics.ALPHA,
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100},
@@ -68,9 +94,12 @@ var (
metricsList = []metrics.Registerable{
PodsEvicted,
PodsEvictedTotal,
buildInfo,
DeschedulerLoopDuration,
DeschedulerStrategyDuration,
LoopDuration,
StrategyDuration,
}
)

View File

@@ -351,6 +351,7 @@ func (d *descheduler) runDeschedulerLoop(ctx context.Context, nodes []*v1.Node)
defer span.End()
defer func(loopStartDuration time.Time) {
metrics.DeschedulerLoopDuration.With(map[string]string{}).Observe(time.Since(loopStartDuration).Seconds())
metrics.LoopDuration.With(map[string]string{}).Observe(time.Since(loopStartDuration).Seconds())
}(time.Now())
// if len is still <= 1 error out

View File

@@ -482,6 +482,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
err := NewEvictionTotalLimitError()
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
metrics.PodsEvictedTotal.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
}
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictTotal)
@@ -496,6 +497,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
err := NewEvictionNodeLimitError(pod.Spec.NodeName)
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
metrics.PodsEvictedTotal.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
}
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNode, "node", pod.Spec.NodeName)
@@ -510,6 +512,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
err := NewEvictionNamespaceLimitError(pod.Namespace)
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
metrics.PodsEvictedTotal.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
}
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNamespace, "namespace", pod.Namespace, "pod", klog.KObj(pod))
@@ -526,6 +529,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", opts.Reason)
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
metrics.PodsEvictedTotal.With(map[string]string{"result": "error", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
}
if pe.evictionFailureEventNotification {
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeWarning, "EvictionFailed", "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler failed: %v", pod.Spec.NodeName, err.Error())
@@ -545,6 +549,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
metrics.PodsEvictedTotal.With(map[string]string{"result": "success", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
}
if pe.dryRun {

View File

@@ -339,6 +339,7 @@ func (d profileImpl) RunDeschedulePlugins(ctx context.Context, nodes []*v1.Node)
strategyStart := time.Now()
status := pl.Deschedule(ctx, nodes)
metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
metrics.StrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
if status != nil && status.Err != nil {
span.AddEvent("Plugin Execution Failed", trace.WithAttributes(attribute.String("err", status.Err.Error())))
@@ -368,6 +369,7 @@ func (d profileImpl) RunBalancePlugins(ctx context.Context, nodes []*v1.Node) *f
strategyStart := time.Now()
status := pl.Balance(ctx, nodes)
metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
metrics.StrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
if status != nil && status.Err != nil {
span.AddEvent("Plugin Execution Failed", trace.WithAttributes(attribute.String("err", status.Err.Error())))