mirror of
https://github.com/kubernetes-sigs/descheduler.git
synced 2026-01-26 05:14:13 +01:00
metrics name refact and mark old one deprecated
Signed-off-by: lowang-bh <lhui_wang@163.com>
This commit is contained in:
@@ -1050,9 +1050,12 @@ To get best results from HA mode some additional configurations might require:
|
|||||||
| name | type | description |
|
| name | type | description |
|
||||||
|---------------------------------------|--------------|-----------------------------------------------------------------------------------|
|
|---------------------------------------|--------------|-----------------------------------------------------------------------------------|
|
||||||
| build_info | gauge | constant 1 |
|
| build_info | gauge | constant 1 |
|
||||||
| pods_evicted | CounterVec | total number of pods evicted |
|
| pods_evicted | CounterVec | total number of pods evicted, is deprecated in version v0.34.0 |
|
||||||
| descheduler_loop_duration_seconds | HistogramVec | time taken to complete a whole descheduling cycle (support _bucket, _sum, _count) |
|
| pods_evicted_total | CounterVec | total number of pods evicted |
|
||||||
| descheduler_strategy_duration_seconds | HistogramVec | time taken to complete each stragtegy of descheduling operation (support _bucket, _sum, _count) |
|
| descheduler_loop_duration_seconds | HistogramVec | time taken to complete a whole descheduling cycle (support _bucket, _sum, _count), is deprecated in version v0.34.0 |
|
||||||
|
| loop_duration_seconds | HistogramVec | time taken to complete a whole descheduling cycle (support _bucket, _sum, _count) |
|
||||||
|
| descheduler_strategy_duration_seconds | HistogramVec | time taken to complete each stragtegy of descheduling operation (support _bucket, _sum, _count), is deprecated in version v0.34.0 |
|
||||||
|
| strategy_duration_seconds | HistogramVec | time taken to complete each stragtegy of descheduling operation (support _bucket, _sum, _count) |
|
||||||
|
|
||||||
The metrics are served through https://localhost:10258/metrics by default.
|
The metrics are served through https://localhost:10258/metrics by default.
|
||||||
The address and port can be changed by setting `--binding-address` and `--secure-port` flags.
|
The address and port can be changed by setting `--binding-address` and `--secure-port` flags.
|
||||||
|
|||||||
@@ -31,10 +31,18 @@ const (
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
PodsEvicted = metrics.NewCounterVec(
|
PodsEvicted = metrics.NewCounterVec(
|
||||||
|
&metrics.CounterOpts{
|
||||||
|
Subsystem: DeschedulerSubsystem,
|
||||||
|
Name: "pods_evicted",
|
||||||
|
Help: "Number of total evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
DeprecatedVersion: "0.34.0",
|
||||||
|
}, []string{"result", "strategy", "profile", "namespace", "node"})
|
||||||
|
PodsEvictedTotal = metrics.NewCounterVec(
|
||||||
&metrics.CounterOpts{
|
&metrics.CounterOpts{
|
||||||
Subsystem: DeschedulerSubsystem,
|
Subsystem: DeschedulerSubsystem,
|
||||||
Name: "pods_evicted",
|
Name: "pods_evicted_total",
|
||||||
Help: "Number of evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted",
|
Help: "Number of total evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted",
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.ALPHA,
|
||||||
}, []string{"result", "strategy", "profile", "namespace", "node"})
|
}, []string{"result", "strategy", "profile", "namespace", "node"})
|
||||||
|
|
||||||
@@ -49,18 +57,36 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
DeschedulerLoopDuration = metrics.NewHistogramVec(
|
DeschedulerLoopDuration = metrics.NewHistogramVec(
|
||||||
|
&metrics.HistogramOpts{
|
||||||
|
Subsystem: DeschedulerSubsystem,
|
||||||
|
Name: "descheduler_loop_duration_seconds",
|
||||||
|
Help: "Time taken to complete a full descheduling cycle",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
DeprecatedVersion: "0.34.0",
|
||||||
|
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500},
|
||||||
|
}, []string{})
|
||||||
|
LoopDuration = metrics.NewHistogramVec(
|
||||||
&metrics.HistogramOpts{
|
&metrics.HistogramOpts{
|
||||||
Subsystem: DeschedulerSubsystem,
|
Subsystem: DeschedulerSubsystem,
|
||||||
Name: "descheduler_loop_duration_seconds",
|
Name: "loop_duration_seconds",
|
||||||
Help: "Time taken to complete a full descheduling cycle",
|
Help: "Time taken to complete a full descheduling cycle",
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.ALPHA,
|
||||||
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500},
|
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500},
|
||||||
}, []string{})
|
}, []string{})
|
||||||
|
|
||||||
DeschedulerStrategyDuration = metrics.NewHistogramVec(
|
DeschedulerStrategyDuration = metrics.NewHistogramVec(
|
||||||
|
&metrics.HistogramOpts{
|
||||||
|
Subsystem: DeschedulerSubsystem,
|
||||||
|
Name: "descheduler_strategy_duration_seconds",
|
||||||
|
Help: "Time taken to complete Each strategy of the descheduling operation",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
DeprecatedVersion: "0.34.0",
|
||||||
|
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100},
|
||||||
|
}, []string{"strategy", "profile"})
|
||||||
|
StrategyDuration = metrics.NewHistogramVec(
|
||||||
&metrics.HistogramOpts{
|
&metrics.HistogramOpts{
|
||||||
Subsystem: DeschedulerSubsystem,
|
Subsystem: DeschedulerSubsystem,
|
||||||
Name: "descheduler_strategy_duration_seconds",
|
Name: "strategy_duration_seconds",
|
||||||
Help: "Time taken to complete Each strategy of the descheduling operation",
|
Help: "Time taken to complete Each strategy of the descheduling operation",
|
||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.ALPHA,
|
||||||
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100},
|
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 25, 50, 100},
|
||||||
@@ -68,9 +94,12 @@ var (
|
|||||||
|
|
||||||
metricsList = []metrics.Registerable{
|
metricsList = []metrics.Registerable{
|
||||||
PodsEvicted,
|
PodsEvicted,
|
||||||
|
PodsEvictedTotal,
|
||||||
buildInfo,
|
buildInfo,
|
||||||
DeschedulerLoopDuration,
|
DeschedulerLoopDuration,
|
||||||
DeschedulerStrategyDuration,
|
DeschedulerStrategyDuration,
|
||||||
|
LoopDuration,
|
||||||
|
StrategyDuration,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -351,6 +351,7 @@ func (d *descheduler) runDeschedulerLoop(ctx context.Context, nodes []*v1.Node)
|
|||||||
defer span.End()
|
defer span.End()
|
||||||
defer func(loopStartDuration time.Time) {
|
defer func(loopStartDuration time.Time) {
|
||||||
metrics.DeschedulerLoopDuration.With(map[string]string{}).Observe(time.Since(loopStartDuration).Seconds())
|
metrics.DeschedulerLoopDuration.With(map[string]string{}).Observe(time.Since(loopStartDuration).Seconds())
|
||||||
|
metrics.LoopDuration.With(map[string]string{}).Observe(time.Since(loopStartDuration).Seconds())
|
||||||
}(time.Now())
|
}(time.Now())
|
||||||
|
|
||||||
// if len is still <= 1 error out
|
// if len is still <= 1 error out
|
||||||
|
|||||||
@@ -482,6 +482,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
|
|||||||
err := NewEvictionTotalLimitError()
|
err := NewEvictionTotalLimitError()
|
||||||
if pe.metricsEnabled {
|
if pe.metricsEnabled {
|
||||||
metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
|
metrics.PodsEvictedTotal.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
}
|
}
|
||||||
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
|
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
|
||||||
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictTotal)
|
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictTotal)
|
||||||
@@ -496,6 +497,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
|
|||||||
err := NewEvictionNodeLimitError(pod.Spec.NodeName)
|
err := NewEvictionNodeLimitError(pod.Spec.NodeName)
|
||||||
if pe.metricsEnabled {
|
if pe.metricsEnabled {
|
||||||
metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
|
metrics.PodsEvictedTotal.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
}
|
}
|
||||||
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
|
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
|
||||||
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNode, "node", pod.Spec.NodeName)
|
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNode, "node", pod.Spec.NodeName)
|
||||||
@@ -510,6 +512,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
|
|||||||
err := NewEvictionNamespaceLimitError(pod.Namespace)
|
err := NewEvictionNamespaceLimitError(pod.Namespace)
|
||||||
if pe.metricsEnabled {
|
if pe.metricsEnabled {
|
||||||
metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
metrics.PodsEvicted.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
|
metrics.PodsEvictedTotal.With(map[string]string{"result": err.Error(), "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
}
|
}
|
||||||
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
|
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
|
||||||
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNamespace, "namespace", pod.Namespace, "pod", klog.KObj(pod))
|
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNamespace, "namespace", pod.Namespace, "pod", klog.KObj(pod))
|
||||||
@@ -526,6 +529,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
|
|||||||
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", opts.Reason)
|
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", opts.Reason)
|
||||||
if pe.metricsEnabled {
|
if pe.metricsEnabled {
|
||||||
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
|
metrics.PodsEvictedTotal.With(map[string]string{"result": "error", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
}
|
}
|
||||||
if pe.evictionFailureEventNotification {
|
if pe.evictionFailureEventNotification {
|
||||||
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeWarning, "EvictionFailed", "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler failed: %v", pod.Spec.NodeName, err.Error())
|
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeWarning, "EvictionFailed", "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler failed: %v", pod.Spec.NodeName, err.Error())
|
||||||
@@ -545,6 +549,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
|
|||||||
|
|
||||||
if pe.metricsEnabled {
|
if pe.metricsEnabled {
|
||||||
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
|
metrics.PodsEvictedTotal.With(map[string]string{"result": "success", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
if pe.dryRun {
|
if pe.dryRun {
|
||||||
|
|||||||
@@ -339,6 +339,7 @@ func (d profileImpl) RunDeschedulePlugins(ctx context.Context, nodes []*v1.Node)
|
|||||||
strategyStart := time.Now()
|
strategyStart := time.Now()
|
||||||
status := pl.Deschedule(ctx, nodes)
|
status := pl.Deschedule(ctx, nodes)
|
||||||
metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
|
metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
|
||||||
|
metrics.StrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
|
||||||
|
|
||||||
if status != nil && status.Err != nil {
|
if status != nil && status.Err != nil {
|
||||||
span.AddEvent("Plugin Execution Failed", trace.WithAttributes(attribute.String("err", status.Err.Error())))
|
span.AddEvent("Plugin Execution Failed", trace.WithAttributes(attribute.String("err", status.Err.Error())))
|
||||||
@@ -368,6 +369,7 @@ func (d profileImpl) RunBalancePlugins(ctx context.Context, nodes []*v1.Node) *f
|
|||||||
strategyStart := time.Now()
|
strategyStart := time.Now()
|
||||||
status := pl.Balance(ctx, nodes)
|
status := pl.Balance(ctx, nodes)
|
||||||
metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
|
metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
|
||||||
|
metrics.StrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
|
||||||
|
|
||||||
if status != nil && status.Err != nil {
|
if status != nil && status.Err != nil {
|
||||||
span.AddEvent("Plugin Execution Failed", trace.WithAttributes(attribute.String("err", status.Err.Error())))
|
span.AddEvent("Plugin Execution Failed", trace.WithAttributes(attribute.String("err", status.Err.Error())))
|
||||||
|
|||||||
Reference in New Issue
Block a user