instrumentation: add native histograms to complement high-traffic summaries (#17374)

This adds the following native histograms (with a few classic buckets for backwards compatibility), while keeping the corresponding summaries (same name, just without `_histogram`):

- `prometheus_sd_refresh_duration_histogram_seconds`
- `prometheus_rule_evaluation_duration_histogram_seconds`
- `prometheus_rule_group_duration_histogram_seconds`
- `prometheus_target_sync_length_histogram_seconds`
- `prometheus_target_interval_length_histogram_seconds`
- `prometheus_engine_query_duration_histogram_seconds`

Signed-off-by: Harsh <harshmastic@gmail.com>
Signed-off-by: harsh kumar <135993950+hxrshxz@users.noreply.github.com>
Co-authored-by: Björn Rabenstein <github@rabenste.in>
This commit is contained in:
harsh kumar 2025-11-27 23:15:35 +05:30 committed by GitHub
parent 7bb95d548c
commit 30be1483d1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 135 additions and 36 deletions

View file

@ -63,8 +63,9 @@ type DiscovererOptions struct {
// We define them here in the "discovery" package in order to avoid a cyclic dependency between
// "discovery" and "refresh".
type RefreshMetrics struct {
Failures prometheus.Counter
Duration prometheus.Observer
Failures prometheus.Counter
Duration prometheus.Observer
DurationHistogram prometheus.Observer
}
// RefreshMetricsInstantiator instantiates the metrics used by the "refresh" package.

View file

@ -14,6 +14,8 @@
package discovery
import (
"time"
"github.com/prometheus/client_golang/prometheus"
)
@ -21,8 +23,9 @@ import (
// We define them here in the "discovery" package in order to avoid a cyclic dependency between
// "discovery" and "refresh".
type RefreshMetricsVecs struct {
failuresVec *prometheus.CounterVec
durationVec *prometheus.SummaryVec
failuresVec *prometheus.CounterVec
durationVec *prometheus.SummaryVec
durationHistVec *prometheus.HistogramVec
metricRegisterer MetricRegisterer
}
@ -44,6 +47,16 @@ func NewRefreshMetrics(reg prometheus.Registerer) RefreshMetricsManager {
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"mechanism", "config"}),
durationHistVec: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "prometheus_sd_refresh_duration_histogram_seconds",
Help: "The duration of a refresh for the given SD mechanism.",
Buckets: []float64{.01, .1, 1, 10},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
},
[]string{"mechanism"}),
}
// The reason we register metric vectors instead of metrics is so that
@ -51,6 +64,7 @@ func NewRefreshMetrics(reg prometheus.Registerer) RefreshMetricsManager {
m.metricRegisterer = NewMetricRegisterer(reg, []prometheus.Collector{
m.failuresVec,
m.durationVec,
m.durationHistVec,
})
return m
@ -59,8 +73,9 @@ func NewRefreshMetrics(reg prometheus.Registerer) RefreshMetricsManager {
// Instantiate returns metrics out of metric vectors for a given mechanism and config.
func (m *RefreshMetricsVecs) Instantiate(mech, config string) *RefreshMetrics {
return &RefreshMetrics{
Failures: m.failuresVec.WithLabelValues(mech, config),
Duration: m.durationVec.WithLabelValues(mech, config),
Failures: m.failuresVec.WithLabelValues(mech, config),
Duration: m.durationVec.WithLabelValues(mech, config),
DurationHistogram: m.durationHistVec.WithLabelValues(mech),
}
}

View file

@ -108,6 +108,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
now := time.Now()
defer func() {
d.metrics.Duration.Observe(time.Since(now).Seconds())
d.metrics.DurationHistogram.Observe(time.Since(now).Seconds())
}()
tgs, err := d.refreshf(ctx)

View file

@ -76,15 +76,19 @@ const (
)
type engineMetrics struct {
currentQueries prometheus.Gauge
maxConcurrentQueries prometheus.Gauge
queryLogEnabled prometheus.Gauge
queryLogFailures prometheus.Counter
queryQueueTime prometheus.Observer
queryPrepareTime prometheus.Observer
queryInnerEval prometheus.Observer
queryResultSort prometheus.Observer
querySamples prometheus.Counter
currentQueries prometheus.Gauge
maxConcurrentQueries prometheus.Gauge
queryLogEnabled prometheus.Gauge
queryLogFailures prometheus.Counter
queryQueueTime prometheus.Observer
queryQueueTimeHistogram prometheus.Observer
queryPrepareTime prometheus.Observer
queryPrepareTimeHistogram prometheus.Observer
queryInnerEval prometheus.Observer
queryInnerEvalHistogram prometheus.Observer
queryResultSort prometheus.Observer
queryResultSortHistogram prometheus.Observer
querySamples prometheus.Counter
}
type (
@ -363,6 +367,19 @@ func NewEngine(opts EngineOpts) *Engine {
[]string{"slice"},
)
queryResultHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "query_duration_histogram_seconds",
Help: "The duration of various parts of PromQL query execution.",
Buckets: []float64{.01, .1, 1, 10},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
},
[]string{"slice"},
)
metrics := &engineMetrics{
currentQueries: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
@ -394,10 +411,14 @@ func NewEngine(opts EngineOpts) *Engine {
Name: "query_samples_total",
Help: "The total number of samples loaded by all queries.",
}),
queryQueueTime: queryResultSummary.WithLabelValues("queue_time"),
queryPrepareTime: queryResultSummary.WithLabelValues("prepare_time"),
queryInnerEval: queryResultSummary.WithLabelValues("inner_eval"),
queryResultSort: queryResultSummary.WithLabelValues("result_sort"),
queryQueueTime: queryResultSummary.WithLabelValues("queue_time"),
queryQueueTimeHistogram: queryResultHistogram.WithLabelValues("queue_time"),
queryPrepareTime: queryResultSummary.WithLabelValues("prepare_time"),
queryPrepareTimeHistogram: queryResultHistogram.WithLabelValues("prepare_time"),
queryInnerEval: queryResultSummary.WithLabelValues("inner_eval"),
queryInnerEvalHistogram: queryResultHistogram.WithLabelValues("inner_eval"),
queryResultSort: queryResultSummary.WithLabelValues("result_sort"),
queryResultSortHistogram: queryResultHistogram.WithLabelValues("result_sort"),
}
if t := opts.ActiveQueryTracker; t != nil {
@ -421,6 +442,7 @@ func NewEngine(opts EngineOpts) *Engine {
metrics.queryLogFailures,
metrics.querySamples,
queryResultSummary,
queryResultHistogram,
)
}
@ -701,7 +723,7 @@ func (ng *Engine) queueActive(ctx context.Context, q *query) (func(), error) {
if ng.activeQueryTracker == nil {
return func() {}, nil
}
queueSpanTimer, _ := q.stats.GetSpanTimer(ctx, stats.ExecQueueTime, ng.metrics.queryQueueTime)
queueSpanTimer, _ := q.stats.GetSpanTimer(ctx, stats.ExecQueueTime, ng.metrics.queryQueueTime, ng.metrics.queryQueueTimeHistogram)
queryIndex, err := ng.activeQueryTracker.Insert(ctx, q.q)
queueSpanTimer.Finish()
return func() { ng.activeQueryTracker.Delete(queryIndex) }, err
@ -717,7 +739,7 @@ func durationMilliseconds(d time.Duration) int64 {
// execEvalStmt evaluates the expression of an evaluation statement for the given time range.
func (ng *Engine) execEvalStmt(ctx context.Context, query *query, s *parser.EvalStmt) (parser.Value, annotations.Annotations, error) {
prepareSpanTimer, ctxPrepare := query.stats.GetSpanTimer(ctx, stats.QueryPreparationTime, ng.metrics.queryPrepareTime)
prepareSpanTimer, ctxPrepare := query.stats.GetSpanTimer(ctx, stats.QueryPreparationTime, ng.metrics.queryPrepareTime, ng.metrics.queryPrepareTimeHistogram)
mint, maxt := FindMinMaxTime(s)
querier, err := query.queryable.Querier(mint, maxt)
if err != nil {
@ -732,7 +754,7 @@ func (ng *Engine) execEvalStmt(ctx context.Context, query *query, s *parser.Eval
// Modify the offset of vector and matrix selectors for the @ modifier
// w.r.t. the start time since only 1 evaluation will be done on them.
setOffsetForAtModifier(timeMilliseconds(s.Start), s.Expr)
evalSpanTimer, ctxInnerEval := query.stats.GetSpanTimer(ctx, stats.InnerEvalTime, ng.metrics.queryInnerEval)
evalSpanTimer, ctxInnerEval := query.stats.GetSpanTimer(ctx, stats.InnerEvalTime, ng.metrics.queryInnerEval, ng.metrics.queryInnerEvalHistogram)
// Instant evaluation. This is executed as a range evaluation with one step.
if s.Start.Equal(s.End) && s.Interval == 0 {
start := timeMilliseconds(s.Start)
@ -835,7 +857,7 @@ func (ng *Engine) execEvalStmt(ctx context.Context, query *query, s *parser.Eval
}
func (ng *Engine) sortMatrixResult(ctx context.Context, query *query, mat Matrix) {
sortSpanTimer, _ := query.stats.GetSpanTimer(ctx, stats.ResultSortTime, ng.metrics.queryResultSort)
sortSpanTimer, _ := query.stats.GetSpanTimer(ctx, stats.ResultSortTime, ng.metrics.queryResultSort, ng.metrics.queryResultSortHistogram)
sort.Sort(mat)
sortSpanTimer.Finish()
}

View file

@ -519,6 +519,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
since := time.Since(t)
g.metrics.EvalDuration.Observe(since.Seconds())
g.metrics.EvalDurationHistogram.Observe(since.Seconds())
rule.SetEvaluationDuration(since)
rule.SetEvaluationTimestamp(t)
}(time.Now())
@ -910,19 +911,21 @@ const namespace = "prometheus"
// Metrics for rule evaluation.
type Metrics struct {
EvalDuration prometheus.Summary
IterationDuration prometheus.Summary
IterationsMissed *prometheus.CounterVec
IterationsScheduled *prometheus.CounterVec
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec
GroupLastRuleDurationSum *prometheus.GaugeVec
GroupLastRestoreDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec
EvalDuration prometheus.Summary
EvalDurationHistogram prometheus.Histogram
IterationDuration prometheus.Summary
IterationDurationHistogram prometheus.Histogram
IterationsMissed *prometheus.CounterVec
IterationsScheduled *prometheus.CounterVec
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec
GroupLastRuleDurationSum *prometheus.GaugeVec
GroupLastRestoreDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec
}
// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
@ -936,12 +939,30 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
Help: "The duration for a rule to execute.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}),
EvalDurationHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "rule_evaluation_duration_histogram_seconds",
Help: "The duration for a rule to execute.",
Buckets: []float64{.01, .1, 1, 10},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
}),
IterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{
Namespace: namespace,
Name: "rule_group_duration_seconds",
Help: "The duration of rule group evaluations.",
Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001},
}),
IterationDurationHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "rule_group_duration_histogram_seconds",
Help: "The duration of rule group evaluations.",
Buckets: []float64{.01, .1, 1, 10},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
}),
IterationsMissed: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
@ -1035,7 +1056,9 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
if reg != nil {
reg.MustRegister(
m.EvalDuration,
m.EvalDurationHistogram,
m.IterationDuration,
m.IterationDurationHistogram,
m.IterationsMissed,
m.IterationsScheduled,
m.EvalTotal,

View file

@ -85,6 +85,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
timeSinceStart := time.Since(start)
g.metrics.IterationDuration.Observe(timeSinceStart.Seconds())
g.metrics.IterationDurationHistogram.Observe(timeSinceStart.Seconds())
g.updateRuleEvaluationTimeSum()
g.setEvaluationTime(timeSinceStart)
g.setLastEvaluation(start)

View file

@ -15,6 +15,7 @@ package scrape
import (
"fmt"
"time"
"github.com/prometheus/client_golang/prometheus"
)
@ -36,6 +37,7 @@ type scrapeMetrics struct {
targetScrapePoolTargetsAdded *prometheus.GaugeVec
targetScrapePoolSymbolTableItems *prometheus.GaugeVec
targetSyncIntervalLength *prometheus.SummaryVec
targetSyncIntervalLengthHistogram *prometheus.HistogramVec
targetSyncFailed *prometheus.CounterVec
// Used by targetScraper.
@ -46,6 +48,7 @@ type scrapeMetrics struct {
// Used by scrapeLoop.
targetIntervalLength *prometheus.SummaryVec
targetIntervalLengthHistogram *prometheus.HistogramVec
targetScrapeSampleLimit prometheus.Counter
targetScrapeSampleDuplicate prometheus.Counter
targetScrapeSampleOutOfOrder prometheus.Counter
@ -152,6 +155,17 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) {
},
[]string{"scrape_job"},
)
sm.targetSyncIntervalLengthHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "prometheus_target_sync_length_histogram_seconds",
Help: "Actual interval to sync the scrape pool.",
Buckets: []float64{.01, .1, 1, 10},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
},
[]string{"scrape_job"},
)
sm.targetSyncFailed = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "prometheus_target_sync_failed_total",
@ -185,6 +199,17 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) {
},
[]string{"interval"},
)
sm.targetIntervalLengthHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "prometheus_target_interval_length_histogram_seconds",
Help: "Actual intervals between scrapes.",
Buckets: []float64{.01, .1, 1, 10},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
},
[]string{"interval"},
)
sm.targetScrapeSampleLimit = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prometheus_target_scrapes_exceeded_sample_limit_total",
@ -238,6 +263,7 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) {
sm.targetScrapePoolReloads,
sm.targetScrapePoolReloadsFailed,
sm.targetSyncIntervalLength,
sm.targetSyncIntervalLengthHistogram,
sm.targetScrapePoolSyncsCounter,
sm.targetScrapePoolExceededTargetLimit,
sm.targetScrapePoolTargetLimit,
@ -250,6 +276,7 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) {
sm.targetScrapeCacheFlushForced,
// Used by scrapeLoop.
sm.targetIntervalLength,
sm.targetIntervalLengthHistogram,
sm.targetScrapeSampleLimit,
sm.targetScrapeSampleDuplicate,
sm.targetScrapeSampleOutOfOrder,
@ -279,6 +306,7 @@ func (sm *scrapeMetrics) Unregister() {
sm.reg.Unregister(sm.targetScrapePoolReloads)
sm.reg.Unregister(sm.targetScrapePoolReloadsFailed)
sm.reg.Unregister(sm.targetSyncIntervalLength)
sm.reg.Unregister(sm.targetSyncIntervalLengthHistogram)
sm.reg.Unregister(sm.targetScrapePoolSyncsCounter)
sm.reg.Unregister(sm.targetScrapePoolExceededTargetLimit)
sm.reg.Unregister(sm.targetScrapePoolTargetLimit)
@ -288,6 +316,7 @@ func (sm *scrapeMetrics) Unregister() {
sm.reg.Unregister(sm.targetScrapeExceededBodySizeLimit)
sm.reg.Unregister(sm.targetScrapeCacheFlushForced)
sm.reg.Unregister(sm.targetIntervalLength)
sm.reg.Unregister(sm.targetIntervalLengthHistogram)
sm.reg.Unregister(sm.targetScrapeSampleLimit)
sm.reg.Unregister(sm.targetScrapeSampleDuplicate)
sm.reg.Unregister(sm.targetScrapeSampleOutOfOrder)

View file

@ -309,6 +309,7 @@ func (sp *scrapePool) stop() {
sp.metrics.targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName)
sp.metrics.targetScrapePoolSymbolTableItems.DeleteLabelValues(sp.config.JobName)
sp.metrics.targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName)
sp.metrics.targetSyncIntervalLengthHistogram.DeleteLabelValues(sp.config.JobName)
sp.metrics.targetSyncFailed.DeleteLabelValues(sp.config.JobName)
}
}
@ -505,6 +506,9 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) {
sp.metrics.targetSyncIntervalLength.WithLabelValues(sp.config.JobName).Observe(
time.Since(start).Seconds(),
)
sp.metrics.targetSyncIntervalLengthHistogram.WithLabelValues(sp.config.JobName).Observe(
time.Since(start).Seconds(),
)
sp.metrics.targetScrapePoolSyncsCounter.WithLabelValues(sp.config.JobName).Inc()
}
@ -1420,6 +1424,9 @@ func (sl *scrapeLoop) scrapeAndReport(last, appendTime time.Time, errc chan<- er
sl.metrics.targetIntervalLength.WithLabelValues(sl.interval.String()).Observe(
time.Since(last).Seconds(),
)
sl.metrics.targetIntervalLengthHistogram.WithLabelValues(sl.interval.String()).Observe(
time.Since(last).Seconds(),
)
}
var total, added, seriesAdded, bytesRead int