instrumentation: add native histograms to complement high-traffic summaries (#17374)

This adds the following native histograms (with a few classic buckets for backwards compatibility), while keeping the corresponding summaries (same name, just without `_histogram`): - `prometheus_sd_refresh_duration_histogram_seconds` - `prometheus_rule_evaluation_duration_histogram_seconds` - `prometheus_rule_group_duration_histogram_seconds` - `prometheus_target_sync_length_histogram_seconds` - `prometheus_target_interval_length_histogram_seconds` - `prometheus_engine_query_duration_histogram_seconds` Signed-off-by: Harsh <harshmastic@gmail.com> Signed-off-by: harsh kumar <135993950+hxrshxz@users.noreply.github.com> Co-authored-by: Björn Rabenstein <github@rabenste.in>
2026-02-03 20:39:32 -05:00 · 2025-11-27 23:15:35 +05:30 · 2025-11-27 23:15:35 +05:30 · 30be1483d1
commit 30be1483d1
parent 7bb95d548c
8 changed files with 135 additions and 36 deletions
--- a/discovery/discovery.go
+++ b/discovery/discovery.go
@ -63,8 +63,9 @@ type DiscovererOptions struct {
 // We define them here in the "discovery" package in order to avoid a cyclic dependency between
 // "discovery" and "refresh".
 type RefreshMetrics struct {
-	Failures prometheus.Counter
-	Duration prometheus.Observer
+	Failures          prometheus.Counter
+	Duration          prometheus.Observer
+	DurationHistogram prometheus.Observer
 }

 // RefreshMetricsInstantiator instantiates the metrics used by the "refresh" package.
--- a/discovery/metrics_refresh.go
+++ b/discovery/metrics_refresh.go
@ -14,6 +14,8 @@
 package discovery

 import (
+	"time"
+
 	"github.com/prometheus/client_golang/prometheus"
 )

@ -21,8 +23,9 @@ import (
 // We define them here in the "discovery" package in order to avoid a cyclic dependency between
 // "discovery" and "refresh".
 type RefreshMetricsVecs struct {
-	failuresVec *prometheus.CounterVec
-	durationVec *prometheus.SummaryVec
+	failuresVec     *prometheus.CounterVec
+	durationVec     *prometheus.SummaryVec
+	durationHistVec *prometheus.HistogramVec

 	metricRegisterer MetricRegisterer
 }
@ -44,6 +47,16 @@ func NewRefreshMetrics(reg prometheus.Registerer) RefreshMetricsManager {
 				Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
 			},
 			[]string{"mechanism", "config"}),
+		durationHistVec: prometheus.NewHistogramVec(
+			prometheus.HistogramOpts{
+				Name:                            "prometheus_sd_refresh_duration_histogram_seconds",
+				Help:                            "The duration of a refresh for the given SD mechanism.",
+				Buckets:                         []float64{.01, .1, 1, 10},
+				NativeHistogramBucketFactor:     1.1,
+				NativeHistogramMaxBucketNumber:  100,
+				NativeHistogramMinResetDuration: 1 * time.Hour,
+			},
+			[]string{"mechanism"}),
 	}

 	// The reason we register metric vectors instead of metrics is so that
@ -51,6 +64,7 @@ func NewRefreshMetrics(reg prometheus.Registerer) RefreshMetricsManager {
 	m.metricRegisterer = NewMetricRegisterer(reg, []prometheus.Collector{
 		m.failuresVec,
 		m.durationVec,
+		m.durationHistVec,
 	})

 	return m
@ -59,8 +73,9 @@ func NewRefreshMetrics(reg prometheus.Registerer) RefreshMetricsManager {
 // Instantiate returns metrics out of metric vectors for a given mechanism and config.
 func (m *RefreshMetricsVecs) Instantiate(mech, config string) *RefreshMetrics {
 	return &RefreshMetrics{
-		Failures: m.failuresVec.WithLabelValues(mech, config),
-		Duration: m.durationVec.WithLabelValues(mech, config),
+		Failures:          m.failuresVec.WithLabelValues(mech, config),
+		Duration:          m.durationVec.WithLabelValues(mech, config),
+		DurationHistogram: m.durationHistVec.WithLabelValues(mech),
 	}
 }

--- a/discovery/refresh/refresh.go
+++ b/discovery/refresh/refresh.go
@ -108,6 +108,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
 	now := time.Now()
 	defer func() {
 		d.metrics.Duration.Observe(time.Since(now).Seconds())
+		d.metrics.DurationHistogram.Observe(time.Since(now).Seconds())
 	}()

 	tgs, err := d.refreshf(ctx)
--- a/promql/engine.go
+++ b/promql/engine.go
@ -76,15 +76,19 @@ const (
 )

 type engineMetrics struct {
-	currentQueries       prometheus.Gauge
-	maxConcurrentQueries prometheus.Gauge
-	queryLogEnabled      prometheus.Gauge
-	queryLogFailures     prometheus.Counter
-	queryQueueTime       prometheus.Observer
-	queryPrepareTime     prometheus.Observer
-	queryInnerEval       prometheus.Observer
-	queryResultSort      prometheus.Observer
-	querySamples         prometheus.Counter
+	currentQueries            prometheus.Gauge
+	maxConcurrentQueries      prometheus.Gauge
+	queryLogEnabled           prometheus.Gauge
+	queryLogFailures          prometheus.Counter
+	queryQueueTime            prometheus.Observer
+	queryQueueTimeHistogram   prometheus.Observer
+	queryPrepareTime          prometheus.Observer
+	queryPrepareTimeHistogram prometheus.Observer
+	queryInnerEval            prometheus.Observer
+	queryInnerEvalHistogram   prometheus.Observer
+	queryResultSort           prometheus.Observer
+	queryResultSortHistogram  prometheus.Observer
+	querySamples              prometheus.Counter
 }

 type (
@ -363,6 +367,19 @@ func NewEngine(opts EngineOpts) *Engine {
 		[]string{"slice"},
 	)

+	queryResultHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Namespace:                       namespace,
+		Subsystem:                       subsystem,
+		Name:                            "query_duration_histogram_seconds",
+		Help:                            "The duration of various parts of PromQL query execution.",
+		Buckets:                         []float64{.01, .1, 1, 10},
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
+	},
+		[]string{"slice"},
+	)
+
 	metrics := &engineMetrics{
 		currentQueries: prometheus.NewGauge(prometheus.GaugeOpts{
 			Namespace: namespace,
@ -394,10 +411,14 @@ func NewEngine(opts EngineOpts) *Engine {
 			Name:      "query_samples_total",
 			Help:      "The total number of samples loaded by all queries.",
 		}),
-		queryQueueTime:   queryResultSummary.WithLabelValues("queue_time"),
-		queryPrepareTime: queryResultSummary.WithLabelValues("prepare_time"),
-		queryInnerEval:   queryResultSummary.WithLabelValues("inner_eval"),
-		queryResultSort:  queryResultSummary.WithLabelValues("result_sort"),
+		queryQueueTime:            queryResultSummary.WithLabelValues("queue_time"),
+		queryQueueTimeHistogram:   queryResultHistogram.WithLabelValues("queue_time"),
+		queryPrepareTime:          queryResultSummary.WithLabelValues("prepare_time"),
+		queryPrepareTimeHistogram: queryResultHistogram.WithLabelValues("prepare_time"),
+		queryInnerEval:            queryResultSummary.WithLabelValues("inner_eval"),
+		queryInnerEvalHistogram:   queryResultHistogram.WithLabelValues("inner_eval"),
+		queryResultSort:           queryResultSummary.WithLabelValues("result_sort"),
+		queryResultSortHistogram:  queryResultHistogram.WithLabelValues("result_sort"),
 	}

 	if t := opts.ActiveQueryTracker; t != nil {
@ -421,6 +442,7 @@ func NewEngine(opts EngineOpts) *Engine {
 			metrics.queryLogFailures,
 			metrics.querySamples,
 			queryResultSummary,
+			queryResultHistogram,
 		)
 	}

@ -701,7 +723,7 @@ func (ng *Engine) queueActive(ctx context.Context, q *query) (func(), error) {
 	if ng.activeQueryTracker == nil {
 		return func() {}, nil
 	}
-	queueSpanTimer, _ := q.stats.GetSpanTimer(ctx, stats.ExecQueueTime, ng.metrics.queryQueueTime)
+	queueSpanTimer, _ := q.stats.GetSpanTimer(ctx, stats.ExecQueueTime, ng.metrics.queryQueueTime, ng.metrics.queryQueueTimeHistogram)
 	queryIndex, err := ng.activeQueryTracker.Insert(ctx, q.q)
 	queueSpanTimer.Finish()
 	return func() { ng.activeQueryTracker.Delete(queryIndex) }, err
@ -717,7 +739,7 @@ func durationMilliseconds(d time.Duration) int64 {

 // execEvalStmt evaluates the expression of an evaluation statement for the given time range.
 func (ng *Engine) execEvalStmt(ctx context.Context, query *query, s *parser.EvalStmt) (parser.Value, annotations.Annotations, error) {
-	prepareSpanTimer, ctxPrepare := query.stats.GetSpanTimer(ctx, stats.QueryPreparationTime, ng.metrics.queryPrepareTime)
+	prepareSpanTimer, ctxPrepare := query.stats.GetSpanTimer(ctx, stats.QueryPreparationTime, ng.metrics.queryPrepareTime, ng.metrics.queryPrepareTimeHistogram)
 	mint, maxt := FindMinMaxTime(s)
 	querier, err := query.queryable.Querier(mint, maxt)
 	if err != nil {
@ -732,7 +754,7 @@ func (ng *Engine) execEvalStmt(ctx context.Context, query *query, s *parser.Eval
 	// Modify the offset of vector and matrix selectors for the @ modifier
 	// w.r.t. the start time since only 1 evaluation will be done on them.
 	setOffsetForAtModifier(timeMilliseconds(s.Start), s.Expr)
-	evalSpanTimer, ctxInnerEval := query.stats.GetSpanTimer(ctx, stats.InnerEvalTime, ng.metrics.queryInnerEval)
+	evalSpanTimer, ctxInnerEval := query.stats.GetSpanTimer(ctx, stats.InnerEvalTime, ng.metrics.queryInnerEval, ng.metrics.queryInnerEvalHistogram)
 	// Instant evaluation. This is executed as a range evaluation with one step.
 	if s.Start.Equal(s.End) && s.Interval == 0 {
 		start := timeMilliseconds(s.Start)
@ -835,7 +857,7 @@ func (ng *Engine) execEvalStmt(ctx context.Context, query *query, s *parser.Eval
 }

 func (ng *Engine) sortMatrixResult(ctx context.Context, query *query, mat Matrix) {
-	sortSpanTimer, _ := query.stats.GetSpanTimer(ctx, stats.ResultSortTime, ng.metrics.queryResultSort)
+	sortSpanTimer, _ := query.stats.GetSpanTimer(ctx, stats.ResultSortTime, ng.metrics.queryResultSort, ng.metrics.queryResultSortHistogram)
 	sort.Sort(mat)
 	sortSpanTimer.Finish()
 }
--- a/rules/group.go
+++ b/rules/group.go
@ -519,6 +519,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {

 			since := time.Since(t)
 			g.metrics.EvalDuration.Observe(since.Seconds())
+			g.metrics.EvalDurationHistogram.Observe(since.Seconds())
 			rule.SetEvaluationDuration(since)
 			rule.SetEvaluationTimestamp(t)
 		}(time.Now())
@ -910,19 +911,21 @@ const namespace = "prometheus"

 // Metrics for rule evaluation.
 type Metrics struct {
-	EvalDuration             prometheus.Summary
-	IterationDuration        prometheus.Summary
-	IterationsMissed         *prometheus.CounterVec
-	IterationsScheduled      *prometheus.CounterVec
-	EvalTotal                *prometheus.CounterVec
-	EvalFailures             *prometheus.CounterVec
-	GroupInterval            *prometheus.GaugeVec
-	GroupLastEvalTime        *prometheus.GaugeVec
-	GroupLastDuration        *prometheus.GaugeVec
-	GroupLastRuleDurationSum *prometheus.GaugeVec
-	GroupLastRestoreDuration *prometheus.GaugeVec
-	GroupRules               *prometheus.GaugeVec
-	GroupSamples             *prometheus.GaugeVec
+	EvalDuration               prometheus.Summary
+	EvalDurationHistogram      prometheus.Histogram
+	IterationDuration          prometheus.Summary
+	IterationDurationHistogram prometheus.Histogram
+	IterationsMissed           *prometheus.CounterVec
+	IterationsScheduled        *prometheus.CounterVec
+	EvalTotal                  *prometheus.CounterVec
+	EvalFailures               *prometheus.CounterVec
+	GroupInterval              *prometheus.GaugeVec
+	GroupLastEvalTime          *prometheus.GaugeVec
+	GroupLastDuration          *prometheus.GaugeVec
+	GroupLastRuleDurationSum   *prometheus.GaugeVec
+	GroupLastRestoreDuration   *prometheus.GaugeVec
+	GroupRules                 *prometheus.GaugeVec
+	GroupSamples               *prometheus.GaugeVec
 }

 // NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
@ -936,12 +939,30 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
 				Help:       "The duration for a rule to execute.",
 				Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
 			}),
+		EvalDurationHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
+			Namespace:                       namespace,
+			Name:                            "rule_evaluation_duration_histogram_seconds",
+			Help:                            "The duration for a rule to execute.",
+			Buckets:                         []float64{.01, .1, 1, 10},
+			NativeHistogramBucketFactor:     1.1,
+			NativeHistogramMaxBucketNumber:  100,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
+		}),
 		IterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{
 			Namespace:  namespace,
 			Name:       "rule_group_duration_seconds",
 			Help:       "The duration of rule group evaluations.",
 			Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001},
 		}),
+		IterationDurationHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
+			Namespace:                       namespace,
+			Name:                            "rule_group_duration_histogram_seconds",
+			Help:                            "The duration of rule group evaluations.",
+			Buckets:                         []float64{.01, .1, 1, 10},
+			NativeHistogramBucketFactor:     1.1,
+			NativeHistogramMaxBucketNumber:  100,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
+		}),
 		IterationsMissed: prometheus.NewCounterVec(
 			prometheus.CounterOpts{
 				Namespace: namespace,
@ -1035,7 +1056,9 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
 	if reg != nil {
 		reg.MustRegister(
 			m.EvalDuration,
+			m.EvalDurationHistogram,
 			m.IterationDuration,
+			m.IterationDurationHistogram,
 			m.IterationsMissed,
 			m.IterationsScheduled,
 			m.EvalTotal,
--- a/rules/manager.go
+++ b/rules/manager.go
@ -85,6 +85,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
 	timeSinceStart := time.Since(start)

 	g.metrics.IterationDuration.Observe(timeSinceStart.Seconds())
+	g.metrics.IterationDurationHistogram.Observe(timeSinceStart.Seconds())
 	g.updateRuleEvaluationTimeSum()
 	g.setEvaluationTime(timeSinceStart)
 	g.setLastEvaluation(start)
--- a/scrape/metrics.go
+++ b/scrape/metrics.go
@ -15,6 +15,7 @@ package scrape

 import (
 	"fmt"
+	"time"

 	"github.com/prometheus/client_golang/prometheus"
 )
@ -36,6 +37,7 @@ type scrapeMetrics struct {
 	targetScrapePoolTargetsAdded        *prometheus.GaugeVec
 	targetScrapePoolSymbolTableItems    *prometheus.GaugeVec
 	targetSyncIntervalLength            *prometheus.SummaryVec
+	targetSyncIntervalLengthHistogram   *prometheus.HistogramVec
 	targetSyncFailed                    *prometheus.CounterVec

 	// Used by targetScraper.
@ -46,6 +48,7 @@ type scrapeMetrics struct {

 	// Used by scrapeLoop.
 	targetIntervalLength                   *prometheus.SummaryVec
+	targetIntervalLengthHistogram          *prometheus.HistogramVec
 	targetScrapeSampleLimit                prometheus.Counter
 	targetScrapeSampleDuplicate            prometheus.Counter
 	targetScrapeSampleOutOfOrder           prometheus.Counter
@ -152,6 +155,17 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) {
 		},
 		[]string{"scrape_job"},
 	)
+	sm.targetSyncIntervalLengthHistogram = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:                            "prometheus_target_sync_length_histogram_seconds",
+			Help:                            "Actual interval to sync the scrape pool.",
+			Buckets:                         []float64{.01, .1, 1, 10},
+			NativeHistogramBucketFactor:     1.1,
+			NativeHistogramMaxBucketNumber:  100,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
+		},
+		[]string{"scrape_job"},
+	)
 	sm.targetSyncFailed = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Name: "prometheus_target_sync_failed_total",
@ -185,6 +199,17 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) {
 		},
 		[]string{"interval"},
 	)
+	sm.targetIntervalLengthHistogram = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:                            "prometheus_target_interval_length_histogram_seconds",
+			Help:                            "Actual intervals between scrapes.",
+			Buckets:                         []float64{.01, .1, 1, 10},
+			NativeHistogramBucketFactor:     1.1,
+			NativeHistogramMaxBucketNumber:  100,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
+		},
+		[]string{"interval"},
+	)
 	sm.targetScrapeSampleLimit = prometheus.NewCounter(
 		prometheus.CounterOpts{
 			Name: "prometheus_target_scrapes_exceeded_sample_limit_total",
@ -238,6 +263,7 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) {
 		sm.targetScrapePoolReloads,
 		sm.targetScrapePoolReloadsFailed,
 		sm.targetSyncIntervalLength,
+		sm.targetSyncIntervalLengthHistogram,
 		sm.targetScrapePoolSyncsCounter,
 		sm.targetScrapePoolExceededTargetLimit,
 		sm.targetScrapePoolTargetLimit,
@ -250,6 +276,7 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) {
 		sm.targetScrapeCacheFlushForced,
 		// Used by scrapeLoop.
 		sm.targetIntervalLength,
+		sm.targetIntervalLengthHistogram,
 		sm.targetScrapeSampleLimit,
 		sm.targetScrapeSampleDuplicate,
 		sm.targetScrapeSampleOutOfOrder,
@ -279,6 +306,7 @@ func (sm *scrapeMetrics) Unregister() {
 	sm.reg.Unregister(sm.targetScrapePoolReloads)
 	sm.reg.Unregister(sm.targetScrapePoolReloadsFailed)
 	sm.reg.Unregister(sm.targetSyncIntervalLength)
+	sm.reg.Unregister(sm.targetSyncIntervalLengthHistogram)
 	sm.reg.Unregister(sm.targetScrapePoolSyncsCounter)
 	sm.reg.Unregister(sm.targetScrapePoolExceededTargetLimit)
 	sm.reg.Unregister(sm.targetScrapePoolTargetLimit)
@ -288,6 +316,7 @@ func (sm *scrapeMetrics) Unregister() {
 	sm.reg.Unregister(sm.targetScrapeExceededBodySizeLimit)
 	sm.reg.Unregister(sm.targetScrapeCacheFlushForced)
 	sm.reg.Unregister(sm.targetIntervalLength)
+	sm.reg.Unregister(sm.targetIntervalLengthHistogram)
 	sm.reg.Unregister(sm.targetScrapeSampleLimit)
 	sm.reg.Unregister(sm.targetScrapeSampleDuplicate)
 	sm.reg.Unregister(sm.targetScrapeSampleOutOfOrder)
--- a/scrape/scrape.go
+++ b/scrape/scrape.go
@ -309,6 +309,7 @@ func (sp *scrapePool) stop() {
 		sp.metrics.targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName)
 		sp.metrics.targetScrapePoolSymbolTableItems.DeleteLabelValues(sp.config.JobName)
 		sp.metrics.targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName)
+		sp.metrics.targetSyncIntervalLengthHistogram.DeleteLabelValues(sp.config.JobName)
 		sp.metrics.targetSyncFailed.DeleteLabelValues(sp.config.JobName)
 	}
 }
@ -505,6 +506,9 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) {
 	sp.metrics.targetSyncIntervalLength.WithLabelValues(sp.config.JobName).Observe(
 		time.Since(start).Seconds(),
 	)
+	sp.metrics.targetSyncIntervalLengthHistogram.WithLabelValues(sp.config.JobName).Observe(
+		time.Since(start).Seconds(),
+	)
 	sp.metrics.targetScrapePoolSyncsCounter.WithLabelValues(sp.config.JobName).Inc()
 }

@ -1420,6 +1424,9 @@ func (sl *scrapeLoop) scrapeAndReport(last, appendTime time.Time, errc chan<- er
 		sl.metrics.targetIntervalLength.WithLabelValues(sl.interval.String()).Observe(
 			time.Since(last).Seconds(),
 		)
+		sl.metrics.targetIntervalLengthHistogram.WithLabelValues(sl.interval.String()).Observe(
+			time.Since(last).Seconds(),
+		)
 	}

 	var total, added, seriesAdded, bytesRead int