[VAULT-17826] Remove mount point from rollback metrics (#22400)

* remove metrics * add test and documentation * update docs * changelog * fix TestConfig_Sanitized * Update website/content/docs/upgrading/upgrade-to-1.15.x.mdx Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com> * Update website/content/docs/upgrading/upgrade-to-1.15.x.mdx Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com> * Update website/content/partials/telemetry-metrics/rollback-intro.mdx Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com> * Update website/content/partials/telemetry-metrics/route-intro.mdx Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com> * separate partials for metrics * remove debugging line * add high cardinality warning --------- Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com>
2026-02-03 20:40:45 -05:00 · 2023-08-21 16:55:37 +02:00 · 2023-08-21 16:55:37 +02:00 · 616c3a5ba5
commit 616c3a5ba5
parent 35a5fbfc60
22 changed files with 235 additions and 36 deletions
--- a/changelog/22400.txt
+++ b/changelog/22400.txt
@ -0,0 +1,3 @@
+```release-note:change
+telemetry: Replace `vault.rollback.attempt.{MOUNT_POINT}` and `vault.route.rollback.{MOUNT_POINT}` metrics with `vault.rollback.attempt` and `vault.route.rollback metrics` by default. Added a telemetry configuration `add_mount_point_rollback_metrics` which, when set to true, causes vault to emit the metrics with mount points in their names.
+```
--- a/command/server/config_telemetry_test.go
+++ b/command/server/config_telemetry_test.go
@ -7,6 +7,7 @@ import (
 	"testing"

 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )

 func TestMetricFilterConfigs(t *testing.T) {
@ -41,3 +42,35 @@ func TestMetricFilterConfigs(t *testing.T) {
 		}
 	})
 }
+
+// TestRollbackMountPointMetricsConfig verifies that the add_mount_point_rollback_metrics
+// config option is parsed correctly, when it is set to true. Also verifies that
+// the default for this setting is false
+func TestRollbackMountPointMetricsConfig(t *testing.T) {
+	t.Parallel()
+	testCases := []struct {
+		name           string
+		configFile     string
+		wantMountPoint bool
+	}{
+		{
+			name:           "include mount point",
+			configFile:     "./test-fixtures/telemetry/rollback_mount_point.hcl",
+			wantMountPoint: true,
+		},
+		{
+			name:           "exclude mount point",
+			configFile:     "./test-fixtures/telemetry/valid_prefix_filter.hcl",
+			wantMountPoint: false,
+		},
+	}
+	for _, tc := range testCases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+			config, err := LoadConfigFile(tc.configFile)
+			require.NoError(t, err)
+			require.Equal(t, tc.wantMountPoint, config.Telemetry.RollbackMetricsIncludeMountPoint)
+		})
+	}
+}
--- a/command/server/config_test_helpers.go
+++ b/command/server/config_test_helpers.go
@ -846,6 +846,7 @@ func testConfig_Sanitized(t *testing.T) {
 			"lease_metrics_epsilon":                  time.Hour,
 			"num_lease_metrics_buckets":              168,
 			"add_lease_metrics_namespace_labels":     false,
+			"add_mount_point_rollback_metrics":       false,
 		},
 		"administrative_namespace_path": "admin/",
 	}
--- a/command/server/test-fixtures/telemetry/rollback_mount_point.hcl
+++ b/command/server/test-fixtures/telemetry/rollback_mount_point.hcl
@ -0,0 +1,9 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+disable_mlock = true
+ui            = true
+
+telemetry {
+  add_mount_point_rollback_metrics = true
+}
--- a/helper/metricsutil/wrapped_metrics.go
+++ b/helper/metricsutil/wrapped_metrics.go
@ -37,9 +37,10 @@ type ClusterMetricSink struct {
 }

 type TelemetryConstConfig struct {
-	LeaseMetricsEpsilon         time.Duration
-	NumLeaseMetricsTimeBuckets  int
-	LeaseMetricsNameSpaceLabels bool
+	LeaseMetricsEpsilon              time.Duration
+	NumLeaseMetricsTimeBuckets       int
+	LeaseMetricsNameSpaceLabels      bool
+	RollbackMetricsIncludeMountPoint bool
 }

 type Metrics interface {
--- a/internalshared/configutil/config.go
+++ b/internalshared/configutil/config.go
@ -271,6 +271,7 @@ func (c *SharedConfig) Sanitized() map[string]interface{} {
 			"lease_metrics_epsilon":                  c.Telemetry.LeaseMetricsEpsilon,
 			"num_lease_metrics_buckets":              c.Telemetry.NumLeaseMetricsTimeBuckets,
 			"add_lease_metrics_namespace_labels":     c.Telemetry.LeaseMetricsNameSpaceLabels,
+			"add_mount_point_rollback_metrics":       c.Telemetry.RollbackMetricsIncludeMountPoint,
 		}
 		result["telemetry"] = sanitizedTelemetry
 	}
--- a/internalshared/configutil/telemetry.go
+++ b/internalshared/configutil/telemetry.go
@ -162,6 +162,10 @@ type Telemetry struct {
 	// PrefixFilter is a list of filter rules to apply for allowing
 	// or blocking metrics by prefix.
 	PrefixFilter []string `hcl:"prefix_filter"`
+
+	// Whether or not telemetry should include the mount point in the rollback
+	// metrics
+	RollbackMetricsIncludeMountPoint bool `hcl:"add_mount_point_rollback_metrics"`
 }

 func (t *Telemetry) Validate(source string) []ConfigError {
@ -402,6 +406,7 @@ func SetupTelemetry(opts *SetupTelemetryOpts) (*metrics.InmemSink, *metricsutil.
 	wrapper.TelemetryConsts.LeaseMetricsEpsilon = opts.Config.LeaseMetricsEpsilon
 	wrapper.TelemetryConsts.LeaseMetricsNameSpaceLabels = opts.Config.LeaseMetricsNameSpaceLabels
 	wrapper.TelemetryConsts.NumLeaseMetricsTimeBuckets = opts.Config.NumLeaseMetricsTimeBuckets
+	wrapper.TelemetryConsts.RollbackMetricsIncludeMountPoint = opts.Config.RollbackMetricsIncludeMountPoint

 	// Parse the metric filters
 	telemetryAllowedPrefixes, telemetryBlockedPrefixes, err := parsePrefixFilter(opts.Config.PrefixFilter)
--- a/vault/core.go
+++ b/vault/core.go
@ -677,7 +677,8 @@ type Core struct {
 	// heartbeating with the active node. Default to the current SDK version.
 	effectiveSDKVersion string

-	rollbackPeriod time.Duration
+	rollbackPeriod           time.Duration
+	rollbackMountPathMetrics bool

 	experiments []string

@ -1027,6 +1028,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
 		experiments:                    conf.Experiments,
 		pendingRemovalMountsAllowed:    conf.PendingRemovalMountsAllowed,
 		expirationRevokeRetryBase:      conf.ExpirationRevokeRetryBase,
+		rollbackMountPathMetrics:       conf.MetricSink.TelemetryConsts.RollbackMetricsIncludeMountPoint,
 	}

 	c.standbyStopCh.Store(make(chan struct{}))
@ -1036,6 +1038,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
 	c.shutdownDoneCh.Store(make(chan struct{}))

 	c.router.logger = c.logger.Named("router")
+	c.router.rollbackMetricsMountName = c.rollbackMountPathMetrics

 	c.inFlightReqData = &InFlightRequests{
 		InFlightReqMap:   &sync.Map{},
--- a/vault/rollback.go
+++ b/vault/rollback.go
@ -39,9 +39,10 @@ type RollbackManager struct {
 	router *Router
 	period time.Duration

-	inflightAll  sync.WaitGroup
-	inflight     map[string]*rollbackState
-	inflightLock sync.RWMutex
+	rollbackMetricsMountName bool
+	inflightAll              sync.WaitGroup
+	inflight                 map[string]*rollbackState
+	inflightLock             sync.RWMutex

 	doneCh          chan struct{}
 	shutdown        bool
@ -52,6 +53,8 @@ type RollbackManager struct {
 	quitContext     context.Context

 	core *Core
+	// This channel is used for testing
+	rollbacksDoneCh chan struct{}
 }

 // rollbackState is used to track the state of a single rollback attempt
@ -65,16 +68,18 @@ type rollbackState struct {
 // NewRollbackManager is used to create a new rollback manager
 func NewRollbackManager(ctx context.Context, logger log.Logger, backendsFunc func() []*MountEntry, router *Router, core *Core) *RollbackManager {
 	r := &RollbackManager{
-		logger:      logger,
-		backends:    backendsFunc,
-		router:      router,
-		period:      core.rollbackPeriod,
-		inflight:    make(map[string]*rollbackState),
-		doneCh:      make(chan struct{}),
-		shutdownCh:  make(chan struct{}),
-		stopTicker:  make(chan struct{}),
-		quitContext: ctx,
-		core:        core,
+		logger:                   logger,
+		backends:                 backendsFunc,
+		router:                   router,
+		period:                   core.rollbackPeriod,
+		inflight:                 make(map[string]*rollbackState),
+		doneCh:                   make(chan struct{}),
+		shutdownCh:               make(chan struct{}),
+		stopTicker:               make(chan struct{}),
+		quitContext:              ctx,
+		core:                     core,
+		rollbackMetricsMountName: core.rollbackMountPathMetrics,
+		rollbacksDoneCh:          make(chan struct{}),
 	}
 	return r
 }
@ -121,7 +126,6 @@ func (m *RollbackManager) run() {
 		select {
 		case <-tick.C:
 			m.triggerRollbacks()
-
 		case <-m.shutdownCh:
 			m.logger.Info("stopping rollback manager")
 			return
@ -179,14 +183,23 @@ func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath st
 	m.inflight[fullPath] = rs
 	rs.Add(1)
 	m.inflightAll.Add(1)
-	go m.attemptRollback(ctx, fullPath, rs, grabStatelock)
+	go func() {
+		m.attemptRollback(ctx, fullPath, rs, grabStatelock)
+		select {
+		case m.rollbacksDoneCh <- struct{}{}:
+		default:
+		}
+	}()
 	return rs
 }

 // attemptRollback invokes a RollbackOperation for the given path
 func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, rs *rollbackState, grabStatelock bool) (err error) {
-	defer metrics.MeasureSince([]string{"rollback", "attempt", strings.ReplaceAll(fullPath, "/", "-")}, time.Now())
-
+	metricName := []string{"rollback", "attempt"}
+	if m.rollbackMetricsMountName {
+		metricName = append(metricName, strings.ReplaceAll(fullPath, "/", "-"))
+	}
+	defer metrics.MeasureSince(metricName, time.Now())
 	defer func() {
 		rs.lastError = err
 		rs.Done()
--- a/vault/rollback_test.go
+++ b/vault/rollback_test.go
@ -5,14 +5,18 @@ package vault

 import (
 	"context"
+	"strings"
 	"sync"
 	"testing"
 	"time"

+	"github.com/armon/go-metrics"
 	log "github.com/hashicorp/go-hclog"
 	"github.com/hashicorp/go-uuid"
+	"github.com/hashicorp/vault/helper/metricsutil"
 	"github.com/hashicorp/vault/helper/namespace"
 	"github.com/hashicorp/vault/sdk/helper/logging"
+	"github.com/stretchr/testify/require"
 )

 // mockRollback returns a mock rollback manager
@ -120,3 +124,80 @@ func TestRollbackManager_Join(t *testing.T) {
 		t.Fatalf("Error on rollback:%v", err)
 	}
 }
+
+// TestRollbackMetrics verifies that the rollback metrics only include the mount
+// point in their names when RollbackMetricsIncludeMountPoint is true.
+// This test cannot be run in parallel, because we are using the global metrics
+// instance
+func TestRollbackMetrics(t *testing.T) {
+	testCases := []struct {
+		name          string
+		addMountPoint bool
+	}{
+		{
+			name:          "include mount point",
+			addMountPoint: true,
+		},
+		{
+			name:          "exclude mount point",
+			addMountPoint: false,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			inMemSink := metrics.NewInmemSink(10000*time.Hour, 10000*time.Hour)
+			sink := metricsutil.NewClusterMetricSink("test", inMemSink)
+			sink.TelemetryConsts.RollbackMetricsIncludeMountPoint = tc.addMountPoint
+			_, err := metrics.NewGlobal(metrics.DefaultConfig("vault"), inMemSink)
+			require.NoError(t, err)
+			conf := &CoreConfig{
+				MetricSink:     sink,
+				RollbackPeriod: 50 * time.Millisecond,
+				MetricsHelper:  metricsutil.NewMetricsHelper(inMemSink, true),
+			}
+
+			core, _, _ := TestCoreUnsealedWithConfig(t, conf)
+
+			samplesWith := func(intervals []*metrics.IntervalMetrics, with func(string) bool) []metrics.SampledValue {
+				t.Helper()
+				samples := make([]metrics.SampledValue, 0)
+				for _, interval := range intervals {
+					for name, summary := range interval.Samples {
+						if with(name) {
+							samples = append(samples, summary)
+						}
+					}
+				}
+				return samples
+			}
+
+			<-core.rollback.rollbacksDoneCh
+			intervals := inMemSink.Data()
+
+			mountPointAttempts := samplesWith(intervals, func(s string) bool {
+				return strings.HasPrefix(s, "vault.rollback.attempt.")
+			})
+			mountPointRoutes := samplesWith(intervals, func(s string) bool {
+				return strings.HasPrefix(s, "vault.route.rollback.")
+			})
+
+			noMountPointAttempts := samplesWith(intervals, func(s string) bool {
+				return s == "vault.rollback.attempt"
+			})
+			noMountPointRoutes := samplesWith(intervals, func(s string) bool {
+				return s == "vault.route.rollback"
+			})
+			if tc.addMountPoint {
+				require.NotEmpty(t, mountPointAttempts)
+				require.NotEmpty(t, mountPointRoutes)
+				require.Empty(t, noMountPointAttempts)
+				require.Empty(t, noMountPointRoutes)
+			} else {
+				require.Empty(t, mountPointAttempts)
+				require.Empty(t, mountPointRoutes)
+				require.NotEmpty(t, noMountPointAttempts)
+				require.NotEmpty(t, noMountPointRoutes)
+			}
+		})
+	}
+}
--- a/vault/router.go
+++ b/vault/router.go
@ -39,8 +39,9 @@ type Router struct {
 	// storagePrefix maps the prefix used for storage (ala the BarrierView)
 	// to the backend. This is used to map a key back into the backend that owns it.
 	// For example, logical/uuid1/foobar -> secrets/ (kv backend) + foobar
-	storagePrefix *radix.Tree
-	logger        hclog.Logger
+	storagePrefix            *radix.Tree
+	logger                   hclog.Logger
+	rollbackMetricsMountName bool
 }

 // NewRouter returns a new router
@ -581,10 +582,11 @@ func (r *Router) routeCommon(ctx context.Context, req *logical.Request, existenc
 	}
 	req.Path = adjustedPath
 	if !existenceCheck {
-		defer metrics.MeasureSince([]string{
-			"route", string(req.Operation),
-			strings.ReplaceAll(mount, "/", "-"),
-		}, time.Now())
+		metricName := []string{"route", string(req.Operation)}
+		if req.Operation != logical.RollbackOperation || r.rollbackMetricsMountName {
+			metricName = append(metricName, strings.ReplaceAll(mount, "/", "-"))
+		}
+		defer metrics.MeasureSince(metricName, time.Now())
 	}
 	re := raw.(*routeEntry)

--- a/vault/testing.go
+++ b/vault/testing.go
@ -244,6 +244,9 @@ func TestCoreWithSealAndUINoCleanup(t testing.T, opts *CoreConfig) *Core {
 	for k, v := range opts.AuditBackends {
 		conf.AuditBackends[k] = v
 	}
+	if opts.RollbackPeriod != time.Duration(0) {
+		conf.RollbackPeriod = opts.RollbackPeriod
+	}

 	conf.ActivityLogConfig = opts.ActivityLogConfig
 	testApplyEntBaseConfig(conf, opts)
--- a/website/content/docs/configuration/telemetry.mdx
+++ b/website/content/docs/configuration/telemetry.mdx
@ -52,6 +52,11 @@ The following options are available on all telemetry configurations.
 - `add_lease_metrics_namespace_labels` `(bool: false)` - If this value is set to true, then `vault.expire.leases.by_expiration`
  will break down expiring leases by both time and namespace. This parameter is disabled by default because enabling it can lead
  to a large-cardinality metric.
+- `add_mount_point_rollback_metrics` `(bool: false)` - If this value is set to true, then `vault.rollback.attempt.{MOUNT_POINT}`
+  and `vault.route.rollback.{MOUNT_POINT}` metrics will be reported for every mount point. If this parameter is false, then
+  `vault.rollback.attempt` and `vault.route.rollback` metrics (which do not have the mount point in the metric name)
+  will be reported instead. This parameter is disabled by default starting in Vault 1.15 due to the high cardinality of
+  these metrics.
 - `filter_default` `(bool: true)` - This controls whether to allow metrics that have not been specified by the filter.
  Defaults to `true`, which will allow all metrics when no filters are provided.
  When set to `false` with no filters, no metrics will be sent.
--- a/website/content/docs/internals/telemetry/metrics/all.mdx
+++ b/website/content/docs/internals/telemetry/metrics/all.mdx
@ -616,6 +616,8 @@ alphabetic order by name.

@include 'telemetry-metrics/vault/rollback/attempt/mountpoint.mdx'

+@include 'telemetry-metrics/vault/rollback/attempt.mdx'
+
@include 'telemetry-metrics/vault/route/create/mountpoint.mdx'

@include 'telemetry-metrics/vault/route/delete/mountpoint.mdx'
@ -626,6 +628,8 @@ alphabetic order by name.

@include 'telemetry-metrics/vault/route/rollback/mountpoint.mdx'

+@include 'telemetry-metrics/vault/route/rollback.mdx'
+
@include 'telemetry-metrics/vault/runtime/alloc_bytes.mdx'

@include 'telemetry-metrics/vault/runtime/free_count.mdx'
@ -722,4 +726,4 @@ alphabetic order by name.

@include 'telemetry-metrics/vault/zookeeper/list.mdx'

-@include 'telemetry-metrics/vault/zookeeper/put.mdx'
+@include 'telemetry-metrics/vault/zookeeper/put.mdx'
--- a/website/content/docs/internals/telemetry/metrics/core-system.mdx
+++ b/website/content/docs/internals/telemetry/metrics/core-system.mdx
@ -112,6 +112,8 @@ Vault instance.

@include 'telemetry-metrics/vault/rollback/attempt/mountpoint.mdx'

+@include 'telemetry-metrics/vault/rollback/attempt.mdx'
+
 ## Route metrics

@include 'telemetry-metrics/route-intro.mdx'
@ -126,6 +128,8 @@ Vault instance.

@include 'telemetry-metrics/vault/route/rollback/mountpoint.mdx'

+@include 'telemetry-metrics/vault/route/rollback.mdx'
+
 ## Runtime metrics

@include 'telemetry-metrics/runtime-note.mdx'
@ -146,4 +150,4 @@ Vault instance.

@include 'telemetry-metrics/vault/runtime/total_gc_pause_ns.mdx'

-@include 'telemetry-metrics/vault/runtime/total_gc_runs.mdx'
+@include 'telemetry-metrics/vault/runtime/total_gc_runs.mdx'
--- a/website/content/docs/upgrading/upgrade-to-1.15.x.mdx
+++ b/website/content/docs/upgrading/upgrade-to-1.15.x.mdx
@ -30,5 +30,13 @@ Please audit your Consul storage stanza to ensure that you either:
 * Manually convert your `service_tags` to lowercase if required
 * Ensure that any system that relies on the tags is aware of the new case-preserving behavior

+## Rollback metrics

+Vault no longer measures and reports the metrics `vault.rollback.attempts.{MOUNTPOINT}` and `vault.route.rollback.{MOUNTPOINT}` by default. The new default metrics are `vault.rollback.attempts`
+and `vault.route.rollback`, which **do not** contain the mount point in the metric name.

+To continue measuring `vault.rollback.attempts.{MOUNTPOINT}` and
+`vault.route.rollback.{MOUNTPOINT}`, you must explicitly enable mount-specific
+metrics in the `telemetry` stanza of your Vault configuration with the
+[`add_mount_point_rollback_metrics`](/vault/docs/configuration/telemetry#add_mount_point_rollback_metrics)
+option.
--- a/website/content/partials/telemetry-metrics/rollback-intro.mdx
+++ b/website/content/partials/telemetry-metrics/rollback-intro.mdx
@ -1,4 +1,9 @@
-Rollback metrics for each configured mount point. Metric names convert
-forward slashes (`/`) in mount names to dashes (`-`). For example, if you
-have the `auth/token` backend configured, the corresponding mount point metric
-string is `auth-token`
+By default, Vault does not separate rollback metrics by mountpoint. To enable
+explictly named metrics for mountpoints, enable the
+[`add_mount_point_rollback_metrics`](/vault/docs/configuration/telemetry#add_mount_point_rollback_metrics)
+option in your `telemetry` configuration stanza.
+
+If you enable named metrics for mountpoints, the metric name converts forward
+slashes (`/`) in mount names to dashes (`-`). For example, if you have the
+`auth/token` backend configured and mountpoint names enabled for telemetry,
+the corresponding mount point metric string is `auth-token`.
--- a/website/content/partials/telemetry-metrics/route-intro.mdx
+++ b/website/content/partials/telemetry-metrics/route-intro.mdx
@ -1,4 +1,9 @@
 Mount-specific route metrics for each configured mount point. Metric names
 convert forward slashes (`/`) in mount names to dashes (`-`). For example, if
 you have the `auth/token` backend configured, the corresponding mount point
-metric string is `auth-token`
+metric string is `auth-token`
+
+By default, Vault does not separate rollback metrics by mountpoint. To enable
+explictly named metrics for mountpoints, enable the
+[`add_mount_point_rollback_metrics`](/vault/docs/configuration/telemetry#add_mount_point_rollback_metrics)
+option in your `telemetry` configuration stanza.
--- a/website/content/partials/telemetry-metrics/vault/rollback/attempt.mdx
+++ b/website/content/partials/telemetry-metrics/vault/rollback/attempt.mdx
@ -0,0 +1,5 @@
+### vault.rollback.attempt ((#vault-rollback-attempt))
+
+Metric type | Value | Description
+----------- | ----- | -----------
+summary     | ms    | Time required to perform a rollback operation
--- a/website/content/partials/telemetry-metrics/vault/rollback/attempt/mountpoint.mdx
+++ b/website/content/partials/telemetry-metrics/vault/rollback/attempt/mountpoint.mdx
@ -2,4 +2,4 @@

 Metric type | Value | Description
 ----------- | ----- | -----------
-summary     | ms    | Time required to perform a rollback operation on the given mount point
+summary     | ms    | Time required to perform a rollback operation on the given mount point
--- a/website/content/partials/telemetry-metrics/vault/route/rollback.mdx
+++ b/website/content/partials/telemetry-metrics/vault/route/rollback.mdx
@ -0,0 +1,8 @@
+### vault.route.rollback ((#vault-route-rollback))
+
+Metric type | Value | Description
+----------- | ----- | -----------
+summary     | ms    | Time required to send a rollback request to the backend and for the backend to complete the operation
+
+Vault automatically schedules and performs mount point rollback operations to
+clean up partial errors.
--- a/website/content/partials/telemetry-metrics/vault/route/rollback/mountpoint.mdx
+++ b/website/content/partials/telemetry-metrics/vault/route/rollback/mountpoint.mdx
@ -5,4 +5,4 @@ Metric type | Value | Description
 summary     | ms    | Time required to send a rollback request to the backend and for the backend to complete the operation for the given mount point

 Vault automatically schedules and performs mount point rollback operations to
-clean up partial errors.
+clean up partial errors.