[VAULT-17826] Remove mount point from rollback metrics (#22400)

* remove metrics

* add test and documentation

* update docs

* changelog

* fix TestConfig_Sanitized

* Update website/content/docs/upgrading/upgrade-to-1.15.x.mdx

Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com>

* Update website/content/docs/upgrading/upgrade-to-1.15.x.mdx

Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com>

* Update website/content/partials/telemetry-metrics/rollback-intro.mdx

Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com>

* Update website/content/partials/telemetry-metrics/route-intro.mdx

Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com>

* separate partials for metrics

* remove debugging line

* add high cardinality warning

---------

Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com>
This commit is contained in:
miagilepner 2023-08-21 16:55:37 +02:00 committed by GitHub
parent 35a5fbfc60
commit 616c3a5ba5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 235 additions and 36 deletions

3
changelog/22400.txt Normal file
View file

@ -0,0 +1,3 @@
```release-note:change
telemetry: Replace `vault.rollback.attempt.{MOUNT_POINT}` and `vault.route.rollback.{MOUNT_POINT}` metrics with `vault.rollback.attempt` and `vault.route.rollback metrics` by default. Added a telemetry configuration `add_mount_point_rollback_metrics` which, when set to true, causes vault to emit the metrics with mount points in their names.
```

View file

@ -7,6 +7,7 @@ import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestMetricFilterConfigs(t *testing.T) {
@ -41,3 +42,35 @@ func TestMetricFilterConfigs(t *testing.T) {
}
})
}
// TestRollbackMountPointMetricsConfig verifies that the add_mount_point_rollback_metrics
// config option is parsed correctly, when it is set to true. Also verifies that
// the default for this setting is false
func TestRollbackMountPointMetricsConfig(t *testing.T) {
t.Parallel()
testCases := []struct {
name string
configFile string
wantMountPoint bool
}{
{
name: "include mount point",
configFile: "./test-fixtures/telemetry/rollback_mount_point.hcl",
wantMountPoint: true,
},
{
name: "exclude mount point",
configFile: "./test-fixtures/telemetry/valid_prefix_filter.hcl",
wantMountPoint: false,
},
}
for _, tc := range testCases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
config, err := LoadConfigFile(tc.configFile)
require.NoError(t, err)
require.Equal(t, tc.wantMountPoint, config.Telemetry.RollbackMetricsIncludeMountPoint)
})
}
}

View file

@ -846,6 +846,7 @@ func testConfig_Sanitized(t *testing.T) {
"lease_metrics_epsilon": time.Hour,
"num_lease_metrics_buckets": 168,
"add_lease_metrics_namespace_labels": false,
"add_mount_point_rollback_metrics": false,
},
"administrative_namespace_path": "admin/",
}

View file

@ -0,0 +1,9 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
disable_mlock = true
ui = true
telemetry {
add_mount_point_rollback_metrics = true
}

View file

@ -37,9 +37,10 @@ type ClusterMetricSink struct {
}
type TelemetryConstConfig struct {
LeaseMetricsEpsilon time.Duration
NumLeaseMetricsTimeBuckets int
LeaseMetricsNameSpaceLabels bool
LeaseMetricsEpsilon time.Duration
NumLeaseMetricsTimeBuckets int
LeaseMetricsNameSpaceLabels bool
RollbackMetricsIncludeMountPoint bool
}
type Metrics interface {

View file

@ -271,6 +271,7 @@ func (c *SharedConfig) Sanitized() map[string]interface{} {
"lease_metrics_epsilon": c.Telemetry.LeaseMetricsEpsilon,
"num_lease_metrics_buckets": c.Telemetry.NumLeaseMetricsTimeBuckets,
"add_lease_metrics_namespace_labels": c.Telemetry.LeaseMetricsNameSpaceLabels,
"add_mount_point_rollback_metrics": c.Telemetry.RollbackMetricsIncludeMountPoint,
}
result["telemetry"] = sanitizedTelemetry
}

View file

@ -162,6 +162,10 @@ type Telemetry struct {
// PrefixFilter is a list of filter rules to apply for allowing
// or blocking metrics by prefix.
PrefixFilter []string `hcl:"prefix_filter"`
// Whether or not telemetry should include the mount point in the rollback
// metrics
RollbackMetricsIncludeMountPoint bool `hcl:"add_mount_point_rollback_metrics"`
}
func (t *Telemetry) Validate(source string) []ConfigError {
@ -402,6 +406,7 @@ func SetupTelemetry(opts *SetupTelemetryOpts) (*metrics.InmemSink, *metricsutil.
wrapper.TelemetryConsts.LeaseMetricsEpsilon = opts.Config.LeaseMetricsEpsilon
wrapper.TelemetryConsts.LeaseMetricsNameSpaceLabels = opts.Config.LeaseMetricsNameSpaceLabels
wrapper.TelemetryConsts.NumLeaseMetricsTimeBuckets = opts.Config.NumLeaseMetricsTimeBuckets
wrapper.TelemetryConsts.RollbackMetricsIncludeMountPoint = opts.Config.RollbackMetricsIncludeMountPoint
// Parse the metric filters
telemetryAllowedPrefixes, telemetryBlockedPrefixes, err := parsePrefixFilter(opts.Config.PrefixFilter)

View file

@ -677,7 +677,8 @@ type Core struct {
// heartbeating with the active node. Default to the current SDK version.
effectiveSDKVersion string
rollbackPeriod time.Duration
rollbackPeriod time.Duration
rollbackMountPathMetrics bool
experiments []string
@ -1027,6 +1028,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
experiments: conf.Experiments,
pendingRemovalMountsAllowed: conf.PendingRemovalMountsAllowed,
expirationRevokeRetryBase: conf.ExpirationRevokeRetryBase,
rollbackMountPathMetrics: conf.MetricSink.TelemetryConsts.RollbackMetricsIncludeMountPoint,
}
c.standbyStopCh.Store(make(chan struct{}))
@ -1036,6 +1038,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
c.shutdownDoneCh.Store(make(chan struct{}))
c.router.logger = c.logger.Named("router")
c.router.rollbackMetricsMountName = c.rollbackMountPathMetrics
c.inFlightReqData = &InFlightRequests{
InFlightReqMap: &sync.Map{},

View file

@ -39,9 +39,10 @@ type RollbackManager struct {
router *Router
period time.Duration
inflightAll sync.WaitGroup
inflight map[string]*rollbackState
inflightLock sync.RWMutex
rollbackMetricsMountName bool
inflightAll sync.WaitGroup
inflight map[string]*rollbackState
inflightLock sync.RWMutex
doneCh chan struct{}
shutdown bool
@ -52,6 +53,8 @@ type RollbackManager struct {
quitContext context.Context
core *Core
// This channel is used for testing
rollbacksDoneCh chan struct{}
}
// rollbackState is used to track the state of a single rollback attempt
@ -65,16 +68,18 @@ type rollbackState struct {
// NewRollbackManager is used to create a new rollback manager
func NewRollbackManager(ctx context.Context, logger log.Logger, backendsFunc func() []*MountEntry, router *Router, core *Core) *RollbackManager {
r := &RollbackManager{
logger: logger,
backends: backendsFunc,
router: router,
period: core.rollbackPeriod,
inflight: make(map[string]*rollbackState),
doneCh: make(chan struct{}),
shutdownCh: make(chan struct{}),
stopTicker: make(chan struct{}),
quitContext: ctx,
core: core,
logger: logger,
backends: backendsFunc,
router: router,
period: core.rollbackPeriod,
inflight: make(map[string]*rollbackState),
doneCh: make(chan struct{}),
shutdownCh: make(chan struct{}),
stopTicker: make(chan struct{}),
quitContext: ctx,
core: core,
rollbackMetricsMountName: core.rollbackMountPathMetrics,
rollbacksDoneCh: make(chan struct{}),
}
return r
}
@ -121,7 +126,6 @@ func (m *RollbackManager) run() {
select {
case <-tick.C:
m.triggerRollbacks()
case <-m.shutdownCh:
m.logger.Info("stopping rollback manager")
return
@ -179,14 +183,23 @@ func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath st
m.inflight[fullPath] = rs
rs.Add(1)
m.inflightAll.Add(1)
go m.attemptRollback(ctx, fullPath, rs, grabStatelock)
go func() {
m.attemptRollback(ctx, fullPath, rs, grabStatelock)
select {
case m.rollbacksDoneCh <- struct{}{}:
default:
}
}()
return rs
}
// attemptRollback invokes a RollbackOperation for the given path
func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, rs *rollbackState, grabStatelock bool) (err error) {
defer metrics.MeasureSince([]string{"rollback", "attempt", strings.ReplaceAll(fullPath, "/", "-")}, time.Now())
metricName := []string{"rollback", "attempt"}
if m.rollbackMetricsMountName {
metricName = append(metricName, strings.ReplaceAll(fullPath, "/", "-"))
}
defer metrics.MeasureSince(metricName, time.Now())
defer func() {
rs.lastError = err
rs.Done()

View file

@ -5,14 +5,18 @@ package vault
import (
"context"
"strings"
"sync"
"testing"
"time"
"github.com/armon/go-metrics"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-uuid"
"github.com/hashicorp/vault/helper/metricsutil"
"github.com/hashicorp/vault/helper/namespace"
"github.com/hashicorp/vault/sdk/helper/logging"
"github.com/stretchr/testify/require"
)
// mockRollback returns a mock rollback manager
@ -120,3 +124,80 @@ func TestRollbackManager_Join(t *testing.T) {
t.Fatalf("Error on rollback:%v", err)
}
}
// TestRollbackMetrics verifies that the rollback metrics only include the mount
// point in their names when RollbackMetricsIncludeMountPoint is true.
// This test cannot be run in parallel, because we are using the global metrics
// instance
func TestRollbackMetrics(t *testing.T) {
testCases := []struct {
name string
addMountPoint bool
}{
{
name: "include mount point",
addMountPoint: true,
},
{
name: "exclude mount point",
addMountPoint: false,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
inMemSink := metrics.NewInmemSink(10000*time.Hour, 10000*time.Hour)
sink := metricsutil.NewClusterMetricSink("test", inMemSink)
sink.TelemetryConsts.RollbackMetricsIncludeMountPoint = tc.addMountPoint
_, err := metrics.NewGlobal(metrics.DefaultConfig("vault"), inMemSink)
require.NoError(t, err)
conf := &CoreConfig{
MetricSink: sink,
RollbackPeriod: 50 * time.Millisecond,
MetricsHelper: metricsutil.NewMetricsHelper(inMemSink, true),
}
core, _, _ := TestCoreUnsealedWithConfig(t, conf)
samplesWith := func(intervals []*metrics.IntervalMetrics, with func(string) bool) []metrics.SampledValue {
t.Helper()
samples := make([]metrics.SampledValue, 0)
for _, interval := range intervals {
for name, summary := range interval.Samples {
if with(name) {
samples = append(samples, summary)
}
}
}
return samples
}
<-core.rollback.rollbacksDoneCh
intervals := inMemSink.Data()
mountPointAttempts := samplesWith(intervals, func(s string) bool {
return strings.HasPrefix(s, "vault.rollback.attempt.")
})
mountPointRoutes := samplesWith(intervals, func(s string) bool {
return strings.HasPrefix(s, "vault.route.rollback.")
})
noMountPointAttempts := samplesWith(intervals, func(s string) bool {
return s == "vault.rollback.attempt"
})
noMountPointRoutes := samplesWith(intervals, func(s string) bool {
return s == "vault.route.rollback"
})
if tc.addMountPoint {
require.NotEmpty(t, mountPointAttempts)
require.NotEmpty(t, mountPointRoutes)
require.Empty(t, noMountPointAttempts)
require.Empty(t, noMountPointRoutes)
} else {
require.Empty(t, mountPointAttempts)
require.Empty(t, mountPointRoutes)
require.NotEmpty(t, noMountPointAttempts)
require.NotEmpty(t, noMountPointRoutes)
}
})
}
}

View file

@ -39,8 +39,9 @@ type Router struct {
// storagePrefix maps the prefix used for storage (ala the BarrierView)
// to the backend. This is used to map a key back into the backend that owns it.
// For example, logical/uuid1/foobar -> secrets/ (kv backend) + foobar
storagePrefix *radix.Tree
logger hclog.Logger
storagePrefix *radix.Tree
logger hclog.Logger
rollbackMetricsMountName bool
}
// NewRouter returns a new router
@ -581,10 +582,11 @@ func (r *Router) routeCommon(ctx context.Context, req *logical.Request, existenc
}
req.Path = adjustedPath
if !existenceCheck {
defer metrics.MeasureSince([]string{
"route", string(req.Operation),
strings.ReplaceAll(mount, "/", "-"),
}, time.Now())
metricName := []string{"route", string(req.Operation)}
if req.Operation != logical.RollbackOperation || r.rollbackMetricsMountName {
metricName = append(metricName, strings.ReplaceAll(mount, "/", "-"))
}
defer metrics.MeasureSince(metricName, time.Now())
}
re := raw.(*routeEntry)

View file

@ -244,6 +244,9 @@ func TestCoreWithSealAndUINoCleanup(t testing.T, opts *CoreConfig) *Core {
for k, v := range opts.AuditBackends {
conf.AuditBackends[k] = v
}
if opts.RollbackPeriod != time.Duration(0) {
conf.RollbackPeriod = opts.RollbackPeriod
}
conf.ActivityLogConfig = opts.ActivityLogConfig
testApplyEntBaseConfig(conf, opts)

View file

@ -52,6 +52,11 @@ The following options are available on all telemetry configurations.
- `add_lease_metrics_namespace_labels` `(bool: false)` - If this value is set to true, then `vault.expire.leases.by_expiration`
will break down expiring leases by both time and namespace. This parameter is disabled by default because enabling it can lead
to a large-cardinality metric.
- `add_mount_point_rollback_metrics` `(bool: false)` - If this value is set to true, then `vault.rollback.attempt.{MOUNT_POINT}`
and `vault.route.rollback.{MOUNT_POINT}` metrics will be reported for every mount point. If this parameter is false, then
`vault.rollback.attempt` and `vault.route.rollback` metrics (which do not have the mount point in the metric name)
will be reported instead. This parameter is disabled by default starting in Vault 1.15 due to the high cardinality of
these metrics.
- `filter_default` `(bool: true)` - This controls whether to allow metrics that have not been specified by the filter.
Defaults to `true`, which will allow all metrics when no filters are provided.
When set to `false` with no filters, no metrics will be sent.

View file

@ -616,6 +616,8 @@ alphabetic order by name.
@include 'telemetry-metrics/vault/rollback/attempt/mountpoint.mdx'
@include 'telemetry-metrics/vault/rollback/attempt.mdx'
@include 'telemetry-metrics/vault/route/create/mountpoint.mdx'
@include 'telemetry-metrics/vault/route/delete/mountpoint.mdx'
@ -626,6 +628,8 @@ alphabetic order by name.
@include 'telemetry-metrics/vault/route/rollback/mountpoint.mdx'
@include 'telemetry-metrics/vault/route/rollback.mdx'
@include 'telemetry-metrics/vault/runtime/alloc_bytes.mdx'
@include 'telemetry-metrics/vault/runtime/free_count.mdx'
@ -722,4 +726,4 @@ alphabetic order by name.
@include 'telemetry-metrics/vault/zookeeper/list.mdx'
@include 'telemetry-metrics/vault/zookeeper/put.mdx'
@include 'telemetry-metrics/vault/zookeeper/put.mdx'

View file

@ -112,6 +112,8 @@ Vault instance.
@include 'telemetry-metrics/vault/rollback/attempt/mountpoint.mdx'
@include 'telemetry-metrics/vault/rollback/attempt.mdx'
## Route metrics
@include 'telemetry-metrics/route-intro.mdx'
@ -126,6 +128,8 @@ Vault instance.
@include 'telemetry-metrics/vault/route/rollback/mountpoint.mdx'
@include 'telemetry-metrics/vault/route/rollback.mdx'
## Runtime metrics
@include 'telemetry-metrics/runtime-note.mdx'
@ -146,4 +150,4 @@ Vault instance.
@include 'telemetry-metrics/vault/runtime/total_gc_pause_ns.mdx'
@include 'telemetry-metrics/vault/runtime/total_gc_runs.mdx'
@include 'telemetry-metrics/vault/runtime/total_gc_runs.mdx'

View file

@ -30,5 +30,13 @@ Please audit your Consul storage stanza to ensure that you either:
* Manually convert your `service_tags` to lowercase if required
* Ensure that any system that relies on the tags is aware of the new case-preserving behavior
## Rollback metrics
Vault no longer measures and reports the metrics `vault.rollback.attempts.{MOUNTPOINT}` and `vault.route.rollback.{MOUNTPOINT}` by default. The new default metrics are `vault.rollback.attempts`
and `vault.route.rollback`, which **do not** contain the mount point in the metric name.
To continue measuring `vault.rollback.attempts.{MOUNTPOINT}` and
`vault.route.rollback.{MOUNTPOINT}`, you must explicitly enable mount-specific
metrics in the `telemetry` stanza of your Vault configuration with the
[`add_mount_point_rollback_metrics`](/vault/docs/configuration/telemetry#add_mount_point_rollback_metrics)
option.

View file

@ -1,4 +1,9 @@
Rollback metrics for each configured mount point. Metric names convert
forward slashes (`/`) in mount names to dashes (`-`). For example, if you
have the `auth/token` backend configured, the corresponding mount point metric
string is `auth-token`
By default, Vault does not separate rollback metrics by mountpoint. To enable
explictly named metrics for mountpoints, enable the
[`add_mount_point_rollback_metrics`](/vault/docs/configuration/telemetry#add_mount_point_rollback_metrics)
option in your `telemetry` configuration stanza.
If you enable named metrics for mountpoints, the metric name converts forward
slashes (`/`) in mount names to dashes (`-`). For example, if you have the
`auth/token` backend configured and mountpoint names enabled for telemetry,
the corresponding mount point metric string is `auth-token`.

View file

@ -1,4 +1,9 @@
Mount-specific route metrics for each configured mount point. Metric names
convert forward slashes (`/`) in mount names to dashes (`-`). For example, if
you have the `auth/token` backend configured, the corresponding mount point
metric string is `auth-token`
metric string is `auth-token`
By default, Vault does not separate rollback metrics by mountpoint. To enable
explictly named metrics for mountpoints, enable the
[`add_mount_point_rollback_metrics`](/vault/docs/configuration/telemetry#add_mount_point_rollback_metrics)
option in your `telemetry` configuration stanza.

View file

@ -0,0 +1,5 @@
### vault.rollback.attempt ((#vault-rollback-attempt))
Metric type | Value | Description
----------- | ----- | -----------
summary | ms | Time required to perform a rollback operation

View file

@ -2,4 +2,4 @@
Metric type | Value | Description
----------- | ----- | -----------
summary | ms | Time required to perform a rollback operation on the given mount point
summary | ms | Time required to perform a rollback operation on the given mount point

View file

@ -0,0 +1,8 @@
### vault.route.rollback ((#vault-route-rollback))
Metric type | Value | Description
----------- | ----- | -----------
summary | ms | Time required to send a rollback request to the backend and for the backend to complete the operation
Vault automatically schedules and performs mount point rollback operations to
clean up partial errors.

View file

@ -5,4 +5,4 @@ Metric type | Value | Description
summary | ms | Time required to send a rollback request to the backend and for the backend to complete the operation for the given mount point
Vault automatically schedules and performs mount point rollback operations to
clean up partial errors.
clean up partial errors.