From ecb6ab131e84bf6384a56be9faacdbef583115a3 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Fri, 21 Nov 2025 16:05:42 -0500 Subject: [PATCH 1/6] fix: delete expired refresh metrics on reload Building off config-specific Prometheus refresh metrics from an earlier PR (https://github.com/prometheus/prometheus/pull/17138), this deletes refresh metrics like `prometheus_sd_refresh_duration_seconds` and `prometheus_sd_refresh_failures_total` when the underlying scrape job configuration is removed on reload. This reduces un-needed cardinality from scrape job specific metrics while still preserving metrics that indicate overall health of a service discovery engine. For example, `prometheus_sd_refresh_failures_total{config="linode-servers",mechanism="linode"} 1` will no longer be exported by Prometheus when the `linode-servers` scrape job for the Linode service provider is removed. The generic, service discovery specific `prometheus_sd_linode_failures_total` metric will persist however. Signed-off-by: Will Bollock --- cmd/prometheus/main.go | 6 ++-- discovery/discovery.go | 1 + discovery/manager.go | 53 ++++++++++++++++++++++++++---------- discovery/metrics_refresh.go | 9 ++++++ scrape/manager_test.go | 4 ++- 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 6ea65c879a..066ede1fc6 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -816,19 +816,19 @@ func main() { os.Exit(1) } - sdMetrics, err := discovery.CreateAndRegisterSDMetrics(prometheus.DefaultRegisterer) + sdMetrics, refreshMetrics, err := discovery.CreateAndRegisterSDMetrics(prometheus.DefaultRegisterer) if err != nil { logger.Error("failed to register service discovery metrics", "err", err) os.Exit(1) } - discoveryManagerScrape = discovery.NewManager(ctxScrape, logger.With("component", "discovery manager scrape"), prometheus.DefaultRegisterer, sdMetrics, discovery.Name("scrape")) + discoveryManagerScrape = discovery.NewManager(ctxScrape, logger.With("component", "discovery manager scrape"), prometheus.DefaultRegisterer, sdMetrics, refreshMetrics, discovery.Name("scrape")) if discoveryManagerScrape == nil { logger.Error("failed to create a discovery manager scrape") os.Exit(1) } - discoveryManagerNotify = discovery.NewManager(ctxNotify, logger.With("component", "discovery manager notify"), prometheus.DefaultRegisterer, sdMetrics, discovery.Name("notify")) + discoveryManagerNotify = discovery.NewManager(ctxNotify, logger.With("component", "discovery manager notify"), prometheus.DefaultRegisterer, sdMetrics, refreshMetrics, discovery.Name("notify")) if discoveryManagerNotify == nil { logger.Error("failed to create a discovery manager notify") os.Exit(1) diff --git a/discovery/discovery.go b/discovery/discovery.go index 70cd856bb2..2764c5a373 100644 --- a/discovery/discovery.go +++ b/discovery/discovery.go @@ -82,6 +82,7 @@ type RefreshMetricsInstantiator interface { type RefreshMetricsManager interface { DiscovererMetrics RefreshMetricsInstantiator + DeleteLabelValues(mech, config string) } // A Config provides the configuration and constructor for a Discoverer. diff --git a/discovery/manager.go b/discovery/manager.go index 878bc5f6d4..930637fed0 100644 --- a/discovery/manager.go +++ b/discovery/manager.go @@ -69,34 +69,34 @@ func (p *Provider) Config() any { // CreateAndRegisterSDMetrics registers the metrics needed for SD mechanisms. // Does not register the metrics for the Discovery Manager. -// TODO(ptodev): Add ability to unregister the metrics? -func CreateAndRegisterSDMetrics(reg prometheus.Registerer) (map[string]DiscovererMetrics, error) { +func CreateAndRegisterSDMetrics(reg prometheus.Registerer) (map[string]DiscovererMetrics, RefreshMetricsManager, error) { // Some SD mechanisms use the "refresh" package, which has its own metrics. refreshSdMetrics := NewRefreshMetrics(reg) // Register the metrics specific for each SD mechanism, and the ones for the refresh package. sdMetrics, err := RegisterSDMetrics(reg, refreshSdMetrics) if err != nil { - return nil, fmt.Errorf("failed to register service discovery metrics: %w", err) + return nil, nil, fmt.Errorf("failed to register service discovery metrics: %w", err) } - return sdMetrics, nil + return sdMetrics, refreshSdMetrics, nil } // NewManager is the Discovery Manager constructor. -func NewManager(ctx context.Context, logger *slog.Logger, registerer prometheus.Registerer, sdMetrics map[string]DiscovererMetrics, options ...func(*Manager)) *Manager { +func NewManager(ctx context.Context, logger *slog.Logger, registerer prometheus.Registerer, sdMetrics map[string]DiscovererMetrics, refreshMetrics RefreshMetricsManager, options ...func(*Manager)) *Manager { if logger == nil { logger = promslog.NewNopLogger() } mgr := &Manager{ - logger: logger, - syncCh: make(chan map[string][]*targetgroup.Group), - targets: make(map[poolKey]map[string]*targetgroup.Group), - ctx: ctx, - updatert: 5 * time.Second, - triggerSend: make(chan struct{}, 1), - registerer: registerer, - sdMetrics: sdMetrics, + logger: logger, + syncCh: make(chan map[string][]*targetgroup.Group), + targets: make(map[poolKey]map[string]*targetgroup.Group), + ctx: ctx, + updatert: 5 * time.Second, + triggerSend: make(chan struct{}, 1), + registerer: registerer, + sdMetrics: sdMetrics, + refreshMetrics: refreshMetrics, } for _, option := range options { option(mgr) @@ -173,8 +173,9 @@ type Manager struct { // A registerer for all service discovery metrics. registerer prometheus.Registerer - metrics *Metrics - sdMetrics map[string]DiscovererMetrics + metrics *Metrics + sdMetrics map[string]DiscovererMetrics + refreshMetrics RefreshMetricsManager } // Providers returns the currently configured SD providers. @@ -231,6 +232,19 @@ func (m *Manager) ApplyConfig(cfg map[string]Configs) error { prov.cancel() prov.mu.RUnlock() + + // Clear up refresh metrics associated with this cancelled provider (sub means scrape job name). + for s := range prov.subs { + // Also clean up discovered targets metric. + delete(m.targets, poolKey{s, prov.name}) + m.metrics.DiscoveredTargets.DeleteLabelValues(s) + + if m.refreshMetrics != nil { + if cfg, ok := prov.config.(Config); ok { + m.refreshMetrics.DeleteLabelValues(cfg.Name(), s) + } + } + } continue } prov.mu.RUnlock() @@ -247,6 +261,15 @@ func (m *Manager) ApplyConfig(cfg map[string]Configs) error { if _, ok := prov.newSubs[s]; !ok { delete(m.targets, poolKey{s, prov.name}) m.metrics.DiscoveredTargets.DeleteLabelValues(m.name, s) + + // TODO - okay to calculate this again down here? better method? + // Clean up refresh metrics again for subs that are being removed from a provider that is still running. + if m.refreshMetrics != nil { + cfg, ok := prov.config.(Config) + if ok { + m.refreshMetrics.DeleteLabelValues(cfg.Name(), s) + } + } } } // Set metrics and targets for new subs. diff --git a/discovery/metrics_refresh.go b/discovery/metrics_refresh.go index 8a8bf221b8..04eedc0278 100644 --- a/discovery/metrics_refresh.go +++ b/discovery/metrics_refresh.go @@ -73,3 +73,12 @@ func (m *RefreshMetricsVecs) Register() error { func (m *RefreshMetricsVecs) Unregister() { m.metricRegisterer.UnregisterMetrics() } + +// DeleteLabelValues deletes refresh metrics for a specific mechanism and config. +// Smart to use this when a scrape job is removed. +func (m *RefreshMetricsVecs) DeleteLabelValues(mech, config string) { + // DeleteLabelValues is used over UnregisterMetrics to only delete metrics for a specific + // mechanism and config combination. + m.failuresVec.DeleteLabelValues(mech, config) + m.durationVec.DeleteLabelValues(mech, config) +} diff --git a/scrape/manager_test.go b/scrape/manager_test.go index 1ec4875d19..f82fbb2e41 100644 --- a/scrape/manager_test.go +++ b/scrape/manager_test.go @@ -1207,13 +1207,15 @@ func runManagers(t *testing.T, ctx context.Context, opts *Options, app storage.A } reg := prometheus.NewRegistry() - sdMetrics, err := discovery.RegisterSDMetrics(reg, discovery.NewRefreshMetrics(reg)) + refreshMetrics := discovery.NewRefreshMetrics(reg) + sdMetrics, err := discovery.RegisterSDMetrics(reg, refreshMetrics) require.NoError(t, err) discoveryManager := discovery.NewManager( ctx, promslog.NewNopLogger(), reg, sdMetrics, + refreshMetrics, discovery.Updatert(100*time.Millisecond), ) scrapeManager, err := NewManager( From 75a9432048ef1d5861fde4ccf76a21e90c856e7e Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 25 Nov 2025 10:54:06 -0500 Subject: [PATCH 2/6] docs: old refresh metrics are cleaned up on reload Signed-off-by: Will Bollock --- docs/http_sd.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/http_sd.md b/docs/http_sd.md index d329ce07af..bf75aa2ae5 100644 --- a/docs/http_sd.md +++ b/docs/http_sd.md @@ -41,7 +41,8 @@ Prometheus caches target lists. If an error occurs while fetching an updated targets list, Prometheus keeps using the current targets list. The targets list is not saved across restart. The `prometheus_sd_refresh_failures_total` counter metric tracks the number of refresh failures and the `prometheus_sd_refresh_duration_seconds` -bucket can be used to track HTTP SD refresh attempts or performance. +bucket can be used to track HTTP SD refresh attempts or performance. These metrics are +removed when the underlying scrape job disappears on Prometheus configuration reload. The whole list of targets must be returned on every scrape. There is no support for incremental updates. A Prometheus instance does not send its hostname and it From 968b1a7a43a8b7f2f379ccd9ce78f9676990587e Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 25 Nov 2025 11:02:24 -0500 Subject: [PATCH 3/6] fix: add refreshMetrics to tests Signed-off-by: Will Bollock --- discovery/manager_test.go | 58 +++++++++---------- .../examples/custom-sd/adapter-usage/main.go | 2 +- .../examples/custom-sd/adapter/adapter.go | 4 +- .../custom-sd/adapter/adapter_test.go | 2 +- notifier/manager_test.go | 4 +- 5 files changed, 36 insertions(+), 34 deletions(-) diff --git a/discovery/manager_test.go b/discovery/manager_test.go index 5d34cb7ac0..780c5ab133 100644 --- a/discovery/manager_test.go +++ b/discovery/manager_test.go @@ -673,9 +673,9 @@ func TestTargetUpdatesOrder(t *testing.T) { defer cancel() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond @@ -786,9 +786,9 @@ func TestTargetSetTargetGroupsPresentOnConfigReload(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -822,9 +822,9 @@ func TestTargetSetTargetGroupsPresentOnConfigRename(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -861,9 +861,9 @@ func TestTargetSetTargetGroupsPresentOnConfigDuplicateAndDeleteOriginal(t *testi ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -903,9 +903,9 @@ func TestTargetSetTargetGroupsPresentOnConfigChange(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -970,9 +970,9 @@ func TestTargetSetRecreatesTargetGroupsOnConfigChange(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -1013,9 +1013,9 @@ func TestDiscovererConfigs(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -1049,9 +1049,9 @@ func TestTargetSetRecreatesEmptyStaticConfigs(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -1090,9 +1090,9 @@ func TestIdenticalConfigurationsAreCoalesced(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, nil, reg, sdMetrics) + discoveryManager := NewManager(ctx, nil, reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -1128,9 +1128,9 @@ func TestApplyConfigDoesNotModifyStaticTargets(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -1188,9 +1188,9 @@ func TestGaugeFailedConfigs(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -1344,9 +1344,9 @@ func TestCoordinationWithReceiver(t *testing.T) { defer cancel() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - mgr := NewManager(ctx, nil, reg, sdMetrics) + mgr := NewManager(ctx, nil, reg, sdMetrics, refreshMetrics) require.NotNil(t, mgr) mgr.updatert = updateDelay go mgr.Run() @@ -1438,9 +1438,9 @@ func TestTargetSetTargetGroupsUpdateDuringApplyConfig(t *testing.T) { ctx := t.Context() reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond go discoveryManager.Run() @@ -1537,7 +1537,7 @@ func TestUnregisterMetrics(t *testing.T) { refreshMetrics, sdMetrics := NewTestMetrics(t, reg) - discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) // discoveryManager will be nil if there was an error configuring metrics. require.NotNil(t, discoveryManager) // Unregister all metrics. @@ -1554,10 +1554,10 @@ func TestUnregisterMetrics(t *testing.T) { // the manager should not hang. func TestConfigReloadAndShutdownRace(t *testing.T) { reg := prometheus.NewRegistry() - _, sdMetrics := NewTestMetrics(t, reg) + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) mgrCtx, mgrCancel := context.WithCancel(context.Background()) - discoveryManager := NewManager(mgrCtx, promslog.NewNopLogger(), reg, sdMetrics) + discoveryManager := NewManager(mgrCtx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) require.NotNil(t, discoveryManager) discoveryManager.updatert = 100 * time.Millisecond diff --git a/documentation/examples/custom-sd/adapter-usage/main.go b/documentation/examples/custom-sd/adapter-usage/main.go index e7f7a69b5d..320ae60c79 100644 --- a/documentation/examples/custom-sd/adapter-usage/main.go +++ b/documentation/examples/custom-sd/adapter-usage/main.go @@ -281,7 +281,7 @@ func main() { os.Exit(1) } - sdAdapter := adapter.NewAdapter(ctx, *outputFile, "exampleSD", disc, logger, metrics, reg) + sdAdapter := adapter.NewAdapter(ctx, *outputFile, "exampleSD", disc, logger, metrics, refreshMetrics, reg) sdAdapter.Run() <-ctx.Done() diff --git a/documentation/examples/custom-sd/adapter/adapter.go b/documentation/examples/custom-sd/adapter/adapter.go index b242c4eaa0..56a3584455 100644 --- a/documentation/examples/custom-sd/adapter/adapter.go +++ b/documentation/examples/custom-sd/adapter/adapter.go @@ -162,12 +162,12 @@ func (a *Adapter) Run() { } // NewAdapter creates a new instance of Adapter. -func NewAdapter(ctx context.Context, file, name string, d discovery.Discoverer, logger *slog.Logger, sdMetrics map[string]discovery.DiscovererMetrics, registerer prometheus.Registerer) *Adapter { +func NewAdapter(ctx context.Context, file, name string, d discovery.Discoverer, logger *slog.Logger, sdMetrics map[string]discovery.DiscovererMetrics, refreshMetrics discovery.RefreshMetricsManager, registerer prometheus.Registerer) *Adapter { return &Adapter{ ctx: ctx, disc: d, groups: make(map[string]*customSD), - manager: discovery.NewManager(ctx, logger, registerer, sdMetrics), + manager: discovery.NewManager(ctx, logger, registerer, sdMetrics, refreshMetrics), output: file, name: name, logger: logger, diff --git a/documentation/examples/custom-sd/adapter/adapter_test.go b/documentation/examples/custom-sd/adapter/adapter_test.go index 329ca8c29a..804faa8593 100644 --- a/documentation/examples/custom-sd/adapter/adapter_test.go +++ b/documentation/examples/custom-sd/adapter/adapter_test.go @@ -235,6 +235,6 @@ func TestWriteOutput(t *testing.T) { sdMetrics, err := discovery.RegisterSDMetrics(reg, refreshMetrics) require.NoError(t, err) - adapter := NewAdapter(ctx, tmpfile.Name(), "test_sd", nil, nil, sdMetrics, reg) + adapter := NewAdapter(ctx, tmpfile.Name(), "test_sd", nil, nil, sdMetrics, refreshMetrics, reg) require.NoError(t, adapter.writeOutput()) } diff --git a/notifier/manager_test.go b/notifier/manager_test.go index 64de020338..3e2a32ad32 100644 --- a/notifier/manager_test.go +++ b/notifier/manager_test.go @@ -754,13 +754,15 @@ func TestHangingNotifier(t *testing.T) { // The old implementation of TestHangingNotifier didn't take that into account. ctx := t.Context() reg := prometheus.NewRegistry() - sdMetrics, err := discovery.RegisterSDMetrics(reg, discovery.NewRefreshMetrics(reg)) + refreshMetrics := discovery.NewRefreshMetrics(reg) + sdMetrics, err := discovery.RegisterSDMetrics(reg, refreshMetrics) require.NoError(t, err) sdManager := discovery.NewManager( ctx, promslog.NewNopLogger(), reg, sdMetrics, + refreshMetrics, discovery.Name("sd-manager"), discovery.Updatert(sdUpdatert), ) From 559321e6108995a71768dfe383d01873fdda0d19 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 25 Nov 2025 11:03:50 -0500 Subject: [PATCH 4/6] fix: prometheus_sd_discovered_targets deletion This fixes a bug with the prometheus_sd_discovered_targets metric in which it would not be deleted for obsolete scrape jobs. It was passing too many parameters and the metric would persist for old scrape jobs instead of rightfully being deleted. Signed-off-by: Will Bollock --- discovery/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/manager.go b/discovery/manager.go index 930637fed0..ae40262d6f 100644 --- a/discovery/manager.go +++ b/discovery/manager.go @@ -260,7 +260,7 @@ func (m *Manager) ApplyConfig(cfg map[string]Configs) error { // Remove obsolete subs' targets. if _, ok := prov.newSubs[s]; !ok { delete(m.targets, poolKey{s, prov.name}) - m.metrics.DiscoveredTargets.DeleteLabelValues(m.name, s) + m.metrics.DiscoveredTargets.DeleteLabelValues(s) // TODO - okay to calculate this again down here? better method? // Clean up refresh metrics again for subs that are being removed from a provider that is still running. From b4c044f1f81e59675abf11d373ef52c0023b9d64 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 25 Nov 2025 11:13:46 -0500 Subject: [PATCH 5/6] fix: add targetsMtx lock for targets access Signed-off-by: Will Bollock --- discovery/manager.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/discovery/manager.go b/discovery/manager.go index ae40262d6f..904e2c1384 100644 --- a/discovery/manager.go +++ b/discovery/manager.go @@ -234,8 +234,9 @@ func (m *Manager) ApplyConfig(cfg map[string]Configs) error { prov.mu.RUnlock() // Clear up refresh metrics associated with this cancelled provider (sub means scrape job name). + m.targetsMtx.Lock() for s := range prov.subs { - // Also clean up discovered targets metric. + // Also clean up discovered targets metric. targetsMtx lock needed for safe access to m.targets. delete(m.targets, poolKey{s, prov.name}) m.metrics.DiscoveredTargets.DeleteLabelValues(s) @@ -245,6 +246,7 @@ func (m *Manager) ApplyConfig(cfg map[string]Configs) error { } } } + m.targetsMtx.Unlock() continue } prov.mu.RUnlock() @@ -262,7 +264,6 @@ func (m *Manager) ApplyConfig(cfg map[string]Configs) error { delete(m.targets, poolKey{s, prov.name}) m.metrics.DiscoveredTargets.DeleteLabelValues(s) - // TODO - okay to calculate this again down here? better method? // Clean up refresh metrics again for subs that are being removed from a provider that is still running. if m.refreshMetrics != nil { cfg, ok := prov.config.(Config) From bef93d121cab85141ca569df241ef29b95d73e12 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 25 Nov 2025 12:02:48 -0500 Subject: [PATCH 6/6] test: validate refresh/discover metrics are gone Signed-off-by: Will Bollock --- discovery/manager_test.go | 50 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/discovery/manager_test.go b/discovery/manager_test.go index 780c5ab133..abf2af6f52 100644 --- a/discovery/manager_test.go +++ b/discovery/manager_test.go @@ -1550,6 +1550,56 @@ func TestUnregisterMetrics(t *testing.T) { } } +// Refresh and discovery metrics should be deleted for providers that are removed. +func TestMetricsCleanupAfterConfigReload(t *testing.T) { + ctx := t.Context() + + reg := prometheus.NewRegistry() + refreshMetrics, sdMetrics := NewTestMetrics(t, reg) + + discoveryManager := NewManager(ctx, promslog.NewNopLogger(), reg, sdMetrics, refreshMetrics) + require.NotNil(t, discoveryManager) + discoveryManager.updatert = 100 * time.Millisecond + go discoveryManager.Run() + + c := map[string]Configs{ + "prometheus": { + staticConfig("foo:9090", "bar:9090"), + }, + "other": { + staticConfig("baz:9090"), + }, + } + discoveryManager.ApplyConfig(c) + <-discoveryManager.SyncCh() + + // Manually instantiate refresh metrics to make them visible + refreshMetrics.Instantiate("static", "prometheus").Failures.Add(0) + refreshMetrics.Instantiate("static", "other").Failures.Add(0) + + count, err := client_testutil.GatherAndCount(reg, "prometheus_sd_discovered_targets") + require.NoError(t, err) + require.Equal(t, 2, count) + + count, err = client_testutil.GatherAndCount(reg, "prometheus_sd_refresh_failures_total") + require.NoError(t, err) + require.Equal(t, 2, count) + + // Simulate a config refresh. + delete(c, "prometheus") + discoveryManager.ApplyConfig(c) + <-discoveryManager.SyncCh() + + // Ensure we still have metrics for the remaining provider. + count, err = client_testutil.GatherAndCount(reg, "prometheus_sd_discovered_targets") + require.NoError(t, err) + require.Equal(t, 1, count) + + count, err = client_testutil.GatherAndCount(reg, "prometheus_sd_refresh_failures_total") + require.NoError(t, err) + require.Equal(t, 1, count) +} + // Calling ApplyConfig() that removes providers at the same time as shutting down // the manager should not hang. func TestConfigReloadAndShutdownRace(t *testing.T) {