From 8ca9dc102c4a0ed2820c60ae1dfeb20d8966ae09 Mon Sep 17 00:00:00 2001 From: Jainil Rana Date: Wed, 28 Jan 2026 20:26:11 -0500 Subject: [PATCH] Add unauthenticated probe listener for health endpoints Signed-off-by: Jainil Rana --- cmd/prometheus/main.go | 23 ++++++++++ docs/command-line/prometheus.md | 1 + web/web.go | 73 +++++++++++++++++++++++++++++- web/web_test.go | 80 +++++++++++++++++++++++++++++++++ 4 files changed, 176 insertions(+), 1 deletion(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index e4f15f5cb8..59644939f5 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -375,6 +375,8 @@ func main() { a.Flag("web.listen-address", "Address to listen on for UI, API, and telemetry. Can be repeated."). Default("0.0.0.0:9090").StringsVar(&cfg.web.ListenAddresses) + a.Flag("web.probe-listen-address", "Address to listen on for unauthenticated health probes (/-/healthy, /-/ready). Can be repeated."). + Default("").StringsVar(&cfg.web.ProbeListenAddresses) a.Flag("auto-gomaxprocs", "Automatically set GOMAXPROCS to match Linux container CPU quota"). Default("true").BoolVar(&cfg.maxprocsEnable) @@ -1104,6 +1106,12 @@ func main() { os.Exit(1) } + probeListeners, err := webHandler.ProbeListeners() + if err != nil { + logger.Error("Unable to start probe listener", "err", err) + os.Exit(1) + } + err = toolkit_web.Validate(*webConfig) if err != nil { logger.Error("Unable to validate web configuration file", "err", err) @@ -1165,6 +1173,21 @@ func main() { }, ) } + { + if len(probeListeners) > 0 { + g.Add( + func() error { + if err := webHandler.RunProbes(ctxWeb, probeListeners); err != nil { + return fmt.Errorf("error starting the probe server: %w", err) + } + return nil + }, + func(error) { + cancelWeb() + }, + ) + } + } if !agentMode { // Rule manager. g.Add( diff --git a/docs/command-line/prometheus.md b/docs/command-line/prometheus.md index 251fdfd6a4..c8a87c5925 100644 --- a/docs/command-line/prometheus.md +++ b/docs/command-line/prometheus.md @@ -14,6 +14,7 @@ The Prometheus monitoring server | --config.file | Prometheus configuration file path. | `prometheus.yml` | | --config.auto-reload-interval | Specifies the interval for checking and automatically reloading the Prometheus configuration file upon detecting changes. | `30s` | | --web.listen-address ... | Address to listen on for UI, API, and telemetry. Can be repeated. | `0.0.0.0:9090` | +| --web.probe-listen-address ... | Address to listen on for unauthenticated health probes (/-/healthy, /-/ready). Can be repeated. | | | --auto-gomaxprocs | Automatically set GOMAXPROCS to match Linux container CPU quota | `true` | | --auto-gomemlimit | Automatically set GOMEMLIMIT to match Linux container or system memory limit | `true` | | --auto-gomemlimit.ratio | The ratio of reserved GOMEMLIMIT memory to the detected maximum container or system memory | `0.9` | diff --git a/web/web.go b/web/web.go index 4df447be64..5b641a5025 100644 --- a/web/web.go +++ b/web/web.go @@ -17,6 +17,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "io" "log/slog" @@ -272,7 +273,9 @@ type Options struct { NotificationsSub func() (<-chan notifications.Notification, func(), bool) Flags map[string]string - ListenAddresses []string + ListenAddresses []string + ProbeListenAddresses []string + CORSOrigin *regexp.Regexp ReadTimeout time.Duration MaxConnections int @@ -672,6 +675,22 @@ func (h *Handler) Listeners() ([]net.Listener, error) { return listeners, nil } +func (h *Handler) ProbeListeners() ([]net.Listener, error) { + if len(h.options.ProbeListenAddresses) == 0 { + return nil, nil + } + var listeners []net.Listener + sem := netconnlimit.NewSharedSemaphore(h.options.MaxConnections) + for _, address := range h.options.ProbeListenAddresses { + listener, err := h.Listener(address, sem) + if err != nil { + return listeners, err + } + listeners = append(listeners, listener) + } + return listeners, nil +} + // Listener creates the TCP listener for web requests. func (h *Handler) Listener(address string, sem chan struct{}) (net.Listener, error) { h.logger.Info("Start listening for connections", "address", address) @@ -741,6 +760,58 @@ func (h *Handler) Run(ctx context.Context, listeners []net.Listener, webConfig s } } +func (h *Handler) RunProbes(ctx context.Context, listeners []net.Listener) error { + if len(listeners) == 0 { + var err error + listeners, err = h.ProbeListeners() + if err != nil || len(listeners) == 0 { + return err + } + } + + mux := http.NewServeMux() + mux.HandleFunc("/-/healthy", func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet && r.Method != http.MethodHead { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + fmt.Fprintf(w, "%s is Healthy.\n", h.options.AppName) + }) + + readyHandler := h.testReady(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet && r.Method != http.MethodHead { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + fmt.Fprintf(w, "%s is Ready.\n", h.options.AppName) + }) + + mux.Handle("/-/ready", readyHandler) + errlog := slog.NewLogLogger(h.logger.Handler(), slog.LevelError) + httpSrv := &http.Server{ + Handler: mux, + ErrorLog: errlog, + ReadTimeout: h.options.ReadTimeout, + } + + errCh := make(chan error, len(listeners)) + for _, l := range listeners { + go func(ln net.Listener) { + errCh <- httpSrv.Serve(ln) + }(l) + } + select { + case err := <-errCh: + if errors.Is(err, http.ErrServerClosed) { + return nil + } + return err + case <-ctx.Done(): + _ = httpSrv.Shutdown(ctx) + return nil + } +} + func (h *Handler) consoles(w http.ResponseWriter, r *http.Request) { ctx := r.Context() name := route.Param(ctx, "filepath") diff --git a/web/web_test.go b/web/web_test.go index ce682912a9..89e084bcb3 100644 --- a/web/web_test.go +++ b/web/web_test.go @@ -32,6 +32,7 @@ import ( "github.com/prometheus/client_golang/prometheus" prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/common/promslog" "github.com/stretchr/testify/require" "github.com/prometheus/prometheus/config" @@ -39,6 +40,7 @@ import ( "github.com/prometheus/prometheus/rules" "github.com/prometheus/prometheus/scrape" "github.com/prometheus/prometheus/tsdb" + "github.com/prometheus/prometheus/util/features" "github.com/prometheus/prometheus/util/testutil" ) @@ -209,6 +211,84 @@ func TestReadyAndHealthy(t *testing.T) { cleanupTestResponse(t, resp) } +func TestProbeServer(t *testing.T) { + logger := promslog.NewNopLogger() + reg := prometheus.NewRegistry() + + dbDir := t.TempDir() + db, err := tsdb.Open(dbDir, nil, nil, nil, nil) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + port := fmt.Sprintf(":%d", testutil.RandomUnprivilegedPort(t)) + + opts := &Options{ + ListenAddresses: []string{port}, + RoutePrefix: "/", + ExternalURL: &url.URL{ + Scheme: "http", + Host: "localhost" + port, + Path: "/", + }, + LocalStorage: &dbAdapter{db}, + TSDBDir: dbDir, + ScrapeManager: &scrape.Manager{}, + RuleManager: &rules.Manager{}, + Registerer: reg, + Gatherer: reg, + FeatureRegistry: features.DefaultRegistry, + AppName: "Prometheus", + } + opts.Flags = map[string]string{} + + h := New(logger, opts) + + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + defer ln.Close() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + errCh := make(chan error, 1) + go func() { + errCh <- h.RunProbes(ctx, []net.Listener{ln}) + }() + + baseURL := "http://" + ln.Addr().String() + + resp, err := http.Get(baseURL + "/-/healthy") + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + _ = resp.Body.Close() + + resp, err = http.Get(baseURL + "/-/ready") + require.NoError(t, err) + require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) + _ = resp.Body.Close() + + h.SetReady(Ready) + resp, err = http.Get(baseURL + "/-/ready") + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + _ = resp.Body.Close() + + resp, err = http.Get(baseURL + "/metrics") + require.NoError(t, err) + require.Equal(t, http.StatusNotFound, resp.StatusCode) + _ = resp.Body.Close() + + cancel() + + select { + case <-errCh: + case <-time.After(2 * time.Second): + t.Fatal("probe server did not stop") + } +} + func TestRoutePrefix(t *testing.T) { t.Parallel() dbDir := t.TempDir()