VAULT-36112: Better handling for Retry-After rate limit header. (#30887)

* round up * round up, test, update backoff * add external test * changelog * use released version of go-retryablehttp * update api version of go-retryablehttp * fix name
2026-02-03 20:40:45 -05:00 · 2025-06-19 11:22:11 +02:00 · 2025-06-19 11:22:11 +02:00 · 544edd58d6
commit 544edd58d6
parent bef4afdb44
9 changed files with 77 additions and 10 deletions
--- a/api/client.go
+++ b/api/client.go
@ -258,7 +258,7 @@ func DefaultConfig() *Config {
 		MinRetryWait: time.Millisecond * 1000,
 		MaxRetryWait: time.Millisecond * 1500,
 		MaxRetries:   2,
-		Backoff:      retryablehttp.LinearJitterBackoff,
+		Backoff:      retryablehttp.RateLimitLinearJitterBackoff,
 	}

 	transport := config.HttpClient.Transport.(*http.Transport)
--- a/api/go.mod
+++ b/api/go.mod
@ -17,7 +17,7 @@ require (
 	github.com/hashicorp/go-cleanhttp v0.5.2
 	github.com/hashicorp/go-hclog v1.6.3
 	github.com/hashicorp/go-multierror v1.1.1
-	github.com/hashicorp/go-retryablehttp v0.7.7
+	github.com/hashicorp/go-retryablehttp v0.7.8
 	github.com/hashicorp/go-rootcerts v1.0.2
 	github.com/hashicorp/go-secure-stdlib/parseutil v0.1.6
 	github.com/hashicorp/go-secure-stdlib/strutil v0.1.2
--- a/api/go.sum
+++ b/api/go.sum
@ -25,8 +25,8 @@ github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVH
 github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
 github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
 github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
-github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU=
-github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk=
+github.com/hashicorp/go-retryablehttp v0.7.8 h1:ylXZWnqa7Lhqpk0L1P1LzDtGcCR0rPVUrx/c8Unxc48=
+github.com/hashicorp/go-retryablehttp v0.7.8/go.mod h1:rjiScheydd+CxvumBsIrFKlx3iS0jrZ7LvzFGFmuKbw=
 github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc=
 github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8=
 github.com/hashicorp/go-secure-stdlib/parseutil v0.1.6 h1:om4Al8Oy7kCm/B86rLCLah4Dt5Aa0Fr5rYBG60OzwHQ=
--- a/changelog/30887.txt
+++ b/changelog/30887.txt
@ -0,0 +1,6 @@
+```release-note:change
+api: Update the default API client to check for the `Retry-After` header and, if it exists, wait for the specified duration before retrying the request.
+```
+```release-note:change
+quotas/rate-limit: Round up the `Retry-After` value to the nearest second when calculating the retry delay.
+```
--- a/go.mod
+++ b/go.mod
@ -105,7 +105,7 @@ require (
 	github.com/hashicorp/go-pgmultiauth v1.0.0
 	github.com/hashicorp/go-plugin v1.6.1
 	github.com/hashicorp/go-raftchunking v0.6.3-0.20191002164813-7e9e8525653a
-	github.com/hashicorp/go-retryablehttp v0.7.7
+	github.com/hashicorp/go-retryablehttp v0.7.8
 	github.com/hashicorp/go-rootcerts v1.0.2
 	github.com/hashicorp/go-secure-stdlib/awsutil v0.3.0
 	github.com/hashicorp/go-secure-stdlib/base62 v0.1.2
--- a/go.sum
+++ b/go.sum
@ -1449,8 +1449,8 @@ github.com/hashicorp/go-plugin v1.6.1/go.mod h1:XPHFku2tFo3o3QKFgSYo+cghcUhw1NA1
 github.com/hashicorp/go-raftchunking v0.6.3-0.20191002164813-7e9e8525653a h1:FmnBDwGwlTgugDGbVxwV8UavqSMACbGrUpfc98yFLR4=
 github.com/hashicorp/go-raftchunking v0.6.3-0.20191002164813-7e9e8525653a/go.mod h1:xbXnmKqX9/+RhPkJ4zrEx4738HacP72aaUPlT2RZ4sU=
 github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs=
-github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU=
-github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk=
+github.com/hashicorp/go-retryablehttp v0.7.8 h1:ylXZWnqa7Lhqpk0L1P1LzDtGcCR0rPVUrx/c8Unxc48=
+github.com/hashicorp/go-retryablehttp v0.7.8/go.mod h1:rjiScheydd+CxvumBsIrFKlx3iS0jrZ7LvzFGFmuKbw=
 github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc=
 github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8=
 github.com/hashicorp/go-secure-stdlib/awsutil v0.3.0 h1:I8bynUKMh9I7JdwtW9voJ0xmHvBpxQtLjrMFDYmhOxY=
--- a/vault/external_tests/quotas/quotas_test.go
+++ b/vault/external_tests/quotas/quotas_test.go
@ -6,6 +6,7 @@ package quotas
 import (
 	"encoding/json"
 	"fmt"
+	"sync"
 	"testing"
 	"time"

@ -13,6 +14,7 @@ import (
 	"github.com/hashicorp/vault/builtin/credential/userpass"
 	"github.com/hashicorp/vault/builtin/logical/pki"
 	"github.com/hashicorp/vault/helper/constants"
+	"github.com/hashicorp/vault/helper/testhelpers"
 	"github.com/hashicorp/vault/helper/testhelpers/teststorage"
 	"github.com/hashicorp/vault/sdk/helper/testhelpers/schema"
 	"github.com/hashicorp/vault/sdk/logical"
@ -727,3 +729,40 @@ func TestQuotas_RateLimitQuota_GroupByConfig(t *testing.T) {
 		})
 	}
 }
+
+// TestQuotas_RateLimit_ZeroRetryRegression verifies that the rate limit response
+// headers do not return a Retry-After value of 0.
+func TestQuotas_RateLimit_ZeroRetryRegression(t *testing.T) {
+	conf, opts := teststorage.ClusterSetup(coreConfig, nil, nil)
+	cluster := vault.NewTestCluster(t, conf, opts)
+	cluster.Start()
+	defer cluster.Cleanup()
+
+	testhelpers.WaitForActiveNode(t, cluster)
+	client := cluster.Cores[0].Client
+	_, err := client.Logical().Write("sys/quotas/config", map[string]interface{}{
+		"enable_rate_limit_response_headers": true,
+	})
+	require.NoError(t, err)
+	_, err = client.Logical().Write("sys/quotas/rate-limit/root-rlq", map[string]interface{}{
+		"name": "root-rlq",
+		"rate": 1,
+	})
+	require.NoError(t, err)
+	failed := atomic.NewBool(false)
+	wg := sync.WaitGroup{}
+	client = client.WithResponseCallbacks(func(response *api.Response) {
+		if response.Header.Get("Retry-After") == "0" {
+			failed.Store(true)
+		}
+	})
+	for i := 0; i < 20; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			client.Logical().Read("sys/mounts")
+		}()
+	}
+	wg.Wait()
+	require.False(t, failed.Load())
+}
--- a/vault/quotas/quotas_rate_limit.go
+++ b/vault/quotas/quotas_rate_limit.go
@ -350,7 +350,7 @@ func (rlq *RateLimitQuota) allow(ctx context.Context, req *Request) (Response, e
 		} else {
 			// deny the request and return early
 			resp.Allowed = false
-			retryAfter = strconv.Itoa(int(time.Until(blockedAt.Add(rlq.BlockInterval)).Seconds()))
+			retryAfter = rlq.retryAfterSeconds(time.Now(), blockedAt.Add(rlq.BlockInterval))
 			return resp, nil
 		}
 	}
@ -363,20 +363,24 @@ func (rlq *RateLimitQuota) allow(ctx context.Context, req *Request) (Response, e
 	resp.Allowed = allow
 	resp.Headers[httplimit.HeaderRateLimitLimit] = strconv.FormatUint(limit, 10)
 	resp.Headers[httplimit.HeaderRateLimitRemaining] = strconv.FormatUint(remaining, 10)
-	resp.Headers[httplimit.HeaderRateLimitReset] = strconv.Itoa(int(time.Until(time.Unix(0, int64(reset))).Seconds()))
+	resp.Headers[httplimit.HeaderRateLimitReset] = rlq.retryAfterSeconds(time.Now(), time.Unix(0, int64(reset)))
 	retryAfter = resp.Headers[httplimit.HeaderRateLimitReset]

 	// If the request is not allowed (i.e. rate limit threshold reached) and blocking
 	// is enabled, we add the client to the set of blocked clients.
 	if !resp.Allowed && rlq.purgeBlocked {
 		blockedAt := time.Now()
-		retryAfter = strconv.Itoa(int(time.Until(blockedAt.Add(rlq.BlockInterval)).Seconds()))
+		retryAfter = rlq.retryAfterSeconds(time.Now(), blockedAt.Add(rlq.BlockInterval))
 		rlq.blockedClients.Store(key, blockedAt)
 	}

 	return resp, nil
 }

+func (rlq *RateLimitQuota) retryAfterSeconds(now, waitUntil time.Time) string {
+	return strconv.Itoa(int(math.Ceil(waitUntil.Sub(now).Seconds())))
+}
+
 // close stops the current running client purge loop.
 // It should be called with the write lock held.
 func (rlq *RateLimitQuota) close(ctx context.Context) error {
--- a/vault/quotas/quotas_rate_limit_test.go
+++ b/vault/quotas/quotas_rate_limit_test.go
@ -234,3 +234,21 @@ func TestRateLimitQuota_Update(t *testing.T) {
 	require.Nil(t, quota.close(context.Background()))
 	require.Nil(t, quotaUpdate.close(context.Background()))
 }
+
+// TestRateLimitQuota_retryAfterSeconds tests the that retryAfterSeconds rounds
+// up the number of seconds until the block ends
+func TestRateLimitQuota_retryAfterSeconds(t *testing.T) {
+	quota := NewRateLimitQuota("quota1", "", "", "", "", GroupByIp, false, time.Second, 0, 10, 0)
+	now := time.Now()
+	t.Run("less than 1", func(t *testing.T) {
+		blockedUntil := time.Now().Add(200 * time.Millisecond)
+		retryAfter := quota.retryAfterSeconds(now, blockedUntil)
+		require.Equal(t, "1", retryAfter)
+	})
+
+	t.Run("more than 1", func(t *testing.T) {
+		blockedUntil := time.Now().Add(1300 * time.Millisecond)
+		retryAfter := quota.retryAfterSeconds(now, blockedUntil)
+		require.Equal(t, "2", retryAfter)
+	})
+}