Avoid poisoning the trusted certificate cache on error (#9457) (#9496)

The code that loads the trusted certificate cache for cert-based
authentication ignores any error that occurs while attempting to load
any of the certificates that it finds. Undoubtedly some deployments
have broken certificates or other non-certificate files stored in
their respective back-ends, and so this is important behavior: we
don't want to fail authentication just because `README.md` is not a
valid certificate!

In addition, because listing files and loading certificates is
expensive, the server maintains a cache of trusted certificates. This
cache is populated the first time it's needed, and then used for the
lifetime of the process. If a file fails to load as a certificate,
then it is simply not included in the cache.

These two things lead to a problem when using a backend that might be
subject to transient failures: a hiccough in the certificate loading
process can cause the server to establish a cache that is missing an
otherwise valid certificate. This can then lead to clients failing to
authenticate to the server, until such time as the server is restarted
and the cache reloaded.

This change makes the certificate cache more resilient to loading
failures, by caching partial successes. With this patch, the cache
behavior becomes:

- If the cache exists *and* is either complete or it is not yet time
  to attempt to reload the certificates, then the cached results are
  used without reservation.

- Otherwise we attempt to load the certificates from storage:

  - If the cache does not already exist then a new, empty cache is
    created.

  - The storage is listed, we attempt to load everything in storage,
    skipping things that we have already successfully loaded, and
    skipping things that we cannot load, as usual.

  - Once we have attempted to load everything from storage, if there
    were any errors, we compute a deadline for retrying the load, with
    an exponentially increasing delay. If there were no errors, then
    the cache is considered complete, and there will be no retry.

This has the nice behavior that we recover from transient failures
eventually, while the exponential back-off ensures that we don't waste
too much time attempting to load certificates that can never be
loaded.

Co-authored-by: John Doty <john.doty@databricks.com>
Co-authored-by: Steven Clark <steven.clark@hashicorp.com>
This commit is contained in:
Vault Automation 2025-09-19 15:55:37 -04:00 committed by GitHub
parent ab62e44b3a
commit f66dc5a921
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 206 additions and 45 deletions

View file

@ -8,7 +8,9 @@ import (
"crypto/x509"
"fmt"
"io"
"maps"
"net/http"
"slices"
"strings"
"sync"
"sync/atomic"
@ -73,11 +75,40 @@ func Backend() *backend {
return &b
}
type trustedRetry struct {
attempt int
deadline time.Time
}
type trusted struct {
pool *x509.CertPool
trusted []*ParsedCert
trustedNonCAs []*ParsedCert
ocspConf *ocsp.VerifyConfig
loaded map[string]struct{}
retry *trustedRetry
}
func (t *trusted) clone() *trusted {
return &trusted{
pool: t.pool.Clone(),
trusted: slices.Clone(t.trusted),
trustedNonCAs: slices.Clone(t.trustedNonCAs),
ocspConf: &ocsp.VerifyConfig{
OcspEnabled: t.ocspConf.OcspEnabled,
ExtraCas: slices.Clone(t.ocspConf.ExtraCas),
OcspServersOverride: slices.Clone(t.ocspConf.OcspServersOverride),
OcspFailureMode: t.ocspConf.OcspFailureMode,
QueryAllServers: t.ocspConf.QueryAllServers,
OcspThisUpdateMaxAge: t.ocspConf.OcspThisUpdateMaxAge,
OcspMaxRetries: t.ocspConf.OcspMaxRetries,
},
loaded: maps.Clone(t.loaded),
retry: &trustedRetry{
attempt: t.retry.attempt,
deadline: t.retry.deadline,
},
}
}
type backend struct {

View file

@ -3112,6 +3112,62 @@ func TestOcspMaxRetriesUpdate(t *testing.T) {
require.Equal(t, 4, resp.Data["ocsp_max_retries"], "ocsp config didn't match expectations on legacy entry")
}
func TestRecoverPartialLoad(t *testing.T) {
storage := &logical.InmemStorage{}
ctx := context.Background()
lb, err := Factory(context.Background(), &logical.BackendConfig{
System: &logical.StaticSystemView{
DefaultLeaseTTLVal: 300 * time.Second,
MaxLeaseTTLVal: 1800 * time.Second,
},
StorageView: storage,
})
require.NoError(t, err, "failed creating backend")
b, ok := lb.(*backend)
require.True(t, ok, "somehow the backend was the wrong type")
// There a single certificate in storage...
nonCACert, err := os.ReadFile(testCertPath1)
if err != nil {
t.Fatal(err)
}
entry, err := logical.StorageEntryJSON("cert/cert_a", CertEntry{
Name: "cert_a",
Certificate: string(nonCACert),
})
if err != nil {
t.Fatal(err)
}
err = storage.Put(ctx, entry)
if err != nil {
t.Fatal(err)
}
// ...but storage doesn't actually work right now....
storage.FailGet(true)
// ...can we load certificates? no.
_, _, trustedNonCAs, _ := b.getTrustedCerts(ctx, storage, "")
require.Len(t, trustedNonCAs, 0, "should have no non CA certificates yet")
// We should have a cache, and it should have a retry assocaited with it.
trusted, _ /*complete*/ := b.getTrustedCertsFromCache("")
require.NotNil(t, trusted, "cache didn't exist")
require.NotNil(t, trusted.retry, "cache not marked with a retry")
// We can't verify complete because it depends on how fast the test is running (techincally)
// Make sure that we reload the cache, and repair storage. (It works now!)
trusted.retry.deadline = time.Now().Add(-1 * time.Second)
storage.FailGet(false)
// Now when we get the trusted certs, it should actually work!
_, _, trustedNonCAs, _ = b.getTrustedCerts(ctx, storage, "")
require.Len(t, trustedNonCAs, 1, "should have recovered and loaded a non-CA cert")
require.Equal(t, trustedNonCAs[0].Entry.Name, "cert_a", "non-CA cert name didn't match")
}
func loadCerts(t *testing.T, certFile, certKey string) tls.Certificate {
caTLS, err := tls.LoadX509KeyPair(certFile, certKey)
require.NoError(t, err, "failed reading ca/key files")

View file

@ -13,8 +13,11 @@ import (
"encoding/pem"
"errors"
"fmt"
"math"
"math/rand"
"net/url"
"strings"
"time"
"github.com/hashicorp/errwrap"
"github.com/hashicorp/go-multierror"
@ -679,8 +682,8 @@ func (b *backend) certificateExtensionsMetadata(clientCert *x509.Certificate, co
func (b *backend) getTrustedCerts(ctx context.Context, storage logical.Storage, certName string) (pool *x509.CertPool, trusted []*ParsedCert, trustedNonCAs []*ParsedCert, conf *ocsp.VerifyConfig) {
if !b.trustedCacheDisabled.Load() {
trusted, found := b.getTrustedCertsFromCache(certName)
if found {
trusted, complete := b.getTrustedCertsFromCache(certName)
if complete {
return trusted.pool, trusted.trusted, trusted.trustedNonCAs, trusted.ocspConf
}
}
@ -688,15 +691,21 @@ func (b *backend) getTrustedCerts(ctx context.Context, storage logical.Storage,
}
func (b *backend) getTrustedCertsFromCache(certName string) (*trusted, bool) {
var trusted *trusted
if certName == "" {
trusted := b.trustedCacheFull.Load()
if trusted != nil {
return trusted, true
}
} else if trusted, found := b.trustedCache.Get(certName); found {
return trusted, true
trusted = b.trustedCacheFull.Load()
} else {
trusted, _ = b.trustedCache.Get(certName)
}
return nil, false
if trusted == nil {
return nil, false
}
// We're complete (for our purposes here) if we're really complete
// (because retry is nil) or if it's just not time to retry the load yet.
complete := (trusted.retry == nil) || time.Now().Before(trusted.retry.deadline)
return trusted, complete
}
// loadTrustedCerts is used to load all the trusted certificates from the backend
@ -705,16 +714,31 @@ func (b *backend) loadTrustedCerts(ctx context.Context, storage logical.Storage,
lock.Lock()
defer lock.Unlock()
var cache *trusted
if !b.trustedCacheDisabled.Load() {
trusted, found := b.getTrustedCertsFromCache(certName)
if found {
return trusted.pool, trusted.trusted, trusted.trustedNonCAs, trusted.ocspConf
var complete bool
cache, complete = b.getTrustedCertsFromCache(certName)
if complete {
return cache.pool, cache.trusted, cache.trustedNonCAs, cache.ocspConf
}
}
pool = x509.NewCertPool()
trustedCerts = make([]*ParsedCert, 0)
trustedNonCAs = make([]*ParsedCert, 0)
if cache == nil {
cache = &trusted{
pool: x509.NewCertPool(),
trusted: make([]*ParsedCert, 0),
trustedNonCAs: make([]*ParsedCert, 0),
loaded: make(map[string]struct{}),
ocspConf: &ocsp.VerifyConfig{},
}
} else {
cache = cache.clone()
}
pool = cache.pool
trustedCerts = cache.trusted
trustedNonCAs = cache.trustedNonCAs
conf = cache.ocspConf
var names []string
if certName != "" {
@ -728,25 +752,23 @@ func (b *backend) loadTrustedCerts(ctx context.Context, storage logical.Storage,
}
}
conf = &ocsp.VerifyConfig{}
anyErrors := false
for _, name := range names {
entry, err := b.Cert(ctx, storage, strings.TrimPrefix(name, trustedCertPath))
if err != nil {
b.Logger().Error("failed to load trusted cert", "name", name, "error", err)
continue
}
if entry == nil {
// This could happen when the certName was provided and the cert doesn'log exist,
// or just if between the LIST and the GET the cert was deleted.
if _, found := cache.loaded[name]; found {
continue
}
parsed := parsePEM([]byte(entry.Certificate))
if len(parsed) == 0 {
b.Logger().Error("failed to parse certificate", "name", name)
entry, parsed, ocsp_ca_certs := b.loadTrustedCert(ctx, storage, name)
if entry == nil {
anyErrors = true
continue
}
parsed = append(parsed, parsePEM([]byte(entry.OcspCaCertificates))...)
parsed = append(parsed, ocsp_ca_certs...)
// NOTE: From this point on please finish adding the cert to all the
// appropriate lists and pools and configuration! Perform any error
// checking above this line.
cache.loaded[name] = struct{}{}
if !parsed[0].IsCA {
trustedNonCAs = append(trustedNonCAs, &ParsedCert{
@ -758,12 +780,12 @@ func (b *backend) loadTrustedCerts(ctx context.Context, storage logical.Storage,
pool.AddCert(p)
}
// Create a ParsedCert entry
trustedCerts = append(trustedCerts, &ParsedCert{
Entry: entry,
Certificates: parsed,
})
}
if entry.OcspEnabled {
conf.OcspEnabled = true
conf.OcspServersOverride = append(conf.OcspServersOverride, entry.OcspServersOverride...)
@ -776,33 +798,82 @@ func (b *backend) loadTrustedCerts(ctx context.Context, storage logical.Storage,
conf.OcspThisUpdateMaxAge = entry.OcspThisUpdateMaxAge
conf.OcspMaxRetries = entry.OcspMaxRetries
if len(entry.OcspCaCertificates) > 0 {
certs, err := certutil.ParseCertsPEM([]byte(entry.OcspCaCertificates))
if err != nil {
b.Logger().Error("failed to parse ocsp_ca_certificates", "name", name, "error", err)
continue
}
conf.ExtraCas = certs
if len(ocsp_ca_certs) > 0 {
conf.ExtraCas = ocsp_ca_certs
}
}
}
if !b.trustedCacheDisabled.Load() {
entry := &trusted{
pool: pool,
trusted: trustedCerts,
trustedNonCAs: trustedNonCAs,
ocspConf: conf,
}
if certName == "" {
b.trustedCacheFull.Store(entry)
if anyErrors {
// If something went wrong then we are going to set up for an
// exponential backoff on reloading the certificates.
if cache.retry == nil {
cache.retry = &trustedRetry{}
}
// Limits are arbitrary. Max of 2^55 backoff just so that delay
// and the the jitter fits into a double. Is that silly? Yes.
// 2^55 seconds is more years than the computer will keep
// running.
if cache.retry.attempt < 55 {
cache.retry.attempt += 1
}
d := 1 << cache.retry.attempt
pct := (rand.Float64() * 2.0) - 1.0 // between -100% and +100%
d += int(math.Floor(float64(d/4) * pct)) // between -25% and +25%
cache.retry.deadline = time.Now().Add(time.Duration(d) * time.Second)
} else {
b.trustedCache.Add(certName, entry)
// No problems, cache is complete, no need to retry.
cache.retry = nil
}
cache.trustedNonCAs = trustedNonCAs
cache.trusted = trustedCerts
if certName == "" {
b.trustedCacheFull.Store(cache)
} else {
b.trustedCache.Add(certName, cache)
}
}
return
}
func (b *backend) loadTrustedCert(ctx context.Context, storage logical.Storage, name string) (*CertEntry, []*x509.Certificate, []*x509.Certificate) {
entry, err := b.Cert(ctx, storage, strings.TrimPrefix(name, trustedCertPath))
if err != nil {
b.Logger().Error("failed to load trusted cert", "name", name, "error", err)
return nil, nil, nil
}
if entry == nil {
// This could happen when the certName was provided and the cert doesn't exist,
// or just if between the LIST and the GET the cert was deleted.
b.Logger().Error("loaded a nil trusted cert", "name", name)
return nil, nil, nil
}
parsed := parsePEM([]byte(entry.Certificate))
if len(parsed) == 0 {
b.Logger().Error("failed to parse certificate", "name", name)
return nil, nil, nil
}
var ocsp_ca_certs []*x509.Certificate = nil
if len(entry.OcspCaCertificates) > 0 {
ocsp_ca_certs, err = certutil.ParseCertsPEM([]byte(entry.OcspCaCertificates))
if err != nil {
// NOTE: For compatibility, failure to parse the OcspCaCertificates
// is never actually fatal to loading the broader file
b.Logger().Error("failed to parse ocsp_ca_certificates", "name", name, "error", err)
ocsp_ca_certs = nil
}
}
return entry, parsed, ocsp_ca_certs
}
func (b *backend) checkForCertInOCSP(ctx context.Context, clientCert *x509.Certificate, chain []*x509.Certificate, conf *ocsp.VerifyConfig) (bool, error) {
if !conf.OcspEnabled || len(chain) < 2 {
return true, nil

3
changelog/31438.txt Normal file
View file

@ -0,0 +1,3 @@
```release-note:bug
auth/cert: Recover from partially populated caches of trusted certificates if one or more certificates fails to load.
```