DRA upgrade/downgrade: rewrite as Go unit test

tCtx.Run and sub-tests make it much simpler to separate the different steps than with Ginkgo because unless a test runs tCtx.Parallel (which we don't do here), everything runs sequentially in a deterministic order. Right now we get: ... localupcluster.go:285: I1210 12:24:22.067524] bring up v1.34: stopping kubelet localupcluster.go:285: I1210 12:24:22.067548] bring up v1.34: stopping kube-scheduler localupcluster.go:285: I1210 12:24:22.067570] bring up v1.34: stopping kube-controller-manager localupcluster.go:285: I1210 12:24:22.067589] bring up v1.34: stopping kube-apiserver --- PASS: TestUpgradeDowngrade (94.78s) --- PASS: TestUpgradeDowngrade/after-cluster-creation (2.07s) --- PASS: TestUpgradeDowngrade/after-cluster-creation/core_DRA (2.05s) --- PASS: TestUpgradeDowngrade/after-cluster-creation/ResourceClaim_device_status (0.02s) --- PASS: TestUpgradeDowngrade/after-cluster-upgrade (4.10s) --- PASS: TestUpgradeDowngrade/after-cluster-upgrade/core_DRA (4.09s) --- PASS: TestUpgradeDowngrade/after-cluster-upgrade/ResourceClaim_device_status (0.01s) --- PASS: TestUpgradeDowngrade/after-cluster-downgrade (1.24s) --- PASS: TestUpgradeDowngrade/after-cluster-downgrade/core_DRA (1.21s) --- PASS: TestUpgradeDowngrade/after-cluster-downgrade/ResourceClaim_device_status (0.02s) PASS It's even possible to use `-failfast` and e.g. `-run=TestUpgradeDowngrade/after-cluster-creation/core_DRA`: `go test` then runs everything up to that sub-test or any failing sub-test, then stops and cleans up.
2026-02-13 15:59:57 -05:00 · 2025-12-10 12:34:04 +01:00 · 2025-12-10 12:34:04 +01:00 · de47714879
commit de47714879
parent 7c7b1e1018
4 changed files with 197 additions and 299 deletions
--- a/test/e2e_dra/README.md
+++ b/test/e2e_dra/README.md
@ -1,12 +1,13 @@
 This directory contains a testsuite with automatic upgrade/downgrade tests for
 DRA. Conceptually this is like an integration test, in the sense that it
-starts/stops cluster components and runs tests against them.
+starts/stops cluster components and runs tests against them. It has its own
+directory because it needs to be started differently than other integration
+tests or unit tests, which makes it more like an E2E suite.

 The difference is that it starts Kubernetes components by running the actual
 binaries, relying on local-up-cluster.sh for the logic and configuration
-steps. Because local-up-cluster.sh needs additional permissions and
-preparations on the host, the test cannot run in "make test-integration" and
-just skips itself there.
+steps. local-up-cluster.sh needs additional permissions and
+preparations on the host.

 To run it:
 - Make sure that hack/local-up-cluster.sh works:
@ -23,21 +24,16 @@ To run it:
  Otherwise a test tmp directory is used.
 - Invoke as a Go test (no need for the ginkgo CLI), for example:

-        go test -v -count=1 -timeout=1h ./test/e2e_dra -args -ginkgo.v
-        dlv test ./test/e2e_dra -- -ginkgo.v
-        make test KUBE_TIMEOUT=-timeout=1h WHAT=test/e2e_dra FULL_LOG=true KUBE_TEST_ARGS="-count=1 -args -ginkgo.v"
+        go test -v -count=1 -timeout=1h ./test/e2e_dra
+        dlv test ./test/e2e_dra -- -test.v
+        make test KUBE_TIMEOUT=-timeout=1h WHAT=test/e2e_dra FULL_LOG=true KUBE_TEST_ARGS="-count=1"

 `make test` instead of `make test-integration` is intentional: `local-up-cluster.sh`
 itself wants to start etcd. `-count=1` ensures that test runs each time it is invoked.
-`-v` and `-ginkgo.v` make the test output visible while the test runs.
+`-v`/`-test.v`/`FULL_LOG=true` make the test output visible while the test runs.

 To simplify starting from scratch, `./test/e2e_dra/run.sh` cleans
 up, sets permissions, and then invokes whatever command is specified on the
 command line:

     ./test/e2e_dra/run.sh go test ./test/e2e_dra
-
-The test is implemented as a Ginkgo suite because that allows reusing the same
-helper code as in E2E tests. Long-term the goal is to port that helper code to
-ktesting, support ktesting in test/e2e, and turn this test into a normal Go
-test.
--- a/test/e2e_dra/coredra_test.go
+++ b/test/e2e_dra/coredra_test.go
@ -17,28 +17,29 @@ limitations under the License.
 package e2edra

 import (
+	"time"
+
 	"github.com/onsi/gomega"
 	v1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	drautils "k8s.io/kubernetes/test/e2e/dra/utils"
-	"k8s.io/kubernetes/test/e2e/framework"
 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 	"k8s.io/kubernetes/test/utils/ktesting"
 )

-func coreDRA(tCtx ktesting.TContext, f *framework.Framework, b *drautils.Builder) step2Func {
-	namespace := f.Namespace.Name
+func coreDRA(tCtx ktesting.TContext, b *drautils.Builder) upgradedTestFunc {
+	namespace := tCtx.Namespace()
 	claim := b.ExternalClaim()
 	pod := b.PodExternal()
 	b.Create(tCtx, claim, pod)
 	b.TestPod(tCtx, pod)

-	return func(tCtx ktesting.TContext) step3Func {
+	return func(tCtx ktesting.TContext) downgradedTestFunc {
 		// Remove pod prepared in step 1.
-		framework.ExpectNoError(f.ClientSet.ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{}))
-		framework.ExpectNoError(f.ClientSet.CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{}))
-		framework.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(tCtx, f.ClientSet, pod.Name, namespace, f.Timeouts.PodDelete))
+		tCtx.ExpectNoError(tCtx.Client().ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{}))
+		tCtx.ExpectNoError(tCtx.Client().CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{}))
+		tCtx.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(tCtx, tCtx.Client(), pod.Name, namespace, 3*time.Minute))

 		// Create another claim and pod, this time using the latest Kubernetes.
 		claim = b.ExternalClaim()
@ -56,13 +57,13 @@ func coreDRA(tCtx ktesting.TContext, f *framework.Framework, b *drautils.Builder
 			// or (even weirder) with
 			//     getting *v1.Pod: pods "tester-2" is forbidden: User "kubernetes-admin" cannot get resource "pods" in API group "" in the namespace "dra-9021"
 			ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) error {
-				return f.ClientSet.ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{})
+				return tCtx.Client().ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{})
 			}).Should(gomega.Succeed(), "delete claim after downgrade")
 			ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) error {
-				return f.ClientSet.CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{})
+				return tCtx.Client().CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{})
 			}).Should(gomega.Succeed(), "delete pod after downgrade")
 			ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) *v1.Pod {
-				pod, err := f.ClientSet.CoreV1().Pods(namespace).Get(tCtx, pod.Name, metav1.GetOptions{})
+				pod, err := tCtx.Client().CoreV1().Pods(namespace).Get(tCtx, pod.Name, metav1.GetOptions{})
 				if apierrors.IsNotFound(err) {
 					return nil
 				}
--- a/test/e2e_dra/resourceclaimstatus_test.go
+++ b/test/e2e_dra/resourceclaimstatus_test.go
@ -31,15 +31,12 @@ import (
 	resourceapiacv1beta2 "k8s.io/client-go/applyconfigurations/resource/v1beta2"
 	draapiv1beta2 "k8s.io/dynamic-resource-allocation/api/v1beta2"
 	drautils "k8s.io/kubernetes/test/e2e/dra/utils"
-	"k8s.io/kubernetes/test/e2e/framework"
 	"k8s.io/kubernetes/test/utils/ktesting"
 )

-// resourceClaimDeviceStatus corresponds to testResourceClaimDeviceStatus in test/integration/dra
-// and was copied from there, therefore the unit-test style with tCtx and require.
-// This is the preferred style for new tests.
-func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b *drautils.Builder) step2Func {
-	namespace := f.Namespace.Name
+// resourceClaimDeviceStatus corresponds to testResourceClaimDeviceStatus in test/integration/dra.
+func resourceClaimDeviceStatus(tCtx ktesting.TContext, b *drautils.Builder) upgradedTestFunc {
+	namespace := tCtx.Namespace()
 	claimName := "claim-with-device-status"
 	claim := &resourceapiv1beta2.ResourceClaim{
 		ObjectMeta: metav1.ObjectMeta{
@ -111,7 +108,6 @@ func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b
 		err = client.ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{})
 		tCtx.ExpectNoError(err, "delete claim")
 	}
-	tCtx.CleanupCtx(removeClaim)
 	claim, err = tCtx.Client().ResourceV1beta2().ResourceClaims(namespace).UpdateStatus(tCtx, claim, metav1.UpdateOptions{})
 	tCtx.ExpectNoError(err, "add allocation result")

@ -193,7 +189,7 @@ func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b
 	tCtx.ExpectNoError(encoder.Encode(claim))
 	tCtx.Logf("Final ResourceClaim:\n%s", buffer.String())

-	return func(tCtx ktesting.TContext) step3Func {
+	return func(tCtx ktesting.TContext) downgradedTestFunc {
 		// Update one entry, remove the other.
 		deviceStatusAC := resourceapiac.AllocatedDeviceStatus().
 			WithDriver("two").
@ -227,8 +223,9 @@ func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b
 			tCtx.ExpectNoError(err, "remove device status three")
 			require.Equal(tCtx, deviceStatus, claim.Status.Devices, "after removing device status three")

-			// The cleanup order is so that we have to run this explicitly now.
-			// The tCtx.CleanupCtx is more for the sake of completeness.
+			// This was created in a prior sub-test, so we have to
+			// clean up manually for a proper termination of the
+			// overall test.
 			removeClaim(tCtx)
 		}
 	}
--- a/test/e2e_dra/upgradedowngrade_test.go
+++ b/test/e2e_dra/upgradedowngrade_test.go
@ -19,9 +19,7 @@ package e2edra
 import (
 	"archive/tar"
 	"compress/gzip"
-	"context"
 	_ "embed"
-	"flag"
 	"fmt"
 	"io"
 	"net/http"
@ -34,19 +32,16 @@ import (
 	"testing"
 	"time"

-	"github.com/onsi/ginkgo/v2"
 	"github.com/onsi/gomega"

 	"k8s.io/apimachinery/pkg/util/version"
 	restclient "k8s.io/client-go/rest"
 	"k8s.io/kubernetes/cmd/kubeadm/app/util/errors"
 	drautils "k8s.io/kubernetes/test/e2e/dra/utils"
-	"k8s.io/kubernetes/test/e2e/framework"
 	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
 	e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
 	"k8s.io/kubernetes/test/utils/ktesting"
 	"k8s.io/kubernetes/test/utils/localupcluster"
-	admissionapi "k8s.io/pod-security-admission/api"
 )

 var errHTTP404 = errors.New("resource not found (404)")
@ -69,31 +64,23 @@ func init() {
 //
 // The "test code" gets registered here with a single function for each
 // sub-test. That function then returns the next piece of code, which then
-// returns the final code.
-//
-// For performance reasons there is only a single `It("works")` which
-// runs everything. Failures in sub-tests are reported separately *if they
-// are reported via the TContext*. Failures reported via ginkgo.Fail
-// currently abort the entire test. This will be addressed by converting
-// everything to ktesting-based unit tests.
+// returns the final code. Each callback function is executed as a sub-test.
+// The builder is configured to not delete objects when that sub-test ends,
+// so objects persist until the entire test is done.
 //
 // Each sub-test must be self-contained. They intentionally run in a random
 // order. However, they share the same cluster and the 8 devices which are
 // available there.
-var subTests = map[string]step1Func{
+var subTests = map[string]initialTestFunc{
 	"core DRA":                    coreDRA,
 	"ResourceClaim device status": resourceClaimDeviceStatus,
 }

-// step1Func is passed everything that is needed to run
-type step1Func func(tCtx ktesting.TContext, f *framework.Framework, builder *drautils.Builder) step2Func
+type initialTestFunc func(tCtx ktesting.TContext, builder *drautils.Builder) upgradedTestFunc

-type step2Func func(tCtx ktesting.TContext) step3Func
+type upgradedTestFunc func(tCtx ktesting.TContext) downgradedTestFunc

-type step3Func func(tCtx ktesting.TContext)
-
-// steps has the names for the actual ginkgo.It where sub-test results are provided.
-var steps = []string{"before downgrade", "after downgrade", "after upgrade"}
+type downgradedTestFunc func(tCtx ktesting.TContext)

 var repoRoot = repoRootDefault()

@ -117,272 +104,189 @@ func repoRootDefault() string {
 	return "../../"
 }

-func TestUpgradeDowngrade(t *testing.T) {
-	suiteConfig, reporterConfig := framework.CreateGinkgoConfig()
-	suiteConfig.RandomizeAllSpecs = false
-	ginkgo.RunSpecs(t, "DRA", suiteConfig, reporterConfig)
-}
-
-var _ = ginkgo.Describe("DRA upgrade/downgrade", func() {
-	// Initialize the default values by registering flags. We don't actually expose those flags.
-	var fs flag.FlagSet
-	framework.RegisterCommonFlags(&fs)
-	framework.RegisterClusterFlags(&fs)
-
+func TestUpgradeDowngrade(t *testing.T) { testUpgradeDowngrade(ktesting.Init(t)) }
+func testUpgradeDowngrade(tCtx ktesting.TContext) {
 	// Some other things normally done by test/e2e.
 	e2etestfiles.AddFileSource(e2etestfiles.RootFileSource{Root: repoRoot})
-	gomega.RegisterFailHandler(ginkgo.Fail)

-	// sub-test -> step -> failure
-	//
-	// Initially each failure string is empty.
-	results := make(map[string]map[string]string, len(subTests))
-	for subTest := range subTests {
-		results[subTest] = make(map[string]string, len(steps))
+	// Ideally we shouldn't have any code which directly calls gomega.Expect,
+	// but we are not there yet (e.g. e2epod.MakePod). So for now we install
+	// one fail handler which records failures in the main test context.
+	gomega.RegisterFailHandler(func(message string, callerSkip ...int) {
+		tCtx.Helper()
+		tCtx.Fatal(message)
+	})
+
+	envName, dir := currentBinDir()
+	if dir == "" {
+		tCtx.Fatalf("%s must be set to test DRA upgrade/downgrade scenarios.", envName)
 	}

-	ginkgo.It("works", func(ctx context.Context) {
-		// TODO: replace with helper code from https://github.com/kubernetes/kubernetes/pull/122481 should that get merged.
-		tCtx := ktesting.Init(GinkgoContextTB())
-		tCtx = ktesting.WithContext(tCtx, ctx)
+	// Determine what we need to downgrade to.
+	tCtx = ktesting.Begin(tCtx, "get source code version")
+	gitVersion, _, err := sourceVersion(tCtx, repoRoot)
+	tCtx.ExpectNoError(err, "determine source code version for repo root %q", repoRoot)
+	version, err := version.ParseGeneric(gitVersion)
+	tCtx.ExpectNoError(err, "parse version %s of repo root %q", gitVersion, repoRoot)
+	major, previousMinor := version.Major(), version.Minor()-1
+	if strings.Contains(gitVersion, "-alpha.0") {
+		// All version up to and including x.y.z-alpha.0 are treated as if we were
+		// still the previous minor version x.(y-1). There are two reason for this:
+		//
+		// - During code freeze around (at?) -rc.0, the master branch already
+		//   identfies itself as the next release with -alpha.0. Without this
+		//   special case, we would change the version skew testing from what
+		//   has been tested and been known to work to something else, which
+		//   can and at least once did break.
+		//
+		// - Early in the next cycle the differences compared to the previous
+		//   release are small, so it's more interesting to go back further.
+		previousMinor--
+	}
+	tCtx.Logf("got version: major: %d, minor: %d, previous minor: %d", major, version.Minor(), previousMinor)
+	tCtx = ktesting.End(tCtx)

-		envName, dir := currentBinDir()
-		if dir == "" {
-			tCtx.Fatalf("%s must be set to test DRA upgrade/downgrade scenarios.", envName)
+	// KUBERNETES_SERVER_CACHE_DIR can be set to keep downloaded files across test restarts.
+	binDir, cacheBinaries := os.LookupEnv("KUBERNETES_SERVER_CACHE_DIR")
+	if !cacheBinaries {
+		binDir = tCtx.TempDir()
+	}
+	haveBinaries := false
+
+	// Get the previous release.
+	tCtx = ktesting.Begin(tCtx, "get previous release info")
+	tCtx.Logf("stable release %d.%d", major, previousMinor)
+	previousURL, previousVersion, err := serverDownloadURL(tCtx, "stable", major, previousMinor)
+	if errors.Is(err, errHTTP404) {
+		tCtx.Logf("stable doesn't exist, get latest release %d.%d", major, previousMinor)
+		previousURL, previousVersion, err = serverDownloadURL(tCtx, "latest", major, previousMinor)
+	}
+	tCtx.ExpectNoError(err)
+	tCtx.Logf("got previous release version: %s, URL: %s", previousVersion, previousURL)
+	tCtx = ktesting.End(tCtx)
+
+	if cacheBinaries {
+		binDir = path.Join(binDir, previousVersion)
+		_, err := os.Stat(path.Join(binDir, string(localupcluster.KubeClusterComponents[0])))
+		if err == nil {
+			haveBinaries = true
 		}
-
-		// Determine what we need to downgrade to.
-		tCtx = ktesting.Begin(tCtx, "get source code version")
-		gitVersion, _, err := sourceVersion(tCtx, repoRoot)
-		tCtx.ExpectNoError(err, "determine source code version for repo root %q", repoRoot)
-		version, err := version.ParseGeneric(gitVersion)
-		tCtx.ExpectNoError(err, "parse version %s of repo root %q", gitVersion, repoRoot)
-		major, previousMinor := version.Major(), version.Minor()-1
-		if strings.Contains(gitVersion, "-alpha.0") {
-			// All version up to and including x.y.z-alpha.0 are treated as if we were
-			// still the previous minor version x.(y-1). There are two reason for this:
-			//
-			// - During code freeze around (at?) -rc.0, the master branch already
-			//   identfies itself as the next release with -alpha.0. Without this
-			//   special case, we would change the version skew testing from what
-			//   has been tested and been known to work to something else, which
-			//   can and at least once did break.
-			//
-			// - Early in the next cycle the differences compared to the previous
-			//   release are small, so it's more interesting to go back further.
-			previousMinor--
-		}
-		tCtx.Logf("got version: major: %d, minor: %d, previous minor: %d", major, version.Minor(), previousMinor)
-		tCtx = ktesting.End(tCtx)
-
-		// KUBERNETES_SERVER_CACHE_DIR can be set to keep downloaded files across test restarts.
-		binDir, cacheBinaries := os.LookupEnv("KUBERNETES_SERVER_CACHE_DIR")
-		if !cacheBinaries {
-			binDir = tCtx.TempDir()
-		}
-		haveBinaries := false
-
-		// Get the previous release.
-		tCtx = ktesting.Begin(tCtx, "get previous release info")
-		tCtx.Logf("stable release %d.%d", major, previousMinor)
-		previousURL, previousVersion, err := serverDownloadURL(tCtx, "stable", major, previousMinor)
-		if errors.Is(err, errHTTP404) {
-			tCtx.Logf("stable doesn't exist, get latest release %d.%d", major, previousMinor)
-			previousURL, previousVersion, err = serverDownloadURL(tCtx, "latest", major, previousMinor)
-		}
-		tCtx.ExpectNoError(err)
-		tCtx.Logf("got previous release version: %s, URL: %s", previousVersion, previousURL)
-		tCtx = ktesting.End(tCtx)
-
-		if cacheBinaries {
-			binDir = path.Join(binDir, previousVersion)
-			_, err := os.Stat(path.Join(binDir, string(localupcluster.KubeClusterComponents[0])))
-			if err == nil {
-				haveBinaries = true
+	}
+	if !haveBinaries {
+		tCtx = ktesting.Begin(tCtx, fmt.Sprintf("download and unpack %s", previousURL))
+		req, err := http.NewRequestWithContext(tCtx, http.MethodGet, previousURL, nil)
+		tCtx.ExpectNoError(err, "construct request")
+		response, err := http.DefaultClient.Do(req)
+		tCtx.ExpectNoError(err, "download")
+		defer func() {
+			_ = response.Body.Close()
+		}()
+		decompress, err := gzip.NewReader(response.Body)
+		tCtx.ExpectNoError(err, "construct gzip reader")
+		unpack := tar.NewReader(decompress)
+		for {
+			header, err := unpack.Next()
+			if err == io.EOF {
+				break
+			}
+			base := path.Base(header.Name)
+			if slices.Contains(localupcluster.KubeClusterComponents, localupcluster.KubeComponentName(base)) {
+				data, err := io.ReadAll(unpack)
+				tCtx.ExpectNoError(err, fmt.Sprintf("read content of %s", header.Name))
+				tCtx.ExpectNoError(os.MkdirAll(binDir, 0755), "create directory for binaries")
+				tCtx.ExpectNoError(os.WriteFile(path.Join(binDir, base), data, 0555), fmt.Sprintf("write content of %s", header.Name))
 			}
 		}
-		if !haveBinaries {
-			tCtx = ktesting.Begin(tCtx, fmt.Sprintf("download and unpack %s", previousURL))
-			req, err := http.NewRequestWithContext(tCtx, http.MethodGet, previousURL, nil)
-			tCtx.ExpectNoError(err, "construct request")
-			response, err := http.DefaultClient.Do(req)
-			tCtx.ExpectNoError(err, "download")
-			defer func() {
-				_ = response.Body.Close()
-			}()
-			decompress, err := gzip.NewReader(response.Body)
-			tCtx.ExpectNoError(err, "construct gzip reader")
-			unpack := tar.NewReader(decompress)
-			for {
-				header, err := unpack.Next()
-				if err == io.EOF {
-					break
-				}
-				base := path.Base(header.Name)
-				if slices.Contains(localupcluster.KubeClusterComponents, localupcluster.KubeComponentName(base)) {
-					data, err := io.ReadAll(unpack)
-					tCtx.ExpectNoError(err, fmt.Sprintf("read content of %s", header.Name))
-					tCtx.ExpectNoError(os.MkdirAll(binDir, 0755), "create directory for binaries")
-					tCtx.ExpectNoError(os.WriteFile(path.Join(binDir, base), data, 0555), fmt.Sprintf("write content of %s", header.Name))
-				}
-			}
-			tCtx = ktesting.End(tCtx)
-		}
-
-		tCtx = ktesting.Begin(tCtx, fmt.Sprintf("bring up v%d.%d", major, previousMinor))
-		cluster := localupcluster.New(tCtx)
-		localUpClusterEnv := map[string]string{
-			"RUNTIME_CONFIG": "resource.k8s.io/v1beta1,resource.k8s.io/v1beta2",
-			"FEATURE_GATES":  "DynamicResourceAllocation=true",
-			// *not* needed because driver will run in "local filesystem" mode (= driver.IsLocal): "ALLOW_PRIVILEGED": "1",
-		}
-		cluster.Start(tCtx, binDir, localUpClusterEnv)
 		tCtx = ktesting.End(tCtx)
+	}

-		restConfig := cluster.LoadConfig(tCtx)
-		restConfig.UserAgent = fmt.Sprintf("%s -- dra", restclient.DefaultKubernetesUserAgent())
-		tCtx = ktesting.WithRESTConfig(tCtx, restConfig)
-		// TODO: rewrite all DRA test code to use ktesting.TContext once https://github.com/kubernetes/kubernetes/pull/122481 is
-		// merged, then we don't need to fake a Framework instance.
-		f := &framework.Framework{
-			BaseName:      "dra",
-			Timeouts:      framework.NewTimeoutContext(),
-			ClientSet:     tCtx.Client(),
-			DynamicClient: tCtx.Dynamic(),
+	tCtx = ktesting.Begin(tCtx, fmt.Sprintf("bring up v%d.%d", major, previousMinor))
+	cluster := localupcluster.New(tCtx)
+	localUpClusterEnv := map[string]string{
+		"RUNTIME_CONFIG": "resource.k8s.io/v1beta1,resource.k8s.io/v1beta2",
+		"FEATURE_GATES":  "DynamicResourceAllocation=true",
+		// *not* needed because driver will run in "local filesystem" mode (= driver.IsLocal): "ALLOW_PRIVILEGED": "1",
+	}
+	cluster.Start(tCtx, binDir, localUpClusterEnv)
+	tCtx = ktesting.End(tCtx)

-			// The driver containers have to run with sufficient privileges to
-			// modify /var/lib/kubelet/plugins.
-			NamespacePodSecurityLevel: admissionapi.LevelPrivileged,
-		}
-		f.SetClientConfig(restConfig)
+	restConfig := cluster.LoadConfig(tCtx)
+	restConfig.UserAgent = fmt.Sprintf("%s -- dra", restclient.DefaultKubernetesUserAgent())
+	tCtx = tCtx.WithRESTConfig(restConfig).WithNamespace("default")

-		namespace, err := f.CreateNamespace(tCtx, f.BaseName, map[string]string{
-			"e2e-framework": f.BaseName,
-		})
-		tCtx.ExpectNoError(err, "create namespace")
-		f.Namespace = namespace
-		f.UniqueName = namespace.Name
+	tCtx = ktesting.Begin(tCtx, fmt.Sprintf("v%d.%d", major, previousMinor))

-		tCtx = ktesting.Begin(tCtx, fmt.Sprintf("v%d.%d", major, previousMinor))
+	tCtx.ExpectNoError(e2enode.WaitForAllNodesSchedulable(tCtx, tCtx.Client(), 5*time.Minute), "wait for all nodes to be schedulable")
+	nodes := drautils.NewNodesNow(tCtx, 1, 1)

-		tCtx.ExpectNoError(e2enode.WaitForAllNodesSchedulable(tCtx, tCtx.Client(), f.Timeouts.NodeSchedulable), "wait for all nodes to be schedulable")
-		nodes := drautils.NewNodesNow(tCtx, 1, 1)
+	// Opening sockets locally avoids intermittent errors and delays caused by proxying through the restarted apiserver.
+	// We could speed up testing by shortening the sync delay in the ResourceSlice controller, but let's better
+	// test the defaults.
+	driver := drautils.NewDriverInstance(tCtx)
+	driver.IsLocal = true
+	driver.Run(tCtx, "/var/lib/kubelet", nodes, drautils.DriverResourcesNow(nodes, 8))
+	b := drautils.NewBuilderNow(tCtx, driver)
+	b.SkipCleanup = true

-		// Opening sockets locally avoids intermittent errors and delays caused by proxying through the restarted apiserver.
-		// We could speed up testing by shortening the sync delay in the ResourceSlice controller, but let's better
-		// test the defaults.
-		driver := drautils.NewDriverInstance(tCtx)
-		driver.IsLocal = true
-		driver.Run(tCtx, nodes, drautils.DriverResourcesNow(nodes, 8))
-		b := drautils.NewBuilderNow(tCtx, driver)
+	tCtx = ktesting.End(tCtx)

-		tCtx = ktesting.End(tCtx)
-
-		steps2 := make(map[string]step2Func, len(subTests))
-		for subTest, step1 := range subTests {
-			tCtx = ktesting.Begin(tCtx, subTest)
-			var result error
-			func() {
-				tCtx, finalize := ktesting.WithError(tCtx, &result)
-				defer finalize()
-
-				// This only gets set in case of success.
-				steps2[subTest] = step1(tCtx, f, b)
-			}()
-			if result != nil {
-				results[subTest][steps[0]] = result.Error()
-			}
-			tCtx = ktesting.End(tCtx)
-		}
-
-		tCtx = ktesting.Begin(tCtx, fmt.Sprintf("update to %s", gitVersion))
-		// We could split this up into first updating the apiserver, then control plane components, then restarting kubelet.
-		// For the purpose of this test here we we primarily care about full before/after comparisons, so not done yet.
-		// TODO
-		restoreOptions := cluster.Modify(tCtx, localupcluster.ModifyOptions{Upgrade: true, BinDir: dir})
-		tCtx = ktesting.End(tCtx)
-
-		// The kubelet wipes all ResourceSlices on a restart because it doesn't know which drivers were running.
-		// Wait for the ResourceSlice controller in the driver to notice and recreate the ResourceSlices.
-		tCtx = ktesting.Begin(tCtx, "wait for ResourceSlices")
-		ktesting.Eventually(tCtx, driver.NewGetSlices()).WithTimeout(5 * time.Minute).Should(gomega.HaveField("Items", gomega.HaveLen(len(nodes.NodeNames))))
-		tCtx = ktesting.End(tCtx)
-
-		steps3 := make(map[string]step3Func, len(subTests))
-		for subTest := range subTests {
-			step2 := steps2[subTest]
-			if step2 == nil {
-				continue
-			}
-			tCtx = ktesting.Begin(tCtx, subTest)
-			var result error
-			func() {
-				tCtx, finalize := ktesting.WithError(tCtx, &result)
-				defer finalize()
-
-				// This only gets set in case of success.
-				steps3[subTest] = step2(tCtx)
-			}()
-			if result != nil {
-				results[subTest][steps[0]] = result.Error()
-			}
-			tCtx = ktesting.End(tCtx)
-		}
-
-		// Roll back.
-		tCtx = ktesting.Begin(tCtx, "downgrade")
-		cluster.Modify(tCtx, restoreOptions)
-		tCtx = ktesting.End(tCtx)
-
-		// TODO: ensure that kube-controller-manager is up-and-running.
-		// This works around https://github.com/kubernetes/kubernetes/issues/132334 and can be removed
-		// once a fix for that is backported.
-		tCtx = ktesting.Begin(tCtx, "wait for kube-controller-manager")
-		ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) string {
-			output, _ := cluster.GetSystemLogs(tCtx, localupcluster.KubeControllerManager)
-			return output
-		}).Should(gomega.ContainSubstring(`"Caches are synced" controller="resource_claim"`))
-		tCtx = ktesting.End(tCtx)
-
-		for subTest := range subTests {
-			step3 := steps3[subTest]
-			if step3 == nil {
-				continue
-			}
-			tCtx = ktesting.Begin(tCtx, subTest)
-			var result error
-			func() {
-				tCtx, finalize := ktesting.WithError(tCtx, &result)
-				defer finalize()
-
-				step3(tCtx)
-			}()
-			if result != nil {
-				results[subTest][steps[0]] = result.Error()
-			}
-			tCtx = ktesting.End(tCtx)
+	upgradedTestFuncs := make(map[string]upgradedTestFunc, len(subTests))
+	tCtx.Run("after-cluster-creation", func(tCtx ktesting.TContext) {
+		for subTest, f := range subTests {
+			tCtx.Run(subTest, func(tCtx ktesting.TContext) {
+				// This only gets set if f doesn't panic because of a fatal error,
+				// so below we won't continue if step 1 already failed.
+				// Other sub-tests are not affected.
+				upgradedTestFuncs[subTest] = f(tCtx, b)
+			})
 		}
 	})

-	// This runs last because by default Ginkgo does not randomize within
-	// a top-level container.
-	for subTest := range subTests {
-		ginkgo.Context(subTest, func() {
-			for _, step := range steps {
-				ginkgo.It(step, func() {
-					failure := results[subTest][step]
-					if failure != "" {
-						// Source code location will be useless here. We can't have both:
-						// separate test results *and* correct source code location.
-						// This will become better with testing.T-based unit tests.
-						_, _ = ginkgo.GinkgoWriter.Write([]byte("For log output see 'DRA upgrade/downgrade works'\n"))
-						ginkgo.Fail(failure)
-					}
-				})
-			}
-		})
-	}
-})
+	tCtx = ktesting.Begin(tCtx, fmt.Sprintf("update to %s", gitVersion))
+	// We could split this up into first updating the apiserver, then control plane components, then restarting kubelet.
+	// For the purpose of this test here we we primarily care about full before/after comparisons, so not done yet.
+	// TODO
+	restoreOptions := cluster.Modify(tCtx, localupcluster.ModifyOptions{Upgrade: true, BinDir: dir})
+	tCtx = ktesting.End(tCtx)
+
+	// The kubelet wipes all ResourceSlices on a restart because it doesn't know which drivers were running.
+	// Wait for the ResourceSlice controller in the driver to notice and recreate the ResourceSlices.
+	tCtx = ktesting.Begin(tCtx, "wait for ResourceSlices")
+	ktesting.Eventually(tCtx, driver.NewGetSlices()).WithTimeout(5 * time.Minute).Should(gomega.HaveField("Items", gomega.HaveLen(len(nodes.NodeNames))))
+	tCtx = ktesting.End(tCtx)
+
+	downgradedTestFuncs := make(map[string]downgradedTestFunc, len(subTests))
+	tCtx.Run("after-cluster-upgrade", func(tCtx ktesting.TContext) {
+		for subTest, f := range upgradedTestFuncs {
+			tCtx.Run(subTest, func(tCtx ktesting.TContext) {
+				downgradedTestFuncs[subTest] = f(tCtx)
+			})
+		}
+	})
+
+	// Roll back.
+	tCtx = ktesting.Begin(tCtx, "downgrade")
+	cluster.Modify(tCtx, restoreOptions)
+	tCtx = ktesting.End(tCtx)
+
+	// TODO: ensure that kube-controller-manager is up-and-running.
+	// This works around https://github.com/kubernetes/kubernetes/issues/132334 and can be removed
+	// once a fix for that is backported.
+	tCtx = ktesting.Begin(tCtx, "wait for kube-controller-manager")
+	ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) string {
+		output, _ := cluster.GetSystemLogs(tCtx, localupcluster.KubeControllerManager)
+		return output
+	}).Should(gomega.ContainSubstring(`"Caches are synced" controller="resource_claim"`))
+	tCtx = ktesting.End(tCtx)
+
+	tCtx.Run("after-cluster-downgrade", func(tCtx ktesting.TContext) {
+		for subTest, f := range downgradedTestFuncs {
+			tCtx.Run(subTest, func(tCtx ktesting.TContext) {
+				f(tCtx)
+			})
+		}
+	})
+}

 // sourceVersion identifies the Kubernetes git version based on hack/print-workspace-status.sh.
 //