DRA upgrade/downgrade: rewrite as Go unit test

tCtx.Run and sub-tests make it much simpler to separate the different steps
than with Ginkgo because unless a test runs tCtx.Parallel (which we don't do
here), everything runs sequentially in a deterministic order.

Right now we get:

    ...
        localupcluster.go:285: I1210 12:24:22.067524] bring up v1.34: stopping kubelet
        localupcluster.go:285: I1210 12:24:22.067548] bring up v1.34: stopping kube-scheduler
        localupcluster.go:285: I1210 12:24:22.067570] bring up v1.34: stopping kube-controller-manager
        localupcluster.go:285: I1210 12:24:22.067589] bring up v1.34: stopping kube-apiserver
    --- PASS: TestUpgradeDowngrade (94.78s)
        --- PASS: TestUpgradeDowngrade/after-cluster-creation (2.07s)
            --- PASS: TestUpgradeDowngrade/after-cluster-creation/core_DRA (2.05s)
            --- PASS: TestUpgradeDowngrade/after-cluster-creation/ResourceClaim_device_status (0.02s)
        --- PASS: TestUpgradeDowngrade/after-cluster-upgrade (4.10s)
            --- PASS: TestUpgradeDowngrade/after-cluster-upgrade/core_DRA (4.09s)
            --- PASS: TestUpgradeDowngrade/after-cluster-upgrade/ResourceClaim_device_status (0.01s)
        --- PASS: TestUpgradeDowngrade/after-cluster-downgrade (1.24s)
            --- PASS: TestUpgradeDowngrade/after-cluster-downgrade/core_DRA (1.21s)
            --- PASS: TestUpgradeDowngrade/after-cluster-downgrade/ResourceClaim_device_status (0.02s)
    PASS

It's even possible to use `-failfast` and
e.g. `-run=TestUpgradeDowngrade/after-cluster-creation/core_DRA`: `go test` then
runs everything up to that sub-test or any failing sub-test, then stops and
cleans up.
This commit is contained in:
Patrick Ohly 2025-12-10 12:34:04 +01:00
parent 7c7b1e1018
commit de47714879
4 changed files with 197 additions and 299 deletions

View file

@ -1,12 +1,13 @@
This directory contains a testsuite with automatic upgrade/downgrade tests for
DRA. Conceptually this is like an integration test, in the sense that it
starts/stops cluster components and runs tests against them.
starts/stops cluster components and runs tests against them. It has its own
directory because it needs to be started differently than other integration
tests or unit tests, which makes it more like an E2E suite.
The difference is that it starts Kubernetes components by running the actual
binaries, relying on local-up-cluster.sh for the logic and configuration
steps. Because local-up-cluster.sh needs additional permissions and
preparations on the host, the test cannot run in "make test-integration" and
just skips itself there.
steps. local-up-cluster.sh needs additional permissions and
preparations on the host.
To run it:
- Make sure that hack/local-up-cluster.sh works:
@ -23,21 +24,16 @@ To run it:
Otherwise a test tmp directory is used.
- Invoke as a Go test (no need for the ginkgo CLI), for example:
go test -v -count=1 -timeout=1h ./test/e2e_dra -args -ginkgo.v
dlv test ./test/e2e_dra -- -ginkgo.v
make test KUBE_TIMEOUT=-timeout=1h WHAT=test/e2e_dra FULL_LOG=true KUBE_TEST_ARGS="-count=1 -args -ginkgo.v"
go test -v -count=1 -timeout=1h ./test/e2e_dra
dlv test ./test/e2e_dra -- -test.v
make test KUBE_TIMEOUT=-timeout=1h WHAT=test/e2e_dra FULL_LOG=true KUBE_TEST_ARGS="-count=1"
`make test` instead of `make test-integration` is intentional: `local-up-cluster.sh`
itself wants to start etcd. `-count=1` ensures that test runs each time it is invoked.
`-v` and `-ginkgo.v` make the test output visible while the test runs.
`-v`/`-test.v`/`FULL_LOG=true` make the test output visible while the test runs.
To simplify starting from scratch, `./test/e2e_dra/run.sh` cleans
up, sets permissions, and then invokes whatever command is specified on the
command line:
./test/e2e_dra/run.sh go test ./test/e2e_dra
The test is implemented as a Ginkgo suite because that allows reusing the same
helper code as in E2E tests. Long-term the goal is to port that helper code to
ktesting, support ktesting in test/e2e, and turn this test into a normal Go
test.

View file

@ -17,28 +17,29 @@ limitations under the License.
package e2edra
import (
"time"
"github.com/onsi/gomega"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
drautils "k8s.io/kubernetes/test/e2e/dra/utils"
"k8s.io/kubernetes/test/e2e/framework"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
"k8s.io/kubernetes/test/utils/ktesting"
)
func coreDRA(tCtx ktesting.TContext, f *framework.Framework, b *drautils.Builder) step2Func {
namespace := f.Namespace.Name
func coreDRA(tCtx ktesting.TContext, b *drautils.Builder) upgradedTestFunc {
namespace := tCtx.Namespace()
claim := b.ExternalClaim()
pod := b.PodExternal()
b.Create(tCtx, claim, pod)
b.TestPod(tCtx, pod)
return func(tCtx ktesting.TContext) step3Func {
return func(tCtx ktesting.TContext) downgradedTestFunc {
// Remove pod prepared in step 1.
framework.ExpectNoError(f.ClientSet.ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{}))
framework.ExpectNoError(f.ClientSet.CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{}))
framework.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(tCtx, f.ClientSet, pod.Name, namespace, f.Timeouts.PodDelete))
tCtx.ExpectNoError(tCtx.Client().ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{}))
tCtx.ExpectNoError(tCtx.Client().CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{}))
tCtx.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(tCtx, tCtx.Client(), pod.Name, namespace, 3*time.Minute))
// Create another claim and pod, this time using the latest Kubernetes.
claim = b.ExternalClaim()
@ -56,13 +57,13 @@ func coreDRA(tCtx ktesting.TContext, f *framework.Framework, b *drautils.Builder
// or (even weirder) with
// getting *v1.Pod: pods "tester-2" is forbidden: User "kubernetes-admin" cannot get resource "pods" in API group "" in the namespace "dra-9021"
ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) error {
return f.ClientSet.ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{})
return tCtx.Client().ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{})
}).Should(gomega.Succeed(), "delete claim after downgrade")
ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) error {
return f.ClientSet.CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{})
return tCtx.Client().CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{})
}).Should(gomega.Succeed(), "delete pod after downgrade")
ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) *v1.Pod {
pod, err := f.ClientSet.CoreV1().Pods(namespace).Get(tCtx, pod.Name, metav1.GetOptions{})
pod, err := tCtx.Client().CoreV1().Pods(namespace).Get(tCtx, pod.Name, metav1.GetOptions{})
if apierrors.IsNotFound(err) {
return nil
}

View file

@ -31,15 +31,12 @@ import (
resourceapiacv1beta2 "k8s.io/client-go/applyconfigurations/resource/v1beta2"
draapiv1beta2 "k8s.io/dynamic-resource-allocation/api/v1beta2"
drautils "k8s.io/kubernetes/test/e2e/dra/utils"
"k8s.io/kubernetes/test/e2e/framework"
"k8s.io/kubernetes/test/utils/ktesting"
)
// resourceClaimDeviceStatus corresponds to testResourceClaimDeviceStatus in test/integration/dra
// and was copied from there, therefore the unit-test style with tCtx and require.
// This is the preferred style for new tests.
func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b *drautils.Builder) step2Func {
namespace := f.Namespace.Name
// resourceClaimDeviceStatus corresponds to testResourceClaimDeviceStatus in test/integration/dra.
func resourceClaimDeviceStatus(tCtx ktesting.TContext, b *drautils.Builder) upgradedTestFunc {
namespace := tCtx.Namespace()
claimName := "claim-with-device-status"
claim := &resourceapiv1beta2.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{
@ -111,7 +108,6 @@ func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b
err = client.ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{})
tCtx.ExpectNoError(err, "delete claim")
}
tCtx.CleanupCtx(removeClaim)
claim, err = tCtx.Client().ResourceV1beta2().ResourceClaims(namespace).UpdateStatus(tCtx, claim, metav1.UpdateOptions{})
tCtx.ExpectNoError(err, "add allocation result")
@ -193,7 +189,7 @@ func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b
tCtx.ExpectNoError(encoder.Encode(claim))
tCtx.Logf("Final ResourceClaim:\n%s", buffer.String())
return func(tCtx ktesting.TContext) step3Func {
return func(tCtx ktesting.TContext) downgradedTestFunc {
// Update one entry, remove the other.
deviceStatusAC := resourceapiac.AllocatedDeviceStatus().
WithDriver("two").
@ -227,8 +223,9 @@ func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b
tCtx.ExpectNoError(err, "remove device status three")
require.Equal(tCtx, deviceStatus, claim.Status.Devices, "after removing device status three")
// The cleanup order is so that we have to run this explicitly now.
// The tCtx.CleanupCtx is more for the sake of completeness.
// This was created in a prior sub-test, so we have to
// clean up manually for a proper termination of the
// overall test.
removeClaim(tCtx)
}
}

View file

@ -19,9 +19,7 @@ package e2edra
import (
"archive/tar"
"compress/gzip"
"context"
_ "embed"
"flag"
"fmt"
"io"
"net/http"
@ -34,19 +32,16 @@ import (
"testing"
"time"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
"k8s.io/apimachinery/pkg/util/version"
restclient "k8s.io/client-go/rest"
"k8s.io/kubernetes/cmd/kubeadm/app/util/errors"
drautils "k8s.io/kubernetes/test/e2e/dra/utils"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
"k8s.io/kubernetes/test/utils/ktesting"
"k8s.io/kubernetes/test/utils/localupcluster"
admissionapi "k8s.io/pod-security-admission/api"
)
var errHTTP404 = errors.New("resource not found (404)")
@ -69,31 +64,23 @@ func init() {
//
// The "test code" gets registered here with a single function for each
// sub-test. That function then returns the next piece of code, which then
// returns the final code.
//
// For performance reasons there is only a single `It("works")` which
// runs everything. Failures in sub-tests are reported separately *if they
// are reported via the TContext*. Failures reported via ginkgo.Fail
// currently abort the entire test. This will be addressed by converting
// everything to ktesting-based unit tests.
// returns the final code. Each callback function is executed as a sub-test.
// The builder is configured to not delete objects when that sub-test ends,
// so objects persist until the entire test is done.
//
// Each sub-test must be self-contained. They intentionally run in a random
// order. However, they share the same cluster and the 8 devices which are
// available there.
var subTests = map[string]step1Func{
var subTests = map[string]initialTestFunc{
"core DRA": coreDRA,
"ResourceClaim device status": resourceClaimDeviceStatus,
}
// step1Func is passed everything that is needed to run
type step1Func func(tCtx ktesting.TContext, f *framework.Framework, builder *drautils.Builder) step2Func
type initialTestFunc func(tCtx ktesting.TContext, builder *drautils.Builder) upgradedTestFunc
type step2Func func(tCtx ktesting.TContext) step3Func
type upgradedTestFunc func(tCtx ktesting.TContext) downgradedTestFunc
type step3Func func(tCtx ktesting.TContext)
// steps has the names for the actual ginkgo.It where sub-test results are provided.
var steps = []string{"before downgrade", "after downgrade", "after upgrade"}
type downgradedTestFunc func(tCtx ktesting.TContext)
var repoRoot = repoRootDefault()
@ -117,272 +104,189 @@ func repoRootDefault() string {
return "../../"
}
func TestUpgradeDowngrade(t *testing.T) {
suiteConfig, reporterConfig := framework.CreateGinkgoConfig()
suiteConfig.RandomizeAllSpecs = false
ginkgo.RunSpecs(t, "DRA", suiteConfig, reporterConfig)
}
var _ = ginkgo.Describe("DRA upgrade/downgrade", func() {
// Initialize the default values by registering flags. We don't actually expose those flags.
var fs flag.FlagSet
framework.RegisterCommonFlags(&fs)
framework.RegisterClusterFlags(&fs)
func TestUpgradeDowngrade(t *testing.T) { testUpgradeDowngrade(ktesting.Init(t)) }
func testUpgradeDowngrade(tCtx ktesting.TContext) {
// Some other things normally done by test/e2e.
e2etestfiles.AddFileSource(e2etestfiles.RootFileSource{Root: repoRoot})
gomega.RegisterFailHandler(ginkgo.Fail)
// sub-test -> step -> failure
//
// Initially each failure string is empty.
results := make(map[string]map[string]string, len(subTests))
for subTest := range subTests {
results[subTest] = make(map[string]string, len(steps))
// Ideally we shouldn't have any code which directly calls gomega.Expect,
// but we are not there yet (e.g. e2epod.MakePod). So for now we install
// one fail handler which records failures in the main test context.
gomega.RegisterFailHandler(func(message string, callerSkip ...int) {
tCtx.Helper()
tCtx.Fatal(message)
})
envName, dir := currentBinDir()
if dir == "" {
tCtx.Fatalf("%s must be set to test DRA upgrade/downgrade scenarios.", envName)
}
ginkgo.It("works", func(ctx context.Context) {
// TODO: replace with helper code from https://github.com/kubernetes/kubernetes/pull/122481 should that get merged.
tCtx := ktesting.Init(GinkgoContextTB())
tCtx = ktesting.WithContext(tCtx, ctx)
// Determine what we need to downgrade to.
tCtx = ktesting.Begin(tCtx, "get source code version")
gitVersion, _, err := sourceVersion(tCtx, repoRoot)
tCtx.ExpectNoError(err, "determine source code version for repo root %q", repoRoot)
version, err := version.ParseGeneric(gitVersion)
tCtx.ExpectNoError(err, "parse version %s of repo root %q", gitVersion, repoRoot)
major, previousMinor := version.Major(), version.Minor()-1
if strings.Contains(gitVersion, "-alpha.0") {
// All version up to and including x.y.z-alpha.0 are treated as if we were
// still the previous minor version x.(y-1). There are two reason for this:
//
// - During code freeze around (at?) -rc.0, the master branch already
// identfies itself as the next release with -alpha.0. Without this
// special case, we would change the version skew testing from what
// has been tested and been known to work to something else, which
// can and at least once did break.
//
// - Early in the next cycle the differences compared to the previous
// release are small, so it's more interesting to go back further.
previousMinor--
}
tCtx.Logf("got version: major: %d, minor: %d, previous minor: %d", major, version.Minor(), previousMinor)
tCtx = ktesting.End(tCtx)
envName, dir := currentBinDir()
if dir == "" {
tCtx.Fatalf("%s must be set to test DRA upgrade/downgrade scenarios.", envName)
// KUBERNETES_SERVER_CACHE_DIR can be set to keep downloaded files across test restarts.
binDir, cacheBinaries := os.LookupEnv("KUBERNETES_SERVER_CACHE_DIR")
if !cacheBinaries {
binDir = tCtx.TempDir()
}
haveBinaries := false
// Get the previous release.
tCtx = ktesting.Begin(tCtx, "get previous release info")
tCtx.Logf("stable release %d.%d", major, previousMinor)
previousURL, previousVersion, err := serverDownloadURL(tCtx, "stable", major, previousMinor)
if errors.Is(err, errHTTP404) {
tCtx.Logf("stable doesn't exist, get latest release %d.%d", major, previousMinor)
previousURL, previousVersion, err = serverDownloadURL(tCtx, "latest", major, previousMinor)
}
tCtx.ExpectNoError(err)
tCtx.Logf("got previous release version: %s, URL: %s", previousVersion, previousURL)
tCtx = ktesting.End(tCtx)
if cacheBinaries {
binDir = path.Join(binDir, previousVersion)
_, err := os.Stat(path.Join(binDir, string(localupcluster.KubeClusterComponents[0])))
if err == nil {
haveBinaries = true
}
// Determine what we need to downgrade to.
tCtx = ktesting.Begin(tCtx, "get source code version")
gitVersion, _, err := sourceVersion(tCtx, repoRoot)
tCtx.ExpectNoError(err, "determine source code version for repo root %q", repoRoot)
version, err := version.ParseGeneric(gitVersion)
tCtx.ExpectNoError(err, "parse version %s of repo root %q", gitVersion, repoRoot)
major, previousMinor := version.Major(), version.Minor()-1
if strings.Contains(gitVersion, "-alpha.0") {
// All version up to and including x.y.z-alpha.0 are treated as if we were
// still the previous minor version x.(y-1). There are two reason for this:
//
// - During code freeze around (at?) -rc.0, the master branch already
// identfies itself as the next release with -alpha.0. Without this
// special case, we would change the version skew testing from what
// has been tested and been known to work to something else, which
// can and at least once did break.
//
// - Early in the next cycle the differences compared to the previous
// release are small, so it's more interesting to go back further.
previousMinor--
}
tCtx.Logf("got version: major: %d, minor: %d, previous minor: %d", major, version.Minor(), previousMinor)
tCtx = ktesting.End(tCtx)
// KUBERNETES_SERVER_CACHE_DIR can be set to keep downloaded files across test restarts.
binDir, cacheBinaries := os.LookupEnv("KUBERNETES_SERVER_CACHE_DIR")
if !cacheBinaries {
binDir = tCtx.TempDir()
}
haveBinaries := false
// Get the previous release.
tCtx = ktesting.Begin(tCtx, "get previous release info")
tCtx.Logf("stable release %d.%d", major, previousMinor)
previousURL, previousVersion, err := serverDownloadURL(tCtx, "stable", major, previousMinor)
if errors.Is(err, errHTTP404) {
tCtx.Logf("stable doesn't exist, get latest release %d.%d", major, previousMinor)
previousURL, previousVersion, err = serverDownloadURL(tCtx, "latest", major, previousMinor)
}
tCtx.ExpectNoError(err)
tCtx.Logf("got previous release version: %s, URL: %s", previousVersion, previousURL)
tCtx = ktesting.End(tCtx)
if cacheBinaries {
binDir = path.Join(binDir, previousVersion)
_, err := os.Stat(path.Join(binDir, string(localupcluster.KubeClusterComponents[0])))
if err == nil {
haveBinaries = true
}
if !haveBinaries {
tCtx = ktesting.Begin(tCtx, fmt.Sprintf("download and unpack %s", previousURL))
req, err := http.NewRequestWithContext(tCtx, http.MethodGet, previousURL, nil)
tCtx.ExpectNoError(err, "construct request")
response, err := http.DefaultClient.Do(req)
tCtx.ExpectNoError(err, "download")
defer func() {
_ = response.Body.Close()
}()
decompress, err := gzip.NewReader(response.Body)
tCtx.ExpectNoError(err, "construct gzip reader")
unpack := tar.NewReader(decompress)
for {
header, err := unpack.Next()
if err == io.EOF {
break
}
base := path.Base(header.Name)
if slices.Contains(localupcluster.KubeClusterComponents, localupcluster.KubeComponentName(base)) {
data, err := io.ReadAll(unpack)
tCtx.ExpectNoError(err, fmt.Sprintf("read content of %s", header.Name))
tCtx.ExpectNoError(os.MkdirAll(binDir, 0755), "create directory for binaries")
tCtx.ExpectNoError(os.WriteFile(path.Join(binDir, base), data, 0555), fmt.Sprintf("write content of %s", header.Name))
}
}
if !haveBinaries {
tCtx = ktesting.Begin(tCtx, fmt.Sprintf("download and unpack %s", previousURL))
req, err := http.NewRequestWithContext(tCtx, http.MethodGet, previousURL, nil)
tCtx.ExpectNoError(err, "construct request")
response, err := http.DefaultClient.Do(req)
tCtx.ExpectNoError(err, "download")
defer func() {
_ = response.Body.Close()
}()
decompress, err := gzip.NewReader(response.Body)
tCtx.ExpectNoError(err, "construct gzip reader")
unpack := tar.NewReader(decompress)
for {
header, err := unpack.Next()
if err == io.EOF {
break
}
base := path.Base(header.Name)
if slices.Contains(localupcluster.KubeClusterComponents, localupcluster.KubeComponentName(base)) {
data, err := io.ReadAll(unpack)
tCtx.ExpectNoError(err, fmt.Sprintf("read content of %s", header.Name))
tCtx.ExpectNoError(os.MkdirAll(binDir, 0755), "create directory for binaries")
tCtx.ExpectNoError(os.WriteFile(path.Join(binDir, base), data, 0555), fmt.Sprintf("write content of %s", header.Name))
}
}
tCtx = ktesting.End(tCtx)
}
tCtx = ktesting.Begin(tCtx, fmt.Sprintf("bring up v%d.%d", major, previousMinor))
cluster := localupcluster.New(tCtx)
localUpClusterEnv := map[string]string{
"RUNTIME_CONFIG": "resource.k8s.io/v1beta1,resource.k8s.io/v1beta2",
"FEATURE_GATES": "DynamicResourceAllocation=true",
// *not* needed because driver will run in "local filesystem" mode (= driver.IsLocal): "ALLOW_PRIVILEGED": "1",
}
cluster.Start(tCtx, binDir, localUpClusterEnv)
tCtx = ktesting.End(tCtx)
}
restConfig := cluster.LoadConfig(tCtx)
restConfig.UserAgent = fmt.Sprintf("%s -- dra", restclient.DefaultKubernetesUserAgent())
tCtx = ktesting.WithRESTConfig(tCtx, restConfig)
// TODO: rewrite all DRA test code to use ktesting.TContext once https://github.com/kubernetes/kubernetes/pull/122481 is
// merged, then we don't need to fake a Framework instance.
f := &framework.Framework{
BaseName: "dra",
Timeouts: framework.NewTimeoutContext(),
ClientSet: tCtx.Client(),
DynamicClient: tCtx.Dynamic(),
tCtx = ktesting.Begin(tCtx, fmt.Sprintf("bring up v%d.%d", major, previousMinor))
cluster := localupcluster.New(tCtx)
localUpClusterEnv := map[string]string{
"RUNTIME_CONFIG": "resource.k8s.io/v1beta1,resource.k8s.io/v1beta2",
"FEATURE_GATES": "DynamicResourceAllocation=true",
// *not* needed because driver will run in "local filesystem" mode (= driver.IsLocal): "ALLOW_PRIVILEGED": "1",
}
cluster.Start(tCtx, binDir, localUpClusterEnv)
tCtx = ktesting.End(tCtx)
// The driver containers have to run with sufficient privileges to
// modify /var/lib/kubelet/plugins.
NamespacePodSecurityLevel: admissionapi.LevelPrivileged,
}
f.SetClientConfig(restConfig)
restConfig := cluster.LoadConfig(tCtx)
restConfig.UserAgent = fmt.Sprintf("%s -- dra", restclient.DefaultKubernetesUserAgent())
tCtx = tCtx.WithRESTConfig(restConfig).WithNamespace("default")
namespace, err := f.CreateNamespace(tCtx, f.BaseName, map[string]string{
"e2e-framework": f.BaseName,
})
tCtx.ExpectNoError(err, "create namespace")
f.Namespace = namespace
f.UniqueName = namespace.Name
tCtx = ktesting.Begin(tCtx, fmt.Sprintf("v%d.%d", major, previousMinor))
tCtx = ktesting.Begin(tCtx, fmt.Sprintf("v%d.%d", major, previousMinor))
tCtx.ExpectNoError(e2enode.WaitForAllNodesSchedulable(tCtx, tCtx.Client(), 5*time.Minute), "wait for all nodes to be schedulable")
nodes := drautils.NewNodesNow(tCtx, 1, 1)
tCtx.ExpectNoError(e2enode.WaitForAllNodesSchedulable(tCtx, tCtx.Client(), f.Timeouts.NodeSchedulable), "wait for all nodes to be schedulable")
nodes := drautils.NewNodesNow(tCtx, 1, 1)
// Opening sockets locally avoids intermittent errors and delays caused by proxying through the restarted apiserver.
// We could speed up testing by shortening the sync delay in the ResourceSlice controller, but let's better
// test the defaults.
driver := drautils.NewDriverInstance(tCtx)
driver.IsLocal = true
driver.Run(tCtx, "/var/lib/kubelet", nodes, drautils.DriverResourcesNow(nodes, 8))
b := drautils.NewBuilderNow(tCtx, driver)
b.SkipCleanup = true
// Opening sockets locally avoids intermittent errors and delays caused by proxying through the restarted apiserver.
// We could speed up testing by shortening the sync delay in the ResourceSlice controller, but let's better
// test the defaults.
driver := drautils.NewDriverInstance(tCtx)
driver.IsLocal = true
driver.Run(tCtx, nodes, drautils.DriverResourcesNow(nodes, 8))
b := drautils.NewBuilderNow(tCtx, driver)
tCtx = ktesting.End(tCtx)
tCtx = ktesting.End(tCtx)
steps2 := make(map[string]step2Func, len(subTests))
for subTest, step1 := range subTests {
tCtx = ktesting.Begin(tCtx, subTest)
var result error
func() {
tCtx, finalize := ktesting.WithError(tCtx, &result)
defer finalize()
// This only gets set in case of success.
steps2[subTest] = step1(tCtx, f, b)
}()
if result != nil {
results[subTest][steps[0]] = result.Error()
}
tCtx = ktesting.End(tCtx)
}
tCtx = ktesting.Begin(tCtx, fmt.Sprintf("update to %s", gitVersion))
// We could split this up into first updating the apiserver, then control plane components, then restarting kubelet.
// For the purpose of this test here we we primarily care about full before/after comparisons, so not done yet.
// TODO
restoreOptions := cluster.Modify(tCtx, localupcluster.ModifyOptions{Upgrade: true, BinDir: dir})
tCtx = ktesting.End(tCtx)
// The kubelet wipes all ResourceSlices on a restart because it doesn't know which drivers were running.
// Wait for the ResourceSlice controller in the driver to notice and recreate the ResourceSlices.
tCtx = ktesting.Begin(tCtx, "wait for ResourceSlices")
ktesting.Eventually(tCtx, driver.NewGetSlices()).WithTimeout(5 * time.Minute).Should(gomega.HaveField("Items", gomega.HaveLen(len(nodes.NodeNames))))
tCtx = ktesting.End(tCtx)
steps3 := make(map[string]step3Func, len(subTests))
for subTest := range subTests {
step2 := steps2[subTest]
if step2 == nil {
continue
}
tCtx = ktesting.Begin(tCtx, subTest)
var result error
func() {
tCtx, finalize := ktesting.WithError(tCtx, &result)
defer finalize()
// This only gets set in case of success.
steps3[subTest] = step2(tCtx)
}()
if result != nil {
results[subTest][steps[0]] = result.Error()
}
tCtx = ktesting.End(tCtx)
}
// Roll back.
tCtx = ktesting.Begin(tCtx, "downgrade")
cluster.Modify(tCtx, restoreOptions)
tCtx = ktesting.End(tCtx)
// TODO: ensure that kube-controller-manager is up-and-running.
// This works around https://github.com/kubernetes/kubernetes/issues/132334 and can be removed
// once a fix for that is backported.
tCtx = ktesting.Begin(tCtx, "wait for kube-controller-manager")
ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) string {
output, _ := cluster.GetSystemLogs(tCtx, localupcluster.KubeControllerManager)
return output
}).Should(gomega.ContainSubstring(`"Caches are synced" controller="resource_claim"`))
tCtx = ktesting.End(tCtx)
for subTest := range subTests {
step3 := steps3[subTest]
if step3 == nil {
continue
}
tCtx = ktesting.Begin(tCtx, subTest)
var result error
func() {
tCtx, finalize := ktesting.WithError(tCtx, &result)
defer finalize()
step3(tCtx)
}()
if result != nil {
results[subTest][steps[0]] = result.Error()
}
tCtx = ktesting.End(tCtx)
upgradedTestFuncs := make(map[string]upgradedTestFunc, len(subTests))
tCtx.Run("after-cluster-creation", func(tCtx ktesting.TContext) {
for subTest, f := range subTests {
tCtx.Run(subTest, func(tCtx ktesting.TContext) {
// This only gets set if f doesn't panic because of a fatal error,
// so below we won't continue if step 1 already failed.
// Other sub-tests are not affected.
upgradedTestFuncs[subTest] = f(tCtx, b)
})
}
})
// This runs last because by default Ginkgo does not randomize within
// a top-level container.
for subTest := range subTests {
ginkgo.Context(subTest, func() {
for _, step := range steps {
ginkgo.It(step, func() {
failure := results[subTest][step]
if failure != "" {
// Source code location will be useless here. We can't have both:
// separate test results *and* correct source code location.
// This will become better with testing.T-based unit tests.
_, _ = ginkgo.GinkgoWriter.Write([]byte("For log output see 'DRA upgrade/downgrade works'\n"))
ginkgo.Fail(failure)
}
})
}
})
}
})
tCtx = ktesting.Begin(tCtx, fmt.Sprintf("update to %s", gitVersion))
// We could split this up into first updating the apiserver, then control plane components, then restarting kubelet.
// For the purpose of this test here we we primarily care about full before/after comparisons, so not done yet.
// TODO
restoreOptions := cluster.Modify(tCtx, localupcluster.ModifyOptions{Upgrade: true, BinDir: dir})
tCtx = ktesting.End(tCtx)
// The kubelet wipes all ResourceSlices on a restart because it doesn't know which drivers were running.
// Wait for the ResourceSlice controller in the driver to notice and recreate the ResourceSlices.
tCtx = ktesting.Begin(tCtx, "wait for ResourceSlices")
ktesting.Eventually(tCtx, driver.NewGetSlices()).WithTimeout(5 * time.Minute).Should(gomega.HaveField("Items", gomega.HaveLen(len(nodes.NodeNames))))
tCtx = ktesting.End(tCtx)
downgradedTestFuncs := make(map[string]downgradedTestFunc, len(subTests))
tCtx.Run("after-cluster-upgrade", func(tCtx ktesting.TContext) {
for subTest, f := range upgradedTestFuncs {
tCtx.Run(subTest, func(tCtx ktesting.TContext) {
downgradedTestFuncs[subTest] = f(tCtx)
})
}
})
// Roll back.
tCtx = ktesting.Begin(tCtx, "downgrade")
cluster.Modify(tCtx, restoreOptions)
tCtx = ktesting.End(tCtx)
// TODO: ensure that kube-controller-manager is up-and-running.
// This works around https://github.com/kubernetes/kubernetes/issues/132334 and can be removed
// once a fix for that is backported.
tCtx = ktesting.Begin(tCtx, "wait for kube-controller-manager")
ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) string {
output, _ := cluster.GetSystemLogs(tCtx, localupcluster.KubeControllerManager)
return output
}).Should(gomega.ContainSubstring(`"Caches are synced" controller="resource_claim"`))
tCtx = ktesting.End(tCtx)
tCtx.Run("after-cluster-downgrade", func(tCtx ktesting.TContext) {
for subTest, f := range downgradedTestFuncs {
tCtx.Run(subTest, func(tCtx ktesting.TContext) {
f(tCtx)
})
}
})
}
// sourceVersion identifies the Kubernetes git version based on hack/print-workspace-status.sh.
//