kubernetes/hack/ginkgo-e2e.sh
Patrick Ohly aefd2effc5 test: automatically lower Ginkgo parallelism when using race detection
When race detection is enabled, merely running 25 e2e.test instances was too much
and the OOM killer shut down the Prow test pod because of the memory overhead.

A CI job could control that via GINKGO_PARALLEL_NODES, but we should also have
saner defaults which take this into account.
2025-09-16 20:25:53 +02:00

295 lines
10 KiB
Bash
Executable file

#!/usr/bin/env bash
# Copyright 2014 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script runs e2e tests on Google Cloud Platform.
# Usage: `hack/ginkgo-e2e.sh`.
set -o errexit
set -o nounset
set -o pipefail
KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
source "${KUBE_ROOT}/cluster/common.sh"
source "${KUBE_ROOT}/hack/lib/init.sh"
# Find the ginkgo binary build as part of the release.
ginkgo=$(kube::util::find-binary "ginkgo")
e2e_test=$(kube::util::find-binary "e2e.test")
# --- Setup some env vars.
GINKGO_PARALLEL=${GINKGO_PARALLEL:-n} # set to 'y' to run tests in parallel
GINKGO_SILENCE_SKIPS=${GINKGO_SILENCE_SKIPS:-y} # set to 'n' to see S character for each skipped test
GINKGO_FORCE_NEWLINES=${GINKGO_FORCE_NEWLINES:-$( if [ "${CI:-false}" = "true" ]; then echo "y"; else echo "n"; fi )} # set to 'y' to print a newline after each S or o character
CLOUD_CONFIG=${CLOUD_CONFIG:-""}
# If 'y', Ginkgo's reporter will not use escape sequence to color output.
#
# Since Kubernetes 1.25, the default is to use colors only when connected to
# a terminal. That is the right choice for all Prow jobs (Spyglass doesn't
# render them properly).
GINKGO_NO_COLOR=${GINKGO_NO_COLOR:-$(if [ -t 2 ]; then echo n; else echo y; fi)}
# If set, the command executed will be:
# - `dlv exec` if set to "delve"
# - `gdb` if set to "gdb"
# NOTE: for this to work the e2e.test binary has to be compiled with
# make DBG=1 WHAT=test/e2e/e2e.test
E2E_TEST_DEBUG_TOOL=${E2E_TEST_DEBUG_TOOL:-}
: "${KUBECTL:="${KUBE_ROOT}/cluster/kubectl.sh"}"
: "${KUBE_CONFIG_FILE:="config-test.sh"}"
export KUBECTL KUBE_CONFIG_FILE
source "${KUBE_ROOT}/cluster/kube-util.sh"
function detect-master-from-kubeconfig() {
export KUBECONFIG=${KUBECONFIG:-$DEFAULT_KUBECONFIG}
local cc
cc=$("${KUBE_ROOT}/cluster/kubectl.sh" config view -o jsonpath="{.current-context}")
if [[ -n "${KUBE_CONTEXT:-}" ]]; then
cc="${KUBE_CONTEXT}"
fi
local cluster
cluster=$("${KUBE_ROOT}/cluster/kubectl.sh" config view -o jsonpath="{.contexts[?(@.name == \"${cc}\")].context.cluster}")
KUBE_MASTER_URL=$("${KUBE_ROOT}/cluster/kubectl.sh" config view -o jsonpath="{.clusters[?(@.name == \"${cluster}\")].cluster.server}")
}
# ---- Do cloud-provider-specific setup
if [[ -n "${KUBERNETES_CONFORMANCE_TEST:-}" ]]; then
echo "Conformance test: not doing test setup."
KUBERNETES_PROVIDER=${KUBERNETES_CONFORMANCE_PROVIDER:-"skeleton"}
detect-master-from-kubeconfig
auth_config=(
"--kubeconfig=${KUBECONFIG}"
)
else
echo "Setting up for KUBERNETES_PROVIDER=\"${KUBERNETES_PROVIDER}\"."
prepare-e2e
detect-master >/dev/null
KUBE_MASTER_URL="${KUBE_MASTER_URL:-}"
if [[ -z "${KUBE_MASTER_URL:-}" && -n "${KUBE_MASTER_IP:-}" ]]; then
KUBE_MASTER_URL="https://${KUBE_MASTER_IP}"
fi
auth_config=(
"--kubeconfig=${KUBECONFIG:-$DEFAULT_KUBECONFIG}"
)
fi
if [[ -n "${NODE_INSTANCE_PREFIX:-}" ]]; then
NODE_INSTANCE_GROUP="${NODE_INSTANCE_PREFIX}-group"
fi
if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then
set_num_migs
NODE_INSTANCE_GROUP=""
for ((i=1; i<=NUM_MIGS; i++)); do
if [[ ${i} == "${NUM_MIGS}" ]]; then
# We are assigning the same mig names as create-nodes function from cluster/gce/util.sh.
NODE_INSTANCE_GROUP="${NODE_INSTANCE_GROUP}${NODE_INSTANCE_PREFIX}-group"
else
NODE_INSTANCE_GROUP="${NODE_INSTANCE_GROUP}${NODE_INSTANCE_PREFIX}-group-${i},"
fi
done
fi
# TODO(kubernetes/test-infra#3330): Allow NODE_INSTANCE_GROUP to be
# set before we get here, which eliminates any cluster/gke use if
# KUBERNETES_CONFORMANCE_PROVIDER is set to "gke".
if [[ -z "${NODE_INSTANCE_GROUP:-}" ]] && [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
detect-node-instance-groups
NODE_INSTANCE_GROUP=$(kube::util::join , "${NODE_INSTANCE_GROUP[@]}")
fi
if [[ "${KUBERNETES_PROVIDER}" == "azure" ]]; then
if [[ ${CLOUD_CONFIG} == "" ]]; then
echo "Missing azure cloud config"
exit 1
fi
fi
# These arguments are understood by both Ginkgo test suite and CLI.
# Some arguments (like --nodes) are only supported when using the CLI.
# Those get set below when choosing the program.
ginkgo_args=(
"--poll-progress-after=${GINKGO_POLL_PROGRESS_AFTER:-60m}"
"--poll-progress-interval=${GINKGO_POLL_PROGRESS_INTERVAL:-5m}"
"--source-root=${KUBE_ROOT}"
)
# NOTE: Ginkgo's default timeout has been reduced from 24h to 1h in V2, set it manually here as "24h"
# for backward compatibility purpose.
ginkgo_args+=("--timeout=${GINKGO_TIMEOUT:-24h}")
if [[ -n "${CONFORMANCE_TEST_SKIP_REGEX:-}" ]]; then
ginkgo_args+=("--skip=${CONFORMANCE_TEST_SKIP_REGEX}")
ginkgo_args+=("--seed=1436380640")
fi
if [[ "${GINKGO_UNTIL_IT_FAILS:-}" == true ]]; then
ginkgo_args+=("--until-it-fails=true")
fi
if [[ "${GINKGO_SILENCE_SKIPS}" == "y" ]]; then
ginkgo_args+=("--silence-skips")
fi
if [[ "${GINKGO_FORCE_NEWLINES}" == "y" ]]; then
ginkgo_args+=("--force-newlines")
fi
if [[ "${GINKGO_NO_COLOR}" == "y" ]]; then
ginkgo_args+=("--no-color")
fi
# The --host setting is used only when providing --auth_config
# If --kubeconfig is used, the host to use is retrieved from the .kubeconfig
# file and the one provided with --host is ignored.
# Add path for things like running kubectl binary.
PATH=$(dirname "${e2e_test}"):"${PATH}"
export PATH
# Choose the program to execute and additional arguments for it.
#
# Note that the e2e.test binary must have been built with "make DBG=1"
# when using debuggers, otherwise debug information gets stripped.
case "${E2E_TEST_DEBUG_TOOL:-ginkgo}" in
ginkgo)
program=("${ginkgo}")
if [[ -n "${GINKGO_PARALLEL_NODES:-}" ]]; then
program+=("--nodes=${GINKGO_PARALLEL_NODES}")
elif [[ ${GINKGO_PARALLEL} =~ ^[yY]$ ]]; then
if [[ "${KUBE_RACE:-}" == "-race" ]]; then
# When race detection is enabled, merely running 25 e2e.test instances was too much
# and the OOM killer shut down the Prow test pod because of the memory overhead.
#
# A CI job could control that via GINKGO_PARALLEL_NODES, but we should also have
# saner defaults which take this into account.
program+=("--nodes=10")
else
program+=("--nodes=25")
fi
fi
program+=("${ginkgo_args[@]:+${ginkgo_args[@]}}")
;;
delve) program=("dlv" "exec") ;;
gdb) program=("gdb") ;;
*) kube::log::error_exit "Unsupported E2E_TEST_DEBUG_TOOL=${E2E_TEST_DEBUG_TOOL}" ;;
esac
# Move Ginkgo arguments that are understood by the suite when not using
# the CLI.
suite_args=()
if [ "${E2E_TEST_DEBUG_TOOL:-ginkgo}" != "ginkgo" ]; then
for arg in "${ginkgo_args[@]}"; do
suite_args+=("--ginkgo.${arg#--}")
done
fi
# Generate full dumps of the test result and progress in <report-dir>/ginkgo/,
# using the Ginkgo-specific JSON format and JUnit XML. Ignored if --report-dir
# is not used.
suite_args+=(--report-complete-ginkgo --report-complete-junit)
# When SIGTERM doesn't reach the E2E test suite binaries, ginkgo will exit
# without collecting information from about the currently running and
# potentially stuck tests. This seems to happen when Prow shuts down a test
# job because of a timeout.
#
# It's useful to print one final progress report in that case,
# so GINKGO_PROGRESS_REPORT_ON_SIGTERM (enabled by default when CI=true)
# catches SIGTERM and forwards it to all processes spawned by ginkgo.
#
# Manual invocations can trigger a similar report with `killall -USR1 e2e.test`
# without having to kill the test run.
GINKGO_CLI_PID=
signal_handler() {
if [ -n "${GINKGO_CLI_PID}" ]; then
cat <<EOF
*** $0: received $1 signal -> asking Ginkgo to stop.
***
*** Beware that a timeout may have been caused by some earlier test,
*** not necessarily the one which gets interrupted now.
*** See the "Spec runtime" for information about how long the
*** interrupted test was running.
EOF
# This goes to the process group, which is important because we
# need to reach the e2e.test processes forked by the Ginkgo CLI.
kill -TERM "-${GINKGO_CLI_PID}" || true
echo "Waiting for Ginkgo with pid ${GINKGO_CLI_PID}..."
wait "${GINKGO_CLI_PID}"
echo "Ginkgo terminated."
fi
}
case "${GINKGO_PROGRESS_REPORT_ON_SIGTERM:-${CI:-no}}" in
y|yes|true)
kube::util::trap_add "signal_handler INT" INT
kube::util::trap_add "signal_handler TERM" TERM
# Job control is needed to make the Ginkgo CLI and all workers run
# in their own process group.
set -m
;;
esac
# The following invocation is fairly complex. Let's dump it to simplify
# determining what the final options are. Enabled by default in CI
# environments like Prow.
case "${GINKGO_SHOW_COMMAND:-${CI:-no}}" in y|yes|true) set -x ;; esac
"${program[@]}" "${e2e_test}" -- \
"${auth_config[@]:+${auth_config[@]}}" \
--host="${KUBE_MASTER_URL}" \
--provider="${KUBERNETES_PROVIDER}" \
--gce-project="${PROJECT:-}" \
--gce-zone="${ZONE:-}" \
--gce-region="${REGION:-}" \
--gce-multizone="${MULTIZONE:-false}" \
--gke-cluster="${CLUSTER_NAME:-}" \
--kube-master="${KUBE_MASTER:-}" \
--cluster-tag="${CLUSTER_ID:-}" \
--cloud-config-file="${CLOUD_CONFIG:-}" \
--repo-root="${KUBE_ROOT}" \
--node-instance-group="${NODE_INSTANCE_GROUP:-}" \
--prefix="${KUBE_GCE_INSTANCE_PREFIX:-e2e}" \
--network="${KUBE_GCE_NETWORK:-${KUBE_GKE_NETWORK:-e2e}}" \
--node-tag="${NODE_TAG:-}" \
--master-tag="${MASTER_TAG:-}" \
--docker-config-file="${DOCKER_CONFIG_FILE:-}" \
--dns-domain="${KUBE_DNS_DOMAIN:-cluster.local}" \
--prepull-images="${PREPULL_IMAGES:-false}" \
${MASTER_OS_DISTRIBUTION:+"--master-os-distro=${MASTER_OS_DISTRIBUTION}"} \
${NODE_OS_DISTRIBUTION:+"--node-os-distro=${NODE_OS_DISTRIBUTION}"} \
${NUM_NODES:+"--num-nodes=${NUM_NODES}"} \
${E2E_REPORT_DIR:+"--report-dir=${E2E_REPORT_DIR}"} \
${E2E_REPORT_PREFIX:+"--report-prefix=${E2E_REPORT_PREFIX}"} \
"${suite_args[@]:+${suite_args[@]}}" \
"${@}" &
set +x
GINKGO_CLI_PID=$!
wait "${GINKGO_CLI_PID}"