fix e2e tests, especially in GitHub Actions (#1137)

2 years ago · f6838597dd
parent e238500fce
commit f6838597dd
5 changed files with 40 additions and 19 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -36,7 +36,13 @@ jobs:
        run: make test
      - name: Run E2E Tests
        timeout-minutes: 20
-        run: make e2e -e E2E_LOG_LEVEL=trace
+        # Quite often, tests were failing due to "too many open files" errors, so we're fixing this here
+        # Also, we want to see trace level logs if tests fail and the pipeline should exit on first error
+        run: |
+          sudo prlimit --pid $$ --nofile=1048576:1048576
+          sudo sysctl fs.inotify.max_user_instances=1280
+          sudo sysctl fs.inotify.max_user_watches=655360
+          make e2e -e E2E_LOG_LEVEL=trace -e E2E_FAIL_FAST=true
      # Builds
      - name: Test Platform Builds
        run: make build-cross
--- a/3
+++ b/3
@ -55,6 +55,7 @@ E2E_KEEP ?=
 E2E_PARALLEL ?=
 E2E_DIND_VERSION ?=
 E2E_K3S_VERSION ?=
+E2E_FAIL_FAST ?=

 ########## Go Build Options ##########
 # Build targets
@ -179,7 +180,7 @@ test:

 e2e:
 	@echo "Running e2e tests"
-	LOG_LEVEL="$(E2E_LOG_LEVEL)" E2E_INCLUDE="$(E2E_INCLUDE)" E2E_EXCLUDE="$(E2E_EXCLUDE)" E2E_EXTRA="$(E2E_EXTRA)" E2E_RUNNER_START_TIMEOUT=$(E2E_RUNNER_START_TIMEOUT) E2E_HELPER_IMAGE_TAG="$(E2E_HELPER_IMAGE_TAG)" E2E_KEEP="$(E2E_KEEP)" E2E_PARALLEL="$(E2E_PARALLEL)" E2E_DIND_VERSION="$(E2E_DIND_VERSION)" E2E_K3S_VERSION="$(E2E_K3S_VERSION)" tests/dind.sh "${K3D_IMAGE_TAG}"
+	LOG_LEVEL="$(E2E_LOG_LEVEL)" E2E_INCLUDE="$(E2E_INCLUDE)" E2E_EXCLUDE="$(E2E_EXCLUDE)" E2E_EXTRA="$(E2E_EXTRA)" E2E_RUNNER_START_TIMEOUT=$(E2E_RUNNER_START_TIMEOUT) E2E_HELPER_IMAGE_TAG="$(E2E_HELPER_IMAGE_TAG)" E2E_KEEP="$(E2E_KEEP)" E2E_PARALLEL="$(E2E_PARALLEL)" E2E_DIND_VERSION="$(E2E_DIND_VERSION)" E2E_K3S_VERSION="$(E2E_K3S_VERSION)" E2E_FAIL_FAST="$(E2E_FAIL_FAST)" tests/dind.sh "${K3D_IMAGE_TAG}"

 ci-tests: fmt check e2e

--- a/tests/common.sh
+++ b/tests/common.sh
@ -45,7 +45,10 @@ failed() {
  elif [[ -n "$LOG_FILE" ]]; then
    mv "$LOG_FILE" "$LOG_FILE.failed"
  fi
-  abort "test failed"
+  abort "$CURRENT_STAGE: test failed"
+  if [[ "$E2E_FAIL_FAST" == "true" ]]; then
+    exit 1
+  fi
 }

 passed() {
@ -73,11 +76,16 @@ check_url() {
 check_clusters() {
  [ -n "$EXE" ] || abort "EXE is not defined"
  for c in "$@" ; do
-    $EXE kubeconfig merge "$c" --kubeconfig-switch-context || return 1
-    if kubectl cluster-info ; then
-      passed "cluster $c is reachable"
+    if $EXE kubeconfig merge "$c" --kubeconfig-switch-context; then
+      if kubectl cluster-info ; then
+        passed "cluster $c is reachable"
+      else
+        warn "could not obtain cluster info for $c. Kubeconfig:\n$(kubectl config view)"
+        docker ps -a
+        return 1
+      fi
    else
-      warn "could not obtain cluster info for $c. Kubeconfig:\n$(kubectl config view)"
+      warn "could not merge kubeconfig for $c."
      docker ps -a
      return 1
    fi
--- a/tests/dind.sh
+++ b/tests/dind.sh
@ -15,6 +15,9 @@ RUNNER_START_TIMEOUT=${E2E_RUNNER_START_TIMEOUT:-10}
 # Override Docker-in-Docker version
 E2E_DIND_VERSION=${E2E_DIND_VERSION:-}

+# Fail on first error instead of waiting until all tests are done. Useful in CI.
+E2E_FAIL_FAST=${E2E_FAIL_FAST:-}
+
 ####################################################################################

 CURR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
@ -50,6 +53,7 @@ k3de2e=$(docker run -d \
          -e EXE="$K3D_EXE" \
          -e CI="true" \
          -e LOG_LEVEL="$LOG_LEVEL" \
+          -e E2E_FAIL_FAST="$E2E_FAIL_FAST" \
          -e E2E_INCLUDE="$E2E_INCLUDE" \
          -e E2E_EXCLUDE="$E2E_EXCLUDE" \
          -e E2E_PARALLEL="$E2E_PARALLEL" \
--- a/tests/test_config_file_from_stdin.sh
+++ b/tests/test_config_file_from_stdin.sh
@ -28,22 +28,18 @@ fi

 export CURRENT_STAGE="Test | config-file-stdin | $K3S_IMAGE"

-configfileoriginal="$CURR_DIR/assets/config_test_simple.yaml"
-configfile="/tmp/config_test_simple-tmp_$(date -u +'%Y%m%dT%H%M%SZ').yaml"
 clustername="configteststdin"

-sed -E "s/^  name:.+/  name: $clustername/g" < "$configfileoriginal" > "$configfile" # replace cluster name in config file so we can use it in this script without running into override issues
-cat "$configfile"
 highlight "[START] ConfigTest $EXTRA_TITLE"

 info "Creating cluster $clustername..."

-cat <<EOF | $EXE cluster create "$clustername" --config=-
+cat <<EOF | $EXE cluster create --config=-
 apiVersion: k3d.io/v1alpha4
 kind: Simple
 metadata:
-  name: test
-servers: 3
+  name: $clustername
+servers: 1
 agents: 2
 #image: rancher/k3s:latest
 volumes:
@ -99,8 +95,8 @@ sleep 5
 info "Checking that we have access to the cluster..."
 check_clusters "$clustername" || failed "error checking cluster"

-info "Checking that we have 5 nodes online..."
-check_multi_node "$clustername" 5 || failed "failed to verify number of nodes"
+info "Checking that we have 3 nodes online..."
+check_multi_node "$clustername" 3 || failed "failed to verify number of nodes"

 # 2. check some config settings

@ -128,10 +124,16 @@ exec_in_node "k3d-$clustername-server-0" "cat /etc/rancher/k3s/registries.yaml"

 # Cleanup

-info "Deleting cluster $clustername (using config file)..."
-$EXE cluster delete --config "$configfile" --trace || failed "could not delete the cluster $clustername"
+info "Deleting cluster $clustername (using config file from stdin)..."
+cat <<EOF | $EXE cluster delete --config=-
+apiVersion: k3d.io/v1alpha4
+kind: Simple
+metadata:
+  name: $clustername
+EOF
+
+test $? -eq 0 || failed "could not delete cluster $clustername from stdin config"

-rm "$configfile"

 highlight "[DONE] ConfigTest $EXTRA_TITLE"