Parallelize CI test runs (#5626)

Parallelize CI test runs for non-coverage dev PRs. This is backward compatible in that the master branch for doc push & coverage, the upstream CI workflows and the local testing shouldn't be affected /sharded.
pytorch · Sep 28, 2023 · 9b1952d · 9b1952d
1 parent 67854bd
commit 9b1952d
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 42 deletions.
diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -127,15 +127,10 @@ function build_torch_xla() {
   popd
 }
 
-function run_torch_xla_tests() {
+function run_torch_xla_python_tests() {
   PYTORCH_DIR=$1
   XLA_DIR=$2
   USE_COVERAGE="${3:-0}"
-  if [ -x "$(command -v nvidia-smi)" ]; then
-    export GPU_NUM_DEVICES=2
-  fi
-  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
-  export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
 
   pushd $XLA_DIR
     echo "Running Python Tests"
@@ -176,7 +171,15 @@ function run_torch_xla_tests() {
         fi
       fi
     fi
+  popd
+}
 
+function run_torch_xla_cpp_tests() {
+  PYTORCH_DIR=$1
+  XLA_DIR=$2
+  USE_COVERAGE="${3:-0}"
+
+  pushd $XLA_DIR
     echo "Running C++ Tests on PJRT"
     EXTRA_ARGS=""
     if [ "$USE_COVERAGE" != "0" ]; then
@@ -185,25 +188,58 @@ function run_torch_xla_tests() {
     if [ ! -z "$GCLOUD_SERVICE_KEY_FILE" ]; then
 	    EXTRA_ARGS="$EXTRA_ARGS -R"
     fi
-    if [ -x "$(command -v nvidia-smi)" ]; then
-      PJRT_DEVICE=GPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
-      if [ "$USE_COVERAGE" != "0" ]; then
+
+    if [ "$USE_COVERAGE" != "0" ]; then
+      # TODO(yeounoh) shard the coverage testing
+      if [ -x "$(command -v nvidia-smi)" ]; then
+        PJRT_DEVICE=GPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
         cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov1.dat
-      fi
-      PJRT_DEVICE=GPU test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
-      if [ "$USE_COVERAGE" != "0" ]; then
+        PJRT_DEVICE=GPU test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
         cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov2.dat
         lcov --add-tracefile /tmp/cov1.dat -a /tmp/cov2.dat -o /tmp/merged.dat
-      fi
-    else
-      PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
-      if [ "$USE_COVERAGE" != "0" ]; then
+      else
+        PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
         cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
       fi
-    fi
-    if [ "$USE_COVERAGE" != "0" ]; then
       genhtml /tmp/merged.dat -o ~/htmlcov/cpp/cpp_lcov.info
       mv /tmp/merged.dat ~/htmlcov/cpp_lcov.info
+    else
+      # Shard GPU testing
+      if [ -x "$(command -v nvidia-smi)" ]; then
+        PJRT_DEVICE=GPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
+        PJRT_DEVICE=GPU test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
+      else
+        PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
+      fi
     fi
   popd
 }
+
+function run_torch_xla_tests() {
+  PYTORCH_DIR=$1
+  XLA_DIR=$2
+  USE_COVERAGE="${3:-0}"
+  RUN_CPP="${RUN_CPP_TESTS:0}"
+  RUN_PYTHON="${RUN_PYTHON_TESTS:0}"
+
+  if [ -x "$(command -v nvidia-smi)" ]; then
+    num_devices=$(nvidia-smi --list-gpus | wc -l)
+    echo "Found $num_devices GPU devices..."
+    export GPU_NUM_DEVICES=$num_devices
+  fi
+  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
+  export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
+
+  # TODO(yeounoh) test coverage workflow is not parallelized.
+  if [[ -z "$RUN_CPP_TESTS1" && -z "$RUN_CPP_TESTS2" && -z "$RUN_PYTHON_TESTS" || "$USE_COVERAGE" != "0" ]]; then
+    run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
+    run_torch_xla_cpp_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
+  else
+    # run python and cpp tests separately.
+    if [[ "$RUN_PYTHON_TESTS" == "python_tests" ]]; then
+      run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
+    else
+      run_torch_xla_cpp_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
+    fi
+  fi
+}
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
@@ -45,6 +45,21 @@ on:
 jobs:
   test:
     runs-on: ${{ inputs.runner }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # Use readable strings as they define the workflow titles.
+          - run_cpp_tests1: 'cpp_tests1'
+          - run_cpp_tests2: 'cpp_tests2'
+          - run_python_tests: 'python_tests'
+            run_xla_op_tests1: 'xla_op1'
+          - run_python_tests: 'python_tests'
+            run_xla_op_tests2: 'xla_op2'
+          - run_python_tests: 'python_tests'
+            run_xla_op_tests3: 'xla_op3'
+          - run_python_tests: 'python'
+            run_torch_mp_op_tests: 'torch_mp_op'
     timeout-minutes: ${{ inputs.timeout-minutes }}
     env:
       DOCKER_IMAGE: ${{ inputs.docker-image }}
@@ -54,6 +69,13 @@ jobs:
       XLA_SKIP_XRT_TESTS: ${{ inputs.disable-xrt }}
       XLA_SKIP_TORCH_OP_TESTS: ${{ inputs.disable-pjrt }}
       XLA_SKIP_MP_OP_TESTS: ${{ inputs.disable-pjrt }}
+      RUN_CPP_TESTS1: ${{ matrix.run_cpp_tests1 }}
+      RUN_CPP_TESTS2: ${{ matrix.run_cpp_tests2 }}
+      RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }}
+      RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }}
+      RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }}
+      RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }}
+      RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }}
     steps:
       - name: Setup Linux
         uses: pytorch/test-infra/.github/actions/setup-linux@main
@@ -88,12 +110,13 @@ jobs:
         run: |
           echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
           docker pull "${DOCKER_IMAGE}"
-          pid=$(docker run --shm-size=16g ${GPU_FLAG:-} -e USE_COVERAGE -e XLA_SKIP_XRT_TESTS -e XLA_SKIP_TORCH_OP_TESTS -e XLA_SKIP_MP_OP_TESTS -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
+          pid=$(docker run --shm-size=16g ${GPU_FLAG:-} -e USE_COVERAGE -e XLA_SKIP_XRT_TESTS -e XLA_SKIP_TORCH_OP_TESTS -e XLA_SKIP_MP_OP_TESTS -e RUN_CPP_TESTS1 -e RUN_CPP_TESTS2 -e RUN_PYTHON_TESTS -e RUN_XLA_OP_TESTS1 -e RUN_XLA_OP_TESTS2 -e RUN_XLA_OP_TESTS3 -e RUN_TORCH_MP_OP_TESTS -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
           echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> /tmp/pytorch/xla/default_credentials.json"
           echo "pid=${pid}" >> "${GITHUB_ENV}"
       - name: Test
         shell: bash
-        run: docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
+        run: |
+          docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
       - name: Upload coverage results
         if: ${{ inputs.collect-coverage }}
         shell: bash

diff --git a/.github/workflows/build_and_test_xrt.yml b/.github/workflows/build_and_test_xrt.yml
@@ -42,7 +42,7 @@ jobs:
     with:
       docker-image: ${{ needs.build.outputs.docker-image }}
       runner: linux.8xlarge.nvidia.gpu
-      timeout-minutes: 180
+      timeout-minutes: 300
       disable-xrt: 0
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
diff --git a/test/cpp/run_tests.sh b/test/cpp/run_tests.sh
@@ -85,9 +85,28 @@ if [[ "$BAZEL_VERB" == "coverage" ]]; then
   EXTRA_FLAGS="$EXTRA_FLAGS --remote_download_outputs=all" # for lcov symlink
 fi
 
-
-if [ "$LOGFILE" != "" ]; then
-  bazel $BAZEL_VERB $EXTRA_FLAGS //torch_xla/csrc/runtime:all //test/cpp:all --test_timeout 1000 ${FILTER:+"$FILTER"} 2> $LOGFILE
-else
-  bazel $BAZEL_VERB $EXTRA_FLAGS //torch_xla/csrc/runtime:all //test/cpp:all --test_timeout 1000 ${FILTER:+"$FILTER"}
+test_names=("all")
+if [[ "$RUN_CPP_TESTS1" == "cpp_tests1" ]]; then
+  test_names=("test_aten_xla_tensor_1"
+              "test_aten_xla_tensor_2"
+              "test_aten_xla_tensor_3"
+              "test_aten_xla_tensor_4")
+elif [[ "$RUN_CPP_TESTS2" == "cpp_tests2" ]]; then
+  test_names=("test_aten_xla_tensor_5"
+              "test_aten_xla_tensor_6"
+              "test_ir"
+              "test_lazy"
+              "test_replication"
+              "test_tensor"
+              "test_xla_backend_intf"
+              "test_xla_sharding")
 fi
+for name in "${test_names[@]}"; do
+  echo "Running $name cpp test..."
+  if [ "$LOGFILE" != "" ]; then
+    bazel $BAZEL_VERB $EXTRA_FLAGS //torch_xla/csrc/runtime:all //test/cpp:${name} --test_timeout 1000 ${FILTER:+"$FILTER"} 2> $LOGFILE
+  else
+    bazel $BAZEL_VERB $EXTRA_FLAGS //torch_xla/csrc/runtime:all //test/cpp:${name} --test_timeout 1000 ${FILTER:+"$FILTER"}
+  fi
+done
+
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -135,7 +135,12 @@ function run_torch_op_tests {
   run_dynamic "$CDIR/../../test/test_type_promotion.py" "$@" -v TestTypePromotionXLA
 }
 
-function run_xla_op_tests {
+#######################################################################################
+################################# XLA OP TESTS SHARDS #################################
+#######################################################################################
+
+# DO NOT MODIFY
+function run_xla_op_tests1 {
   run_dynamic "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_dynamic "$CDIR/ds/test_dynamic_shapes.py"
   run_dynamic "$CDIR/ds/test_dynamic_shape_models.py" "$@" --verbosity=$VERBOSITY
@@ -144,8 +149,13 @@ function run_xla_op_tests {
   run_test "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_test_without_functionalization "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_test "$CDIR/test_async_closures.py"
-  run_test "$CDIR/test_autocast.py"
   run_test "$CDIR/test_profiler.py"
+  run_test "$CDIR/pjrt/test_runtime.py"
+  run_test "$CDIR/pjrt/test_runtime_gpu.py"
+  run_test "$CDIR/pjrt/test_runtime_multi_cpu.py"
+  run_test "$CDIR/pjrt/test_internal_tpu.py"
+  run_test "$CDIR/pjrt/test_ddp.py"
+  run_test "$CDIR/pjrt/test_mesh_service.py"
   run_test "$CDIR/test_ops.py"
   run_test "$CDIR/test_metrics.py"
   run_test "$CDIR/test_zero1.py"
@@ -154,37 +164,44 @@ function run_xla_op_tests {
   run_test "$CDIR/dynamo/test_bridge.py"
   run_test "$CDIR/dynamo/test_num_output.py"
   run_save_tensor_ir "$CDIR/dynamo/test_dynamo_graph_dump.py"
-  run_downcast_bf16 "$CDIR/test_data_type.py"
   run_use_bf16 "$CDIR/test_data_type.py"
   run_xla_ir_debug "$CDIR/test_env_var_mapper.py"
   run_xla_hlo_debug "$CDIR/test_env_var_mapper.py"
   run_xla_hlo_debug "$CDIR/stablehlo/test_stablehlo_save_load.py"
+  run_save_tensor_ir "$CDIR/spmd/test_spmd_graph_dump.py"
+  run_save_tensor_hlo "$CDIR/spmd/test_spmd_graph_dump.py"
+}
+
+# DO NOT MODIFY
+function run_xla_op_tests2 {
+  run_downcast_bf16 "$CDIR/test_data_type.py"
+  run_test "$CDIR/test_autocast.py"  # TODO(yeounoh) this is expensive on GPU
+}
+
+# All the new xla op tests should go to run_xla_op_tests3
+function run_xla_op_tests3 {
   # TODO(qihqi): this test require tensorflow to run. need to setup separate
   #     CI with tf.
   run_xla_hlo_debug "$CDIR/stablehlo/test_stablehlo_inference.py"
   run_stablehlo_compile "$CDIR/stablehlo/test_stablehlo_compile.py"
-  run_test "$CDIR/pjrt/test_runtime.py"
-  run_test "$CDIR/pjrt/test_runtime_gpu.py"
-  run_test "$CDIR/pjrt/test_runtime_multi_cpu.py"
-  run_test "$CDIR/pjrt/test_internal_tpu.py"
-  run_test "$CDIR/pjrt/test_ddp.py"
-  run_test "$CDIR/pjrt/test_mesh_service.py"
   run_test "$CDIR/spmd/test_xla_sharding.py"
   run_test "$CDIR/spmd/test_xla_sharding_hlo.py"
   run_test "$CDIR/spmd/test_xla_virtual_device.py"
   run_test "$CDIR/spmd/test_dynamo_spmd.py"
   run_test "$CDIR/spmd/test_xla_distributed_checkpoint.py"
   run_test "$CDIR/spmd/test_xla_spmd_python_api_interaction.py"
-  run_save_tensor_ir "$CDIR/spmd/test_spmd_graph_dump.py"
-  run_save_tensor_hlo "$CDIR/spmd/test_spmd_graph_dump.py"
   run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY
   run_test "$CDIR/test_input_output_aliases.py"
   run_test "$CDIR/test_torch_distributed_xla_backend.py"
 }
 
+#######################################################################################
+
 function run_op_tests {
   run_torch_op_tests
-  run_xla_op_tests
+  run_xla_op_tests1
+  run_xla_op_tests2
+  run_xla_op_tests3
 }
 
 function run_mp_op_tests {
@@ -207,12 +224,33 @@ function run_mp_op_tests {
 }
 
 function run_tests {
-  run_xla_op_tests
-  if [[ "$XLA_SKIP_TORCH_OP_TESTS" != "1" ]]; then
+  # RUN_ flags filter an explicit test type to run, XLA_SKIP_ flags exclude one.
+  if [[ "$RUN_XLA_OP_TESTS1" == "xla_op1" ]]; then
+    echo "Running xla op tests..."
+    run_xla_op_tests1
+  elif [[ "$RUN_XLA_OP_TESTS2" == "xla_op2" ]]; then
+    echo "Running xla op tests..."
+    run_xla_op_tests2
+  elif [[ "$RUN_XLA_OP_TESTS3" == "xla_op3" ]]; then
+    echo "Running xla op tests..."
+    run_xla_op_tests3
+  elif [[ "$RUN_TORCH_MP_OP_TESTS" == "torch_mp_op" ]]; then
+    echo "Running torch op tests..."
     run_torch_op_tests
-  fi
-  if [[ "$XLA_SKIP_MP_OP_TESTS" != "1" ]]; then
     run_mp_op_tests
+  else
+    # Run full tests without sharding, respects XLA_SKIP_*
+    if [[ "$XLA_SKIP_XLA_OP_TESTS" != "1" ]]; then
+      run_xla_op_tests1
+      run_xla_op_tests2
+      run_xla_op_tests3
+    fi
+    if [[ "$XLA_SKIP_TORCH_OP_TESTS" != "1" ]]; then
+      run_torch_op_tests
+    fi
+    if [[ "$XLA_SKIP_MP_OP_TESTS" != "1" ]]; then
+      run_mp_op_tests
+    fi
   fi
 }