Skip to content

Commit

Permalink
Parallelize CI test runs (#5626)
Browse files Browse the repository at this point in the history
Parallelize CI test runs for non-coverage dev PRs. This is backward compatible in that the master branch for doc push & coverage, the upstream CI workflows and the local testing shouldn't be affected /sharded.
  • Loading branch information
yeounoh authored and zpcore committed Sep 28, 2023
1 parent 67854bd commit 9b1952d
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 42 deletions.
72 changes: 54 additions & 18 deletions .circleci/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,10 @@ function build_torch_xla() {
popd
}

function run_torch_xla_tests() {
function run_torch_xla_python_tests() {
PYTORCH_DIR=$1
XLA_DIR=$2
USE_COVERAGE="${3:-0}"
if [ -x "$(command -v nvidia-smi)" ]; then
export GPU_NUM_DEVICES=2
fi
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")

pushd $XLA_DIR
echo "Running Python Tests"
Expand Down Expand Up @@ -176,7 +171,15 @@ function run_torch_xla_tests() {
fi
fi
fi
popd
}

function run_torch_xla_cpp_tests() {
PYTORCH_DIR=$1
XLA_DIR=$2
USE_COVERAGE="${3:-0}"

pushd $XLA_DIR
echo "Running C++ Tests on PJRT"
EXTRA_ARGS=""
if [ "$USE_COVERAGE" != "0" ]; then
Expand All @@ -185,25 +188,58 @@ function run_torch_xla_tests() {
if [ ! -z "$GCLOUD_SERVICE_KEY_FILE" ]; then
EXTRA_ARGS="$EXTRA_ARGS -R"
fi
if [ -x "$(command -v nvidia-smi)" ]; then
PJRT_DEVICE=GPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
if [ "$USE_COVERAGE" != "0" ]; then

if [ "$USE_COVERAGE" != "0" ]; then
# TODO(yeounoh) shard the coverage testing
if [ -x "$(command -v nvidia-smi)" ]; then
PJRT_DEVICE=GPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov1.dat
fi
PJRT_DEVICE=GPU test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
if [ "$USE_COVERAGE" != "0" ]; then
PJRT_DEVICE=GPU test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov2.dat
lcov --add-tracefile /tmp/cov1.dat -a /tmp/cov2.dat -o /tmp/merged.dat
fi
else
PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
if [ "$USE_COVERAGE" != "0" ]; then
else
PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
fi
fi
if [ "$USE_COVERAGE" != "0" ]; then
genhtml /tmp/merged.dat -o ~/htmlcov/cpp/cpp_lcov.info
mv /tmp/merged.dat ~/htmlcov/cpp_lcov.info
else
# Shard GPU testing
if [ -x "$(command -v nvidia-smi)" ]; then
PJRT_DEVICE=GPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
PJRT_DEVICE=GPU test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
else
PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
fi
fi
popd
}

function run_torch_xla_tests() {
PYTORCH_DIR=$1
XLA_DIR=$2
USE_COVERAGE="${3:-0}"
RUN_CPP="${RUN_CPP_TESTS:0}"
RUN_PYTHON="${RUN_PYTHON_TESTS:0}"

if [ -x "$(command -v nvidia-smi)" ]; then
num_devices=$(nvidia-smi --list-gpus | wc -l)
echo "Found $num_devices GPU devices..."
export GPU_NUM_DEVICES=$num_devices
fi
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")

# TODO(yeounoh) test coverage workflow is not parallelized.
if [[ -z "$RUN_CPP_TESTS1" && -z "$RUN_CPP_TESTS2" && -z "$RUN_PYTHON_TESTS" || "$USE_COVERAGE" != "0" ]]; then
run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
run_torch_xla_cpp_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
else
# run python and cpp tests separately.
if [[ "$RUN_PYTHON_TESTS" == "python_tests" ]]; then
run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
else
run_torch_xla_cpp_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
fi
fi
}
27 changes: 25 additions & 2 deletions .github/workflows/_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,21 @@ on:
jobs:
test:
runs-on: ${{ inputs.runner }}
strategy:
fail-fast: false
matrix:
include:
# Use readable strings as they define the workflow titles.
- run_cpp_tests1: 'cpp_tests1'
- run_cpp_tests2: 'cpp_tests2'
- run_python_tests: 'python_tests'
run_xla_op_tests1: 'xla_op1'
- run_python_tests: 'python_tests'
run_xla_op_tests2: 'xla_op2'
- run_python_tests: 'python_tests'
run_xla_op_tests3: 'xla_op3'
- run_python_tests: 'python'
run_torch_mp_op_tests: 'torch_mp_op'
timeout-minutes: ${{ inputs.timeout-minutes }}
env:
DOCKER_IMAGE: ${{ inputs.docker-image }}
Expand All @@ -54,6 +69,13 @@ jobs:
XLA_SKIP_XRT_TESTS: ${{ inputs.disable-xrt }}
XLA_SKIP_TORCH_OP_TESTS: ${{ inputs.disable-pjrt }}
XLA_SKIP_MP_OP_TESTS: ${{ inputs.disable-pjrt }}
RUN_CPP_TESTS1: ${{ matrix.run_cpp_tests1 }}
RUN_CPP_TESTS2: ${{ matrix.run_cpp_tests2 }}
RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }}
RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }}
RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }}
RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }}
RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }}
steps:
- name: Setup Linux
uses: pytorch/test-infra/.github/actions/setup-linux@main
Expand Down Expand Up @@ -88,12 +110,13 @@ jobs:
run: |
echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
docker pull "${DOCKER_IMAGE}"
pid=$(docker run --shm-size=16g ${GPU_FLAG:-} -e USE_COVERAGE -e XLA_SKIP_XRT_TESTS -e XLA_SKIP_TORCH_OP_TESTS -e XLA_SKIP_MP_OP_TESTS -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
pid=$(docker run --shm-size=16g ${GPU_FLAG:-} -e USE_COVERAGE -e XLA_SKIP_XRT_TESTS -e XLA_SKIP_TORCH_OP_TESTS -e XLA_SKIP_MP_OP_TESTS -e RUN_CPP_TESTS1 -e RUN_CPP_TESTS2 -e RUN_PYTHON_TESTS -e RUN_XLA_OP_TESTS1 -e RUN_XLA_OP_TESTS2 -e RUN_XLA_OP_TESTS3 -e RUN_TORCH_MP_OP_TESTS -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> /tmp/pytorch/xla/default_credentials.json"
echo "pid=${pid}" >> "${GITHUB_ENV}"
- name: Test
shell: bash
run: docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
run: |
docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
- name: Upload coverage results
if: ${{ inputs.collect-coverage }}
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test_xrt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
with:
docker-image: ${{ needs.build.outputs.docker-image }}
runner: linux.8xlarge.nvidia.gpu
timeout-minutes: 180
timeout-minutes: 300
disable-xrt: 0
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
29 changes: 24 additions & 5 deletions test/cpp/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,28 @@ if [[ "$BAZEL_VERB" == "coverage" ]]; then
EXTRA_FLAGS="$EXTRA_FLAGS --remote_download_outputs=all" # for lcov symlink
fi


if [ "$LOGFILE" != "" ]; then
bazel $BAZEL_VERB $EXTRA_FLAGS //torch_xla/csrc/runtime:all //test/cpp:all --test_timeout 1000 ${FILTER:+"$FILTER"} 2> $LOGFILE
else
bazel $BAZEL_VERB $EXTRA_FLAGS //torch_xla/csrc/runtime:all //test/cpp:all --test_timeout 1000 ${FILTER:+"$FILTER"}
test_names=("all")
if [[ "$RUN_CPP_TESTS1" == "cpp_tests1" ]]; then
test_names=("test_aten_xla_tensor_1"
"test_aten_xla_tensor_2"
"test_aten_xla_tensor_3"
"test_aten_xla_tensor_4")
elif [[ "$RUN_CPP_TESTS2" == "cpp_tests2" ]]; then
test_names=("test_aten_xla_tensor_5"
"test_aten_xla_tensor_6"
"test_ir"
"test_lazy"
"test_replication"
"test_tensor"
"test_xla_backend_intf"
"test_xla_sharding")
fi
for name in "${test_names[@]}"; do
echo "Running $name cpp test..."
if [ "$LOGFILE" != "" ]; then
bazel $BAZEL_VERB $EXTRA_FLAGS //torch_xla/csrc/runtime:all //test/cpp:${name} --test_timeout 1000 ${FILTER:+"$FILTER"} 2> $LOGFILE
else
bazel $BAZEL_VERB $EXTRA_FLAGS //torch_xla/csrc/runtime:all //test/cpp:${name} --test_timeout 1000 ${FILTER:+"$FILTER"}
fi
done

70 changes: 54 additions & 16 deletions test/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,12 @@ function run_torch_op_tests {
run_dynamic "$CDIR/../../test/test_type_promotion.py" "$@" -v TestTypePromotionXLA
}

function run_xla_op_tests {
#######################################################################################
################################# XLA OP TESTS SHARDS #################################
#######################################################################################

# DO NOT MODIFY
function run_xla_op_tests1 {
run_dynamic "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
run_dynamic "$CDIR/ds/test_dynamic_shapes.py"
run_dynamic "$CDIR/ds/test_dynamic_shape_models.py" "$@" --verbosity=$VERBOSITY
Expand All @@ -144,8 +149,13 @@ function run_xla_op_tests {
run_test "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
run_test_without_functionalization "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
run_test "$CDIR/test_async_closures.py"
run_test "$CDIR/test_autocast.py"
run_test "$CDIR/test_profiler.py"
run_test "$CDIR/pjrt/test_runtime.py"
run_test "$CDIR/pjrt/test_runtime_gpu.py"
run_test "$CDIR/pjrt/test_runtime_multi_cpu.py"
run_test "$CDIR/pjrt/test_internal_tpu.py"
run_test "$CDIR/pjrt/test_ddp.py"
run_test "$CDIR/pjrt/test_mesh_service.py"
run_test "$CDIR/test_ops.py"
run_test "$CDIR/test_metrics.py"
run_test "$CDIR/test_zero1.py"
Expand All @@ -154,37 +164,44 @@ function run_xla_op_tests {
run_test "$CDIR/dynamo/test_bridge.py"
run_test "$CDIR/dynamo/test_num_output.py"
run_save_tensor_ir "$CDIR/dynamo/test_dynamo_graph_dump.py"
run_downcast_bf16 "$CDIR/test_data_type.py"
run_use_bf16 "$CDIR/test_data_type.py"
run_xla_ir_debug "$CDIR/test_env_var_mapper.py"
run_xla_hlo_debug "$CDIR/test_env_var_mapper.py"
run_xla_hlo_debug "$CDIR/stablehlo/test_stablehlo_save_load.py"
run_save_tensor_ir "$CDIR/spmd/test_spmd_graph_dump.py"
run_save_tensor_hlo "$CDIR/spmd/test_spmd_graph_dump.py"
}

# DO NOT MODIFY
function run_xla_op_tests2 {
run_downcast_bf16 "$CDIR/test_data_type.py"
run_test "$CDIR/test_autocast.py" # TODO(yeounoh) this is expensive on GPU
}

# All the new xla op tests should go to run_xla_op_tests3
function run_xla_op_tests3 {
# TODO(qihqi): this test require tensorflow to run. need to setup separate
# CI with tf.
run_xla_hlo_debug "$CDIR/stablehlo/test_stablehlo_inference.py"
run_stablehlo_compile "$CDIR/stablehlo/test_stablehlo_compile.py"
run_test "$CDIR/pjrt/test_runtime.py"
run_test "$CDIR/pjrt/test_runtime_gpu.py"
run_test "$CDIR/pjrt/test_runtime_multi_cpu.py"
run_test "$CDIR/pjrt/test_internal_tpu.py"
run_test "$CDIR/pjrt/test_ddp.py"
run_test "$CDIR/pjrt/test_mesh_service.py"
run_test "$CDIR/spmd/test_xla_sharding.py"
run_test "$CDIR/spmd/test_xla_sharding_hlo.py"
run_test "$CDIR/spmd/test_xla_virtual_device.py"
run_test "$CDIR/spmd/test_dynamo_spmd.py"
run_test "$CDIR/spmd/test_xla_distributed_checkpoint.py"
run_test "$CDIR/spmd/test_xla_spmd_python_api_interaction.py"
run_save_tensor_ir "$CDIR/spmd/test_spmd_graph_dump.py"
run_save_tensor_hlo "$CDIR/spmd/test_spmd_graph_dump.py"
run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY
run_test "$CDIR/test_input_output_aliases.py"
run_test "$CDIR/test_torch_distributed_xla_backend.py"
}

#######################################################################################

function run_op_tests {
run_torch_op_tests
run_xla_op_tests
run_xla_op_tests1
run_xla_op_tests2
run_xla_op_tests3
}

function run_mp_op_tests {
Expand All @@ -207,12 +224,33 @@ function run_mp_op_tests {
}

function run_tests {
run_xla_op_tests
if [[ "$XLA_SKIP_TORCH_OP_TESTS" != "1" ]]; then
# RUN_ flags filter an explicit test type to run, XLA_SKIP_ flags exclude one.
if [[ "$RUN_XLA_OP_TESTS1" == "xla_op1" ]]; then
echo "Running xla op tests..."
run_xla_op_tests1
elif [[ "$RUN_XLA_OP_TESTS2" == "xla_op2" ]]; then
echo "Running xla op tests..."
run_xla_op_tests2
elif [[ "$RUN_XLA_OP_TESTS3" == "xla_op3" ]]; then
echo "Running xla op tests..."
run_xla_op_tests3
elif [[ "$RUN_TORCH_MP_OP_TESTS" == "torch_mp_op" ]]; then
echo "Running torch op tests..."
run_torch_op_tests
fi
if [[ "$XLA_SKIP_MP_OP_TESTS" != "1" ]]; then
run_mp_op_tests
else
# Run full tests without sharding, respects XLA_SKIP_*
if [[ "$XLA_SKIP_XLA_OP_TESTS" != "1" ]]; then
run_xla_op_tests1
run_xla_op_tests2
run_xla_op_tests3
fi
if [[ "$XLA_SKIP_TORCH_OP_TESTS" != "1" ]]; then
run_torch_op_tests
fi
if [[ "$XLA_SKIP_MP_OP_TESTS" != "1" ]]; then
run_mp_op_tests
fi
fi
}

Expand Down

0 comments on commit 9b1952d

Please sign in to comment.