From 0ff871d0ccc6fba15ce49f265f80d4fd5d054268 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Thu, 26 Sep 2024 21:51:56 +0000 Subject: [PATCH 01/10] Pin update --- WORKSPACE | 3 ++- setup.py | 2 +- torch_xla/csrc/BUILD | 1 + torch_xla/csrc/runtime/ifrt_computation_client.h | 1 + torch_xla/csrc/runtime/pjrt_computation_client.cc | 2 +- torch_xla/csrc/xla_sharding_util.cpp | 2 +- 6 files changed, 7 insertions(+), 4 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index 585891e149b..423d1f8cd08 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -50,7 +50,8 @@ new_local_repository( # curl -L https://github.com/openxla/xla/archive/.tar.gz | sha256sum # and update the sha256 with the result. -xla_hash = '32ebd694c4d0442e241d76324ff1a721831366b4' +#xla_hash = '32ebd694c4d0442e241d76324ff1a721831366b4' +xla_hash = '06bbcd1a798cd49bb811674fbed8823dfef51cc4' http_archive( name = "xla", diff --git a/setup.py b/setup.py index 778daf0cf1c..e10ff470127 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ base_dir = os.path.dirname(os.path.abspath(__file__)) -_date = '20240913' +_date = '20240926' _libtpu_version = f'0.1.dev{_date}' _libtpu_storage_path = f'https://storage.googleapis.com/libtpu-nightly-releases/wheels/libtpu-nightly/libtpu_nightly-{_libtpu_version}+nightly-py3-none-any.whl' _jax_version = f'0.4.33.dev{_date}' diff --git a/torch_xla/csrc/BUILD b/torch_xla/csrc/BUILD index 89fefda457f..1287ffbde98 100644 --- a/torch_xla/csrc/BUILD +++ b/torch_xla/csrc/BUILD @@ -151,6 +151,7 @@ ptxla_cc_library( "@xla//xla/client/lib:slicing", "@xla//xla/client/lib:sorting", "@xla//xla/client/lib:svd", + "@xla//xla/hlo/pass:hlo_pass_pipeline", "@xla//xla/stream_executor:dnn", "@tsl//tsl/platform:errors", "@tsl//tsl/profiler/lib:traceme", diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.h b/torch_xla/csrc/runtime/ifrt_computation_client.h index f712a30f221..fd34021393d 100644 --- a/torch_xla/csrc/runtime/ifrt_computation_client.h +++ b/torch_xla/csrc/runtime/ifrt_computation_client.h @@ -19,6 +19,7 @@ #include "xla/python/ifrt/hlo/hlo_program.h" #include "xla/python/pjrt_ifrt/pjrt_array.h" #include "xla/python/pjrt_ifrt/pjrt_client.h" +#include "xla/python/pjrt_ifrt/pjrt_dtype.h" #include "xla/python/pjrt_ifrt/xla_compiler.h" #include "xla/shape.h" diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cc b/torch_xla/csrc/runtime/pjrt_computation_client.cc index 280a733bebe..74403b88040 100644 --- a/torch_xla/csrc/runtime/pjrt_computation_client.cc +++ b/torch_xla/csrc/runtime/pjrt_computation_client.cc @@ -1025,7 +1025,7 @@ void PjRtComputationClient::RegisterCustomCall(const std::string& fn_name, args.function_name = fn_name.c_str(); args.function_name_size = fn_name.size(); args.api_version = 0; - args.custom_call_function = function_ptr; + args.handler_execute = function_ptr; PJRT_Error* error = reinterpret_cast(next)->custom_call(&args); if (error) { diff --git a/torch_xla/csrc/xla_sharding_util.cpp b/torch_xla/csrc/xla_sharding_util.cpp index e6a10c1740b..c48eba0e970 100644 --- a/torch_xla/csrc/xla_sharding_util.cpp +++ b/torch_xla/csrc/xla_sharding_util.cpp @@ -23,9 +23,9 @@ #include "tsl/profiler/lib/traceme.h" #include "xla/execution_options_util.h" #include "xla/hlo/ir/hlo_module.h" +#include "xla/hlo/pass/hlo_pass_pipeline.h" #include "xla/protobuf_util.h" #include "xla/service/hlo_parser.h" -#include "xla/service/hlo_pass_pipeline.h" #include "xla/service/hlo_verifier.h" #include "xla/service/sharding_propagation.h" #include "xla/service/spmd/spmd_partitioner.h" From 324cf2c6f968296da7222fc99971f6325e3b7a42 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Fri, 27 Sep 2024 21:08:03 +0000 Subject: [PATCH 02/10] hermetic --- .bazelrc | 5 +++++ WORKSPACE | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/.bazelrc b/.bazelrc index 694cf3fd125..92fa1a0f760 100644 --- a/.bazelrc +++ b/.bazelrc @@ -53,6 +53,11 @@ build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain build:cuda --@local_config_cuda//:enable_cuda build:cuda --define=xla_python_enable_gpu=true build:cuda --cxxopt=-DXLA_CUDA=1 +build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2" +build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.1.1" +build:cuda --repo_env=HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90" +build:cuda --@local_config_cuda//cuda:include_cuda_libs=true +test:cuda --@cuda_driver//:enable_forward_compatibility=true # Coverage with cuda/gcc/nvcc requires manually setting coverage flags. coverage:cuda --per_file_copt=third_party/.*,torch_xla/.*@--coverage diff --git a/WORKSPACE b/WORKSPACE index 423d1f8cd08..50e3ed01fbb 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -137,7 +137,56 @@ load("@xla//:workspace0.bzl", "xla_workspace0") xla_workspace0() -load("@tsl//third_party/gpus:cuda_configure.bzl", "cuda_configure") +load( + "@tsl//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl", + "cuda_json_init_repository", +) + +cuda_json_init_repository() + +load( + "@tsl//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl", + "cuda_json_init_repository", +) + +cuda_json_init_repository() + +load( + "@cuda_redist_json//:distributions.bzl", + "CUDA_REDISTRIBUTIONS", + "CUDNN_REDISTRIBUTIONS", +) +load( + "@tsl//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl", + "cuda_redist_init_repositories", + "cudnn_redist_init_repository", +) + +cuda_redist_init_repositories( + cuda_redistributions = CUDA_REDISTRIBUTIONS, +) + +cudnn_redist_init_repository( + cudnn_redistributions = CUDNN_REDISTRIBUTIONS, +) + +load( + "@tsl//third_party/gpus/cuda/hermetic:cuda_configure.bzl", + "cuda_configure", +) + cuda_configure(name = "local_config_cuda") -load("@tsl//third_party/nccl:nccl_configure.bzl", "nccl_configure") + +load( + "@tsl//third_party/nccl/hermetic:nccl_redist_init_repository.bzl", + "nccl_redist_init_repository", +) + +nccl_redist_init_repository() + +load( + "@tsl//third_party/nccl/hermetic:nccl_configure.bzl", + "nccl_configure", +) + nccl_configure(name = "local_config_nccl") From 756be912d60d05d9ef91fd972942dc12a956420a Mon Sep 17 00:00:00 2001 From: Han Qi Date: Fri, 27 Sep 2024 21:15:31 +0000 Subject: [PATCH 03/10] use jax stable version --- WORKSPACE | 3 +-- setup.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index 50e3ed01fbb..ccf62e4adf4 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -50,8 +50,7 @@ new_local_repository( # curl -L https://github.com/openxla/xla/archive/.tar.gz | sha256sum # and update the sha256 with the result. -#xla_hash = '32ebd694c4d0442e241d76324ff1a721831366b4' -xla_hash = '06bbcd1a798cd49bb811674fbed8823dfef51cc4' +xla_hash = '590cd6fcd1ed24ab9cf494789a0fc524b94a4a6a' http_archive( name = "xla", diff --git a/setup.py b/setup.py index e10ff470127..2488314d2ea 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ base_dir = os.path.dirname(os.path.abspath(__file__)) -_date = '20240926' +_date = '20240916' _libtpu_version = f'0.1.dev{_date}' _libtpu_storage_path = f'https://storage.googleapis.com/libtpu-nightly-releases/wheels/libtpu-nightly/libtpu_nightly-{_libtpu_version}+nightly-py3-none-any.whl' _jax_version = f'0.4.33.dev{_date}' From 172800bcd106cc5e24288a9ca6be40c1de38947c Mon Sep 17 00:00:00 2001 From: Han Qi Date: Fri, 27 Sep 2024 21:28:25 +0000 Subject: [PATCH 04/10] test --- .bazelrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.bazelrc b/.bazelrc index 92fa1a0f760..2eff9d4b64b 100644 --- a/.bazelrc +++ b/.bazelrc @@ -55,7 +55,7 @@ build:cuda --define=xla_python_enable_gpu=true build:cuda --cxxopt=-DXLA_CUDA=1 build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2" build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.1.1" -build:cuda --repo_env=HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90" +#build:cuda --repo_env=HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90" build:cuda --@local_config_cuda//cuda:include_cuda_libs=true test:cuda --@cuda_driver//:enable_forward_compatibility=true From 447487e341d816505fcd6c195b30e275b7cbddc4 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Fri, 27 Sep 2024 21:55:09 +0000 Subject: [PATCH 05/10] test --- infra/ansible/config/apt.yaml | 1 + infra/ansible/config/env.yaml | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/infra/ansible/config/apt.yaml b/infra/ansible/config/apt.yaml index a779d8b5ec5..f80256f3159 100644 --- a/infra/ansible/config/apt.yaml +++ b/infra/ansible/config/apt.yaml @@ -16,6 +16,7 @@ apt: - clang-format - gcc-10 - g++-10 + - clang-17 - lcov - less diff --git a/infra/ansible/config/env.yaml b/infra/ansible/config/env.yaml index 909e96a5189..1bfe9d46b7f 100644 --- a/infra/ansible/config/env.yaml +++ b/infra/ansible/config/env.yaml @@ -3,10 +3,10 @@ release_env: common: # Force GCC because clang/bazel has issues. - CC: gcc-10 - CXX: g++-10 - # CC: "clang-{{ clang_version }}" - # CXX: "clang++-{{ clang_version }}" + # CC: gcc-10 + #CXX: g++-10 + CC: "clang" + CXX: "clang++" LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib" tpu: @@ -24,8 +24,10 @@ build_env: # Set explicitly to 0 as setup.py defaults this flag to true if unset. BUILD_CPP_TESTS: "{{ build_cpp_tests }}" # Force GCC because clang/bazel has issues. - CC: gcc-10 - CXX: g++-10 + # CC: gcc-10 + # CXX: g++-10 + CC: "clang" + CXX: "clang++" PYTORCH_BUILD_NUMBER: 1 TORCH_XLA_VERSION: "{{ package_version }}" PYTORCH_BUILD_VERSION: "{{ package_version }}" From 6e18028cbbdfe6a9f45b782936d82eedf12556c1 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Mon, 30 Sep 2024 18:14:59 +0000 Subject: [PATCH 06/10] clang --- .bazelrc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.bazelrc b/.bazelrc index 2eff9d4b64b..9c55f82d51f 100644 --- a/.bazelrc +++ b/.bazelrc @@ -25,8 +25,8 @@ build -c opt build --config=short_logs # Force GCC because clang/bazel has issues. -build --action_env=CC=gcc -build --action_env=CXX=g++ +build --action_env=CC=clang +build --action_env=CXX=clang++ build --spawn_strategy=standalone ########################################################################### From 235c42f78ed68efe66506c331f4dc841ac955d1a Mon Sep 17 00:00:00 2001 From: Han Qi Date: Mon, 30 Sep 2024 18:44:49 +0000 Subject: [PATCH 07/10] clang --- infra/ansible/config/apt.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/infra/ansible/config/apt.yaml b/infra/ansible/config/apt.yaml index f80256f3159..5edc4e40159 100644 --- a/infra/ansible/config/apt.yaml +++ b/infra/ansible/config/apt.yaml @@ -17,6 +17,7 @@ apt: - gcc-10 - g++-10 - clang-17 + - clang - lcov - less From ddc40c071d281479652739cecb6d94f623be36b5 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Mon, 30 Sep 2024 19:08:59 +0000 Subject: [PATCH 08/10] install clang --- .circleci/build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.circleci/build.sh b/.circleci/build.sh index 8db3598f216..60f51121898 100755 --- a/.circleci/build.sh +++ b/.circleci/build.sh @@ -53,4 +53,6 @@ export BUILD_CPP_TESTS='1' export TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_70,sm_75,compute_80,$TF_CUDA_COMPUTE_CAPABILITIES" build_torch_xla $XLA_DIR +apt-get install clang-17 + popd From 753198e60fcbe3d665b90a485d4b70a045293a06 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Mon, 30 Sep 2024 19:36:01 +0000 Subject: [PATCH 09/10] clang again --- .github/workflows/_build_plugin.yml | 2 ++ .github/workflows/_build_torch_xla.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/_build_plugin.yml b/.github/workflows/_build_plugin.yml index 8e7eadbf632..4d2230afd7a 100644 --- a/.github/workflows/_build_plugin.yml +++ b/.github/workflows/_build_plugin.yml @@ -41,6 +41,8 @@ jobs: shell: bash run: | cd pytorch/xla/infra/ansible + apt install clang-17 + echo 'which clang++' ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5,8.6 src_root=${GITHUB_WORKSPACE} cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps - name: Upload wheel uses: actions/upload-artifact@v4 diff --git a/.github/workflows/_build_torch_xla.yml b/.github/workflows/_build_torch_xla.yml index 6df169ed5ac..163a8253a04 100644 --- a/.github/workflows/_build_torch_xla.yml +++ b/.github/workflows/_build_torch_xla.yml @@ -47,6 +47,8 @@ jobs: shell: bash run: | cd pytorch/xla/infra/ansible + apt install clang-17 + echo 'which clang++' ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps - name: Upload wheel uses: actions/upload-artifact@v4 From dc2c5fe3f7a2af523f6c5364e0df83fc800c9267 Mon Sep 17 00:00:00 2001 From: Han Qi Date: Mon, 30 Sep 2024 22:57:22 +0000 Subject: [PATCH 10/10] clang --- .github/workflows/_build_plugin.yml | 4 ++-- .github/workflows/_build_torch_xla.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_build_plugin.yml b/.github/workflows/_build_plugin.yml index 4d2230afd7a..f95aaf03c9d 100644 --- a/.github/workflows/_build_plugin.yml +++ b/.github/workflows/_build_plugin.yml @@ -41,8 +41,8 @@ jobs: shell: bash run: | cd pytorch/xla/infra/ansible - apt install clang-17 - echo 'which clang++' + apt install -y clang + bazel clean --expunge || true ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5,8.6 src_root=${GITHUB_WORKSPACE} cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps - name: Upload wheel uses: actions/upload-artifact@v4 diff --git a/.github/workflows/_build_torch_xla.yml b/.github/workflows/_build_torch_xla.yml index 163a8253a04..5c7676f7395 100644 --- a/.github/workflows/_build_torch_xla.yml +++ b/.github/workflows/_build_torch_xla.yml @@ -47,8 +47,8 @@ jobs: shell: bash run: | cd pytorch/xla/infra/ansible - apt install clang-17 - echo 'which clang++' + apt install -y clang + bazel clean --expunge || true ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps - name: Upload wheel uses: actions/upload-artifact@v4