Skip to content

Commit

Permalink
Separate build and test phase for triton
Browse files Browse the repository at this point in the history
  • Loading branch information
bhavya01 committed Apr 16, 2024
1 parent cb0bb85 commit 015b1ad
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 87 deletions.
7 changes: 1 addition & 6 deletions .circleci/triton.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,4 @@ python setup.py install
XLA_DIR=$PYTORCH_DIR/xla
export TF_CUDA_COMPUTE_CAPABILITIES="compute_86"
export XLA_CUDA=1
build_torch_xla $XLA_DIR

export GCLOUD_SERVICE_KEY_FILE="$XLA_DIR/default_credentials.json"
export SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1' # cache bucket for CI
export TRITON_PTXAS_PATH='/usr/local/cuda/bin/ptxas'
python3 $XLA_DIR/test/test_triton.py
build_torch_xla $XLA_DIR
6 changes: 3 additions & 3 deletions .github/workflows/_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ on:
required: false
type: string
description: Runner type for the test
default: linux.24xlarge
default: linux.12xlarge
cuda:
required: false
type: string
Expand Down Expand Up @@ -68,8 +68,8 @@ jobs:
# if image layers are not present in the repo.
# Note: disable the following 2 lines while testing a new image, so we do not
# push to the upstream.
# docker tag "${GCR_DOCKER_IMAGE}" "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
# docker push "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
docker tag "${GCR_DOCKER_IMAGE}" "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
docker push "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
- name: Start the container
shell: bash
run: |
Expand Down
76 changes: 0 additions & 76 deletions .github/workflows/_triton.yml

This file was deleted.

44 changes: 42 additions & 2 deletions .github/workflows/triton.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ concurrency:
cancel-in-progress: true

jobs:
test-triton:
runs-on: linux.g5.4xlarge.nvidia.gpu
build-triton:
runs-on: linux.24xlarge
timeout-minutes: 300
env:
DOCKER_IMAGE: gcr.io/tpu-pytorch/xla_base:dev-3.8_cuda_12.1
ECR_DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
WORKDIR: /triton_dir
steps:
- name: Setup Linux
Expand Down Expand Up @@ -52,6 +53,45 @@ jobs:
shell: bash
run: |
docker exec --privileged -u jenkins "${pid}" bash -c ".circleci/triton.sh"
- name: Push built docker image to ECR
id: upload-docker-image
shell: bash
run: |
export COMMIT_DOCKER_IMAGE="${ECR_DOCKER_IMAGE_BASE}:triton-${GITHUB_SHA}"
time docker commit "${pid}" "${COMMIT_DOCKER_IMAGE}"
time docker push "${COMMIT_DOCKER_IMAGE}"
echo "docker-image=${COMMIT_DOCKER_IMAGE}" >> "${GITHUB_OUTPUT}"
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()
test-triton:
runs-on: linux.g5.4xlarge.nvidia.gpu
timeout-minutes: 300
needs: build-triton
env:
DOCKER_IMAGE: ${{ needs.build-triton.outputs.docker-image }}
WORKDIR: /triton_dir
steps:
- name: Setup Linux
uses: pytorch/test-infra/.github/actions/setup-linux@main
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
instructions: |
Tests are done inside the container, to start an interactive session run:
docker exec -it $(docker container ps --format '{{.ID}}') bash
- name: Download and run docker image from GCR
shell: bash
run: |
echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
docker pull "${DOCKER_IMAGE}"
pid=$(docker run --shm-size=16g ${GPU_FLAG:-} -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
echo "pid=${pid}" >> "${GITHUB_ENV}"
- name: Test
shell: bash
run: |
docker exec --privileged -u jenkins "${pid}" bash -c 'TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas python test/test_triton.py'
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()

0 comments on commit 015b1ad

Please sign in to comment.