Nightly Docker Release #5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly Docker Release | |
| on: | |
| schedule: | |
| - cron: '0 14 * * *' # Every day at 22:00 Beijing Time (UTC+8) | |
| workflow_dispatch: | |
| inputs: | |
| base_image: | |
| description: "Base Docker image for ATOM build" | |
| type: choice | |
| default: "rocm/pytorch:latest" | |
| options: | |
| - rocm/pytorch:latest | |
| - rocm/pytorch-nightly:latest | |
| - rocm/pytorch:rocm7.1.1_ubuntu24.04_py3.12_pytorch_release_2.10.0 | |
| atom_repo: | |
| description: "ATOM repo" | |
| default: "https://github.com/ROCm/ATOM.git" | |
| atom_commit: | |
| description: "ATOM commit" | |
| default: "HEAD" | |
| aiter_repo: | |
| description: "Aiter repo" | |
| default: "https://github.com/ROCm/aiter.git" | |
| aiter_commit: | |
| description: "Aiter commit" | |
| default: "HEAD" | |
| rccl_repo: | |
| description: "RCCL repo" | |
| default: "https://github.com/ROCm/rccl.git" | |
| rccl_branch: | |
| description: "RCCL branch" | |
| default: "29e1567b95e28823b0beb1a988adc587bfab5b4f" | |
| runner: | |
| description: "Runner label to use" | |
| type: choice | |
| default: "linux-atom-mi355-1" | |
| options: | |
| - linux-atom-mi355-1 | |
| - build-only-atom | |
| - atom-mi355-8gpu.predownload | |
| skip_tests: | |
| description: "Skip test step" | |
| type: boolean | |
| default: false | |
| build_oot_image: | |
| description: "Build OOT vLLM image for manual runs (scheduled nightly runs always build OOT)" | |
| type: boolean | |
| default: false | |
| oot_base_image: | |
| description: "Base ATOM image for OOT vLLM (empty means the atom_release:ci image built in this job)" | |
| default: "" | |
| vllm_commit: | |
| description: "vLLM commit for OOT image" | |
| default: "b31e9326a7d9394aab8c767f8ebe225c65594b60" | |
| vllm_version: | |
| description: "vLLM version label for OOT image tags" | |
| default: "0.17.0" | |
| jobs: | |
| docker-release: | |
| name: Nightly Docker Release | |
| runs-on: ${{ inputs.runner || 'linux-atom-mi355-1' }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| base_image: | |
| - ${{ inputs.base_image || 'rocm/pytorch:latest' }} | |
| env: | |
| ATOM_REPO: "https://github.com/ROCm/ATOM.git" | |
| ATOM_COMMIT: "HEAD" | |
| AITER_REPO: "https://github.com/ROCm/aiter.git" | |
| AITER_COMMIT: "HEAD" | |
| RCCL_REPO: "https://github.com/ROCm/rccl.git" | |
| RCCL_BRANCH: "29e1567b95e28823b0beb1a988adc587bfab5b4f" | |
| GPU_ARCH: "gfx942;gfx950" | |
| VLLM_COMMIT: "b31e9326a7d9394aab8c767f8ebe225c65594b60" | |
| VLLM_VERSION: "0.17.0" | |
| steps: | |
| - name: Set HF_TOKEN | |
| run: echo "HF_TOKEN=${HF_TOKEN:-${{ secrets.AMD_HF_TOKEN }}}" >> $GITHUB_ENV | |
| - name: Checkout ATOM repo | |
| uses: actions/checkout@v6 | |
| - name: Login to Docker Hub | |
| run: | | |
| echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u ${{ secrets.DOCKER_USERNAME }} --password-stdin | |
| - name: Echo environment variables | |
| run: | | |
| echo "ATOM_REPO: ${{ inputs.atom_repo || env.ATOM_REPO }}" | |
| echo "ATOM_COMMIT: ${{ inputs.atom_commit || env.ATOM_COMMIT }}" | |
| echo "AITER_REPO: ${{ inputs.aiter_repo || env.AITER_REPO }}" | |
| echo "AITER_COMMIT: ${{ inputs.aiter_commit || env.AITER_COMMIT }}" | |
| echo "RCCL_REPO: ${{ inputs.rccl_repo || env.RCCL_REPO }}" | |
| echo "RCCL_BRANCH: ${{ inputs.rccl_branch || env.RCCL_BRANCH }}" | |
| - name: Build Docker image | |
| timeout-minutes: 180 | |
| run: | | |
| DOCKER_BUILDKIT=1 docker build --pull --network=host -t atom_release:ci \ | |
| --target atom_image \ | |
| --build-arg BASE_IMAGE="${{ matrix.base_image }}" \ | |
| --build-arg GPU_ARCH="${{ env.GPU_ARCH }}" \ | |
| --build-arg ATOM_REPO="${{ inputs.atom_repo || env.ATOM_REPO }}" \ | |
| --build-arg ATOM_COMMIT="${{ inputs.atom_commit || env.ATOM_COMMIT }}" \ | |
| --build-arg AITER_REPO="${{ inputs.aiter_repo || env.AITER_REPO }}" \ | |
| --build-arg AITER_COMMIT="${{ inputs.aiter_commit || env.AITER_COMMIT }}" \ | |
| --build-arg RCCL_REPO="${{ inputs.rccl_repo || env.RCCL_REPO }}" \ | |
| --build-arg RCCL_BRANCH="${{ inputs.rccl_branch || env.RCCL_BRANCH }}" \ | |
| --build-arg CACHEBUST=$(date +%s) \ | |
| -f docker/Dockerfile . | |
| docker inspect atom_release:ci | |
| - name: Test Docker image | |
| if: ${{ inputs.skip_tests != true }} | |
| timeout-minutes: 60 | |
| run: | | |
| echo "Clean up containers..." | |
| docker ps -aq -f name=atom_release_ci | xargs -r docker stop | xargs -r docker rm | |
| if [ -f "/etc/podinfo/gha-render-devices" ]; then | |
| DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) | |
| else | |
| DEVICE_FLAG="--device /dev/dri" | |
| fi | |
| echo "Starting container: atom_release_ci (image: atom_release:ci)" | |
| docker run -dt --device=/dev/kfd $DEVICE_FLAG \ | |
| -v "${GITHUB_WORKSPACE:-$PWD}":/workspace \ | |
| --ipc=host --group-add video \ | |
| --shm-size 16g \ | |
| --cap-add=SYS_PTRACE \ | |
| -e HF_TOKEN="${HF_TOKEN:-}" \ | |
| --security-opt seccomp=unconfined \ | |
| --ulimit memlock=-1 \ | |
| --ulimit stack=67108864 \ | |
| --name atom_release_ci \ | |
| atom_release:ci | |
| docker exec atom_release_ci bash -c \ | |
| "rm -f ~/.cache/huggingface/token ~/.cache/huggingface/stored_tokens 2>/dev/null; \ | |
| python3 -m atom.examples.simple_inference \ | |
| --model meta-llama/Meta-Llama-3-8B-Instruct \ | |
| --temperature 0 | grep -E '^Prompt: |^Completion:'" > atom_release_output.txt | |
| echo "" | |
| echo "========== Showing release output below ==========" | |
| cat atom_release_output.txt | |
| echo "" | |
| echo "========== Comparing output with golden outputs ==========" | |
| # Ignore whitespace and EOL differences in the diff | |
| diff -u -B -w --strip-trailing-cr atom_release_output.txt .github/workflows/golden_outputs.txt || true | |
| DIFF_EXIT_CODE=$? | |
| if [ $DIFF_EXIT_CODE -ne 0 ]; then | |
| echo "Output does not match golden outputs." | |
| exit 1 | |
| else | |
| echo "Output matches golden outputs." | |
| fi | |
| - name: Push Docker image | |
| if: ${{ success() }} | |
| run: | | |
| # Push both the versioned tag and update the 'latest' tag for the Docker image | |
| TAG=nightly_$(date +%Y%m%d%H%M) | |
| docker tag atom_release:ci rocm/atom-dev:latest | |
| docker push rocm/atom-dev:latest | |
| docker tag atom_release:ci rocm/atom-dev:${TAG} | |
| docker push rocm/atom-dev:${TAG} | |
| - name: Build OOT Docker image | |
| if: ${{ success() && (github.event_name == 'schedule' || inputs.build_oot_image == true) }} | |
| timeout-minutes: 180 | |
| run: | | |
| if [ -n "${{ inputs.oot_base_image }}" ]; then | |
| OOT_BASE_IMAGE="${{ inputs.oot_base_image }}" | |
| docker pull "${OOT_BASE_IMAGE}" | |
| else | |
| OOT_BASE_IMAGE="atom_release:ci" | |
| fi | |
| echo "Using OOT base image: ${OOT_BASE_IMAGE}" | |
| docker build --network=host -t atom_oot_release:ci \ | |
| --target atom_oot \ | |
| --build-arg OOT_BASE_IMAGE="${OOT_BASE_IMAGE}" \ | |
| --build-arg MAX_JOBS=64 \ | |
| --build-arg VLLM_COMMIT="${{ inputs.vllm_commit || env.VLLM_COMMIT }}" \ | |
| --build-arg INSTALL_LM_EVAL=1 \ | |
| --build-arg INSTALL_FASTSAFETENSORS=1 \ | |
| -f docker/Dockerfile . | |
| docker inspect atom_oot_release:ci | |
| - name: Push OOT Docker image | |
| if: ${{ success() && (github.event_name == 'schedule' || inputs.build_oot_image == true) }} | |
| run: | | |
| VLLM_VER="${{ inputs.vllm_version || env.VLLM_VERSION }}" | |
| OOT_TAG="vllm-v${VLLM_VER}-nightly_$(date +%Y%m%d)" | |
| OOT_LATEST_TAG="vllm-latest" | |
| docker tag atom_oot_release:ci rocm/atom-dev:${OOT_TAG} | |
| docker push rocm/atom-dev:${OOT_TAG} | |
| docker tag atom_oot_release:ci rocm/atom-dev:${OOT_LATEST_TAG} | |
| docker push rocm/atom-dev:${OOT_LATEST_TAG} | |
| - name: Clean Up | |
| if: always() | |
| run: | | |
| docker stop atom_release_ci || true | |
| docker rm atom_release_ci || true | |
| # Remove build and tagged images to free disk space | |
| docker rmi atom_release:ci || true | |
| docker rmi rocm/atom-dev:latest || true | |
| docker rmi atom_oot_release:ci || true | |
| # Remove nightly tagged image if it exists | |
| docker images "rocm/atom-dev:nightly_*" -q | xargs -r docker rmi || true | |
| docker images "rocm/atom-dev:vllm-v*-nightly_*" -q | xargs -r docker rmi || true | |
| docker rmi rocm/atom-dev:vllm-latest || true |