Skip to content

Nightly Docker Release #5

Nightly Docker Release

Nightly Docker Release #5

name: Nightly Docker Release
on:
schedule:
- cron: '0 14 * * *' # Every day at 22:00 Beijing Time (UTC+8)
workflow_dispatch:
inputs:
base_image:
description: "Base Docker image for ATOM build"
type: choice
default: "rocm/pytorch:latest"
options:
- rocm/pytorch:latest
- rocm/pytorch-nightly:latest
- rocm/pytorch:rocm7.1.1_ubuntu24.04_py3.12_pytorch_release_2.10.0
atom_repo:
description: "ATOM repo"
default: "https://github.com/ROCm/ATOM.git"
atom_commit:
description: "ATOM commit"
default: "HEAD"
aiter_repo:
description: "Aiter repo"
default: "https://github.com/ROCm/aiter.git"
aiter_commit:
description: "Aiter commit"
default: "HEAD"
rccl_repo:
description: "RCCL repo"
default: "https://github.com/ROCm/rccl.git"
rccl_branch:
description: "RCCL branch"
default: "29e1567b95e28823b0beb1a988adc587bfab5b4f"
runner:
description: "Runner label to use"
type: choice
default: "linux-atom-mi355-1"
options:
- linux-atom-mi355-1
- build-only-atom
- atom-mi355-8gpu.predownload
skip_tests:
description: "Skip test step"
type: boolean
default: false
build_oot_image:
description: "Build OOT vLLM image for manual runs (scheduled nightly runs always build OOT)"
type: boolean
default: false
oot_base_image:
description: "Base ATOM image for OOT vLLM (empty means the atom_release:ci image built in this job)"
default: ""
vllm_commit:
description: "vLLM commit for OOT image"
default: "b31e9326a7d9394aab8c767f8ebe225c65594b60"
vllm_version:
description: "vLLM version label for OOT image tags"
default: "0.17.0"
jobs:
docker-release:
name: Nightly Docker Release
runs-on: ${{ inputs.runner || 'linux-atom-mi355-1' }}
strategy:
fail-fast: false
matrix:
base_image:
- ${{ inputs.base_image || 'rocm/pytorch:latest' }}
env:
ATOM_REPO: "https://github.com/ROCm/ATOM.git"
ATOM_COMMIT: "HEAD"
AITER_REPO: "https://github.com/ROCm/aiter.git"
AITER_COMMIT: "HEAD"
RCCL_REPO: "https://github.com/ROCm/rccl.git"
RCCL_BRANCH: "29e1567b95e28823b0beb1a988adc587bfab5b4f"
GPU_ARCH: "gfx942;gfx950"
VLLM_COMMIT: "b31e9326a7d9394aab8c767f8ebe225c65594b60"
VLLM_VERSION: "0.17.0"
steps:
- name: Set HF_TOKEN
run: echo "HF_TOKEN=${HF_TOKEN:-${{ secrets.AMD_HF_TOKEN }}}" >> $GITHUB_ENV
- name: Checkout ATOM repo
uses: actions/checkout@v6
- name: Login to Docker Hub
run: |
echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u ${{ secrets.DOCKER_USERNAME }} --password-stdin
- name: Echo environment variables
run: |
echo "ATOM_REPO: ${{ inputs.atom_repo || env.ATOM_REPO }}"
echo "ATOM_COMMIT: ${{ inputs.atom_commit || env.ATOM_COMMIT }}"
echo "AITER_REPO: ${{ inputs.aiter_repo || env.AITER_REPO }}"
echo "AITER_COMMIT: ${{ inputs.aiter_commit || env.AITER_COMMIT }}"
echo "RCCL_REPO: ${{ inputs.rccl_repo || env.RCCL_REPO }}"
echo "RCCL_BRANCH: ${{ inputs.rccl_branch || env.RCCL_BRANCH }}"
- name: Build Docker image
timeout-minutes: 180
run: |
DOCKER_BUILDKIT=1 docker build --pull --network=host -t atom_release:ci \
--target atom_image \
--build-arg BASE_IMAGE="${{ matrix.base_image }}" \
--build-arg GPU_ARCH="${{ env.GPU_ARCH }}" \
--build-arg ATOM_REPO="${{ inputs.atom_repo || env.ATOM_REPO }}" \
--build-arg ATOM_COMMIT="${{ inputs.atom_commit || env.ATOM_COMMIT }}" \
--build-arg AITER_REPO="${{ inputs.aiter_repo || env.AITER_REPO }}" \
--build-arg AITER_COMMIT="${{ inputs.aiter_commit || env.AITER_COMMIT }}" \
--build-arg RCCL_REPO="${{ inputs.rccl_repo || env.RCCL_REPO }}" \
--build-arg RCCL_BRANCH="${{ inputs.rccl_branch || env.RCCL_BRANCH }}" \
--build-arg CACHEBUST=$(date +%s) \
-f docker/Dockerfile .
docker inspect atom_release:ci
- name: Test Docker image
if: ${{ inputs.skip_tests != true }}
timeout-minutes: 60
run: |
echo "Clean up containers..."
docker ps -aq -f name=atom_release_ci | xargs -r docker stop | xargs -r docker rm
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
echo "Starting container: atom_release_ci (image: atom_release:ci)"
docker run -dt --device=/dev/kfd $DEVICE_FLAG \
-v "${GITHUB_WORKSPACE:-$PWD}":/workspace \
--ipc=host --group-add video \
--shm-size 16g \
--cap-add=SYS_PTRACE \
-e HF_TOKEN="${HF_TOKEN:-}" \
--security-opt seccomp=unconfined \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--name atom_release_ci \
atom_release:ci
docker exec atom_release_ci bash -c \
"rm -f ~/.cache/huggingface/token ~/.cache/huggingface/stored_tokens 2>/dev/null; \
python3 -m atom.examples.simple_inference \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--temperature 0 | grep -E '^Prompt: |^Completion:'" > atom_release_output.txt
echo ""
echo "========== Showing release output below =========="
cat atom_release_output.txt
echo ""
echo "========== Comparing output with golden outputs =========="
# Ignore whitespace and EOL differences in the diff
diff -u -B -w --strip-trailing-cr atom_release_output.txt .github/workflows/golden_outputs.txt || true
DIFF_EXIT_CODE=$?
if [ $DIFF_EXIT_CODE -ne 0 ]; then
echo "Output does not match golden outputs."
exit 1
else
echo "Output matches golden outputs."
fi
- name: Push Docker image
if: ${{ success() }}
run: |
# Push both the versioned tag and update the 'latest' tag for the Docker image
TAG=nightly_$(date +%Y%m%d%H%M)
docker tag atom_release:ci rocm/atom-dev:latest
docker push rocm/atom-dev:latest
docker tag atom_release:ci rocm/atom-dev:${TAG}
docker push rocm/atom-dev:${TAG}
- name: Build OOT Docker image
if: ${{ success() && (github.event_name == 'schedule' || inputs.build_oot_image == true) }}
timeout-minutes: 180
run: |
if [ -n "${{ inputs.oot_base_image }}" ]; then
OOT_BASE_IMAGE="${{ inputs.oot_base_image }}"
docker pull "${OOT_BASE_IMAGE}"
else
OOT_BASE_IMAGE="atom_release:ci"
fi
echo "Using OOT base image: ${OOT_BASE_IMAGE}"
docker build --network=host -t atom_oot_release:ci \
--target atom_oot \
--build-arg OOT_BASE_IMAGE="${OOT_BASE_IMAGE}" \
--build-arg MAX_JOBS=64 \
--build-arg VLLM_COMMIT="${{ inputs.vllm_commit || env.VLLM_COMMIT }}" \
--build-arg INSTALL_LM_EVAL=1 \
--build-arg INSTALL_FASTSAFETENSORS=1 \
-f docker/Dockerfile .
docker inspect atom_oot_release:ci
- name: Push OOT Docker image
if: ${{ success() && (github.event_name == 'schedule' || inputs.build_oot_image == true) }}
run: |
VLLM_VER="${{ inputs.vllm_version || env.VLLM_VERSION }}"
OOT_TAG="vllm-v${VLLM_VER}-nightly_$(date +%Y%m%d)"
OOT_LATEST_TAG="vllm-latest"
docker tag atom_oot_release:ci rocm/atom-dev:${OOT_TAG}
docker push rocm/atom-dev:${OOT_TAG}
docker tag atom_oot_release:ci rocm/atom-dev:${OOT_LATEST_TAG}
docker push rocm/atom-dev:${OOT_LATEST_TAG}
- name: Clean Up
if: always()
run: |
docker stop atom_release_ci || true
docker rm atom_release_ci || true
# Remove build and tagged images to free disk space
docker rmi atom_release:ci || true
docker rmi rocm/atom-dev:latest || true
docker rmi atom_oot_release:ci || true
# Remove nightly tagged image if it exists
docker images "rocm/atom-dev:nightly_*" -q | xargs -r docker rmi || true
docker images "rocm/atom-dev:vllm-v*-nightly_*" -q | xargs -r docker rmi || true
docker rmi rocm/atom-dev:vllm-latest || true