Skip to content

[Diffusion] Enable vLLM-Omni Plugin for Diffusion Model #6

[Diffusion] Enable vLLM-Omni Plugin for Diffusion Model

[Diffusion] Enable vLLM-Omni Plugin for Diffusion Model #6

Workflow file for this run

name: ATOM Test
on:
push:
branches: [main]
pull_request:
branches: [main] # Triggers on PRs targeting `main`
types: [opened, synchronize, reopened, ready_for_review]
paths-ignore:
- '**/*.md'
- 'docs/**'
- 'LICENSE'
- '.gitignore'
schedule:
# Nightly at 00:00 Beijing time (16:00 UTC)
- cron: '0 16 * * *'
workflow_dispatch:
inputs:
aiter_branch:
description: 'ROCm/aiter branch to build inside the CI image'
required: false
default: 'main'
type: string
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
ATOM_BASE_IMAGE: rocm/atom-dev:latest
GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/ATOM.git' }}
GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id || github.sha }}
# workflow_dispatch: inputs.aiter_branch; otherwise main (matches previous default-branch shallow clone)
AITER_GIT_REF: ${{ github.event_name == 'workflow_dispatch' && inputs.aiter_branch || 'main' }}
jobs:
check-signal:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
name: Check Pre Checkin Signal
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
steps:
- name: Checkout ATOM repo
if: ${{ github.event_name != 'workflow_dispatch' }}
uses: actions/checkout@v6
- name: Download and check pre-checkin signal
if: ${{ github.event_name != 'workflow_dispatch' }}
run: bash ./.github/scripts/check_signal.sh
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_SHA: ${{ github.sha }}
download_aiter_wheel:
if: ${{ needs.check-signal.result == 'success' && (!github.event.pull_request || github.event.pull_request.draft == false) }}
needs: [check-signal]
name: Download aiter wheel
runs-on: ubuntu-latest
steps:
- name: Find and download latest aiter wheel
run: |
set -euo pipefail
echo "=== Finding latest aiter-whl-main artifact from ROCm/aiter ==="
API_URL="https://api.github.com"
AUTH_HEADER="Authorization: token ${{ secrets.GITHUB_TOKEN }}"
AITER_TEST_WORKFLOW_ID=179476100
RUNS=$(curl -s -H "$AUTH_HEADER" \
"$API_URL/repos/ROCm/aiter/actions/workflows/$AITER_TEST_WORKFLOW_ID/runs?per_page=100&branch=main&event=push")
ARTIFACT_ID=""
ARTIFACT_NAME=""
for RUN_ID in $(echo "$RUNS" | jq -r '.workflow_runs[].id'); do
ARTIFACT_JSON=$(curl -s -H "$AUTH_HEADER" \
"$API_URL/repos/ROCm/aiter/actions/runs/$RUN_ID/artifacts" \
| jq '[.artifacts[] | select(.name | startswith("aiter-whl-main")) | select(.expired == false)] | first')
if [ "$ARTIFACT_JSON" != "null" ] && [ -n "$ARTIFACT_JSON" ]; then
ARTIFACT_ID=$(echo "$ARTIFACT_JSON" | jq -r '.id')
ARTIFACT_NAME=$(echo "$ARTIFACT_JSON" | jq -r '.name')
echo "Found artifact in run $RUN_ID: $ARTIFACT_NAME (ID: $ARTIFACT_ID)"
break
fi
done
if [ -z "$ARTIFACT_ID" ] || [ "$ARTIFACT_ID" = "null" ]; then
echo "ERROR: No aiter-whl-main artifact found in recent Aiter Test runs"
exit 1
fi
echo "=== Downloading artifact ==="
mkdir -p aiter-whl
curl -s -L -H "$AUTH_HEADER" \
"$API_URL/repos/ROCm/aiter/actions/artifacts/$ARTIFACT_ID/zip" \
-o aiter-whl.zip
unzip -o aiter-whl.zip -d aiter-whl
rm -f aiter-whl.zip
AITER_WHL=$(ls -t aiter-whl/amd_aiter*.whl 2>/dev/null | head -1)
if [ -z "$AITER_WHL" ]; then
echo "ERROR: No amd_aiter wheel found in artifact"
ls -la aiter-whl/
exit 1
fi
echo "Downloaded wheel: $AITER_WHL"
- name: Upload aiter wheel
uses: actions/upload-artifact@v4
with:
name: aiter-whl
path: aiter-whl/amd_aiter*.whl
retention-days: 1
load-test-models:
name: Load test model configs
runs-on: ubuntu-latest
outputs:
models_json: ${{ steps.load.outputs.models_json }}
steps:
- uses: actions/checkout@v6
- id: load
env:
EVENT_NAME: ${{ github.event_name }}
run: |
python3 << 'PY'
import json, os
event = os.environ["EVENT_NAME"]
# pr → pr models only; push to main → pr+main; schedule/dispatch → all
level_map = {"schedule": "nightly", "workflow_dispatch": "nightly", "push": "main"}
current = level_map.get(event, "pr")
allowed = {"pr": {"pr"}, "main": {"pr", "main"}, "nightly": {"pr", "main", "nightly"}}[current]
models = json.load(open(".github/benchmark/models_accuracy.json", encoding="utf-8"))
filtered = [m for m in models if m.get("test_level", "nightly") in allowed]
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
f.write(f"models_json={json.dumps(filtered)}\n")
print(f"Event={event} level={current}: {len(filtered)}/{len(models)} models")
print(f"{'Model':<45} {'Level':<10} {'Runner'}")
print("-" * 80)
for m in models:
enabled = "✓" if m in filtered else "·"
print(f" {enabled} {m['model_name']:<43} {m.get('test_level','?'):<10} {m['runner']}")
PY
atom-test:
needs: [download_aiter_wheel, load-test-models]
name: ATOM Test
strategy:
fail-fast: false
matrix:
include: ${{ fromJson(needs.load-test-models.outputs.models_json) }}
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
runs-on: ${{ matrix.runner }}
env:
CONTAINER_NAME: atom_test_${{ strategy.job-index }}
steps:
- name: Set HF_TOKEN
run: echo "HF_TOKEN=${HF_TOKEN:-${{ secrets.AMD_HF_TOKEN }}}" >> $GITHUB_ENV
- name: Kill all Docker containers and clean up workspace
if: matrix.runner == 'atom-mi355-8gpu.predownload'
run: |
echo "=== Cleaning up containers on $(hostname) ==="
containers=$(docker ps -q)
if [ -n "$containers" ]; then
docker kill $containers || true
fi
docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && find /workspace -mindepth 1 -delete" || true
- name: Show Docker containers
if: matrix.runner == 'atom-mi355-8gpu.predownload'
run: docker ps -a
- name: Show ROCm memory usage
if: matrix.runner == 'atom-mi355-8gpu.predownload'
run: rocm-smi --showmemuse
- name: Show ROCm GPU processes
if: matrix.runner == 'atom-mi355-8gpu.predownload'
run: rocm-smi --showpidgpus
- name: Checkout ATOM repo
uses: actions/checkout@v6
- name: Docker Login
if: ${{ !github.event.pull_request.head.repo.fork }}
run: |
echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u ${{ secrets.DOCKER_USERNAME }} --password-stdin
- name: Generate Dockerfile for forked repo
if: ${{ github.event.pull_request.head.repo.fork }}
run: |
cat <<EOF > Dockerfile.mod
FROM ${{ env.ATOM_BASE_NIGHTLY_IMAGE }}
RUN pip install -U lm-eval[api]
RUN pip show lm-eval || true
RUN pip install hf_transfer
RUN pip show hf_transfer || true
RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
RUN pip uninstall -y amd-aiter
RUN pip install --upgrade "pybind11>=3.0.1"
RUN pip show pybind11
RUN rm -rf /app/aiter-test
RUN git clone --depth 1 -b ${{ env.AITER_GIT_REF }} https://github.com/ROCm/aiter.git /app/aiter-test && \\
cd /app/aiter-test && \\
git submodule sync && git submodule update --init --recursive && \\
MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true
RUN pip uninstall -y atom
RUN rm -rf /app/ATOM
RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\
cd /app/ATOM && \\
git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
pip install -e .
RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true
EOF
- name: Download aiter wheel
uses: actions/download-artifact@v4
with:
name: aiter-whl
path: /tmp/aiter-whl
- name: Start CI container
run: |
echo "Clean up containers..."
(docker ps -aq -f name="^${CONTAINER_NAME}$" | xargs -r docker stop) || true
(docker ps -aq -f name="^${CONTAINER_NAME}$" | xargs -r docker rm) || true
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
if [ -d "/models" ]; then
MODEL_MOUNT="-v /models:/models"
else
echo "Warning: /models directory not found on runner; skipping /models mount and disabling model pre-download optimization."
MODEL_MOUNT=""
fi
# Write env_vars via env block (avoids expression injection)
printenv MODEL_ENV_VARS | grep -v '^$' > /tmp/env_file.txt || true
IMAGE_TAG=${{ env.ATOM_BASE_IMAGE }}
echo "Starting container with image: $IMAGE_TAG"
echo "Model-specific environment variables:"
cat /tmp/env_file.txt
docker run -dt --pull always --device=/dev/kfd $DEVICE_FLAG \
-v "${GITHUB_WORKSPACE:-$PWD}":/workspace \
$MODEL_MOUNT \
-w /workspace \
--ipc=host --group-add video \
--shm-size=16G \
--privileged \
--cap-add=SYS_PTRACE \
-e HF_TOKEN="${HF_TOKEN:-}" \
--env-file /tmp/env_file.txt \
--security-opt seccomp=unconfined \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-e ATOM_DISABLE_MMAP=true \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name "$CONTAINER_NAME" \
$IMAGE_TAG
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
MODEL_ENV_VARS: ${{ matrix.env_vars }}
- name: Check shm size
run: |
docker exec "$CONTAINER_NAME" df -h /dev/shm
- name: Install aiter from wheel
run: |
AITER_WHL=$(ls -t /tmp/aiter-whl/amd_aiter*.whl 2>/dev/null | head -1)
if [ -z "$AITER_WHL" ]; then
echo "ERROR: No amd_aiter wheel found"
ls -la /tmp/aiter-whl/
exit 1
fi
echo "=== Copying wheel into container ==="
WHL_NAME=$(basename "$AITER_WHL")
docker cp "$AITER_WHL" "$CONTAINER_NAME:/tmp/$WHL_NAME"
docker exec "$CONTAINER_NAME" bash -lc "
set -euo pipefail
echo '=== Uninstalling existing amd-aiter ==='
pip uninstall -y amd-aiter || true
echo '=== Installing amd-aiter from wheel ==='
pip install /tmp/$WHL_NAME
echo '=== Installed amd-aiter version ==='
pip show amd-aiter
"
- name: Install ATOM and dependencies
run: |
docker exec "$CONTAINER_NAME" bash -lc "
set -euo pipefail
pip install --timeout 60 --retries 10 -U 'lm-eval[api]'
pip install --timeout 60 --retries 10 hf_transfer
pip install --timeout 60 --retries 10 --upgrade 'pybind11>=3.0.1'
echo '=== Installing ATOM ==='
cd /workspace
git config --global --add safe.directory /workspace
pip install -e .
echo '=== Installed package versions ==='
pip show amd-aiter | grep -E '^(Name|Version):'
pip show atom | grep -E '^(Name|Version):'
pip show triton | grep -E '^(Name|Version):'
pip show torch | grep -E '^(Name|Version):'
"
- name: Download models
run: |
if [ -d "/models" ]; then
echo "/models directory found, downloading model to /models/${{ matrix.model_path }}"
if ! docker exec -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} "$CONTAINER_NAME" bash -lc "hf download ${{ matrix.model_path }} --local-dir /models/${{ matrix.model_path }}"; then
echo "Model download failed for '${{ matrix.model_path }}'. Aborting."
exit 1
fi
else
echo "/models directory not found, skipping model download"
fi
- name: Run ATOM simple inference
# Skip simple inference; accuracy test already validates correctness
if: false
timeout-minutes: 30
run: |
# Run the inference and capture output
set -euo pipefail
echo ""
echo "========== Running test =========="
if [ -d "/models" ]; then
model_path="/models/${{ matrix.model_path }}"
else
model_path="${{ matrix.model_path }}"
fi
echo "Model path: $model_path"
ls -la $model_path || true
# Print debug logs
echo "========= Runner debug logs ==============="
ps aux
rocm-smi --showmemuse
rocm-smi --showpids
docker ps -a
echo "========= End runner debug logs ==============="
docker exec "$CONTAINER_NAME" bash -lc "
set -euo pipefail
python3 -m atom.examples.simple_inference \
--model \"$model_path\" \
${{ matrix.extraArgs }} \
--temperature 0 \
| grep -E '^Prompt: |^Completion:'
" > atom_test_output.txt
echo ""
echo "========== Showing test output below =========="
cat atom_test_output.txt
- name: Compare output with golden outputs
if: false
timeout-minutes: 30
# TODO: skip for all test until it's fixed
run: |
echo "========== Comparing output with golden outputs =========="
if ! diff -u -B -w --strip-trailing-cr \
atom_test_output.txt \
".github/workflows/golden_outputs/${{ matrix.model_name }}_golden_output.txt"; then
echo "Failed: Output does not match golden outputs."
exit 1
else
echo "Success: Output matches golden outputs."
fi
- name: Run ATOM accuracy test
timeout-minutes: 30
run: |
set -euo pipefail
echo ""
echo "========== Launching ATOM server =========="
if [ -d "/models" ]; then
model_path="/models/${{ matrix.model_path }}"
else
model_path="${{ matrix.model_path }}"
fi
docker exec "$CONTAINER_NAME" bash -lc "
.github/scripts/atom_test.sh launch $model_path ${{ matrix.extraArgs }}
"
echo ""
echo "========== Running accuracy test =========="
docker exec "$CONTAINER_NAME" bash -lc "
.github/scripts/atom_test.sh accuracy $model_path
" 2>&1 | tee atom_accuracy_output.txt
- name: Check accuracy test results
if: success()
env:
MODEL_NAME: ${{ matrix.model_name }}
run: |
result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null | head -n 1)
if [ -z "$result_file" ] || [ ! -f "$result_file" ]; then
echo "ERROR: No results JSON file found in accuracy_test_results/"
exit 2
else
echo "RESULT_FILE: $result_file"
fi
flexible_extract_value=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file")
echo "Flexible extract value: $flexible_extract_value"
# Read threshold from models_accuracy.json (via env var to avoid shell injection)
threshold=$(python3 -c "
import json, os
models = json.load(open('.github/benchmark/models_accuracy.json', encoding='utf-8'))
name = os.environ['MODEL_NAME']
t = next((m.get('accuracy_threshold', 0) for m in models if m['model_name'] == name), 0)
print(t)
")
echo "Accuracy test threshold: $threshold"
result=$(awk -v val="$flexible_extract_value" -v threshold="$threshold" 'BEGIN {print (val < threshold) ? 1 : 0}')
if [ "$result" -eq 1 ]; then
echo "Accuracy test failed: $flexible_extract_value < $threshold"
exit 1
else
echo "Accuracy test passed: $flexible_extract_value >= $threshold"
fi
- name: Collect Test Summary
if: success()
env:
MODEL_NAME: ${{ matrix.model_name }}
run: |
# Read threshold and score for summary
threshold=$(python3 -c "
import json, os
models = json.load(open('.github/benchmark/models_accuracy.json', encoding='utf-8'))
name = os.environ['MODEL_NAME']
print(next((m.get('accuracy_threshold', 0) for m in models if m['model_name'] == name), 0))
")
result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null | head -n 1)
score=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file" 2>/dev/null || echo "N/A")
echo "Accuracy Test Summary for ${{ matrix.model_name }} (threshold: ${threshold}, score: ${score}):" >> $GITHUB_STEP_SUMMARY
awk '/\|Tasks\|Version\|/,/^$/ { if (NF > 0) print }' atom_accuracy_output.txt >> $GITHUB_STEP_SUMMARY
- name: Upload output
if: always()
uses: actions/upload-artifact@v7
with:
name: ${{ matrix.model_name }}_atom_test_output.txt
path: atom_test_output.txt
- name: Upload accuracy results
if: always()
uses: actions/upload-artifact@v7
with:
name: accuracy-${{ matrix.model_name }}
path: accuracy_test_results/*.json
if-no-files-found: ignore
- name: Clean Up
if: always()
run: |
# TODO: run a separate container for cleanup of the workspace due to permission issue to remove some pyc files under __pycache__ whose owners are root.
# We should use non-root user to run the test to avoid this issue.
set -x
echo "========== Cleaning up workspace =========="
if [[ ${{ matrix.runner }} == atom-mi355-8gpu.predownload ]]; then
docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && find /workspace -mindepth 1 -delete" || true
fi
docker stop "$CONTAINER_NAME" || true
docker rm "$CONTAINER_NAME" || true
# Remove the pre-built image to free disk space on the runner
docker rmi "rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}" || true
# ---------- Push accuracy data to benchmark dashboard ----------
accuracy-dashboard:
name: Update accuracy dashboard
needs: [atom-test]
if: always() && github.ref == 'refs/heads/main' && (github.event_name == 'push' || github.event_name == 'schedule')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: actions/setup-python@v6
with:
python-version: '3.12'
- name: Download accuracy artifacts
uses: actions/download-artifact@v8
with:
path: /tmp/accuracy-results
pattern: accuracy-*
- name: List downloaded artifacts
run: |
echo "=== Downloaded accuracy artifacts ==="
find /tmp/accuracy-results -type f -name '*.json' | head -20 || echo "No JSON files found"
- name: Transform accuracy results for dashboard
run: |
python3 .github/scripts/accuracy_to_dashboard.py \
/tmp/accuracy-results \
--output accuracy-benchmark-input.json \
--models .github/benchmark/models_accuracy.json \
--run-url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
echo "=== Generated entries ==="
cat accuracy-benchmark-input.json
- name: Store accuracy result to dashboard
if: hashFiles('accuracy-benchmark-input.json') != ''
uses: benchmark-action/github-action-benchmark@v1
with:
tool: customBiggerIsBetter
output-file-path: accuracy-benchmark-input.json
gh-pages-branch: gh-pages
benchmark-data-dir-path: benchmark-dashboard
auto-push: true
max-items-in-chart: 90
github-token: ${{ secrets.GITHUB_TOKEN }}