[Diffusion] Enable vLLM-Omni Plugin for Diffusion Model #6

Workflow file for this run

.github/workflows/atom-test.yaml at 7bc16ba

	name: ATOM Test

	on:
	push:
	branches: [main]
	pull_request:
	branches: [main] # Triggers on PRs targeting `main`
	types: [opened, synchronize, reopened, ready_for_review]
	paths-ignore:
	- '*/.md'
	- 'docs/**'
	- 'LICENSE'
	- '.gitignore'
	schedule:
	# Nightly at 00:00 Beijing time (16:00 UTC)
	- cron: '0 16 * * *'
	workflow_dispatch:
	inputs:
	aiter_branch:
	description: 'ROCm/aiter branch to build inside the CI image'
	required: false
	default: 'main'
	type: string

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

	env:
	ATOM_BASE_IMAGE: rocm/atom-dev:latest
	GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url \|\| 'https://github.com/ROCm/ATOM.git' }}
	GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha \|\| github.event.head_commit.id \|\| github.sha }}
	# workflow_dispatch: inputs.aiter_branch; otherwise main (matches previous default-branch shallow clone)
	AITER_GIT_REF: ${{ github.event_name == 'workflow_dispatch' && inputs.aiter_branch \|\| 'main' }}

	jobs:
	check-signal:
	if: ${{ !github.event.pull_request \|\| github.event.pull_request.draft == false }}
	name: Check Pre Checkin Signal
	runs-on: ubuntu-latest
	permissions:
	actions: read
	contents: read
	steps:
	- name: Checkout ATOM repo
	if: ${{ github.event_name != 'workflow_dispatch' }}
	uses: actions/checkout@v6

	- name: Download and check pre-checkin signal
	if: ${{ github.event_name != 'workflow_dispatch' }}
	run: bash ./.github/scripts/check_signal.sh
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	GITHUB_SHA: ${{ github.sha }}

	download_aiter_wheel:
	if: ${{ needs.check-signal.result == 'success' && (!github.event.pull_request \|\| github.event.pull_request.draft == false) }}
	needs: [check-signal]
	name: Download aiter wheel
	runs-on: ubuntu-latest
	steps:
	- name: Find and download latest aiter wheel
	run: \|
	set -euo pipefail
	echo "=== Finding latest aiter-whl-main artifact from ROCm/aiter ==="

	API_URL="https://api.github.com"
	AUTH_HEADER="Authorization: token ${{ secrets.GITHUB_TOKEN }}"
	AITER_TEST_WORKFLOW_ID=179476100

	RUNS=$(curl -s -H "$AUTH_HEADER" \
	"$API_URL/repos/ROCm/aiter/actions/workflows/$AITER_TEST_WORKFLOW_ID/runs?per_page=100&branch=main&event=push")

	ARTIFACT_ID=""
	ARTIFACT_NAME=""
	for RUN_ID in $(echo "$RUNS" \| jq -r '.workflow_runs[].id'); do
	ARTIFACT_JSON=$(curl -s -H "$AUTH_HEADER" \
	"$API_URL/repos/ROCm/aiter/actions/runs/$RUN_ID/artifacts" \
	\| jq '[.artifacts[] \| select(.name \| startswith("aiter-whl-main")) \| select(.expired == false)] \| first')

	if [ "$ARTIFACT_JSON" != "null" ] && [ -n "$ARTIFACT_JSON" ]; then
	ARTIFACT_ID=$(echo "$ARTIFACT_JSON" \| jq -r '.id')
	ARTIFACT_NAME=$(echo "$ARTIFACT_JSON" \| jq -r '.name')
	echo "Found artifact in run $RUN_ID: $ARTIFACT_NAME (ID: $ARTIFACT_ID)"
	break
	fi
	done

	if [ -z "$ARTIFACT_ID" ] \|\| [ "$ARTIFACT_ID" = "null" ]; then
	echo "ERROR: No aiter-whl-main artifact found in recent Aiter Test runs"
	exit 1
	fi

	echo "=== Downloading artifact ==="
	mkdir -p aiter-whl
	curl -s -L -H "$AUTH_HEADER" \
	"$API_URL/repos/ROCm/aiter/actions/artifacts/$ARTIFACT_ID/zip" \
	-o aiter-whl.zip
	unzip -o aiter-whl.zip -d aiter-whl
	rm -f aiter-whl.zip

	AITER_WHL=$(ls -t aiter-whl/amd_aiter*.whl 2>/dev/null \| head -1)
	if [ -z "$AITER_WHL" ]; then
	echo "ERROR: No amd_aiter wheel found in artifact"
	ls -la aiter-whl/
	exit 1
	fi

	echo "Downloaded wheel: $AITER_WHL"

	- name: Upload aiter wheel
	uses: actions/upload-artifact@v4
	with:
	name: aiter-whl
	path: aiter-whl/amd_aiter*.whl
	retention-days: 1

	load-test-models:
	name: Load test model configs
	runs-on: ubuntu-latest
	outputs:
	models_json: ${{ steps.load.outputs.models_json }}
	steps:
	- uses: actions/checkout@v6
	- id: load
	env:
	EVENT_NAME: ${{ github.event_name }}
	run: \|
	python3 << 'PY'
	import json, os
	event = os.environ["EVENT_NAME"]
	# pr → pr models only; push to main → pr+main; schedule/dispatch → all
	level_map = {"schedule": "nightly", "workflow_dispatch": "nightly", "push": "main"}
	current = level_map.get(event, "pr")
	allowed = {"pr": {"pr"}, "main": {"pr", "main"}, "nightly": {"pr", "main", "nightly"}}[current]
	models = json.load(open(".github/benchmark/models_accuracy.json", encoding="utf-8"))
	filtered = [m for m in models if m.get("test_level", "nightly") in allowed]
	with open(os.environ["GITHUB_OUTPUT"], "a") as f:
	f.write(f"models_json={json.dumps(filtered)}\n")
	print(f"Event={event} level={current}: {len(filtered)}/{len(models)} models")
	print(f"{'Model':<45} {'Level':<10} {'Runner'}")
	print("-" * 80)
	for m in models:
	enabled = "✓" if m in filtered else "·"
	print(f" {enabled} {m['model_name']:<43} {m.get('test_level','?'):<10} {m['runner']}")
	PY

	atom-test:
	needs: [download_aiter_wheel, load-test-models]
	name: ATOM Test
	strategy:
	fail-fast: false
	matrix:
	include: ${{ fromJson(needs.load-test-models.outputs.models_json) }}
	if: ${{ !github.event.pull_request \|\| github.event.pull_request.draft == false }}
	runs-on: ${{ matrix.runner }}

	env:
	CONTAINER_NAME: atom_test_${{ strategy.job-index }}

	steps:
	- name: Set HF_TOKEN
	run: echo "HF_TOKEN=${HF_TOKEN:-${{ secrets.AMD_HF_TOKEN }}}" >> $GITHUB_ENV

	- name: Kill all Docker containers and clean up workspace
	if: matrix.runner == 'atom-mi355-8gpu.predownload'
	run: \|
	echo "=== Cleaning up containers on $(hostname) ==="
	containers=$(docker ps -q)
	if [ -n "$containers" ]; then
	docker kill $containers \|\| true
	fi
	docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && find /workspace -mindepth 1 -delete" \|\| true

	- name: Show Docker containers
	if: matrix.runner == 'atom-mi355-8gpu.predownload'
	run: docker ps -a

	- name: Show ROCm memory usage
	if: matrix.runner == 'atom-mi355-8gpu.predownload'
	run: rocm-smi --showmemuse

	- name: Show ROCm GPU processes
	if: matrix.runner == 'atom-mi355-8gpu.predownload'
	run: rocm-smi --showpidgpus

	- name: Checkout ATOM repo
	uses: actions/checkout@v6

	- name: Docker Login
	if: ${{ !github.event.pull_request.head.repo.fork }}
	run: \|
	echo "${{ secrets.DOCKER_PASSWORD }}" \| docker login -u ${{ secrets.DOCKER_USERNAME }} --password-stdin

	- name: Generate Dockerfile for forked repo
	if: ${{ github.event.pull_request.head.repo.fork }}
	run: \|
	cat <<EOF > Dockerfile.mod
	FROM ${{ env.ATOM_BASE_NIGHTLY_IMAGE }}
	RUN pip install -U lm-eval[api]
	RUN pip show lm-eval \|\| true
	RUN pip install hf_transfer
	RUN pip show hf_transfer \|\| true
	RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter \|\| true
	RUN pip uninstall -y amd-aiter
	RUN pip install --upgrade "pybind11>=3.0.1"
	RUN pip show pybind11
	RUN rm -rf /app/aiter-test
	RUN git clone --depth 1 -b ${{ env.AITER_GIT_REF }} https://github.com/ROCm/aiter.git /app/aiter-test && \\
	cd /app/aiter-test && \\
	git submodule sync && git submodule update --init --recursive && \\
	MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
	RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter \|\| true

	RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom \|\| true
	RUN pip uninstall -y atom
	RUN rm -rf /app/ATOM
	RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\
	cd /app/ATOM && \\
	git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
	pip install -e .

	RUN echo "=== ATOM version AFTER installation ===" && pip show atom \|\| true
	EOF

	- name: Download aiter wheel
	uses: actions/download-artifact@v4
	with:
	name: aiter-whl
	path: /tmp/aiter-whl

	- name: Start CI container
	run: \|
	echo "Clean up containers..."
	(docker ps -aq -f name="^${CONTAINER_NAME}$" \| xargs -r docker stop) \|\| true
	(docker ps -aq -f name="^${CONTAINER_NAME}$" \| xargs -r docker rm) \|\| true

	if [ -f "/etc/podinfo/gha-render-devices" ]; then
	DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
	else
	DEVICE_FLAG="--device /dev/dri"
	fi

	if [ -d "/models" ]; then
	MODEL_MOUNT="-v /models:/models"
	else
	echo "Warning: /models directory not found on runner; skipping /models mount and disabling model pre-download optimization."
	MODEL_MOUNT=""
	fi

	# Write env_vars via env block (avoids expression injection)
	printenv MODEL_ENV_VARS \| grep -v '^$' > /tmp/env_file.txt \|\| true

	IMAGE_TAG=${{ env.ATOM_BASE_IMAGE }}
	echo "Starting container with image: $IMAGE_TAG"
	echo "Model-specific environment variables:"
	cat /tmp/env_file.txt

	docker run -dt --pull always --device=/dev/kfd $DEVICE_FLAG \
	-v "${GITHUB_WORKSPACE:-$PWD}":/workspace \
	$MODEL_MOUNT \
	-w /workspace \
	--ipc=host --group-add video \
	--shm-size=16G \
	--privileged \
	--cap-add=SYS_PTRACE \
	-e HF_TOKEN="${HF_TOKEN:-}" \
	--env-file /tmp/env_file.txt \
	--security-opt seccomp=unconfined \
	--ulimit memlock=-1 \
	--ulimit stack=67108864 \
	-e ATOM_DISABLE_MMAP=true \
	-v "${{ github.workspace }}:/workspace" \
	-w /workspace \
	--name "$CONTAINER_NAME" \
	$IMAGE_TAG

	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}
	MODEL_ENV_VARS: ${{ matrix.env_vars }}

	- name: Check shm size
	run: \|
	docker exec "$CONTAINER_NAME" df -h /dev/shm

	- name: Install aiter from wheel
	run: \|
	AITER_WHL=$(ls -t /tmp/aiter-whl/amd_aiter*.whl 2>/dev/null \| head -1)
	if [ -z "$AITER_WHL" ]; then
	echo "ERROR: No amd_aiter wheel found"
	ls -la /tmp/aiter-whl/
	exit 1
	fi

	echo "=== Copying wheel into container ==="
	WHL_NAME=$(basename "$AITER_WHL")
	docker cp "$AITER_WHL" "$CONTAINER_NAME:/tmp/$WHL_NAME"

	docker exec "$CONTAINER_NAME" bash -lc "
	set -euo pipefail
	echo '=== Uninstalling existing amd-aiter ==='
	pip uninstall -y amd-aiter \|\| true

	echo '=== Installing amd-aiter from wheel ==='
	pip install /tmp/$WHL_NAME

	echo '=== Installed amd-aiter version ==='
	pip show amd-aiter
	"

	- name: Install ATOM and dependencies
	run: \|
	docker exec "$CONTAINER_NAME" bash -lc "
	set -euo pipefail
	pip install --timeout 60 --retries 10 -U 'lm-eval[api]'
	pip install --timeout 60 --retries 10 hf_transfer
	pip install --timeout 60 --retries 10 --upgrade 'pybind11>=3.0.1'

	echo '=== Installing ATOM ==='
	cd /workspace
	git config --global --add safe.directory /workspace
	pip install -e .

	echo '=== Installed package versions ==='
	pip show amd-aiter \| grep -E '^(Name\|Version):'
	pip show atom \| grep -E '^(Name\|Version):'
	pip show triton \| grep -E '^(Name\|Version):'
	pip show torch \| grep -E '^(Name\|Version):'
	"

	- name: Download models
	run: \|
	if [ -d "/models" ]; then
	echo "/models directory found, downloading model to /models/${{ matrix.model_path }}"
	if ! docker exec -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} "$CONTAINER_NAME" bash -lc "hf download ${{ matrix.model_path }} --local-dir /models/${{ matrix.model_path }}"; then
	echo "Model download failed for '${{ matrix.model_path }}'. Aborting."
	exit 1
	fi
	else
	echo "/models directory not found, skipping model download"
	fi

	- name: Run ATOM simple inference
	# Skip simple inference; accuracy test already validates correctness
	if: false
	timeout-minutes: 30
	run: \|
	# Run the inference and capture output
	set -euo pipefail

	echo ""
	echo "========== Running test =========="

	if [ -d "/models" ]; then
	model_path="/models/${{ matrix.model_path }}"
	else
	model_path="${{ matrix.model_path }}"
	fi
	echo "Model path: $model_path"
	ls -la $model_path \|\| true
	# Print debug logs
	echo "========= Runner debug logs ==============="
	ps aux
	rocm-smi --showmemuse
	rocm-smi --showpids
	docker ps -a
	echo "========= End runner debug logs ==============="
	docker exec "$CONTAINER_NAME" bash -lc "
	set -euo pipefail
	python3 -m atom.examples.simple_inference \
	--model \"$model_path\" \
	${{ matrix.extraArgs }} \
	--temperature 0 \
	\| grep -E '^Prompt: \|^Completion:'
	" > atom_test_output.txt

	echo ""
	echo "========== Showing test output below =========="
	cat atom_test_output.txt

	- name: Compare output with golden outputs
	if: false
	timeout-minutes: 30
	# TODO: skip for all test until it's fixed
	run: \|
	echo "========== Comparing output with golden outputs =========="
	if ! diff -u -B -w --strip-trailing-cr \
	atom_test_output.txt \
	".github/workflows/golden_outputs/${{ matrix.model_name }}_golden_output.txt"; then
	echo "Failed: Output does not match golden outputs."
	exit 1
	else
	echo "Success: Output matches golden outputs."
	fi

	- name: Run ATOM accuracy test
	timeout-minutes: 30
	run: \|
	set -euo pipefail
	echo ""
	echo "========== Launching ATOM server =========="
	if [ -d "/models" ]; then
	model_path="/models/${{ matrix.model_path }}"
	else
	model_path="${{ matrix.model_path }}"
	fi
	docker exec "$CONTAINER_NAME" bash -lc "
	.github/scripts/atom_test.sh launch $model_path ${{ matrix.extraArgs }}
	"
	echo ""
	echo "========== Running accuracy test =========="
	docker exec "$CONTAINER_NAME" bash -lc "
	.github/scripts/atom_test.sh accuracy $model_path
	" 2>&1 \| tee atom_accuracy_output.txt

	- name: Check accuracy test results
	if: success()
	env:
	MODEL_NAME: ${{ matrix.model_name }}
	run: \|
	result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null \| head -n 1)
	if [ -z "$result_file" ] \|\| [ ! -f "$result_file" ]; then
	echo "ERROR: No results JSON file found in accuracy_test_results/"
	exit 2
	else
	echo "RESULT_FILE: $result_file"
	fi
	flexible_extract_value=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file")
	echo "Flexible extract value: $flexible_extract_value"

	# Read threshold from models_accuracy.json (via env var to avoid shell injection)
	threshold=$(python3 -c "
	import json, os
	models = json.load(open('.github/benchmark/models_accuracy.json', encoding='utf-8'))
	name = os.environ['MODEL_NAME']
	t = next((m.get('accuracy_threshold', 0) for m in models if m['model_name'] == name), 0)
	print(t)
	")
	echo "Accuracy test threshold: $threshold"

	result=$(awk -v val="$flexible_extract_value" -v threshold="$threshold" 'BEGIN {print (val < threshold) ? 1 : 0}')
	if [ "$result" -eq 1 ]; then
	echo "Accuracy test failed: $flexible_extract_value < $threshold"
	exit 1
	else
	echo "Accuracy test passed: $flexible_extract_value >= $threshold"
	fi

	- name: Collect Test Summary
	if: success()
	env:
	MODEL_NAME: ${{ matrix.model_name }}
	run: \|
	# Read threshold and score for summary
	threshold=$(python3 -c "
	import json, os
	models = json.load(open('.github/benchmark/models_accuracy.json', encoding='utf-8'))
	name = os.environ['MODEL_NAME']
	print(next((m.get('accuracy_threshold', 0) for m in models if m['model_name'] == name), 0))
	")
	result_file=$(ls -1t accuracy_test_results/*.json 2>/dev/null \| head -n 1)
	score=$(jq '.results.gsm8k["exact_match,flexible-extract"]' "$result_file" 2>/dev/null \|\| echo "N/A")

	echo "Accuracy Test Summary for ${{ matrix.model_name }} (threshold: ${threshold}, score: ${score}):" >> $GITHUB_STEP_SUMMARY
	awk '/\\|Tasks\\|Version\\|/,/^$/ { if (NF > 0) print }' atom_accuracy_output.txt >> $GITHUB_STEP_SUMMARY

	- name: Upload output
	if: always()
	uses: actions/upload-artifact@v7
	with:
	name: ${{ matrix.model_name }}_atom_test_output.txt
	path: atom_test_output.txt

	- name: Upload accuracy results
	if: always()
	uses: actions/upload-artifact@v7
	with:
	name: accuracy-${{ matrix.model_name }}
	path: accuracy_test_results/*.json
	if-no-files-found: ignore

	- name: Clean Up
	if: always()
	run: \|
	# TODO: run a separate container for cleanup of the workspace due to permission issue to remove some pyc files under __pycache__ whose owners are root.
	# We should use non-root user to run the test to avoid this issue.
	set -x
	echo "========== Cleaning up workspace =========="
	if [[ ${{ matrix.runner }} == atom-mi355-8gpu.predownload ]]; then
	docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "ls -la /workspace/ && find /workspace -mindepth 1 -delete" \|\| true
	fi
	docker stop "$CONTAINER_NAME" \|\| true
	docker rm "$CONTAINER_NAME" \|\| true
	# Remove the pre-built image to free disk space on the runner
	docker rmi "rocm/atom-dev:pre-build-${{ env.GITHUB_COMMIT_SHA }}" \|\| true

	# ---------- Push accuracy data to benchmark dashboard ----------
	accuracy-dashboard:
	name: Update accuracy dashboard
	needs: [atom-test]
	if: always() && github.ref == 'refs/heads/main' && (github.event_name == 'push' \|\| github.event_name == 'schedule')
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6

	- uses: actions/setup-python@v6
	with:
	python-version: '3.12'

	- name: Download accuracy artifacts
	uses: actions/download-artifact@v8
	with:
	path: /tmp/accuracy-results
	pattern: accuracy-*

	- name: List downloaded artifacts
	run: \|
	echo "=== Downloaded accuracy artifacts ==="
	find /tmp/accuracy-results -type f -name '*.json' \| head -20 \|\| echo "No JSON files found"

	- name: Transform accuracy results for dashboard
	run: \|
	python3 .github/scripts/accuracy_to_dashboard.py \
	/tmp/accuracy-results \
	--output accuracy-benchmark-input.json \
	--models .github/benchmark/models_accuracy.json \
	--run-url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	echo "=== Generated entries ==="
	cat accuracy-benchmark-input.json

	- name: Store accuracy result to dashboard
	if: hashFiles('accuracy-benchmark-input.json') != ''
	uses: benchmark-action/github-action-benchmark@v1
	with:
	tool: customBiggerIsBetter
	output-file-path: accuracy-benchmark-input.json
	gh-pages-branch: gh-pages
	benchmark-data-dir-path: benchmark-dashboard
	auto-push: true
	max-items-in-chart: 90
	github-token: ${{ secrets.GITHUB_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Diffusion] Enable vLLM-Omni Plugin for Diffusion Model #6

Workflow file

[Diffusion] Enable vLLM-Omni Plugin for Diffusion Model #6

Uh oh!

Workflow file for this run